1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 83 #include <asm/intel-family.h> 84 #endif 85 86 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 93 94 #define AMDGPU_RESUME_MS 2000 95 #define AMDGPU_MAX_RETRY_LIMIT 2 96 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 97 98 static const struct drm_driver amdgpu_kms_driver; 99 100 const char *amdgpu_asic_name[] = { 101 "TAHITI", 102 "PITCAIRN", 103 "VERDE", 104 "OLAND", 105 "HAINAN", 106 "BONAIRE", 107 "KAVERI", 108 "KABINI", 109 "HAWAII", 110 "MULLINS", 111 "TOPAZ", 112 "TONGA", 113 "FIJI", 114 "CARRIZO", 115 "STONEY", 116 "POLARIS10", 117 "POLARIS11", 118 "POLARIS12", 119 "VEGAM", 120 "VEGA10", 121 "VEGA12", 122 "VEGA20", 123 "RAVEN", 124 "ARCTURUS", 125 "RENOIR", 126 "ALDEBARAN", 127 "NAVI10", 128 "CYAN_SKILLFISH", 129 "NAVI14", 130 "NAVI12", 131 "SIENNA_CICHLID", 132 "NAVY_FLOUNDER", 133 "VANGOGH", 134 "DIMGREY_CAVEFISH", 135 "BEIGE_GOBY", 136 "YELLOW_CARP", 137 "IP DISCOVERY", 138 "LAST", 139 }; 140 141 /** 142 * DOC: pcie_replay_count 143 * 144 * The amdgpu driver provides a sysfs API for reporting the total number 145 * of PCIe replays (NAKs) 146 * The file pcie_replay_count is used for this and returns the total 147 * number of replays as a sum of the NAKs generated and NAKs received 148 */ 149 150 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 struct drm_device *ddev = dev_get_drvdata(dev); 154 struct amdgpu_device *adev = drm_to_adev(ddev); 155 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 156 157 return sysfs_emit(buf, "%llu\n", cnt); 158 } 159 160 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 161 amdgpu_device_get_pcie_replay_count, NULL); 162 163 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 164 165 /** 166 * DOC: product_name 167 * 168 * The amdgpu driver provides a sysfs API for reporting the product name 169 * for the device 170 * The file serial_number is used for this and returns the product name 171 * as returned from the FRU. 172 * NOTE: This is only available for certain server cards 173 */ 174 175 static ssize_t amdgpu_device_get_product_name(struct device *dev, 176 struct device_attribute *attr, char *buf) 177 { 178 struct drm_device *ddev = dev_get_drvdata(dev); 179 struct amdgpu_device *adev = drm_to_adev(ddev); 180 181 return sysfs_emit(buf, "%s\n", adev->product_name); 182 } 183 184 static DEVICE_ATTR(product_name, S_IRUGO, 185 amdgpu_device_get_product_name, NULL); 186 187 /** 188 * DOC: product_number 189 * 190 * The amdgpu driver provides a sysfs API for reporting the part number 191 * for the device 192 * The file serial_number is used for this and returns the part number 193 * as returned from the FRU. 194 * NOTE: This is only available for certain server cards 195 */ 196 197 static ssize_t amdgpu_device_get_product_number(struct device *dev, 198 struct device_attribute *attr, char *buf) 199 { 200 struct drm_device *ddev = dev_get_drvdata(dev); 201 struct amdgpu_device *adev = drm_to_adev(ddev); 202 203 return sysfs_emit(buf, "%s\n", adev->product_number); 204 } 205 206 static DEVICE_ATTR(product_number, S_IRUGO, 207 amdgpu_device_get_product_number, NULL); 208 209 /** 210 * DOC: serial_number 211 * 212 * The amdgpu driver provides a sysfs API for reporting the serial number 213 * for the device 214 * The file serial_number is used for this and returns the serial number 215 * as returned from the FRU. 216 * NOTE: This is only available for certain server cards 217 */ 218 219 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 220 struct device_attribute *attr, char *buf) 221 { 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 225 return sysfs_emit(buf, "%s\n", adev->serial); 226 } 227 228 static DEVICE_ATTR(serial_number, S_IRUGO, 229 amdgpu_device_get_serial_number, NULL); 230 231 /** 232 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 233 * 234 * @dev: drm_device pointer 235 * 236 * Returns true if the device is a dGPU with ATPX power control, 237 * otherwise return false. 238 */ 239 bool amdgpu_device_supports_px(struct drm_device *dev) 240 { 241 struct amdgpu_device *adev = drm_to_adev(dev); 242 243 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 244 return true; 245 return false; 246 } 247 248 /** 249 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 250 * 251 * @dev: drm_device pointer 252 * 253 * Returns true if the device is a dGPU with ACPI power control, 254 * otherwise return false. 255 */ 256 bool amdgpu_device_supports_boco(struct drm_device *dev) 257 { 258 struct amdgpu_device *adev = drm_to_adev(dev); 259 260 if (adev->has_pr3 || 261 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 262 return true; 263 return false; 264 } 265 266 /** 267 * amdgpu_device_supports_baco - Does the device support BACO 268 * 269 * @dev: drm_device pointer 270 * 271 * Returns true if the device supporte BACO, 272 * otherwise return false. 273 */ 274 bool amdgpu_device_supports_baco(struct drm_device *dev) 275 { 276 struct amdgpu_device *adev = drm_to_adev(dev); 277 278 return amdgpu_asic_supports_baco(adev); 279 } 280 281 /** 282 * amdgpu_device_supports_smart_shift - Is the device dGPU with 283 * smart shift support 284 * 285 * @dev: drm_device pointer 286 * 287 * Returns true if the device is a dGPU with Smart Shift support, 288 * otherwise returns false. 289 */ 290 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 291 { 292 return (amdgpu_device_supports_boco(dev) && 293 amdgpu_acpi_is_power_shift_control_supported()); 294 } 295 296 /* 297 * VRAM access helper functions 298 */ 299 300 /** 301 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 302 * 303 * @adev: amdgpu_device pointer 304 * @pos: offset of the buffer in vram 305 * @buf: virtual address of the buffer in system memory 306 * @size: read/write size, sizeof(@buf) must > @size 307 * @write: true - write to vram, otherwise - read from vram 308 */ 309 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 310 void *buf, size_t size, bool write) 311 { 312 unsigned long flags; 313 uint32_t hi = ~0, tmp = 0; 314 uint32_t *data = buf; 315 uint64_t last; 316 int idx; 317 318 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 319 return; 320 321 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 322 323 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 324 for (last = pos + size; pos < last; pos += 4) { 325 tmp = pos >> 31; 326 327 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 328 if (tmp != hi) { 329 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 330 hi = tmp; 331 } 332 if (write) 333 WREG32_NO_KIQ(mmMM_DATA, *data++); 334 else 335 *data++ = RREG32_NO_KIQ(mmMM_DATA); 336 } 337 338 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 339 drm_dev_exit(idx); 340 } 341 342 /** 343 * amdgpu_device_aper_access - access vram by vram aperature 344 * 345 * @adev: amdgpu_device pointer 346 * @pos: offset of the buffer in vram 347 * @buf: virtual address of the buffer in system memory 348 * @size: read/write size, sizeof(@buf) must > @size 349 * @write: true - write to vram, otherwise - read from vram 350 * 351 * The return value means how many bytes have been transferred. 352 */ 353 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 354 void *buf, size_t size, bool write) 355 { 356 #ifdef CONFIG_64BIT 357 void __iomem *addr; 358 size_t count = 0; 359 uint64_t last; 360 361 if (!adev->mman.aper_base_kaddr) 362 return 0; 363 364 last = min(pos + size, adev->gmc.visible_vram_size); 365 if (last > pos) { 366 addr = adev->mman.aper_base_kaddr + pos; 367 count = last - pos; 368 369 if (write) { 370 memcpy_toio(addr, buf, count); 371 mb(); 372 amdgpu_device_flush_hdp(adev, NULL); 373 } else { 374 amdgpu_device_invalidate_hdp(adev, NULL); 375 mb(); 376 memcpy_fromio(buf, addr, count); 377 } 378 379 } 380 381 return count; 382 #else 383 return 0; 384 #endif 385 } 386 387 /** 388 * amdgpu_device_vram_access - read/write a buffer in vram 389 * 390 * @adev: amdgpu_device pointer 391 * @pos: offset of the buffer in vram 392 * @buf: virtual address of the buffer in system memory 393 * @size: read/write size, sizeof(@buf) must > @size 394 * @write: true - write to vram, otherwise - read from vram 395 */ 396 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 397 void *buf, size_t size, bool write) 398 { 399 size_t count; 400 401 /* try to using vram apreature to access vram first */ 402 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 403 size -= count; 404 if (size) { 405 /* using MM to access rest vram */ 406 pos += count; 407 buf += count; 408 amdgpu_device_mm_access(adev, pos, buf, size, write); 409 } 410 } 411 412 /* 413 * register access helper functions. 414 */ 415 416 /* Check if hw access should be skipped because of hotplug or device error */ 417 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 418 { 419 if (adev->no_hw_access) 420 return true; 421 422 #ifdef CONFIG_LOCKDEP 423 /* 424 * This is a bit complicated to understand, so worth a comment. What we assert 425 * here is that the GPU reset is not running on another thread in parallel. 426 * 427 * For this we trylock the read side of the reset semaphore, if that succeeds 428 * we know that the reset is not running in paralell. 429 * 430 * If the trylock fails we assert that we are either already holding the read 431 * side of the lock or are the reset thread itself and hold the write side of 432 * the lock. 433 */ 434 if (in_task()) { 435 if (down_read_trylock(&adev->reset_domain->sem)) 436 up_read(&adev->reset_domain->sem); 437 else 438 lockdep_assert_held(&adev->reset_domain->sem); 439 } 440 #endif 441 return false; 442 } 443 444 /** 445 * amdgpu_device_rreg - read a memory mapped IO or indirect register 446 * 447 * @adev: amdgpu_device pointer 448 * @reg: dword aligned register offset 449 * @acc_flags: access flags which require special behavior 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 454 uint32_t reg, uint32_t acc_flags) 455 { 456 uint32_t ret; 457 458 if (amdgpu_device_skip_hw_access(adev)) 459 return 0; 460 461 if ((reg * 4) < adev->rmmio_size) { 462 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 463 amdgpu_sriov_runtime(adev) && 464 down_read_trylock(&adev->reset_domain->sem)) { 465 ret = amdgpu_kiq_rreg(adev, reg); 466 up_read(&adev->reset_domain->sem); 467 } else { 468 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 469 } 470 } else { 471 ret = adev->pcie_rreg(adev, reg * 4); 472 } 473 474 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 475 476 return ret; 477 } 478 479 /* 480 * MMIO register read with bytes helper functions 481 * @offset:bytes offset from MMIO start 482 * 483 */ 484 485 /** 486 * amdgpu_mm_rreg8 - read a memory mapped IO register 487 * 488 * @adev: amdgpu_device pointer 489 * @offset: byte aligned register offset 490 * 491 * Returns the 8 bit value from the offset specified. 492 */ 493 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 494 { 495 if (amdgpu_device_skip_hw_access(adev)) 496 return 0; 497 498 if (offset < adev->rmmio_size) 499 return (readb(adev->rmmio + offset)); 500 BUG(); 501 } 502 503 /* 504 * MMIO register write with bytes helper functions 505 * @offset:bytes offset from MMIO start 506 * @value: the value want to be written to the register 507 * 508 */ 509 /** 510 * amdgpu_mm_wreg8 - read a memory mapped IO register 511 * 512 * @adev: amdgpu_device pointer 513 * @offset: byte aligned register offset 514 * @value: 8 bit value to write 515 * 516 * Writes the value specified to the offset specified. 517 */ 518 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 519 { 520 if (amdgpu_device_skip_hw_access(adev)) 521 return; 522 523 if (offset < adev->rmmio_size) 524 writeb(value, adev->rmmio + offset); 525 else 526 BUG(); 527 } 528 529 /** 530 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 531 * 532 * @adev: amdgpu_device pointer 533 * @reg: dword aligned register offset 534 * @v: 32 bit value to write to the register 535 * @acc_flags: access flags which require special behavior 536 * 537 * Writes the value specified to the offset specified. 538 */ 539 void amdgpu_device_wreg(struct amdgpu_device *adev, 540 uint32_t reg, uint32_t v, 541 uint32_t acc_flags) 542 { 543 if (amdgpu_device_skip_hw_access(adev)) 544 return; 545 546 if ((reg * 4) < adev->rmmio_size) { 547 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 548 amdgpu_sriov_runtime(adev) && 549 down_read_trylock(&adev->reset_domain->sem)) { 550 amdgpu_kiq_wreg(adev, reg, v); 551 up_read(&adev->reset_domain->sem); 552 } else { 553 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 554 } 555 } else { 556 adev->pcie_wreg(adev, reg * 4, v); 557 } 558 559 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 560 } 561 562 /** 563 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 564 * 565 * @adev: amdgpu_device pointer 566 * @reg: mmio/rlc register 567 * @v: value to write 568 * 569 * this function is invoked only for the debugfs register access 570 */ 571 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 572 uint32_t reg, uint32_t v) 573 { 574 if (amdgpu_device_skip_hw_access(adev)) 575 return; 576 577 if (amdgpu_sriov_fullaccess(adev) && 578 adev->gfx.rlc.funcs && 579 adev->gfx.rlc.funcs->is_rlcg_access_range) { 580 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 581 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 582 } else if ((reg * 4) >= adev->rmmio_size) { 583 adev->pcie_wreg(adev, reg * 4, v); 584 } else { 585 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 586 } 587 } 588 589 /** 590 * amdgpu_mm_rdoorbell - read a doorbell dword 591 * 592 * @adev: amdgpu_device pointer 593 * @index: doorbell index 594 * 595 * Returns the value in the doorbell aperture at the 596 * requested doorbell index (CIK). 597 */ 598 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 599 { 600 if (amdgpu_device_skip_hw_access(adev)) 601 return 0; 602 603 if (index < adev->doorbell.num_doorbells) { 604 return readl(adev->doorbell.ptr + index); 605 } else { 606 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 607 return 0; 608 } 609 } 610 611 /** 612 * amdgpu_mm_wdoorbell - write a doorbell dword 613 * 614 * @adev: amdgpu_device pointer 615 * @index: doorbell index 616 * @v: value to write 617 * 618 * Writes @v to the doorbell aperture at the 619 * requested doorbell index (CIK). 620 */ 621 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 622 { 623 if (amdgpu_device_skip_hw_access(adev)) 624 return; 625 626 if (index < adev->doorbell.num_doorbells) { 627 writel(v, adev->doorbell.ptr + index); 628 } else { 629 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 630 } 631 } 632 633 /** 634 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 635 * 636 * @adev: amdgpu_device pointer 637 * @index: doorbell index 638 * 639 * Returns the value in the doorbell aperture at the 640 * requested doorbell index (VEGA10+). 641 */ 642 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 643 { 644 if (amdgpu_device_skip_hw_access(adev)) 645 return 0; 646 647 if (index < adev->doorbell.num_doorbells) { 648 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 649 } else { 650 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 651 return 0; 652 } 653 } 654 655 /** 656 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 657 * 658 * @adev: amdgpu_device pointer 659 * @index: doorbell index 660 * @v: value to write 661 * 662 * Writes @v to the doorbell aperture at the 663 * requested doorbell index (VEGA10+). 664 */ 665 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 666 { 667 if (amdgpu_device_skip_hw_access(adev)) 668 return; 669 670 if (index < adev->doorbell.num_doorbells) { 671 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 672 } else { 673 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 674 } 675 } 676 677 /** 678 * amdgpu_device_indirect_rreg - read an indirect register 679 * 680 * @adev: amdgpu_device pointer 681 * @pcie_index: mmio register offset 682 * @pcie_data: mmio register offset 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 pcie_index, u32 pcie_data, 689 u32 reg_addr) 690 { 691 unsigned long flags; 692 u32 r; 693 void __iomem *pcie_index_offset; 694 void __iomem *pcie_data_offset; 695 696 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 697 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 698 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 699 700 writel(reg_addr, pcie_index_offset); 701 readl(pcie_index_offset); 702 r = readl(pcie_data_offset); 703 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 704 705 return r; 706 } 707 708 /** 709 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 710 * 711 * @adev: amdgpu_device pointer 712 * @pcie_index: mmio register offset 713 * @pcie_data: mmio register offset 714 * @reg_addr: indirect register address to read from 715 * 716 * Returns the value of indirect register @reg_addr 717 */ 718 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 719 u32 pcie_index, u32 pcie_data, 720 u32 reg_addr) 721 { 722 unsigned long flags; 723 u64 r; 724 void __iomem *pcie_index_offset; 725 void __iomem *pcie_data_offset; 726 727 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 728 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 729 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 730 731 /* read low 32 bits */ 732 writel(reg_addr, pcie_index_offset); 733 readl(pcie_index_offset); 734 r = readl(pcie_data_offset); 735 /* read high 32 bits */ 736 writel(reg_addr + 4, pcie_index_offset); 737 readl(pcie_index_offset); 738 r |= ((u64)readl(pcie_data_offset) << 32); 739 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 740 741 return r; 742 } 743 744 /** 745 * amdgpu_device_indirect_wreg - write an indirect register address 746 * 747 * @adev: amdgpu_device pointer 748 * @pcie_index: mmio register offset 749 * @pcie_data: mmio register offset 750 * @reg_addr: indirect register offset 751 * @reg_data: indirect register data 752 * 753 */ 754 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 755 u32 pcie_index, u32 pcie_data, 756 u32 reg_addr, u32 reg_data) 757 { 758 unsigned long flags; 759 void __iomem *pcie_index_offset; 760 void __iomem *pcie_data_offset; 761 762 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 763 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 764 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 765 766 writel(reg_addr, pcie_index_offset); 767 readl(pcie_index_offset); 768 writel(reg_data, pcie_data_offset); 769 readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 } 772 773 /** 774 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 775 * 776 * @adev: amdgpu_device pointer 777 * @pcie_index: mmio register offset 778 * @pcie_data: mmio register offset 779 * @reg_addr: indirect register offset 780 * @reg_data: indirect register data 781 * 782 */ 783 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 784 u32 pcie_index, u32 pcie_data, 785 u32 reg_addr, u64 reg_data) 786 { 787 unsigned long flags; 788 void __iomem *pcie_index_offset; 789 void __iomem *pcie_data_offset; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 795 /* write low 32 bits */ 796 writel(reg_addr, pcie_index_offset); 797 readl(pcie_index_offset); 798 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 799 readl(pcie_data_offset); 800 /* write high 32 bits */ 801 writel(reg_addr + 4, pcie_index_offset); 802 readl(pcie_index_offset); 803 writel((u32)(reg_data >> 32), pcie_data_offset); 804 readl(pcie_data_offset); 805 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 806 } 807 808 /** 809 * amdgpu_invalid_rreg - dummy reg read function 810 * 811 * @adev: amdgpu_device pointer 812 * @reg: offset of register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 * Returns the value in the register. 817 */ 818 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 819 { 820 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 821 BUG(); 822 return 0; 823 } 824 825 /** 826 * amdgpu_invalid_wreg - dummy reg write function 827 * 828 * @adev: amdgpu_device pointer 829 * @reg: offset of register 830 * @v: value to write to the register 831 * 832 * Dummy register read function. Used for register blocks 833 * that certain asics don't have (all asics). 834 */ 835 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 836 { 837 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 838 reg, v); 839 BUG(); 840 } 841 842 /** 843 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: offset of register 847 * 848 * Dummy register read function. Used for register blocks 849 * that certain asics don't have (all asics). 850 * Returns the value in the register. 851 */ 852 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_invalid_wreg64 - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @reg: offset of register 864 * @v: value to write to the register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 */ 869 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 870 { 871 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 872 reg, v); 873 BUG(); 874 } 875 876 /** 877 * amdgpu_block_invalid_rreg - dummy reg read function 878 * 879 * @adev: amdgpu_device pointer 880 * @block: offset of instance 881 * @reg: offset of register 882 * 883 * Dummy register read function. Used for register blocks 884 * that certain asics don't have (all asics). 885 * Returns the value in the register. 886 */ 887 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 888 uint32_t block, uint32_t reg) 889 { 890 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 891 reg, block); 892 BUG(); 893 return 0; 894 } 895 896 /** 897 * amdgpu_block_invalid_wreg - dummy reg write function 898 * 899 * @adev: amdgpu_device pointer 900 * @block: offset of instance 901 * @reg: offset of register 902 * @v: value to write to the register 903 * 904 * Dummy register read function. Used for register blocks 905 * that certain asics don't have (all asics). 906 */ 907 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 908 uint32_t block, 909 uint32_t reg, uint32_t v) 910 { 911 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 912 reg, block, v); 913 BUG(); 914 } 915 916 /** 917 * amdgpu_device_asic_init - Wrapper for atom asic_init 918 * 919 * @adev: amdgpu_device pointer 920 * 921 * Does any asic specific work and then calls atom asic init. 922 */ 923 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 924 { 925 amdgpu_asic_pre_asic_init(adev); 926 927 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 928 return amdgpu_atomfirmware_asic_init(adev, true); 929 else 930 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 931 } 932 933 /** 934 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 935 * 936 * @adev: amdgpu_device pointer 937 * 938 * Allocates a scratch page of VRAM for use by various things in the 939 * driver. 940 */ 941 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 942 { 943 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 944 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 945 &adev->vram_scratch.robj, 946 &adev->vram_scratch.gpu_addr, 947 (void **)&adev->vram_scratch.ptr); 948 } 949 950 /** 951 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 952 * 953 * @adev: amdgpu_device pointer 954 * 955 * Frees the VRAM scratch page. 956 */ 957 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 958 { 959 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 960 } 961 962 /** 963 * amdgpu_device_program_register_sequence - program an array of registers. 964 * 965 * @adev: amdgpu_device pointer 966 * @registers: pointer to the register array 967 * @array_size: size of the register array 968 * 969 * Programs an array or registers with and and or masks. 970 * This is a helper for setting golden registers. 971 */ 972 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 973 const u32 *registers, 974 const u32 array_size) 975 { 976 u32 tmp, reg, and_mask, or_mask; 977 int i; 978 979 if (array_size % 3) 980 return; 981 982 for (i = 0; i < array_size; i +=3) { 983 reg = registers[i + 0]; 984 and_mask = registers[i + 1]; 985 or_mask = registers[i + 2]; 986 987 if (and_mask == 0xffffffff) { 988 tmp = or_mask; 989 } else { 990 tmp = RREG32(reg); 991 tmp &= ~and_mask; 992 if (adev->family >= AMDGPU_FAMILY_AI) 993 tmp |= (or_mask & and_mask); 994 else 995 tmp |= or_mask; 996 } 997 WREG32(reg, tmp); 998 } 999 } 1000 1001 /** 1002 * amdgpu_device_pci_config_reset - reset the GPU 1003 * 1004 * @adev: amdgpu_device pointer 1005 * 1006 * Resets the GPU using the pci config reset sequence. 1007 * Only applicable to asics prior to vega10. 1008 */ 1009 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1010 { 1011 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1012 } 1013 1014 /** 1015 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1016 * 1017 * @adev: amdgpu_device pointer 1018 * 1019 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1020 */ 1021 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1022 { 1023 STUB(); 1024 return -ENOSYS; 1025 #ifdef notyet 1026 return pci_reset_function(adev->pdev); 1027 #endif 1028 } 1029 1030 /* 1031 * GPU doorbell aperture helpers function. 1032 */ 1033 /** 1034 * amdgpu_device_doorbell_init - Init doorbell driver information. 1035 * 1036 * @adev: amdgpu_device pointer 1037 * 1038 * Init doorbell driver information (CIK) 1039 * Returns 0 on success, error on failure. 1040 */ 1041 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1042 { 1043 1044 /* No doorbell on SI hardware generation */ 1045 if (adev->asic_type < CHIP_BONAIRE) { 1046 adev->doorbell.base = 0; 1047 adev->doorbell.size = 0; 1048 adev->doorbell.num_doorbells = 0; 1049 adev->doorbell.ptr = NULL; 1050 return 0; 1051 } 1052 1053 #ifdef __linux__ 1054 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1055 return -EINVAL; 1056 #endif 1057 1058 amdgpu_asic_init_doorbell_index(adev); 1059 1060 /* doorbell bar mapping */ 1061 #ifdef __linux__ 1062 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1063 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1064 #endif 1065 1066 if (adev->enable_mes) { 1067 adev->doorbell.num_doorbells = 1068 adev->doorbell.size / sizeof(u32); 1069 } else { 1070 adev->doorbell.num_doorbells = 1071 min_t(u32, adev->doorbell.size / sizeof(u32), 1072 adev->doorbell_index.max_assignment+1); 1073 if (adev->doorbell.num_doorbells == 0) 1074 return -EINVAL; 1075 1076 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1077 * paging queue doorbell use the second page. The 1078 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1079 * doorbells are in the first page. So with paging queue enabled, 1080 * the max num_doorbells should + 1 page (0x400 in dword) 1081 */ 1082 if (adev->asic_type >= CHIP_VEGA10) 1083 adev->doorbell.num_doorbells += 0x400; 1084 } 1085 1086 #ifdef __linux__ 1087 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1088 adev->doorbell.num_doorbells * 1089 sizeof(u32)); 1090 if (adev->doorbell.ptr == NULL) 1091 return -ENOMEM; 1092 #endif 1093 1094 return 0; 1095 } 1096 1097 /** 1098 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Tear down doorbell driver information (CIK) 1103 */ 1104 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1105 { 1106 #ifdef __linux__ 1107 iounmap(adev->doorbell.ptr); 1108 #else 1109 if (adev->doorbell.size > 0) 1110 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1111 adev->doorbell.size); 1112 #endif 1113 adev->doorbell.ptr = NULL; 1114 } 1115 1116 1117 1118 /* 1119 * amdgpu_device_wb_*() 1120 * Writeback is the method by which the GPU updates special pages in memory 1121 * with the status of certain GPU events (fences, ring pointers,etc.). 1122 */ 1123 1124 /** 1125 * amdgpu_device_wb_fini - Disable Writeback and free memory 1126 * 1127 * @adev: amdgpu_device pointer 1128 * 1129 * Disables Writeback and frees the Writeback memory (all asics). 1130 * Used at driver shutdown. 1131 */ 1132 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1133 { 1134 if (adev->wb.wb_obj) { 1135 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1136 &adev->wb.gpu_addr, 1137 (void **)&adev->wb.wb); 1138 adev->wb.wb_obj = NULL; 1139 } 1140 } 1141 1142 /** 1143 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1144 * 1145 * @adev: amdgpu_device pointer 1146 * 1147 * Initializes writeback and allocates writeback memory (all asics). 1148 * Used at driver startup. 1149 * Returns 0 on success or an -error on failure. 1150 */ 1151 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1152 { 1153 int r; 1154 1155 if (adev->wb.wb_obj == NULL) { 1156 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1157 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1158 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1159 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1160 (void **)&adev->wb.wb); 1161 if (r) { 1162 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1163 return r; 1164 } 1165 1166 adev->wb.num_wb = AMDGPU_MAX_WB; 1167 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1168 1169 /* clear wb memory */ 1170 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1171 } 1172 1173 return 0; 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_get - Allocate a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Allocate a wb slot for use by the driver (all asics). 1183 * Returns 0 on success or -EINVAL on failure. 1184 */ 1185 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1186 { 1187 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1188 1189 if (offset < adev->wb.num_wb) { 1190 __set_bit(offset, adev->wb.used); 1191 *wb = offset << 3; /* convert to dw offset */ 1192 return 0; 1193 } else { 1194 return -EINVAL; 1195 } 1196 } 1197 1198 /** 1199 * amdgpu_device_wb_free - Free a wb entry 1200 * 1201 * @adev: amdgpu_device pointer 1202 * @wb: wb index 1203 * 1204 * Free a wb slot allocated for use by the driver (all asics) 1205 */ 1206 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1207 { 1208 wb >>= 3; 1209 if (wb < adev->wb.num_wb) 1210 __clear_bit(wb, adev->wb.used); 1211 } 1212 1213 /** 1214 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1215 * 1216 * @adev: amdgpu_device pointer 1217 * 1218 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1219 * to fail, but if any of the BARs is not accessible after the size we abort 1220 * driver loading by returning -ENODEV. 1221 */ 1222 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1223 { 1224 #ifdef __linux__ 1225 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1226 struct pci_bus *root; 1227 struct resource *res; 1228 unsigned i; 1229 u16 cmd; 1230 int r; 1231 1232 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1233 return 0; 1234 1235 /* Bypass for VF */ 1236 if (amdgpu_sriov_vf(adev)) 1237 return 0; 1238 1239 /* skip if the bios has already enabled large BAR */ 1240 if (adev->gmc.real_vram_size && 1241 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1242 return 0; 1243 1244 /* Check if the root BUS has 64bit memory resources */ 1245 root = adev->pdev->bus; 1246 while (root->parent) 1247 root = root->parent; 1248 1249 pci_bus_for_each_resource(root, res, i) { 1250 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1251 res->start > 0x100000000ull) 1252 break; 1253 } 1254 1255 /* Trying to resize is pointless without a root hub window above 4GB */ 1256 if (!res) 1257 return 0; 1258 1259 /* Limit the BAR size to what is available */ 1260 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1261 rbar_size); 1262 1263 /* Disable memory decoding while we change the BAR addresses and size */ 1264 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1265 pci_write_config_word(adev->pdev, PCI_COMMAND, 1266 cmd & ~PCI_COMMAND_MEMORY); 1267 1268 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1269 amdgpu_device_doorbell_fini(adev); 1270 if (adev->asic_type >= CHIP_BONAIRE) 1271 pci_release_resource(adev->pdev, 2); 1272 1273 pci_release_resource(adev->pdev, 0); 1274 1275 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1276 if (r == -ENOSPC) 1277 DRM_INFO("Not enough PCI address space for a large BAR."); 1278 else if (r && r != -ENOTSUPP) 1279 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1280 1281 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1282 1283 /* When the doorbell or fb BAR isn't available we have no chance of 1284 * using the device. 1285 */ 1286 r = amdgpu_device_doorbell_init(adev); 1287 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1288 return -ENODEV; 1289 1290 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1291 #endif /* __linux__ */ 1292 1293 return 0; 1294 } 1295 1296 /* 1297 * GPU helpers function. 1298 */ 1299 /** 1300 * amdgpu_device_need_post - check if the hw need post or not 1301 * 1302 * @adev: amdgpu_device pointer 1303 * 1304 * Check if the asic has been initialized (all asics) at driver startup 1305 * or post is needed if hw reset is performed. 1306 * Returns true if need or false if not. 1307 */ 1308 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1309 { 1310 uint32_t reg; 1311 1312 if (amdgpu_sriov_vf(adev)) 1313 return false; 1314 1315 if (amdgpu_passthrough(adev)) { 1316 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1317 * some old smc fw still need driver do vPost otherwise gpu hang, while 1318 * those smc fw version above 22.15 doesn't have this flaw, so we force 1319 * vpost executed for smc version below 22.15 1320 */ 1321 if (adev->asic_type == CHIP_FIJI) { 1322 int err; 1323 uint32_t fw_ver; 1324 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1325 /* force vPost if error occured */ 1326 if (err) 1327 return true; 1328 1329 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1330 if (fw_ver < 0x00160e00) 1331 return true; 1332 } 1333 } 1334 1335 /* Don't post if we need to reset whole hive on init */ 1336 if (adev->gmc.xgmi.pending_reset) 1337 return false; 1338 1339 if (adev->has_hw_reset) { 1340 adev->has_hw_reset = false; 1341 return true; 1342 } 1343 1344 /* bios scratch used on CIK+ */ 1345 if (adev->asic_type >= CHIP_BONAIRE) 1346 return amdgpu_atombios_scratch_need_asic_init(adev); 1347 1348 /* check MEM_SIZE for older asics */ 1349 reg = amdgpu_asic_get_config_memsize(adev); 1350 1351 if ((reg != 0) && (reg != 0xffffffff)) 1352 return false; 1353 1354 return true; 1355 } 1356 1357 /* 1358 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1359 * speed switching. Until we have confirmation from Intel that a specific host 1360 * supports it, it's safer that we keep it disabled for all. 1361 * 1362 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1363 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1364 */ 1365 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1366 { 1367 #if IS_ENABLED(CONFIG_X86) 1368 #ifdef __linux__ 1369 struct cpuinfo_x86 *c = &cpu_data(0); 1370 1371 if (c->x86_vendor == X86_VENDOR_INTEL) 1372 #else 1373 if (strcmp(cpu_vendor, "GenuineIntel") == 0) 1374 #endif 1375 return false; 1376 #endif 1377 return true; 1378 } 1379 1380 /** 1381 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1382 * 1383 * @adev: amdgpu_device pointer 1384 * 1385 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1386 * be set for this device. 1387 * 1388 * Returns true if it should be used or false if not. 1389 */ 1390 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1391 { 1392 switch (amdgpu_aspm) { 1393 case -1: 1394 break; 1395 case 0: 1396 return false; 1397 case 1: 1398 return true; 1399 default: 1400 return false; 1401 } 1402 return pcie_aspm_enabled(adev->pdev); 1403 } 1404 1405 bool amdgpu_device_aspm_support_quirk(void) 1406 { 1407 #if IS_ENABLED(CONFIG_X86) 1408 struct cpu_info *ci = curcpu(); 1409 1410 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1411 #else 1412 return true; 1413 #endif 1414 } 1415 1416 /* if we get transitioned to only one device, take VGA back */ 1417 /** 1418 * amdgpu_device_vga_set_decode - enable/disable vga decode 1419 * 1420 * @pdev: PCI device pointer 1421 * @state: enable/disable vga decode 1422 * 1423 * Enable/disable vga decode (all asics). 1424 * Returns VGA resource flags. 1425 */ 1426 #ifdef notyet 1427 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1428 bool state) 1429 { 1430 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1431 amdgpu_asic_set_vga_state(adev, state); 1432 if (state) 1433 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1434 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1435 else 1436 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1437 } 1438 #endif 1439 1440 /** 1441 * amdgpu_device_check_block_size - validate the vm block size 1442 * 1443 * @adev: amdgpu_device pointer 1444 * 1445 * Validates the vm block size specified via module parameter. 1446 * The vm block size defines number of bits in page table versus page directory, 1447 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1448 * page table and the remaining bits are in the page directory. 1449 */ 1450 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1451 { 1452 /* defines number of bits in page table versus page directory, 1453 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1454 * page table and the remaining bits are in the page directory */ 1455 if (amdgpu_vm_block_size == -1) 1456 return; 1457 1458 if (amdgpu_vm_block_size < 9) { 1459 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1460 amdgpu_vm_block_size); 1461 amdgpu_vm_block_size = -1; 1462 } 1463 } 1464 1465 /** 1466 * amdgpu_device_check_vm_size - validate the vm size 1467 * 1468 * @adev: amdgpu_device pointer 1469 * 1470 * Validates the vm size in GB specified via module parameter. 1471 * The VM size is the size of the GPU virtual memory space in GB. 1472 */ 1473 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1474 { 1475 /* no need to check the default value */ 1476 if (amdgpu_vm_size == -1) 1477 return; 1478 1479 if (amdgpu_vm_size < 1) { 1480 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1481 amdgpu_vm_size); 1482 amdgpu_vm_size = -1; 1483 } 1484 } 1485 1486 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1487 { 1488 #ifdef __linux__ 1489 struct sysinfo si; 1490 #endif 1491 bool is_os_64 = (sizeof(void *) == 8); 1492 uint64_t total_memory; 1493 uint64_t dram_size_seven_GB = 0x1B8000000; 1494 uint64_t dram_size_three_GB = 0xB8000000; 1495 1496 if (amdgpu_smu_memory_pool_size == 0) 1497 return; 1498 1499 if (!is_os_64) { 1500 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1501 goto def_value; 1502 } 1503 #ifdef __linux__ 1504 si_meminfo(&si); 1505 total_memory = (uint64_t)si.totalram * si.mem_unit; 1506 #else 1507 total_memory = ptoa(physmem); 1508 #endif 1509 1510 if ((amdgpu_smu_memory_pool_size == 1) || 1511 (amdgpu_smu_memory_pool_size == 2)) { 1512 if (total_memory < dram_size_three_GB) 1513 goto def_value1; 1514 } else if ((amdgpu_smu_memory_pool_size == 4) || 1515 (amdgpu_smu_memory_pool_size == 8)) { 1516 if (total_memory < dram_size_seven_GB) 1517 goto def_value1; 1518 } else { 1519 DRM_WARN("Smu memory pool size not supported\n"); 1520 goto def_value; 1521 } 1522 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1523 1524 return; 1525 1526 def_value1: 1527 DRM_WARN("No enough system memory\n"); 1528 def_value: 1529 adev->pm.smu_prv_buffer_size = 0; 1530 } 1531 1532 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1533 { 1534 if (!(adev->flags & AMD_IS_APU) || 1535 adev->asic_type < CHIP_RAVEN) 1536 return 0; 1537 1538 switch (adev->asic_type) { 1539 case CHIP_RAVEN: 1540 if (adev->pdev->device == 0x15dd) 1541 adev->apu_flags |= AMD_APU_IS_RAVEN; 1542 if (adev->pdev->device == 0x15d8) 1543 adev->apu_flags |= AMD_APU_IS_PICASSO; 1544 break; 1545 case CHIP_RENOIR: 1546 if ((adev->pdev->device == 0x1636) || 1547 (adev->pdev->device == 0x164c)) 1548 adev->apu_flags |= AMD_APU_IS_RENOIR; 1549 else 1550 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1551 break; 1552 case CHIP_VANGOGH: 1553 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1554 break; 1555 case CHIP_YELLOW_CARP: 1556 break; 1557 case CHIP_CYAN_SKILLFISH: 1558 if ((adev->pdev->device == 0x13FE) || 1559 (adev->pdev->device == 0x143F)) 1560 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1561 break; 1562 default: 1563 break; 1564 } 1565 1566 return 0; 1567 } 1568 1569 /** 1570 * amdgpu_device_check_arguments - validate module params 1571 * 1572 * @adev: amdgpu_device pointer 1573 * 1574 * Validates certain module parameters and updates 1575 * the associated values used by the driver (all asics). 1576 */ 1577 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1578 { 1579 if (amdgpu_sched_jobs < 4) { 1580 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1581 amdgpu_sched_jobs); 1582 amdgpu_sched_jobs = 4; 1583 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1584 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1585 amdgpu_sched_jobs); 1586 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1587 } 1588 1589 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1590 /* gart size must be greater or equal to 32M */ 1591 dev_warn(adev->dev, "gart size (%d) too small\n", 1592 amdgpu_gart_size); 1593 amdgpu_gart_size = -1; 1594 } 1595 1596 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1597 /* gtt size must be greater or equal to 32M */ 1598 dev_warn(adev->dev, "gtt size (%d) too small\n", 1599 amdgpu_gtt_size); 1600 amdgpu_gtt_size = -1; 1601 } 1602 1603 /* valid range is between 4 and 9 inclusive */ 1604 if (amdgpu_vm_fragment_size != -1 && 1605 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1606 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1607 amdgpu_vm_fragment_size = -1; 1608 } 1609 1610 if (amdgpu_sched_hw_submission < 2) { 1611 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1612 amdgpu_sched_hw_submission); 1613 amdgpu_sched_hw_submission = 2; 1614 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1615 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1616 amdgpu_sched_hw_submission); 1617 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1618 } 1619 1620 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1621 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1622 amdgpu_reset_method = -1; 1623 } 1624 1625 amdgpu_device_check_smu_prv_buffer_size(adev); 1626 1627 amdgpu_device_check_vm_size(adev); 1628 1629 amdgpu_device_check_block_size(adev); 1630 1631 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1632 1633 return 0; 1634 } 1635 1636 #ifdef __linux__ 1637 /** 1638 * amdgpu_switcheroo_set_state - set switcheroo state 1639 * 1640 * @pdev: pci dev pointer 1641 * @state: vga_switcheroo state 1642 * 1643 * Callback for the switcheroo driver. Suspends or resumes the 1644 * the asics before or after it is powered up using ACPI methods. 1645 */ 1646 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1647 enum vga_switcheroo_state state) 1648 { 1649 struct drm_device *dev = pci_get_drvdata(pdev); 1650 int r; 1651 1652 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1653 return; 1654 1655 if (state == VGA_SWITCHEROO_ON) { 1656 pr_info("switched on\n"); 1657 /* don't suspend or resume card normally */ 1658 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1659 1660 pci_set_power_state(pdev, PCI_D0); 1661 amdgpu_device_load_pci_state(pdev); 1662 r = pci_enable_device(pdev); 1663 if (r) 1664 DRM_WARN("pci_enable_device failed (%d)\n", r); 1665 amdgpu_device_resume(dev, true); 1666 1667 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1668 } else { 1669 pr_info("switched off\n"); 1670 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1671 amdgpu_device_suspend(dev, true); 1672 amdgpu_device_cache_pci_state(pdev); 1673 /* Shut down the device */ 1674 pci_disable_device(pdev); 1675 pci_set_power_state(pdev, PCI_D3cold); 1676 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1677 } 1678 } 1679 1680 /** 1681 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1682 * 1683 * @pdev: pci dev pointer 1684 * 1685 * Callback for the switcheroo driver. Check of the switcheroo 1686 * state can be changed. 1687 * Returns true if the state can be changed, false if not. 1688 */ 1689 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1690 { 1691 struct drm_device *dev = pci_get_drvdata(pdev); 1692 1693 /* 1694 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1695 * locking inversion with the driver load path. And the access here is 1696 * completely racy anyway. So don't bother with locking for now. 1697 */ 1698 return atomic_read(&dev->open_count) == 0; 1699 } 1700 #endif /* __linux__ */ 1701 1702 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1703 #ifdef notyet 1704 .set_gpu_state = amdgpu_switcheroo_set_state, 1705 .reprobe = NULL, 1706 .can_switch = amdgpu_switcheroo_can_switch, 1707 #endif 1708 }; 1709 1710 /** 1711 * amdgpu_device_ip_set_clockgating_state - set the CG state 1712 * 1713 * @dev: amdgpu_device pointer 1714 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1715 * @state: clockgating state (gate or ungate) 1716 * 1717 * Sets the requested clockgating state for all instances of 1718 * the hardware IP specified. 1719 * Returns the error code from the last instance. 1720 */ 1721 int amdgpu_device_ip_set_clockgating_state(void *dev, 1722 enum amd_ip_block_type block_type, 1723 enum amd_clockgating_state state) 1724 { 1725 struct amdgpu_device *adev = dev; 1726 int i, r = 0; 1727 1728 for (i = 0; i < adev->num_ip_blocks; i++) { 1729 if (!adev->ip_blocks[i].status.valid) 1730 continue; 1731 if (adev->ip_blocks[i].version->type != block_type) 1732 continue; 1733 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1734 continue; 1735 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1736 (void *)adev, state); 1737 if (r) 1738 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1739 adev->ip_blocks[i].version->funcs->name, r); 1740 } 1741 return r; 1742 } 1743 1744 /** 1745 * amdgpu_device_ip_set_powergating_state - set the PG state 1746 * 1747 * @dev: amdgpu_device pointer 1748 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1749 * @state: powergating state (gate or ungate) 1750 * 1751 * Sets the requested powergating state for all instances of 1752 * the hardware IP specified. 1753 * Returns the error code from the last instance. 1754 */ 1755 int amdgpu_device_ip_set_powergating_state(void *dev, 1756 enum amd_ip_block_type block_type, 1757 enum amd_powergating_state state) 1758 { 1759 struct amdgpu_device *adev = dev; 1760 int i, r = 0; 1761 1762 for (i = 0; i < adev->num_ip_blocks; i++) { 1763 if (!adev->ip_blocks[i].status.valid) 1764 continue; 1765 if (adev->ip_blocks[i].version->type != block_type) 1766 continue; 1767 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1768 continue; 1769 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1770 (void *)adev, state); 1771 if (r) 1772 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1773 adev->ip_blocks[i].version->funcs->name, r); 1774 } 1775 return r; 1776 } 1777 1778 /** 1779 * amdgpu_device_ip_get_clockgating_state - get the CG state 1780 * 1781 * @adev: amdgpu_device pointer 1782 * @flags: clockgating feature flags 1783 * 1784 * Walks the list of IPs on the device and updates the clockgating 1785 * flags for each IP. 1786 * Updates @flags with the feature flags for each hardware IP where 1787 * clockgating is enabled. 1788 */ 1789 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1790 u64 *flags) 1791 { 1792 int i; 1793 1794 for (i = 0; i < adev->num_ip_blocks; i++) { 1795 if (!adev->ip_blocks[i].status.valid) 1796 continue; 1797 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1798 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1799 } 1800 } 1801 1802 /** 1803 * amdgpu_device_ip_wait_for_idle - wait for idle 1804 * 1805 * @adev: amdgpu_device pointer 1806 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1807 * 1808 * Waits for the request hardware IP to be idle. 1809 * Returns 0 for success or a negative error code on failure. 1810 */ 1811 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1812 enum amd_ip_block_type block_type) 1813 { 1814 int i, r; 1815 1816 for (i = 0; i < adev->num_ip_blocks; i++) { 1817 if (!adev->ip_blocks[i].status.valid) 1818 continue; 1819 if (adev->ip_blocks[i].version->type == block_type) { 1820 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1821 if (r) 1822 return r; 1823 break; 1824 } 1825 } 1826 return 0; 1827 1828 } 1829 1830 /** 1831 * amdgpu_device_ip_is_idle - is the hardware IP idle 1832 * 1833 * @adev: amdgpu_device pointer 1834 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1835 * 1836 * Check if the hardware IP is idle or not. 1837 * Returns true if it the IP is idle, false if not. 1838 */ 1839 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1840 enum amd_ip_block_type block_type) 1841 { 1842 int i; 1843 1844 for (i = 0; i < adev->num_ip_blocks; i++) { 1845 if (!adev->ip_blocks[i].status.valid) 1846 continue; 1847 if (adev->ip_blocks[i].version->type == block_type) 1848 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1849 } 1850 return true; 1851 1852 } 1853 1854 /** 1855 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1856 * 1857 * @adev: amdgpu_device pointer 1858 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1859 * 1860 * Returns a pointer to the hardware IP block structure 1861 * if it exists for the asic, otherwise NULL. 1862 */ 1863 struct amdgpu_ip_block * 1864 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1865 enum amd_ip_block_type type) 1866 { 1867 int i; 1868 1869 for (i = 0; i < adev->num_ip_blocks; i++) 1870 if (adev->ip_blocks[i].version->type == type) 1871 return &adev->ip_blocks[i]; 1872 1873 return NULL; 1874 } 1875 1876 /** 1877 * amdgpu_device_ip_block_version_cmp 1878 * 1879 * @adev: amdgpu_device pointer 1880 * @type: enum amd_ip_block_type 1881 * @major: major version 1882 * @minor: minor version 1883 * 1884 * return 0 if equal or greater 1885 * return 1 if smaller or the ip_block doesn't exist 1886 */ 1887 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1888 enum amd_ip_block_type type, 1889 u32 major, u32 minor) 1890 { 1891 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1892 1893 if (ip_block && ((ip_block->version->major > major) || 1894 ((ip_block->version->major == major) && 1895 (ip_block->version->minor >= minor)))) 1896 return 0; 1897 1898 return 1; 1899 } 1900 1901 /** 1902 * amdgpu_device_ip_block_add 1903 * 1904 * @adev: amdgpu_device pointer 1905 * @ip_block_version: pointer to the IP to add 1906 * 1907 * Adds the IP block driver information to the collection of IPs 1908 * on the asic. 1909 */ 1910 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1911 const struct amdgpu_ip_block_version *ip_block_version) 1912 { 1913 if (!ip_block_version) 1914 return -EINVAL; 1915 1916 switch (ip_block_version->type) { 1917 case AMD_IP_BLOCK_TYPE_VCN: 1918 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1919 return 0; 1920 break; 1921 case AMD_IP_BLOCK_TYPE_JPEG: 1922 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1923 return 0; 1924 break; 1925 default: 1926 break; 1927 } 1928 1929 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1930 ip_block_version->funcs->name); 1931 1932 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1933 1934 return 0; 1935 } 1936 1937 /** 1938 * amdgpu_device_enable_virtual_display - enable virtual display feature 1939 * 1940 * @adev: amdgpu_device pointer 1941 * 1942 * Enabled the virtual display feature if the user has enabled it via 1943 * the module parameter virtual_display. This feature provides a virtual 1944 * display hardware on headless boards or in virtualized environments. 1945 * This function parses and validates the configuration string specified by 1946 * the user and configues the virtual display configuration (number of 1947 * virtual connectors, crtcs, etc.) specified. 1948 */ 1949 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1950 { 1951 adev->enable_virtual_display = false; 1952 1953 #ifdef notyet 1954 if (amdgpu_virtual_display) { 1955 const char *pci_address_name = pci_name(adev->pdev); 1956 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1957 1958 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1959 pciaddstr_tmp = pciaddstr; 1960 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1961 pciaddname = strsep(&pciaddname_tmp, ","); 1962 if (!strcmp("all", pciaddname) 1963 || !strcmp(pci_address_name, pciaddname)) { 1964 long num_crtc; 1965 int res = -1; 1966 1967 adev->enable_virtual_display = true; 1968 1969 if (pciaddname_tmp) 1970 res = kstrtol(pciaddname_tmp, 10, 1971 &num_crtc); 1972 1973 if (!res) { 1974 if (num_crtc < 1) 1975 num_crtc = 1; 1976 if (num_crtc > 6) 1977 num_crtc = 6; 1978 adev->mode_info.num_crtc = num_crtc; 1979 } else { 1980 adev->mode_info.num_crtc = 1; 1981 } 1982 break; 1983 } 1984 } 1985 1986 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1987 amdgpu_virtual_display, pci_address_name, 1988 adev->enable_virtual_display, adev->mode_info.num_crtc); 1989 1990 kfree(pciaddstr); 1991 } 1992 #endif 1993 } 1994 1995 /** 1996 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1997 * 1998 * @adev: amdgpu_device pointer 1999 * 2000 * Parses the asic configuration parameters specified in the gpu info 2001 * firmware and makes them availale to the driver for use in configuring 2002 * the asic. 2003 * Returns 0 on success, -EINVAL on failure. 2004 */ 2005 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2006 { 2007 const char *chip_name; 2008 char fw_name[40]; 2009 int err; 2010 const struct gpu_info_firmware_header_v1_0 *hdr; 2011 2012 adev->firmware.gpu_info_fw = NULL; 2013 2014 if (adev->mman.discovery_bin) { 2015 /* 2016 * FIXME: The bounding box is still needed by Navi12, so 2017 * temporarily read it from gpu_info firmware. Should be dropped 2018 * when DAL no longer needs it. 2019 */ 2020 if (adev->asic_type != CHIP_NAVI12) 2021 return 0; 2022 } 2023 2024 switch (adev->asic_type) { 2025 default: 2026 return 0; 2027 case CHIP_VEGA10: 2028 chip_name = "vega10"; 2029 break; 2030 case CHIP_VEGA12: 2031 chip_name = "vega12"; 2032 break; 2033 case CHIP_RAVEN: 2034 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2035 chip_name = "raven2"; 2036 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2037 chip_name = "picasso"; 2038 else 2039 chip_name = "raven"; 2040 break; 2041 case CHIP_ARCTURUS: 2042 chip_name = "arcturus"; 2043 break; 2044 case CHIP_NAVI12: 2045 chip_name = "navi12"; 2046 break; 2047 } 2048 2049 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2050 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2051 if (err) { 2052 dev_err(adev->dev, 2053 "Failed to load gpu_info firmware \"%s\"\n", 2054 fw_name); 2055 goto out; 2056 } 2057 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2058 if (err) { 2059 dev_err(adev->dev, 2060 "Failed to validate gpu_info firmware \"%s\"\n", 2061 fw_name); 2062 goto out; 2063 } 2064 2065 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2066 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2067 2068 switch (hdr->version_major) { 2069 case 1: 2070 { 2071 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2072 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2073 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2074 2075 /* 2076 * Should be droped when DAL no longer needs it. 2077 */ 2078 if (adev->asic_type == CHIP_NAVI12) 2079 goto parse_soc_bounding_box; 2080 2081 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2082 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2083 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2084 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2085 adev->gfx.config.max_texture_channel_caches = 2086 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2087 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2088 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2089 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2090 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2091 adev->gfx.config.double_offchip_lds_buf = 2092 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2093 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2094 adev->gfx.cu_info.max_waves_per_simd = 2095 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2096 adev->gfx.cu_info.max_scratch_slots_per_cu = 2097 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2098 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2099 if (hdr->version_minor >= 1) { 2100 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2101 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2102 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2103 adev->gfx.config.num_sc_per_sh = 2104 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2105 adev->gfx.config.num_packer_per_sc = 2106 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2107 } 2108 2109 parse_soc_bounding_box: 2110 /* 2111 * soc bounding box info is not integrated in disocovery table, 2112 * we always need to parse it from gpu info firmware if needed. 2113 */ 2114 if (hdr->version_minor == 2) { 2115 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2116 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2117 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2118 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2119 } 2120 break; 2121 } 2122 default: 2123 dev_err(adev->dev, 2124 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2125 err = -EINVAL; 2126 goto out; 2127 } 2128 out: 2129 return err; 2130 } 2131 2132 /** 2133 * amdgpu_device_ip_early_init - run early init for hardware IPs 2134 * 2135 * @adev: amdgpu_device pointer 2136 * 2137 * Early initialization pass for hardware IPs. The hardware IPs that make 2138 * up each asic are discovered each IP's early_init callback is run. This 2139 * is the first stage in initializing the asic. 2140 * Returns 0 on success, negative error code on failure. 2141 */ 2142 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2143 { 2144 struct drm_device *dev = adev_to_drm(adev); 2145 struct pci_dev *parent; 2146 int i, r; 2147 2148 amdgpu_device_enable_virtual_display(adev); 2149 2150 if (amdgpu_sriov_vf(adev)) { 2151 r = amdgpu_virt_request_full_gpu(adev, true); 2152 if (r) 2153 return r; 2154 } 2155 2156 switch (adev->asic_type) { 2157 #ifdef CONFIG_DRM_AMDGPU_SI 2158 case CHIP_VERDE: 2159 case CHIP_TAHITI: 2160 case CHIP_PITCAIRN: 2161 case CHIP_OLAND: 2162 case CHIP_HAINAN: 2163 adev->family = AMDGPU_FAMILY_SI; 2164 r = si_set_ip_blocks(adev); 2165 if (r) 2166 return r; 2167 break; 2168 #endif 2169 #ifdef CONFIG_DRM_AMDGPU_CIK 2170 case CHIP_BONAIRE: 2171 case CHIP_HAWAII: 2172 case CHIP_KAVERI: 2173 case CHIP_KABINI: 2174 case CHIP_MULLINS: 2175 if (adev->flags & AMD_IS_APU) 2176 adev->family = AMDGPU_FAMILY_KV; 2177 else 2178 adev->family = AMDGPU_FAMILY_CI; 2179 2180 r = cik_set_ip_blocks(adev); 2181 if (r) 2182 return r; 2183 break; 2184 #endif 2185 case CHIP_TOPAZ: 2186 case CHIP_TONGA: 2187 case CHIP_FIJI: 2188 case CHIP_POLARIS10: 2189 case CHIP_POLARIS11: 2190 case CHIP_POLARIS12: 2191 case CHIP_VEGAM: 2192 case CHIP_CARRIZO: 2193 case CHIP_STONEY: 2194 if (adev->flags & AMD_IS_APU) 2195 adev->family = AMDGPU_FAMILY_CZ; 2196 else 2197 adev->family = AMDGPU_FAMILY_VI; 2198 2199 r = vi_set_ip_blocks(adev); 2200 if (r) 2201 return r; 2202 break; 2203 default: 2204 r = amdgpu_discovery_set_ip_blocks(adev); 2205 if (r) 2206 return r; 2207 break; 2208 } 2209 2210 if (amdgpu_has_atpx() && 2211 (amdgpu_is_atpx_hybrid() || 2212 amdgpu_has_atpx_dgpu_power_cntl()) && 2213 ((adev->flags & AMD_IS_APU) == 0) && 2214 !pci_is_thunderbolt_attached(dev->pdev)) 2215 adev->flags |= AMD_IS_PX; 2216 2217 if (!(adev->flags & AMD_IS_APU)) { 2218 parent = pci_upstream_bridge(adev->pdev); 2219 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2220 } 2221 2222 amdgpu_amdkfd_device_probe(adev); 2223 2224 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2225 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2226 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2227 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2228 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2229 2230 for (i = 0; i < adev->num_ip_blocks; i++) { 2231 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2232 DRM_ERROR("disabled ip block: %d <%s>\n", 2233 i, adev->ip_blocks[i].version->funcs->name); 2234 adev->ip_blocks[i].status.valid = false; 2235 } else { 2236 if (adev->ip_blocks[i].version->funcs->early_init) { 2237 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2238 if (r == -ENOENT) { 2239 adev->ip_blocks[i].status.valid = false; 2240 } else if (r) { 2241 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2242 adev->ip_blocks[i].version->funcs->name, r); 2243 return r; 2244 } else { 2245 adev->ip_blocks[i].status.valid = true; 2246 } 2247 } else { 2248 adev->ip_blocks[i].status.valid = true; 2249 } 2250 } 2251 /* get the vbios after the asic_funcs are set up */ 2252 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2253 r = amdgpu_device_parse_gpu_info_fw(adev); 2254 if (r) 2255 return r; 2256 2257 /* Read BIOS */ 2258 if (!amdgpu_get_bios(adev)) 2259 return -EINVAL; 2260 2261 r = amdgpu_atombios_init(adev); 2262 if (r) { 2263 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2264 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2265 return r; 2266 } 2267 2268 /*get pf2vf msg info at it's earliest time*/ 2269 if (amdgpu_sriov_vf(adev)) 2270 amdgpu_virt_init_data_exchange(adev); 2271 2272 } 2273 } 2274 2275 adev->cg_flags &= amdgpu_cg_mask; 2276 adev->pg_flags &= amdgpu_pg_mask; 2277 2278 return 0; 2279 } 2280 2281 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2282 { 2283 int i, r; 2284 2285 for (i = 0; i < adev->num_ip_blocks; i++) { 2286 if (!adev->ip_blocks[i].status.sw) 2287 continue; 2288 if (adev->ip_blocks[i].status.hw) 2289 continue; 2290 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2291 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2292 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2293 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2294 if (r) { 2295 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2296 adev->ip_blocks[i].version->funcs->name, r); 2297 return r; 2298 } 2299 adev->ip_blocks[i].status.hw = true; 2300 } 2301 } 2302 2303 return 0; 2304 } 2305 2306 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2307 { 2308 int i, r; 2309 2310 for (i = 0; i < adev->num_ip_blocks; i++) { 2311 if (!adev->ip_blocks[i].status.sw) 2312 continue; 2313 if (adev->ip_blocks[i].status.hw) 2314 continue; 2315 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2316 if (r) { 2317 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2318 adev->ip_blocks[i].version->funcs->name, r); 2319 return r; 2320 } 2321 adev->ip_blocks[i].status.hw = true; 2322 } 2323 2324 return 0; 2325 } 2326 2327 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2328 { 2329 int r = 0; 2330 int i; 2331 uint32_t smu_version; 2332 2333 if (adev->asic_type >= CHIP_VEGA10) { 2334 for (i = 0; i < adev->num_ip_blocks; i++) { 2335 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2336 continue; 2337 2338 if (!adev->ip_blocks[i].status.sw) 2339 continue; 2340 2341 /* no need to do the fw loading again if already done*/ 2342 if (adev->ip_blocks[i].status.hw == true) 2343 break; 2344 2345 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2346 r = adev->ip_blocks[i].version->funcs->resume(adev); 2347 if (r) { 2348 DRM_ERROR("resume of IP block <%s> failed %d\n", 2349 adev->ip_blocks[i].version->funcs->name, r); 2350 return r; 2351 } 2352 } else { 2353 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2354 if (r) { 2355 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2356 adev->ip_blocks[i].version->funcs->name, r); 2357 return r; 2358 } 2359 } 2360 2361 adev->ip_blocks[i].status.hw = true; 2362 break; 2363 } 2364 } 2365 2366 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2367 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2368 2369 return r; 2370 } 2371 2372 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2373 { 2374 long timeout; 2375 int r, i; 2376 2377 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2378 struct amdgpu_ring *ring = adev->rings[i]; 2379 2380 /* No need to setup the GPU scheduler for rings that don't need it */ 2381 if (!ring || ring->no_scheduler) 2382 continue; 2383 2384 switch (ring->funcs->type) { 2385 case AMDGPU_RING_TYPE_GFX: 2386 timeout = adev->gfx_timeout; 2387 break; 2388 case AMDGPU_RING_TYPE_COMPUTE: 2389 timeout = adev->compute_timeout; 2390 break; 2391 case AMDGPU_RING_TYPE_SDMA: 2392 timeout = adev->sdma_timeout; 2393 break; 2394 default: 2395 timeout = adev->video_timeout; 2396 break; 2397 } 2398 2399 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2400 ring->num_hw_submission, amdgpu_job_hang_limit, 2401 timeout, adev->reset_domain->wq, 2402 ring->sched_score, ring->name, 2403 adev->dev); 2404 if (r) { 2405 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2406 ring->name); 2407 return r; 2408 } 2409 } 2410 2411 return 0; 2412 } 2413 2414 2415 /** 2416 * amdgpu_device_ip_init - run init for hardware IPs 2417 * 2418 * @adev: amdgpu_device pointer 2419 * 2420 * Main initialization pass for hardware IPs. The list of all the hardware 2421 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2422 * are run. sw_init initializes the software state associated with each IP 2423 * and hw_init initializes the hardware associated with each IP. 2424 * Returns 0 on success, negative error code on failure. 2425 */ 2426 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2427 { 2428 int i, r; 2429 2430 r = amdgpu_ras_init(adev); 2431 if (r) 2432 return r; 2433 2434 for (i = 0; i < adev->num_ip_blocks; i++) { 2435 if (!adev->ip_blocks[i].status.valid) 2436 continue; 2437 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2438 if (r) { 2439 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2440 adev->ip_blocks[i].version->funcs->name, r); 2441 goto init_failed; 2442 } 2443 adev->ip_blocks[i].status.sw = true; 2444 2445 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2446 /* need to do common hw init early so everything is set up for gmc */ 2447 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2448 if (r) { 2449 DRM_ERROR("hw_init %d failed %d\n", i, r); 2450 goto init_failed; 2451 } 2452 adev->ip_blocks[i].status.hw = true; 2453 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2454 /* need to do gmc hw init early so we can allocate gpu mem */ 2455 /* Try to reserve bad pages early */ 2456 if (amdgpu_sriov_vf(adev)) 2457 amdgpu_virt_exchange_data(adev); 2458 2459 r = amdgpu_device_vram_scratch_init(adev); 2460 if (r) { 2461 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2462 goto init_failed; 2463 } 2464 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2465 if (r) { 2466 DRM_ERROR("hw_init %d failed %d\n", i, r); 2467 goto init_failed; 2468 } 2469 r = amdgpu_device_wb_init(adev); 2470 if (r) { 2471 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2472 goto init_failed; 2473 } 2474 adev->ip_blocks[i].status.hw = true; 2475 2476 /* right after GMC hw init, we create CSA */ 2477 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2478 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2479 AMDGPU_GEM_DOMAIN_VRAM, 2480 AMDGPU_CSA_SIZE); 2481 if (r) { 2482 DRM_ERROR("allocate CSA failed %d\n", r); 2483 goto init_failed; 2484 } 2485 } 2486 } 2487 } 2488 2489 if (amdgpu_sriov_vf(adev)) 2490 amdgpu_virt_init_data_exchange(adev); 2491 2492 r = amdgpu_ib_pool_init(adev); 2493 if (r) { 2494 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2495 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2496 goto init_failed; 2497 } 2498 2499 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2500 if (r) 2501 goto init_failed; 2502 2503 r = amdgpu_device_ip_hw_init_phase1(adev); 2504 if (r) 2505 goto init_failed; 2506 2507 r = amdgpu_device_fw_loading(adev); 2508 if (r) 2509 goto init_failed; 2510 2511 r = amdgpu_device_ip_hw_init_phase2(adev); 2512 if (r) 2513 goto init_failed; 2514 2515 /* 2516 * retired pages will be loaded from eeprom and reserved here, 2517 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2518 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2519 * for I2C communication which only true at this point. 2520 * 2521 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2522 * failure from bad gpu situation and stop amdgpu init process 2523 * accordingly. For other failed cases, it will still release all 2524 * the resource and print error message, rather than returning one 2525 * negative value to upper level. 2526 * 2527 * Note: theoretically, this should be called before all vram allocations 2528 * to protect retired page from abusing 2529 */ 2530 r = amdgpu_ras_recovery_init(adev); 2531 if (r) 2532 goto init_failed; 2533 2534 /** 2535 * In case of XGMI grab extra reference for reset domain for this device 2536 */ 2537 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2538 if (amdgpu_xgmi_add_device(adev) == 0) { 2539 if (!amdgpu_sriov_vf(adev)) { 2540 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2541 2542 if (WARN_ON(!hive)) { 2543 r = -ENOENT; 2544 goto init_failed; 2545 } 2546 2547 if (!hive->reset_domain || 2548 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2549 r = -ENOENT; 2550 amdgpu_put_xgmi_hive(hive); 2551 goto init_failed; 2552 } 2553 2554 /* Drop the early temporary reset domain we created for device */ 2555 amdgpu_reset_put_reset_domain(adev->reset_domain); 2556 adev->reset_domain = hive->reset_domain; 2557 amdgpu_put_xgmi_hive(hive); 2558 } 2559 } 2560 } 2561 2562 r = amdgpu_device_init_schedulers(adev); 2563 if (r) 2564 goto init_failed; 2565 2566 /* Don't init kfd if whole hive need to be reset during init */ 2567 if (!adev->gmc.xgmi.pending_reset) 2568 amdgpu_amdkfd_device_init(adev); 2569 2570 amdgpu_fru_get_product_info(adev); 2571 2572 init_failed: 2573 2574 return r; 2575 } 2576 2577 /** 2578 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2579 * 2580 * @adev: amdgpu_device pointer 2581 * 2582 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2583 * this function before a GPU reset. If the value is retained after a 2584 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2585 */ 2586 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2587 { 2588 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2589 } 2590 2591 /** 2592 * amdgpu_device_check_vram_lost - check if vram is valid 2593 * 2594 * @adev: amdgpu_device pointer 2595 * 2596 * Checks the reset magic value written to the gart pointer in VRAM. 2597 * The driver calls this after a GPU reset to see if the contents of 2598 * VRAM is lost or now. 2599 * returns true if vram is lost, false if not. 2600 */ 2601 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2602 { 2603 if (memcmp(adev->gart.ptr, adev->reset_magic, 2604 AMDGPU_RESET_MAGIC_NUM)) 2605 return true; 2606 2607 if (!amdgpu_in_reset(adev)) 2608 return false; 2609 2610 /* 2611 * For all ASICs with baco/mode1 reset, the VRAM is 2612 * always assumed to be lost. 2613 */ 2614 switch (amdgpu_asic_reset_method(adev)) { 2615 case AMD_RESET_METHOD_BACO: 2616 case AMD_RESET_METHOD_MODE1: 2617 return true; 2618 default: 2619 return false; 2620 } 2621 } 2622 2623 /** 2624 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2625 * 2626 * @adev: amdgpu_device pointer 2627 * @state: clockgating state (gate or ungate) 2628 * 2629 * The list of all the hardware IPs that make up the asic is walked and the 2630 * set_clockgating_state callbacks are run. 2631 * Late initialization pass enabling clockgating for hardware IPs. 2632 * Fini or suspend, pass disabling clockgating for hardware IPs. 2633 * Returns 0 on success, negative error code on failure. 2634 */ 2635 2636 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2637 enum amd_clockgating_state state) 2638 { 2639 int i, j, r; 2640 2641 if (amdgpu_emu_mode == 1) 2642 return 0; 2643 2644 for (j = 0; j < adev->num_ip_blocks; j++) { 2645 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2646 if (!adev->ip_blocks[i].status.late_initialized) 2647 continue; 2648 /* skip CG for GFX on S0ix */ 2649 if (adev->in_s0ix && 2650 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2651 continue; 2652 /* skip CG for VCE/UVD, it's handled specially */ 2653 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2654 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2655 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2656 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2657 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2658 /* enable clockgating to save power */ 2659 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2660 state); 2661 if (r) { 2662 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2663 adev->ip_blocks[i].version->funcs->name, r); 2664 return r; 2665 } 2666 } 2667 } 2668 2669 return 0; 2670 } 2671 2672 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2673 enum amd_powergating_state state) 2674 { 2675 int i, j, r; 2676 2677 if (amdgpu_emu_mode == 1) 2678 return 0; 2679 2680 for (j = 0; j < adev->num_ip_blocks; j++) { 2681 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2682 if (!adev->ip_blocks[i].status.late_initialized) 2683 continue; 2684 /* skip PG for GFX on S0ix */ 2685 if (adev->in_s0ix && 2686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2687 continue; 2688 /* skip CG for VCE/UVD, it's handled specially */ 2689 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2690 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2691 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2692 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2693 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2694 /* enable powergating to save power */ 2695 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2696 state); 2697 if (r) { 2698 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2699 adev->ip_blocks[i].version->funcs->name, r); 2700 return r; 2701 } 2702 } 2703 } 2704 return 0; 2705 } 2706 2707 static int amdgpu_device_enable_mgpu_fan_boost(void) 2708 { 2709 struct amdgpu_gpu_instance *gpu_ins; 2710 struct amdgpu_device *adev; 2711 int i, ret = 0; 2712 2713 mutex_lock(&mgpu_info.mutex); 2714 2715 /* 2716 * MGPU fan boost feature should be enabled 2717 * only when there are two or more dGPUs in 2718 * the system 2719 */ 2720 if (mgpu_info.num_dgpu < 2) 2721 goto out; 2722 2723 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2724 gpu_ins = &(mgpu_info.gpu_ins[i]); 2725 adev = gpu_ins->adev; 2726 if (!(adev->flags & AMD_IS_APU) && 2727 !gpu_ins->mgpu_fan_enabled) { 2728 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2729 if (ret) 2730 break; 2731 2732 gpu_ins->mgpu_fan_enabled = 1; 2733 } 2734 } 2735 2736 out: 2737 mutex_unlock(&mgpu_info.mutex); 2738 2739 return ret; 2740 } 2741 2742 /** 2743 * amdgpu_device_ip_late_init - run late init for hardware IPs 2744 * 2745 * @adev: amdgpu_device pointer 2746 * 2747 * Late initialization pass for hardware IPs. The list of all the hardware 2748 * IPs that make up the asic is walked and the late_init callbacks are run. 2749 * late_init covers any special initialization that an IP requires 2750 * after all of the have been initialized or something that needs to happen 2751 * late in the init process. 2752 * Returns 0 on success, negative error code on failure. 2753 */ 2754 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2755 { 2756 struct amdgpu_gpu_instance *gpu_instance; 2757 int i = 0, r; 2758 2759 for (i = 0; i < adev->num_ip_blocks; i++) { 2760 if (!adev->ip_blocks[i].status.hw) 2761 continue; 2762 if (adev->ip_blocks[i].version->funcs->late_init) { 2763 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2764 if (r) { 2765 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2766 adev->ip_blocks[i].version->funcs->name, r); 2767 return r; 2768 } 2769 } 2770 adev->ip_blocks[i].status.late_initialized = true; 2771 } 2772 2773 r = amdgpu_ras_late_init(adev); 2774 if (r) { 2775 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2776 return r; 2777 } 2778 2779 amdgpu_ras_set_error_query_ready(adev, true); 2780 2781 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2782 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2783 2784 amdgpu_device_fill_reset_magic(adev); 2785 2786 r = amdgpu_device_enable_mgpu_fan_boost(); 2787 if (r) 2788 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2789 2790 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2791 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2792 adev->asic_type == CHIP_ALDEBARAN )) 2793 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2794 2795 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2796 mutex_lock(&mgpu_info.mutex); 2797 2798 /* 2799 * Reset device p-state to low as this was booted with high. 2800 * 2801 * This should be performed only after all devices from the same 2802 * hive get initialized. 2803 * 2804 * However, it's unknown how many device in the hive in advance. 2805 * As this is counted one by one during devices initializations. 2806 * 2807 * So, we wait for all XGMI interlinked devices initialized. 2808 * This may bring some delays as those devices may come from 2809 * different hives. But that should be OK. 2810 */ 2811 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2812 for (i = 0; i < mgpu_info.num_gpu; i++) { 2813 gpu_instance = &(mgpu_info.gpu_ins[i]); 2814 if (gpu_instance->adev->flags & AMD_IS_APU) 2815 continue; 2816 2817 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2818 AMDGPU_XGMI_PSTATE_MIN); 2819 if (r) { 2820 DRM_ERROR("pstate setting failed (%d).\n", r); 2821 break; 2822 } 2823 } 2824 } 2825 2826 mutex_unlock(&mgpu_info.mutex); 2827 } 2828 2829 return 0; 2830 } 2831 2832 /** 2833 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2834 * 2835 * @adev: amdgpu_device pointer 2836 * 2837 * For ASICs need to disable SMC first 2838 */ 2839 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2840 { 2841 int i, r; 2842 2843 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2844 return; 2845 2846 for (i = 0; i < adev->num_ip_blocks; i++) { 2847 if (!adev->ip_blocks[i].status.hw) 2848 continue; 2849 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2850 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2851 /* XXX handle errors */ 2852 if (r) { 2853 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2854 adev->ip_blocks[i].version->funcs->name, r); 2855 } 2856 adev->ip_blocks[i].status.hw = false; 2857 break; 2858 } 2859 } 2860 } 2861 2862 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2863 { 2864 int i, r; 2865 2866 for (i = 0; i < adev->num_ip_blocks; i++) { 2867 if (!adev->ip_blocks[i].version->funcs->early_fini) 2868 continue; 2869 2870 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2871 if (r) { 2872 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2873 adev->ip_blocks[i].version->funcs->name, r); 2874 } 2875 } 2876 2877 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2878 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2879 2880 amdgpu_amdkfd_suspend(adev, false); 2881 2882 /* Workaroud for ASICs need to disable SMC first */ 2883 amdgpu_device_smu_fini_early(adev); 2884 2885 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2886 if (!adev->ip_blocks[i].status.hw) 2887 continue; 2888 2889 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2890 /* XXX handle errors */ 2891 if (r) { 2892 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2893 adev->ip_blocks[i].version->funcs->name, r); 2894 } 2895 2896 adev->ip_blocks[i].status.hw = false; 2897 } 2898 2899 if (amdgpu_sriov_vf(adev)) { 2900 if (amdgpu_virt_release_full_gpu(adev, false)) 2901 DRM_ERROR("failed to release exclusive mode on fini\n"); 2902 } 2903 2904 return 0; 2905 } 2906 2907 /** 2908 * amdgpu_device_ip_fini - run fini for hardware IPs 2909 * 2910 * @adev: amdgpu_device pointer 2911 * 2912 * Main teardown pass for hardware IPs. The list of all the hardware 2913 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2914 * are run. hw_fini tears down the hardware associated with each IP 2915 * and sw_fini tears down any software state associated with each IP. 2916 * Returns 0 on success, negative error code on failure. 2917 */ 2918 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2919 { 2920 int i, r; 2921 2922 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2923 amdgpu_virt_release_ras_err_handler_data(adev); 2924 2925 if (adev->gmc.xgmi.num_physical_nodes > 1) 2926 amdgpu_xgmi_remove_device(adev); 2927 2928 amdgpu_amdkfd_device_fini_sw(adev); 2929 2930 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2931 if (!adev->ip_blocks[i].status.sw) 2932 continue; 2933 2934 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2935 amdgpu_ucode_free_bo(adev); 2936 amdgpu_free_static_csa(&adev->virt.csa_obj); 2937 amdgpu_device_wb_fini(adev); 2938 amdgpu_device_vram_scratch_fini(adev); 2939 amdgpu_ib_pool_fini(adev); 2940 } 2941 2942 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2943 /* XXX handle errors */ 2944 if (r) { 2945 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2946 adev->ip_blocks[i].version->funcs->name, r); 2947 } 2948 adev->ip_blocks[i].status.sw = false; 2949 adev->ip_blocks[i].status.valid = false; 2950 } 2951 2952 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2953 if (!adev->ip_blocks[i].status.late_initialized) 2954 continue; 2955 if (adev->ip_blocks[i].version->funcs->late_fini) 2956 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2957 adev->ip_blocks[i].status.late_initialized = false; 2958 } 2959 2960 amdgpu_ras_fini(adev); 2961 2962 return 0; 2963 } 2964 2965 /** 2966 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2967 * 2968 * @work: work_struct. 2969 */ 2970 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2971 { 2972 struct amdgpu_device *adev = 2973 container_of(work, struct amdgpu_device, delayed_init_work.work); 2974 int r; 2975 2976 r = amdgpu_ib_ring_tests(adev); 2977 if (r) 2978 DRM_ERROR("ib ring test failed (%d).\n", r); 2979 } 2980 2981 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2982 { 2983 struct amdgpu_device *adev = 2984 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2985 2986 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2987 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2988 2989 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2990 adev->gfx.gfx_off_state = true; 2991 } 2992 2993 /** 2994 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2995 * 2996 * @adev: amdgpu_device pointer 2997 * 2998 * Main suspend function for hardware IPs. The list of all the hardware 2999 * IPs that make up the asic is walked, clockgating is disabled and the 3000 * suspend callbacks are run. suspend puts the hardware and software state 3001 * in each IP into a state suitable for suspend. 3002 * Returns 0 on success, negative error code on failure. 3003 */ 3004 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3005 { 3006 int i, r; 3007 3008 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3009 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3010 3011 /* 3012 * Per PMFW team's suggestion, driver needs to handle gfxoff 3013 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3014 * scenario. Add the missing df cstate disablement here. 3015 */ 3016 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3017 dev_warn(adev->dev, "Failed to disallow df cstate"); 3018 3019 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3020 if (!adev->ip_blocks[i].status.valid) 3021 continue; 3022 3023 /* displays are handled separately */ 3024 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3025 continue; 3026 3027 /* XXX handle errors */ 3028 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3029 /* XXX handle errors */ 3030 if (r) { 3031 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3032 adev->ip_blocks[i].version->funcs->name, r); 3033 return r; 3034 } 3035 3036 adev->ip_blocks[i].status.hw = false; 3037 } 3038 3039 return 0; 3040 } 3041 3042 /** 3043 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3044 * 3045 * @adev: amdgpu_device pointer 3046 * 3047 * Main suspend function for hardware IPs. The list of all the hardware 3048 * IPs that make up the asic is walked, clockgating is disabled and the 3049 * suspend callbacks are run. suspend puts the hardware and software state 3050 * in each IP into a state suitable for suspend. 3051 * Returns 0 on success, negative error code on failure. 3052 */ 3053 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3054 { 3055 int i, r; 3056 3057 if (adev->in_s0ix) 3058 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3059 3060 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3061 if (!adev->ip_blocks[i].status.valid) 3062 continue; 3063 /* displays are handled in phase1 */ 3064 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3065 continue; 3066 /* PSP lost connection when err_event_athub occurs */ 3067 if (amdgpu_ras_intr_triggered() && 3068 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3069 adev->ip_blocks[i].status.hw = false; 3070 continue; 3071 } 3072 3073 /* skip unnecessary suspend if we do not initialize them yet */ 3074 if (adev->gmc.xgmi.pending_reset && 3075 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3076 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3077 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3078 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3079 adev->ip_blocks[i].status.hw = false; 3080 continue; 3081 } 3082 3083 /* skip suspend of gfx/mes and psp for S0ix 3084 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3085 * like at runtime. PSP is also part of the always on hardware 3086 * so no need to suspend it. 3087 */ 3088 if (adev->in_s0ix && 3089 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3090 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3091 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3092 continue; 3093 3094 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3095 if (adev->in_s0ix && 3096 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3097 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3098 continue; 3099 3100 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3101 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3102 * from this location and RLC Autoload automatically also gets loaded 3103 * from here based on PMFW -> PSP message during re-init sequence. 3104 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3105 * the TMR and reload FWs again for IMU enabled APU ASICs. 3106 */ 3107 if (amdgpu_in_reset(adev) && 3108 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3109 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3110 continue; 3111 3112 /* XXX handle errors */ 3113 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3114 /* XXX handle errors */ 3115 if (r) { 3116 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3117 adev->ip_blocks[i].version->funcs->name, r); 3118 } 3119 adev->ip_blocks[i].status.hw = false; 3120 /* handle putting the SMC in the appropriate state */ 3121 if(!amdgpu_sriov_vf(adev)){ 3122 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3123 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3124 if (r) { 3125 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3126 adev->mp1_state, r); 3127 return r; 3128 } 3129 } 3130 } 3131 } 3132 3133 return 0; 3134 } 3135 3136 /** 3137 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3138 * 3139 * @adev: amdgpu_device pointer 3140 * 3141 * Main suspend function for hardware IPs. The list of all the hardware 3142 * IPs that make up the asic is walked, clockgating is disabled and the 3143 * suspend callbacks are run. suspend puts the hardware and software state 3144 * in each IP into a state suitable for suspend. 3145 * Returns 0 on success, negative error code on failure. 3146 */ 3147 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3148 { 3149 int r; 3150 3151 if (amdgpu_sriov_vf(adev)) { 3152 amdgpu_virt_fini_data_exchange(adev); 3153 amdgpu_virt_request_full_gpu(adev, false); 3154 } 3155 3156 r = amdgpu_device_ip_suspend_phase1(adev); 3157 if (r) 3158 return r; 3159 r = amdgpu_device_ip_suspend_phase2(adev); 3160 3161 if (amdgpu_sriov_vf(adev)) 3162 amdgpu_virt_release_full_gpu(adev, false); 3163 3164 return r; 3165 } 3166 3167 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3168 { 3169 int i, r; 3170 3171 static enum amd_ip_block_type ip_order[] = { 3172 AMD_IP_BLOCK_TYPE_COMMON, 3173 AMD_IP_BLOCK_TYPE_GMC, 3174 AMD_IP_BLOCK_TYPE_PSP, 3175 AMD_IP_BLOCK_TYPE_IH, 3176 }; 3177 3178 for (i = 0; i < adev->num_ip_blocks; i++) { 3179 int j; 3180 struct amdgpu_ip_block *block; 3181 3182 block = &adev->ip_blocks[i]; 3183 block->status.hw = false; 3184 3185 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3186 3187 if (block->version->type != ip_order[j] || 3188 !block->status.valid) 3189 continue; 3190 3191 r = block->version->funcs->hw_init(adev); 3192 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3193 if (r) 3194 return r; 3195 block->status.hw = true; 3196 } 3197 } 3198 3199 return 0; 3200 } 3201 3202 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3203 { 3204 int i, r; 3205 3206 static enum amd_ip_block_type ip_order[] = { 3207 AMD_IP_BLOCK_TYPE_SMC, 3208 AMD_IP_BLOCK_TYPE_DCE, 3209 AMD_IP_BLOCK_TYPE_GFX, 3210 AMD_IP_BLOCK_TYPE_SDMA, 3211 AMD_IP_BLOCK_TYPE_UVD, 3212 AMD_IP_BLOCK_TYPE_VCE, 3213 AMD_IP_BLOCK_TYPE_VCN 3214 }; 3215 3216 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3217 int j; 3218 struct amdgpu_ip_block *block; 3219 3220 for (j = 0; j < adev->num_ip_blocks; j++) { 3221 block = &adev->ip_blocks[j]; 3222 3223 if (block->version->type != ip_order[i] || 3224 !block->status.valid || 3225 block->status.hw) 3226 continue; 3227 3228 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3229 r = block->version->funcs->resume(adev); 3230 else 3231 r = block->version->funcs->hw_init(adev); 3232 3233 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3234 if (r) 3235 return r; 3236 block->status.hw = true; 3237 } 3238 } 3239 3240 return 0; 3241 } 3242 3243 /** 3244 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3245 * 3246 * @adev: amdgpu_device pointer 3247 * 3248 * First resume function for hardware IPs. The list of all the hardware 3249 * IPs that make up the asic is walked and the resume callbacks are run for 3250 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3251 * after a suspend and updates the software state as necessary. This 3252 * function is also used for restoring the GPU after a GPU reset. 3253 * Returns 0 on success, negative error code on failure. 3254 */ 3255 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3256 { 3257 int i, r; 3258 3259 for (i = 0; i < adev->num_ip_blocks; i++) { 3260 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3261 continue; 3262 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3263 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3264 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3265 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3266 3267 r = adev->ip_blocks[i].version->funcs->resume(adev); 3268 if (r) { 3269 DRM_ERROR("resume of IP block <%s> failed %d\n", 3270 adev->ip_blocks[i].version->funcs->name, r); 3271 return r; 3272 } 3273 adev->ip_blocks[i].status.hw = true; 3274 } 3275 } 3276 3277 return 0; 3278 } 3279 3280 /** 3281 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3282 * 3283 * @adev: amdgpu_device pointer 3284 * 3285 * First resume function for hardware IPs. The list of all the hardware 3286 * IPs that make up the asic is walked and the resume callbacks are run for 3287 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3288 * functional state after a suspend and updates the software state as 3289 * necessary. This function is also used for restoring the GPU after a GPU 3290 * reset. 3291 * Returns 0 on success, negative error code on failure. 3292 */ 3293 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3294 { 3295 int i, r; 3296 3297 for (i = 0; i < adev->num_ip_blocks; i++) { 3298 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3299 continue; 3300 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3301 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3302 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3303 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3304 continue; 3305 r = adev->ip_blocks[i].version->funcs->resume(adev); 3306 if (r) { 3307 DRM_ERROR("resume of IP block <%s> failed %d\n", 3308 adev->ip_blocks[i].version->funcs->name, r); 3309 return r; 3310 } 3311 adev->ip_blocks[i].status.hw = true; 3312 3313 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3314 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3315 * amdgpu_device_resume() after IP resume. 3316 */ 3317 amdgpu_gfx_off_ctrl(adev, false); 3318 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3319 } 3320 3321 } 3322 3323 return 0; 3324 } 3325 3326 /** 3327 * amdgpu_device_ip_resume - run resume for hardware IPs 3328 * 3329 * @adev: amdgpu_device pointer 3330 * 3331 * Main resume function for hardware IPs. The hardware IPs 3332 * are split into two resume functions because they are 3333 * are also used in in recovering from a GPU reset and some additional 3334 * steps need to be take between them. In this case (S3/S4) they are 3335 * run sequentially. 3336 * Returns 0 on success, negative error code on failure. 3337 */ 3338 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3339 { 3340 int r; 3341 3342 r = amdgpu_amdkfd_resume_iommu(adev); 3343 if (r) 3344 return r; 3345 3346 r = amdgpu_device_ip_resume_phase1(adev); 3347 if (r) 3348 return r; 3349 3350 r = amdgpu_device_fw_loading(adev); 3351 if (r) 3352 return r; 3353 3354 r = amdgpu_device_ip_resume_phase2(adev); 3355 3356 return r; 3357 } 3358 3359 /** 3360 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3361 * 3362 * @adev: amdgpu_device pointer 3363 * 3364 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3365 */ 3366 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3367 { 3368 if (amdgpu_sriov_vf(adev)) { 3369 if (adev->is_atom_fw) { 3370 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3371 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3372 } else { 3373 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3374 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3375 } 3376 3377 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3378 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3379 } 3380 } 3381 3382 /** 3383 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3384 * 3385 * @asic_type: AMD asic type 3386 * 3387 * Check if there is DC (new modesetting infrastructre) support for an asic. 3388 * returns true if DC has support, false if not. 3389 */ 3390 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3391 { 3392 switch (asic_type) { 3393 #ifdef CONFIG_DRM_AMDGPU_SI 3394 case CHIP_HAINAN: 3395 #endif 3396 case CHIP_TOPAZ: 3397 /* chips with no display hardware */ 3398 return false; 3399 #if defined(CONFIG_DRM_AMD_DC) 3400 case CHIP_TAHITI: 3401 case CHIP_PITCAIRN: 3402 case CHIP_VERDE: 3403 case CHIP_OLAND: 3404 /* 3405 * We have systems in the wild with these ASICs that require 3406 * LVDS and VGA support which is not supported with DC. 3407 * 3408 * Fallback to the non-DC driver here by default so as not to 3409 * cause regressions. 3410 */ 3411 #if defined(CONFIG_DRM_AMD_DC_SI) 3412 return amdgpu_dc > 0; 3413 #else 3414 return false; 3415 #endif 3416 case CHIP_BONAIRE: 3417 case CHIP_KAVERI: 3418 case CHIP_KABINI: 3419 case CHIP_MULLINS: 3420 /* 3421 * We have systems in the wild with these ASICs that require 3422 * VGA support which is not supported with DC. 3423 * 3424 * Fallback to the non-DC driver here by default so as not to 3425 * cause regressions. 3426 */ 3427 return amdgpu_dc > 0; 3428 default: 3429 return amdgpu_dc != 0; 3430 #else 3431 default: 3432 if (amdgpu_dc > 0) 3433 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3434 "but isn't supported by ASIC, ignoring\n"); 3435 return false; 3436 #endif 3437 } 3438 } 3439 3440 /** 3441 * amdgpu_device_has_dc_support - check if dc is supported 3442 * 3443 * @adev: amdgpu_device pointer 3444 * 3445 * Returns true for supported, false for not supported 3446 */ 3447 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3448 { 3449 if (amdgpu_sriov_vf(adev) || 3450 adev->enable_virtual_display || 3451 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3452 return false; 3453 3454 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3455 } 3456 3457 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3458 { 3459 struct amdgpu_device *adev = 3460 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3461 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3462 3463 /* It's a bug to not have a hive within this function */ 3464 if (WARN_ON(!hive)) 3465 return; 3466 3467 /* 3468 * Use task barrier to synchronize all xgmi reset works across the 3469 * hive. task_barrier_enter and task_barrier_exit will block 3470 * until all the threads running the xgmi reset works reach 3471 * those points. task_barrier_full will do both blocks. 3472 */ 3473 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3474 3475 task_barrier_enter(&hive->tb); 3476 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3477 3478 if (adev->asic_reset_res) 3479 goto fail; 3480 3481 task_barrier_exit(&hive->tb); 3482 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3483 3484 if (adev->asic_reset_res) 3485 goto fail; 3486 3487 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3488 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3489 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3490 } else { 3491 3492 task_barrier_full(&hive->tb); 3493 adev->asic_reset_res = amdgpu_asic_reset(adev); 3494 } 3495 3496 fail: 3497 if (adev->asic_reset_res) 3498 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3499 adev->asic_reset_res, adev_to_drm(adev)->unique); 3500 amdgpu_put_xgmi_hive(hive); 3501 } 3502 3503 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3504 { 3505 char *input = amdgpu_lockup_timeout; 3506 char *timeout_setting = NULL; 3507 int index = 0; 3508 long timeout; 3509 int ret = 0; 3510 3511 /* 3512 * By default timeout for non compute jobs is 10000 3513 * and 60000 for compute jobs. 3514 * In SR-IOV or passthrough mode, timeout for compute 3515 * jobs are 60000 by default. 3516 */ 3517 adev->gfx_timeout = msecs_to_jiffies(10000); 3518 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3519 if (amdgpu_sriov_vf(adev)) 3520 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3521 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3522 else 3523 adev->compute_timeout = msecs_to_jiffies(60000); 3524 3525 #ifdef notyet 3526 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3527 while ((timeout_setting = strsep(&input, ",")) && 3528 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3529 ret = kstrtol(timeout_setting, 0, &timeout); 3530 if (ret) 3531 return ret; 3532 3533 if (timeout == 0) { 3534 index++; 3535 continue; 3536 } else if (timeout < 0) { 3537 timeout = MAX_SCHEDULE_TIMEOUT; 3538 dev_warn(adev->dev, "lockup timeout disabled"); 3539 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3540 } else { 3541 timeout = msecs_to_jiffies(timeout); 3542 } 3543 3544 switch (index++) { 3545 case 0: 3546 adev->gfx_timeout = timeout; 3547 break; 3548 case 1: 3549 adev->compute_timeout = timeout; 3550 break; 3551 case 2: 3552 adev->sdma_timeout = timeout; 3553 break; 3554 case 3: 3555 adev->video_timeout = timeout; 3556 break; 3557 default: 3558 break; 3559 } 3560 } 3561 /* 3562 * There is only one value specified and 3563 * it should apply to all non-compute jobs. 3564 */ 3565 if (index == 1) { 3566 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3567 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3568 adev->compute_timeout = adev->gfx_timeout; 3569 } 3570 } 3571 #endif 3572 3573 return ret; 3574 } 3575 3576 /** 3577 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3578 * 3579 * @adev: amdgpu_device pointer 3580 * 3581 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3582 */ 3583 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3584 { 3585 #ifdef notyet 3586 struct iommu_domain *domain; 3587 3588 domain = iommu_get_domain_for_dev(adev->dev); 3589 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3590 #endif 3591 adev->ram_is_direct_mapped = true; 3592 } 3593 3594 static const struct attribute *amdgpu_dev_attributes[] = { 3595 &dev_attr_product_name.attr, 3596 &dev_attr_product_number.attr, 3597 &dev_attr_serial_number.attr, 3598 &dev_attr_pcie_replay_count.attr, 3599 NULL 3600 }; 3601 3602 /** 3603 * amdgpu_device_init - initialize the driver 3604 * 3605 * @adev: amdgpu_device pointer 3606 * @flags: driver flags 3607 * 3608 * Initializes the driver info and hw (all asics). 3609 * Returns 0 for success or an error on failure. 3610 * Called at driver startup. 3611 */ 3612 int amdgpu_device_init(struct amdgpu_device *adev, 3613 uint32_t flags) 3614 { 3615 struct drm_device *ddev = adev_to_drm(adev); 3616 struct pci_dev *pdev = adev->pdev; 3617 int r, i; 3618 bool px = false; 3619 u32 max_MBps; 3620 int tmp; 3621 3622 adev->shutdown = false; 3623 adev->flags = flags; 3624 3625 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3626 adev->asic_type = amdgpu_force_asic_type; 3627 else 3628 adev->asic_type = flags & AMD_ASIC_MASK; 3629 3630 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3631 if (amdgpu_emu_mode == 1) 3632 adev->usec_timeout *= 10; 3633 adev->gmc.gart_size = 512 * 1024 * 1024; 3634 adev->accel_working = false; 3635 adev->num_rings = 0; 3636 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3637 adev->mman.buffer_funcs = NULL; 3638 adev->mman.buffer_funcs_ring = NULL; 3639 adev->vm_manager.vm_pte_funcs = NULL; 3640 adev->vm_manager.vm_pte_num_scheds = 0; 3641 adev->gmc.gmc_funcs = NULL; 3642 adev->harvest_ip_mask = 0x0; 3643 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3644 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3645 3646 adev->smc_rreg = &amdgpu_invalid_rreg; 3647 adev->smc_wreg = &amdgpu_invalid_wreg; 3648 adev->pcie_rreg = &amdgpu_invalid_rreg; 3649 adev->pcie_wreg = &amdgpu_invalid_wreg; 3650 adev->pciep_rreg = &amdgpu_invalid_rreg; 3651 adev->pciep_wreg = &amdgpu_invalid_wreg; 3652 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3653 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3654 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3655 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3656 adev->didt_rreg = &amdgpu_invalid_rreg; 3657 adev->didt_wreg = &amdgpu_invalid_wreg; 3658 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3659 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3660 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3661 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3662 3663 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3664 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3665 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3666 3667 /* mutex initialization are all done here so we 3668 * can recall function without having locking issues */ 3669 rw_init(&adev->firmware.mutex, "agfw"); 3670 rw_init(&adev->pm.mutex, "agpm"); 3671 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3672 rw_init(&adev->srbm_mutex, "srbm"); 3673 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3674 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3675 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3676 rw_init(&adev->mn_lock, "agpumn"); 3677 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3678 hash_init(adev->mn_hash); 3679 rw_init(&adev->psp.mutex, "agpsp"); 3680 rw_init(&adev->notifier_lock, "agnf"); 3681 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3682 rw_init(&adev->benchmark_mutex, "agbm"); 3683 3684 amdgpu_device_init_apu_flags(adev); 3685 3686 r = amdgpu_device_check_arguments(adev); 3687 if (r) 3688 return r; 3689 3690 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3691 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3692 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3693 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3694 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3695 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3696 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3697 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3698 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3699 3700 INIT_LIST_HEAD(&adev->shadow_list); 3701 rw_init(&adev->shadow_list_lock, "sdwlst"); 3702 3703 INIT_LIST_HEAD(&adev->reset_list); 3704 3705 INIT_LIST_HEAD(&adev->ras_list); 3706 3707 INIT_DELAYED_WORK(&adev->delayed_init_work, 3708 amdgpu_device_delayed_init_work_handler); 3709 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3710 amdgpu_device_delay_enable_gfx_off); 3711 3712 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3713 3714 adev->gfx.gfx_off_req_count = 1; 3715 adev->gfx.gfx_off_residency = 0; 3716 adev->gfx.gfx_off_entrycount = 0; 3717 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3718 3719 atomic_set(&adev->throttling_logging_enabled, 1); 3720 /* 3721 * If throttling continues, logging will be performed every minute 3722 * to avoid log flooding. "-1" is subtracted since the thermal 3723 * throttling interrupt comes every second. Thus, the total logging 3724 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3725 * for throttling interrupt) = 60 seconds. 3726 */ 3727 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3728 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3729 3730 #ifdef __linux__ 3731 /* Registers mapping */ 3732 /* TODO: block userspace mapping of io register */ 3733 if (adev->asic_type >= CHIP_BONAIRE) { 3734 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3735 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3736 } else { 3737 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3738 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3739 } 3740 3741 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3742 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3743 3744 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3745 if (adev->rmmio == NULL) { 3746 return -ENOMEM; 3747 } 3748 #endif 3749 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3750 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3751 3752 amdgpu_device_get_pcie_info(adev); 3753 3754 if (amdgpu_mcbp) 3755 DRM_INFO("MCBP is enabled\n"); 3756 3757 /* 3758 * Reset domain needs to be present early, before XGMI hive discovered 3759 * (if any) and intitialized to use reset sem and in_gpu reset flag 3760 * early on during init and before calling to RREG32. 3761 */ 3762 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3763 if (!adev->reset_domain) 3764 return -ENOMEM; 3765 3766 /* detect hw virtualization here */ 3767 amdgpu_detect_virtualization(adev); 3768 3769 r = amdgpu_device_get_job_timeout_settings(adev); 3770 if (r) { 3771 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3772 return r; 3773 } 3774 3775 /* early init functions */ 3776 r = amdgpu_device_ip_early_init(adev); 3777 if (r) 3778 return r; 3779 3780 /* Get rid of things like offb */ 3781 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3782 if (r) 3783 return r; 3784 3785 /* Enable TMZ based on IP_VERSION */ 3786 amdgpu_gmc_tmz_set(adev); 3787 3788 amdgpu_gmc_noretry_set(adev); 3789 /* Need to get xgmi info early to decide the reset behavior*/ 3790 if (adev->gmc.xgmi.supported) { 3791 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3792 if (r) 3793 return r; 3794 } 3795 3796 /* enable PCIE atomic ops */ 3797 #ifdef notyet 3798 if (amdgpu_sriov_vf(adev)) 3799 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3800 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3801 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3802 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3803 * internal path natively support atomics, set have_atomics_support to true. 3804 */ 3805 else if ((adev->flags & AMD_IS_APU) && 3806 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3807 adev->have_atomics_support = true; 3808 else 3809 adev->have_atomics_support = 3810 !pci_enable_atomic_ops_to_root(adev->pdev, 3811 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3812 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3813 if (!adev->have_atomics_support) 3814 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3815 #else 3816 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3817 * internal path natively support atomics, set have_atomics_support to true. 3818 */ 3819 if ((adev->flags & AMD_IS_APU) && 3820 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3821 adev->have_atomics_support = true; 3822 else 3823 adev->have_atomics_support = false; 3824 #endif 3825 3826 /* doorbell bar mapping and doorbell index init*/ 3827 amdgpu_device_doorbell_init(adev); 3828 3829 if (amdgpu_emu_mode == 1) { 3830 /* post the asic on emulation mode */ 3831 emu_soc_asic_init(adev); 3832 goto fence_driver_init; 3833 } 3834 3835 amdgpu_reset_init(adev); 3836 3837 /* detect if we are with an SRIOV vbios */ 3838 amdgpu_device_detect_sriov_bios(adev); 3839 3840 /* check if we need to reset the asic 3841 * E.g., driver was not cleanly unloaded previously, etc. 3842 */ 3843 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3844 if (adev->gmc.xgmi.num_physical_nodes) { 3845 dev_info(adev->dev, "Pending hive reset.\n"); 3846 adev->gmc.xgmi.pending_reset = true; 3847 /* Only need to init necessary block for SMU to handle the reset */ 3848 for (i = 0; i < adev->num_ip_blocks; i++) { 3849 if (!adev->ip_blocks[i].status.valid) 3850 continue; 3851 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3852 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3853 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3854 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3855 DRM_DEBUG("IP %s disabled for hw_init.\n", 3856 adev->ip_blocks[i].version->funcs->name); 3857 adev->ip_blocks[i].status.hw = true; 3858 } 3859 } 3860 } else { 3861 tmp = amdgpu_reset_method; 3862 /* It should do a default reset when loading or reloading the driver, 3863 * regardless of the module parameter reset_method. 3864 */ 3865 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3866 r = amdgpu_asic_reset(adev); 3867 amdgpu_reset_method = tmp; 3868 if (r) { 3869 dev_err(adev->dev, "asic reset on init failed\n"); 3870 goto failed; 3871 } 3872 } 3873 } 3874 3875 pci_enable_pcie_error_reporting(adev->pdev); 3876 3877 /* Post card if necessary */ 3878 if (amdgpu_device_need_post(adev)) { 3879 if (!adev->bios) { 3880 dev_err(adev->dev, "no vBIOS found\n"); 3881 r = -EINVAL; 3882 goto failed; 3883 } 3884 DRM_INFO("GPU posting now...\n"); 3885 r = amdgpu_device_asic_init(adev); 3886 if (r) { 3887 dev_err(adev->dev, "gpu post error!\n"); 3888 goto failed; 3889 } 3890 } 3891 3892 if (adev->is_atom_fw) { 3893 /* Initialize clocks */ 3894 r = amdgpu_atomfirmware_get_clock_info(adev); 3895 if (r) { 3896 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3897 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3898 goto failed; 3899 } 3900 } else { 3901 /* Initialize clocks */ 3902 r = amdgpu_atombios_get_clock_info(adev); 3903 if (r) { 3904 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3905 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3906 goto failed; 3907 } 3908 /* init i2c buses */ 3909 if (!amdgpu_device_has_dc_support(adev)) 3910 amdgpu_atombios_i2c_init(adev); 3911 } 3912 3913 fence_driver_init: 3914 /* Fence driver */ 3915 r = amdgpu_fence_driver_sw_init(adev); 3916 if (r) { 3917 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3918 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3919 goto failed; 3920 } 3921 3922 /* init the mode config */ 3923 drm_mode_config_init(adev_to_drm(adev)); 3924 3925 r = amdgpu_device_ip_init(adev); 3926 if (r) { 3927 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3928 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3929 goto release_ras_con; 3930 } 3931 3932 amdgpu_fence_driver_hw_init(adev); 3933 3934 dev_info(adev->dev, 3935 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3936 adev->gfx.config.max_shader_engines, 3937 adev->gfx.config.max_sh_per_se, 3938 adev->gfx.config.max_cu_per_sh, 3939 adev->gfx.cu_info.number); 3940 3941 #ifdef __OpenBSD__ 3942 { 3943 const char *chip_name; 3944 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3945 int maj, min, rev; 3946 3947 switch (adev->asic_type) { 3948 case CHIP_RAVEN: 3949 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3950 chip_name = "RAVEN2"; 3951 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3952 chip_name = "PICASSO"; 3953 else 3954 chip_name = "RAVEN"; 3955 break; 3956 case CHIP_RENOIR: 3957 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3958 chip_name = "RENOIR"; 3959 else 3960 chip_name = "GREEN_SARDINE"; 3961 break; 3962 default: 3963 chip_name = amdgpu_asic_name[adev->asic_type]; 3964 } 3965 3966 printf("%s: %s", adev->self.dv_xname, chip_name); 3967 /* show graphics/compute ip block version, not set on < GFX9 */ 3968 if (version) { 3969 maj = IP_VERSION_MAJ(version); 3970 min = IP_VERSION_MIN(version); 3971 rev = IP_VERSION_REV(version); 3972 printf(" GC %d.%d.%d", maj, min, rev); 3973 } 3974 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3975 } 3976 #endif 3977 3978 adev->accel_working = true; 3979 3980 amdgpu_vm_check_compute_bug(adev); 3981 3982 /* Initialize the buffer migration limit. */ 3983 if (amdgpu_moverate >= 0) 3984 max_MBps = amdgpu_moverate; 3985 else 3986 max_MBps = 8; /* Allow 8 MB/s. */ 3987 /* Get a log2 for easy divisions. */ 3988 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3989 3990 r = amdgpu_pm_sysfs_init(adev); 3991 if (r) { 3992 adev->pm_sysfs_en = false; 3993 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3994 } else 3995 adev->pm_sysfs_en = true; 3996 3997 r = amdgpu_ucode_sysfs_init(adev); 3998 if (r) { 3999 adev->ucode_sysfs_en = false; 4000 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4001 } else 4002 adev->ucode_sysfs_en = true; 4003 4004 r = amdgpu_psp_sysfs_init(adev); 4005 if (r) { 4006 adev->psp_sysfs_en = false; 4007 if (!amdgpu_sriov_vf(adev)) 4008 DRM_ERROR("Creating psp sysfs failed\n"); 4009 } else 4010 adev->psp_sysfs_en = true; 4011 4012 /* 4013 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4014 * Otherwise the mgpu fan boost feature will be skipped due to the 4015 * gpu instance is counted less. 4016 */ 4017 amdgpu_register_gpu_instance(adev); 4018 4019 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4020 * explicit gating rather than handling it automatically. 4021 */ 4022 if (!adev->gmc.xgmi.pending_reset) { 4023 r = amdgpu_device_ip_late_init(adev); 4024 if (r) { 4025 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4026 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4027 goto release_ras_con; 4028 } 4029 /* must succeed. */ 4030 amdgpu_ras_resume(adev); 4031 queue_delayed_work(system_wq, &adev->delayed_init_work, 4032 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4033 } 4034 4035 if (amdgpu_sriov_vf(adev)) { 4036 amdgpu_virt_release_full_gpu(adev, true); 4037 flush_delayed_work(&adev->delayed_init_work); 4038 } 4039 4040 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4041 if (r) 4042 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4043 4044 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4045 r = amdgpu_pmu_init(adev); 4046 if (r) 4047 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4048 4049 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4050 if (amdgpu_device_cache_pci_state(adev->pdev)) 4051 pci_restore_state(pdev); 4052 4053 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4054 /* this will fail for cards that aren't VGA class devices, just 4055 * ignore it */ 4056 #ifdef notyet 4057 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4058 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4059 #endif 4060 4061 px = amdgpu_device_supports_px(ddev); 4062 4063 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4064 apple_gmux_detect(NULL, NULL))) 4065 vga_switcheroo_register_client(adev->pdev, 4066 &amdgpu_switcheroo_ops, px); 4067 4068 if (px) 4069 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4070 4071 if (adev->gmc.xgmi.pending_reset) 4072 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4073 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4074 4075 amdgpu_device_check_iommu_direct_map(adev); 4076 4077 return 0; 4078 4079 release_ras_con: 4080 if (amdgpu_sriov_vf(adev)) 4081 amdgpu_virt_release_full_gpu(adev, true); 4082 4083 /* failed in exclusive mode due to timeout */ 4084 if (amdgpu_sriov_vf(adev) && 4085 !amdgpu_sriov_runtime(adev) && 4086 amdgpu_virt_mmio_blocked(adev) && 4087 !amdgpu_virt_wait_reset(adev)) { 4088 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4089 /* Don't send request since VF is inactive. */ 4090 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4091 adev->virt.ops = NULL; 4092 r = -EAGAIN; 4093 } 4094 amdgpu_release_ras_context(adev); 4095 4096 failed: 4097 amdgpu_vf_error_trans_all(adev); 4098 4099 return r; 4100 } 4101 4102 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4103 { 4104 STUB(); 4105 #ifdef notyet 4106 /* Clear all CPU mappings pointing to this device */ 4107 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4108 #endif 4109 4110 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4111 amdgpu_device_doorbell_fini(adev); 4112 4113 #ifdef __linux__ 4114 iounmap(adev->rmmio); 4115 adev->rmmio = NULL; 4116 if (adev->mman.aper_base_kaddr) 4117 iounmap(adev->mman.aper_base_kaddr); 4118 adev->mman.aper_base_kaddr = NULL; 4119 #else 4120 if (adev->rmmio_size > 0) 4121 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4122 adev->rmmio_size); 4123 adev->rmmio_size = 0; 4124 adev->rmmio = NULL; 4125 if (adev->mman.aper_base_kaddr) 4126 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4127 adev->gmc.visible_vram_size); 4128 adev->mman.aper_base_kaddr = NULL; 4129 #endif 4130 4131 /* Memory manager related */ 4132 if (!adev->gmc.xgmi.connected_to_cpu) { 4133 #ifdef __linux__ 4134 arch_phys_wc_del(adev->gmc.vram_mtrr); 4135 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4136 #else 4137 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4138 #endif 4139 } 4140 } 4141 4142 /** 4143 * amdgpu_device_fini_hw - tear down the driver 4144 * 4145 * @adev: amdgpu_device pointer 4146 * 4147 * Tear down the driver info (all asics). 4148 * Called at driver shutdown. 4149 */ 4150 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4151 { 4152 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4153 flush_delayed_work(&adev->delayed_init_work); 4154 adev->shutdown = true; 4155 4156 /* make sure IB test finished before entering exclusive mode 4157 * to avoid preemption on IB test 4158 * */ 4159 if (amdgpu_sriov_vf(adev)) { 4160 amdgpu_virt_request_full_gpu(adev, false); 4161 amdgpu_virt_fini_data_exchange(adev); 4162 } 4163 4164 /* disable all interrupts */ 4165 amdgpu_irq_disable_all(adev); 4166 if (adev->mode_info.mode_config_initialized){ 4167 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4168 drm_helper_force_disable_all(adev_to_drm(adev)); 4169 else 4170 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4171 } 4172 amdgpu_fence_driver_hw_fini(adev); 4173 4174 if (adev->mman.initialized) { 4175 flush_delayed_work(&adev->mman.bdev.wq); 4176 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4177 } 4178 4179 if (adev->pm_sysfs_en) 4180 amdgpu_pm_sysfs_fini(adev); 4181 if (adev->ucode_sysfs_en) 4182 amdgpu_ucode_sysfs_fini(adev); 4183 if (adev->psp_sysfs_en) 4184 amdgpu_psp_sysfs_fini(adev); 4185 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4186 4187 /* disable ras feature must before hw fini */ 4188 amdgpu_ras_pre_fini(adev); 4189 4190 amdgpu_device_ip_fini_early(adev); 4191 4192 amdgpu_irq_fini_hw(adev); 4193 4194 if (adev->mman.initialized) 4195 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4196 4197 amdgpu_gart_dummy_page_fini(adev); 4198 4199 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4200 amdgpu_device_unmap_mmio(adev); 4201 4202 } 4203 4204 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4205 { 4206 int idx; 4207 bool px; 4208 4209 amdgpu_fence_driver_sw_fini(adev); 4210 amdgpu_device_ip_fini(adev); 4211 release_firmware(adev->firmware.gpu_info_fw); 4212 adev->firmware.gpu_info_fw = NULL; 4213 adev->accel_working = false; 4214 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4215 4216 amdgpu_reset_fini(adev); 4217 4218 /* free i2c buses */ 4219 if (!amdgpu_device_has_dc_support(adev)) 4220 amdgpu_i2c_fini(adev); 4221 4222 if (amdgpu_emu_mode != 1) 4223 amdgpu_atombios_fini(adev); 4224 4225 kfree(adev->bios); 4226 adev->bios = NULL; 4227 4228 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4229 4230 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4231 apple_gmux_detect(NULL, NULL))) 4232 vga_switcheroo_unregister_client(adev->pdev); 4233 4234 if (px) 4235 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4236 4237 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4238 vga_client_unregister(adev->pdev); 4239 4240 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4241 #ifdef __linux__ 4242 iounmap(adev->rmmio); 4243 adev->rmmio = NULL; 4244 #else 4245 if (adev->rmmio_size > 0) 4246 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4247 adev->rmmio_size); 4248 adev->rmmio_size = 0; 4249 adev->rmmio = NULL; 4250 #endif 4251 amdgpu_device_doorbell_fini(adev); 4252 drm_dev_exit(idx); 4253 } 4254 4255 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4256 amdgpu_pmu_fini(adev); 4257 if (adev->mman.discovery_bin) 4258 amdgpu_discovery_fini(adev); 4259 4260 amdgpu_reset_put_reset_domain(adev->reset_domain); 4261 adev->reset_domain = NULL; 4262 4263 kfree(adev->pci_state); 4264 4265 } 4266 4267 /** 4268 * amdgpu_device_evict_resources - evict device resources 4269 * @adev: amdgpu device object 4270 * 4271 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4272 * of the vram memory type. Mainly used for evicting device resources 4273 * at suspend time. 4274 * 4275 */ 4276 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4277 { 4278 int ret; 4279 4280 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4281 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4282 return 0; 4283 4284 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4285 if (ret) 4286 DRM_WARN("evicting device resources failed\n"); 4287 return ret; 4288 } 4289 4290 /* 4291 * Suspend & resume. 4292 */ 4293 /** 4294 * amdgpu_device_suspend - initiate device suspend 4295 * 4296 * @dev: drm dev pointer 4297 * @fbcon : notify the fbdev of suspend 4298 * 4299 * Puts the hw in the suspend state (all asics). 4300 * Returns 0 for success or an error on failure. 4301 * Called at driver suspend. 4302 */ 4303 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4304 { 4305 struct amdgpu_device *adev = drm_to_adev(dev); 4306 int r = 0; 4307 4308 if (adev->shutdown) 4309 return 0; 4310 4311 #ifdef notyet 4312 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4313 return 0; 4314 #endif 4315 4316 adev->in_suspend = true; 4317 4318 if (amdgpu_sriov_vf(adev)) { 4319 amdgpu_virt_fini_data_exchange(adev); 4320 r = amdgpu_virt_request_full_gpu(adev, false); 4321 if (r) 4322 return r; 4323 } 4324 4325 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4326 DRM_WARN("smart shift update failed\n"); 4327 4328 drm_kms_helper_poll_disable(dev); 4329 4330 if (fbcon) 4331 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4332 4333 cancel_delayed_work_sync(&adev->delayed_init_work); 4334 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4335 4336 amdgpu_ras_suspend(adev); 4337 4338 amdgpu_device_ip_suspend_phase1(adev); 4339 4340 if (!adev->in_s0ix) 4341 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4342 4343 r = amdgpu_device_evict_resources(adev); 4344 if (r) 4345 return r; 4346 4347 amdgpu_fence_driver_hw_fini(adev); 4348 4349 amdgpu_device_ip_suspend_phase2(adev); 4350 4351 if (amdgpu_sriov_vf(adev)) 4352 amdgpu_virt_release_full_gpu(adev, false); 4353 4354 return 0; 4355 } 4356 4357 /** 4358 * amdgpu_device_resume - initiate device resume 4359 * 4360 * @dev: drm dev pointer 4361 * @fbcon : notify the fbdev of resume 4362 * 4363 * Bring the hw back to operating state (all asics). 4364 * Returns 0 for success or an error on failure. 4365 * Called at driver resume. 4366 */ 4367 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4368 { 4369 struct amdgpu_device *adev = drm_to_adev(dev); 4370 int r = 0; 4371 4372 if (amdgpu_sriov_vf(adev)) { 4373 r = amdgpu_virt_request_full_gpu(adev, true); 4374 if (r) 4375 return r; 4376 } 4377 4378 #ifdef notyet 4379 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4380 return 0; 4381 #endif 4382 4383 if (adev->in_s0ix) 4384 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4385 4386 /* post card */ 4387 if (amdgpu_device_need_post(adev)) { 4388 r = amdgpu_device_asic_init(adev); 4389 if (r) 4390 dev_err(adev->dev, "amdgpu asic init failed\n"); 4391 } 4392 4393 r = amdgpu_device_ip_resume(adev); 4394 4395 /* no matter what r is, always need to properly release full GPU */ 4396 if (amdgpu_sriov_vf(adev)) { 4397 amdgpu_virt_init_data_exchange(adev); 4398 amdgpu_virt_release_full_gpu(adev, true); 4399 } 4400 4401 if (r) { 4402 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4403 return r; 4404 } 4405 amdgpu_fence_driver_hw_init(adev); 4406 4407 r = amdgpu_device_ip_late_init(adev); 4408 if (r) 4409 return r; 4410 4411 queue_delayed_work(system_wq, &adev->delayed_init_work, 4412 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4413 4414 if (!adev->in_s0ix) { 4415 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4416 if (r) 4417 return r; 4418 } 4419 4420 /* Make sure IB tests flushed */ 4421 flush_delayed_work(&adev->delayed_init_work); 4422 4423 if (adev->in_s0ix) { 4424 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4425 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4426 */ 4427 amdgpu_gfx_off_ctrl(adev, true); 4428 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4429 } 4430 if (fbcon) 4431 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4432 4433 drm_kms_helper_poll_enable(dev); 4434 4435 amdgpu_ras_resume(adev); 4436 4437 /* 4438 * Most of the connector probing functions try to acquire runtime pm 4439 * refs to ensure that the GPU is powered on when connector polling is 4440 * performed. Since we're calling this from a runtime PM callback, 4441 * trying to acquire rpm refs will cause us to deadlock. 4442 * 4443 * Since we're guaranteed to be holding the rpm lock, it's safe to 4444 * temporarily disable the rpm helpers so this doesn't deadlock us. 4445 */ 4446 #if defined(CONFIG_PM) && defined(__linux__) 4447 dev->dev->power.disable_depth++; 4448 #endif 4449 if (!amdgpu_device_has_dc_support(adev)) 4450 drm_helper_hpd_irq_event(dev); 4451 else 4452 drm_kms_helper_hotplug_event(dev); 4453 #if defined(CONFIG_PM) && defined(__linux__) 4454 dev->dev->power.disable_depth--; 4455 #endif 4456 adev->in_suspend = false; 4457 4458 if (adev->enable_mes) 4459 amdgpu_mes_self_test(adev); 4460 4461 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4462 DRM_WARN("smart shift update failed\n"); 4463 4464 return 0; 4465 } 4466 4467 /** 4468 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4469 * 4470 * @adev: amdgpu_device pointer 4471 * 4472 * The list of all the hardware IPs that make up the asic is walked and 4473 * the check_soft_reset callbacks are run. check_soft_reset determines 4474 * if the asic is still hung or not. 4475 * Returns true if any of the IPs are still in a hung state, false if not. 4476 */ 4477 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4478 { 4479 int i; 4480 bool asic_hang = false; 4481 4482 if (amdgpu_sriov_vf(adev)) 4483 return true; 4484 4485 if (amdgpu_asic_need_full_reset(adev)) 4486 return true; 4487 4488 for (i = 0; i < adev->num_ip_blocks; i++) { 4489 if (!adev->ip_blocks[i].status.valid) 4490 continue; 4491 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4492 adev->ip_blocks[i].status.hang = 4493 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4494 if (adev->ip_blocks[i].status.hang) { 4495 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4496 asic_hang = true; 4497 } 4498 } 4499 return asic_hang; 4500 } 4501 4502 /** 4503 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4504 * 4505 * @adev: amdgpu_device pointer 4506 * 4507 * The list of all the hardware IPs that make up the asic is walked and the 4508 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4509 * handles any IP specific hardware or software state changes that are 4510 * necessary for a soft reset to succeed. 4511 * Returns 0 on success, negative error code on failure. 4512 */ 4513 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4514 { 4515 int i, r = 0; 4516 4517 for (i = 0; i < adev->num_ip_blocks; i++) { 4518 if (!adev->ip_blocks[i].status.valid) 4519 continue; 4520 if (adev->ip_blocks[i].status.hang && 4521 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4522 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4523 if (r) 4524 return r; 4525 } 4526 } 4527 4528 return 0; 4529 } 4530 4531 /** 4532 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4533 * 4534 * @adev: amdgpu_device pointer 4535 * 4536 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4537 * reset is necessary to recover. 4538 * Returns true if a full asic reset is required, false if not. 4539 */ 4540 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4541 { 4542 int i; 4543 4544 if (amdgpu_asic_need_full_reset(adev)) 4545 return true; 4546 4547 for (i = 0; i < adev->num_ip_blocks; i++) { 4548 if (!adev->ip_blocks[i].status.valid) 4549 continue; 4550 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4551 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4552 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4553 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4554 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4555 if (adev->ip_blocks[i].status.hang) { 4556 dev_info(adev->dev, "Some block need full reset!\n"); 4557 return true; 4558 } 4559 } 4560 } 4561 return false; 4562 } 4563 4564 /** 4565 * amdgpu_device_ip_soft_reset - do a soft reset 4566 * 4567 * @adev: amdgpu_device pointer 4568 * 4569 * The list of all the hardware IPs that make up the asic is walked and the 4570 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4571 * IP specific hardware or software state changes that are necessary to soft 4572 * reset the IP. 4573 * Returns 0 on success, negative error code on failure. 4574 */ 4575 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4576 { 4577 int i, r = 0; 4578 4579 for (i = 0; i < adev->num_ip_blocks; i++) { 4580 if (!adev->ip_blocks[i].status.valid) 4581 continue; 4582 if (adev->ip_blocks[i].status.hang && 4583 adev->ip_blocks[i].version->funcs->soft_reset) { 4584 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4585 if (r) 4586 return r; 4587 } 4588 } 4589 4590 return 0; 4591 } 4592 4593 /** 4594 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4595 * 4596 * @adev: amdgpu_device pointer 4597 * 4598 * The list of all the hardware IPs that make up the asic is walked and the 4599 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4600 * handles any IP specific hardware or software state changes that are 4601 * necessary after the IP has been soft reset. 4602 * Returns 0 on success, negative error code on failure. 4603 */ 4604 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4605 { 4606 int i, r = 0; 4607 4608 for (i = 0; i < adev->num_ip_blocks; i++) { 4609 if (!adev->ip_blocks[i].status.valid) 4610 continue; 4611 if (adev->ip_blocks[i].status.hang && 4612 adev->ip_blocks[i].version->funcs->post_soft_reset) 4613 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4614 if (r) 4615 return r; 4616 } 4617 4618 return 0; 4619 } 4620 4621 /** 4622 * amdgpu_device_recover_vram - Recover some VRAM contents 4623 * 4624 * @adev: amdgpu_device pointer 4625 * 4626 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4627 * restore things like GPUVM page tables after a GPU reset where 4628 * the contents of VRAM might be lost. 4629 * 4630 * Returns: 4631 * 0 on success, negative error code on failure. 4632 */ 4633 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4634 { 4635 struct dma_fence *fence = NULL, *next = NULL; 4636 struct amdgpu_bo *shadow; 4637 struct amdgpu_bo_vm *vmbo; 4638 long r = 1, tmo; 4639 4640 if (amdgpu_sriov_runtime(adev)) 4641 tmo = msecs_to_jiffies(8000); 4642 else 4643 tmo = msecs_to_jiffies(100); 4644 4645 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4646 mutex_lock(&adev->shadow_list_lock); 4647 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4648 /* If vm is compute context or adev is APU, shadow will be NULL */ 4649 if (!vmbo->shadow) 4650 continue; 4651 shadow = vmbo->shadow; 4652 4653 /* No need to recover an evicted BO */ 4654 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4655 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4656 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4657 continue; 4658 4659 r = amdgpu_bo_restore_shadow(shadow, &next); 4660 if (r) 4661 break; 4662 4663 if (fence) { 4664 tmo = dma_fence_wait_timeout(fence, false, tmo); 4665 dma_fence_put(fence); 4666 fence = next; 4667 if (tmo == 0) { 4668 r = -ETIMEDOUT; 4669 break; 4670 } else if (tmo < 0) { 4671 r = tmo; 4672 break; 4673 } 4674 } else { 4675 fence = next; 4676 } 4677 } 4678 mutex_unlock(&adev->shadow_list_lock); 4679 4680 if (fence) 4681 tmo = dma_fence_wait_timeout(fence, false, tmo); 4682 dma_fence_put(fence); 4683 4684 if (r < 0 || tmo <= 0) { 4685 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4686 return -EIO; 4687 } 4688 4689 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4690 return 0; 4691 } 4692 4693 4694 /** 4695 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4696 * 4697 * @adev: amdgpu_device pointer 4698 * @from_hypervisor: request from hypervisor 4699 * 4700 * do VF FLR and reinitialize Asic 4701 * return 0 means succeeded otherwise failed 4702 */ 4703 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4704 bool from_hypervisor) 4705 { 4706 int r; 4707 struct amdgpu_hive_info *hive = NULL; 4708 int retry_limit = 0; 4709 4710 retry: 4711 amdgpu_amdkfd_pre_reset(adev); 4712 4713 if (from_hypervisor) 4714 r = amdgpu_virt_request_full_gpu(adev, true); 4715 else 4716 r = amdgpu_virt_reset_gpu(adev); 4717 if (r) 4718 return r; 4719 4720 /* Resume IP prior to SMC */ 4721 r = amdgpu_device_ip_reinit_early_sriov(adev); 4722 if (r) 4723 goto error; 4724 4725 amdgpu_virt_init_data_exchange(adev); 4726 4727 r = amdgpu_device_fw_loading(adev); 4728 if (r) 4729 return r; 4730 4731 /* now we are okay to resume SMC/CP/SDMA */ 4732 r = amdgpu_device_ip_reinit_late_sriov(adev); 4733 if (r) 4734 goto error; 4735 4736 hive = amdgpu_get_xgmi_hive(adev); 4737 /* Update PSP FW topology after reset */ 4738 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4739 r = amdgpu_xgmi_update_topology(hive, adev); 4740 4741 if (hive) 4742 amdgpu_put_xgmi_hive(hive); 4743 4744 if (!r) { 4745 amdgpu_irq_gpu_reset_resume_helper(adev); 4746 r = amdgpu_ib_ring_tests(adev); 4747 4748 amdgpu_amdkfd_post_reset(adev); 4749 } 4750 4751 error: 4752 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4753 amdgpu_inc_vram_lost(adev); 4754 r = amdgpu_device_recover_vram(adev); 4755 } 4756 amdgpu_virt_release_full_gpu(adev, true); 4757 4758 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4759 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4760 retry_limit++; 4761 goto retry; 4762 } else 4763 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4764 } 4765 4766 return r; 4767 } 4768 4769 /** 4770 * amdgpu_device_has_job_running - check if there is any job in mirror list 4771 * 4772 * @adev: amdgpu_device pointer 4773 * 4774 * check if there is any job in mirror list 4775 */ 4776 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4777 { 4778 int i; 4779 struct drm_sched_job *job; 4780 4781 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4782 struct amdgpu_ring *ring = adev->rings[i]; 4783 4784 if (!ring || !ring->sched.thread) 4785 continue; 4786 4787 spin_lock(&ring->sched.job_list_lock); 4788 job = list_first_entry_or_null(&ring->sched.pending_list, 4789 struct drm_sched_job, list); 4790 spin_unlock(&ring->sched.job_list_lock); 4791 if (job) 4792 return true; 4793 } 4794 return false; 4795 } 4796 4797 /** 4798 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4799 * 4800 * @adev: amdgpu_device pointer 4801 * 4802 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4803 * a hung GPU. 4804 */ 4805 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4806 { 4807 4808 if (amdgpu_gpu_recovery == 0) 4809 goto disabled; 4810 4811 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4812 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4813 return false; 4814 } 4815 4816 if (amdgpu_sriov_vf(adev)) 4817 return true; 4818 4819 if (amdgpu_gpu_recovery == -1) { 4820 switch (adev->asic_type) { 4821 #ifdef CONFIG_DRM_AMDGPU_SI 4822 case CHIP_VERDE: 4823 case CHIP_TAHITI: 4824 case CHIP_PITCAIRN: 4825 case CHIP_OLAND: 4826 case CHIP_HAINAN: 4827 #endif 4828 #ifdef CONFIG_DRM_AMDGPU_CIK 4829 case CHIP_KAVERI: 4830 case CHIP_KABINI: 4831 case CHIP_MULLINS: 4832 #endif 4833 case CHIP_CARRIZO: 4834 case CHIP_STONEY: 4835 case CHIP_CYAN_SKILLFISH: 4836 goto disabled; 4837 default: 4838 break; 4839 } 4840 } 4841 4842 return true; 4843 4844 disabled: 4845 dev_info(adev->dev, "GPU recovery disabled.\n"); 4846 return false; 4847 } 4848 4849 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4850 { 4851 u32 i; 4852 int ret = 0; 4853 4854 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4855 4856 dev_info(adev->dev, "GPU mode1 reset\n"); 4857 4858 /* disable BM */ 4859 pci_clear_master(adev->pdev); 4860 4861 amdgpu_device_cache_pci_state(adev->pdev); 4862 4863 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4864 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4865 ret = amdgpu_dpm_mode1_reset(adev); 4866 } else { 4867 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4868 ret = psp_gpu_reset(adev); 4869 } 4870 4871 if (ret) 4872 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4873 4874 amdgpu_device_load_pci_state(adev->pdev); 4875 4876 /* wait for asic to come out of reset */ 4877 for (i = 0; i < adev->usec_timeout; i++) { 4878 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4879 4880 if (memsize != 0xffffffff) 4881 break; 4882 udelay(1); 4883 } 4884 4885 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4886 return ret; 4887 } 4888 4889 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4890 struct amdgpu_reset_context *reset_context) 4891 { 4892 int i, r = 0; 4893 struct amdgpu_job *job = NULL; 4894 bool need_full_reset = 4895 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4896 4897 if (reset_context->reset_req_dev == adev) 4898 job = reset_context->job; 4899 4900 if (amdgpu_sriov_vf(adev)) { 4901 /* stop the data exchange thread */ 4902 amdgpu_virt_fini_data_exchange(adev); 4903 } 4904 4905 amdgpu_fence_driver_isr_toggle(adev, true); 4906 4907 /* block all schedulers and reset given job's ring */ 4908 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4909 struct amdgpu_ring *ring = adev->rings[i]; 4910 4911 if (!ring || !ring->sched.thread) 4912 continue; 4913 4914 /*clear job fence from fence drv to avoid force_completion 4915 *leave NULL and vm flush fence in fence drv */ 4916 amdgpu_fence_driver_clear_job_fences(ring); 4917 4918 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4919 amdgpu_fence_driver_force_completion(ring); 4920 } 4921 4922 amdgpu_fence_driver_isr_toggle(adev, false); 4923 4924 if (job && job->vm) 4925 drm_sched_increase_karma(&job->base); 4926 4927 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4928 /* If reset handler not implemented, continue; otherwise return */ 4929 if (r == -ENOSYS) 4930 r = 0; 4931 else 4932 return r; 4933 4934 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4935 if (!amdgpu_sriov_vf(adev)) { 4936 4937 if (!need_full_reset) 4938 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4939 4940 if (!need_full_reset && amdgpu_gpu_recovery) { 4941 amdgpu_device_ip_pre_soft_reset(adev); 4942 r = amdgpu_device_ip_soft_reset(adev); 4943 amdgpu_device_ip_post_soft_reset(adev); 4944 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4945 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4946 need_full_reset = true; 4947 } 4948 } 4949 4950 if (need_full_reset) 4951 r = amdgpu_device_ip_suspend(adev); 4952 if (need_full_reset) 4953 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4954 else 4955 clear_bit(AMDGPU_NEED_FULL_RESET, 4956 &reset_context->flags); 4957 } 4958 4959 return r; 4960 } 4961 4962 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4963 { 4964 int i; 4965 4966 lockdep_assert_held(&adev->reset_domain->sem); 4967 4968 for (i = 0; i < adev->num_regs; i++) { 4969 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4970 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4971 adev->reset_dump_reg_value[i]); 4972 } 4973 4974 return 0; 4975 } 4976 4977 #ifdef CONFIG_DEV_COREDUMP 4978 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4979 size_t count, void *data, size_t datalen) 4980 { 4981 struct drm_printer p; 4982 struct amdgpu_device *adev = data; 4983 struct drm_print_iterator iter; 4984 int i; 4985 4986 iter.data = buffer; 4987 iter.offset = 0; 4988 iter.start = offset; 4989 iter.remain = count; 4990 4991 p = drm_coredump_printer(&iter); 4992 4993 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4994 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4995 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4996 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4997 if (adev->reset_task_info.pid) 4998 drm_printf(&p, "process_name: %s PID: %d\n", 4999 adev->reset_task_info.process_name, 5000 adev->reset_task_info.pid); 5001 5002 if (adev->reset_vram_lost) 5003 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 5004 if (adev->num_regs) { 5005 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 5006 5007 for (i = 0; i < adev->num_regs; i++) 5008 drm_printf(&p, "0x%08x: 0x%08x\n", 5009 adev->reset_dump_reg_list[i], 5010 adev->reset_dump_reg_value[i]); 5011 } 5012 5013 return count - iter.remain; 5014 } 5015 5016 static void amdgpu_devcoredump_free(void *data) 5017 { 5018 } 5019 5020 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 5021 { 5022 struct drm_device *dev = adev_to_drm(adev); 5023 5024 ktime_get_ts64(&adev->reset_time); 5025 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5026 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5027 } 5028 #endif 5029 5030 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5031 struct amdgpu_reset_context *reset_context) 5032 { 5033 struct amdgpu_device *tmp_adev = NULL; 5034 bool need_full_reset, skip_hw_reset, vram_lost = false; 5035 int r = 0; 5036 bool gpu_reset_for_dev_remove = 0; 5037 5038 /* Try reset handler method first */ 5039 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5040 reset_list); 5041 amdgpu_reset_reg_dumps(tmp_adev); 5042 5043 reset_context->reset_device_list = device_list_handle; 5044 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5045 /* If reset handler not implemented, continue; otherwise return */ 5046 if (r == -ENOSYS) 5047 r = 0; 5048 else 5049 return r; 5050 5051 /* Reset handler not implemented, use the default method */ 5052 need_full_reset = 5053 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5054 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5055 5056 gpu_reset_for_dev_remove = 5057 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5058 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5059 5060 /* 5061 * ASIC reset has to be done on all XGMI hive nodes ASAP 5062 * to allow proper links negotiation in FW (within 1 sec) 5063 */ 5064 if (!skip_hw_reset && need_full_reset) { 5065 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5066 /* For XGMI run all resets in parallel to speed up the process */ 5067 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5068 tmp_adev->gmc.xgmi.pending_reset = false; 5069 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5070 r = -EALREADY; 5071 } else 5072 r = amdgpu_asic_reset(tmp_adev); 5073 5074 if (r) { 5075 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5076 r, adev_to_drm(tmp_adev)->unique); 5077 break; 5078 } 5079 } 5080 5081 /* For XGMI wait for all resets to complete before proceed */ 5082 if (!r) { 5083 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5084 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5085 flush_work(&tmp_adev->xgmi_reset_work); 5086 r = tmp_adev->asic_reset_res; 5087 if (r) 5088 break; 5089 } 5090 } 5091 } 5092 } 5093 5094 if (!r && amdgpu_ras_intr_triggered()) { 5095 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5096 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5097 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5098 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5099 } 5100 5101 amdgpu_ras_intr_cleared(); 5102 } 5103 5104 /* Since the mode1 reset affects base ip blocks, the 5105 * phase1 ip blocks need to be resumed. Otherwise there 5106 * will be a BIOS signature error and the psp bootloader 5107 * can't load kdb on the next amdgpu install. 5108 */ 5109 if (gpu_reset_for_dev_remove) { 5110 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5111 amdgpu_device_ip_resume_phase1(tmp_adev); 5112 5113 goto end; 5114 } 5115 5116 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5117 if (need_full_reset) { 5118 /* post card */ 5119 r = amdgpu_device_asic_init(tmp_adev); 5120 if (r) { 5121 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5122 } else { 5123 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5124 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5125 if (r) 5126 goto out; 5127 5128 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5129 if (r) 5130 goto out; 5131 5132 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5133 #ifdef CONFIG_DEV_COREDUMP 5134 tmp_adev->reset_vram_lost = vram_lost; 5135 memset(&tmp_adev->reset_task_info, 0, 5136 sizeof(tmp_adev->reset_task_info)); 5137 if (reset_context->job && reset_context->job->vm) 5138 tmp_adev->reset_task_info = 5139 reset_context->job->vm->task_info; 5140 amdgpu_reset_capture_coredumpm(tmp_adev); 5141 #endif 5142 if (vram_lost) { 5143 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5144 amdgpu_inc_vram_lost(tmp_adev); 5145 } 5146 5147 r = amdgpu_device_fw_loading(tmp_adev); 5148 if (r) 5149 return r; 5150 5151 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5152 if (r) 5153 goto out; 5154 5155 if (vram_lost) 5156 amdgpu_device_fill_reset_magic(tmp_adev); 5157 5158 /* 5159 * Add this ASIC as tracked as reset was already 5160 * complete successfully. 5161 */ 5162 amdgpu_register_gpu_instance(tmp_adev); 5163 5164 if (!reset_context->hive && 5165 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5166 amdgpu_xgmi_add_device(tmp_adev); 5167 5168 r = amdgpu_device_ip_late_init(tmp_adev); 5169 if (r) 5170 goto out; 5171 5172 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5173 5174 /* 5175 * The GPU enters bad state once faulty pages 5176 * by ECC has reached the threshold, and ras 5177 * recovery is scheduled next. So add one check 5178 * here to break recovery if it indeed exceeds 5179 * bad page threshold, and remind user to 5180 * retire this GPU or setting one bigger 5181 * bad_page_threshold value to fix this once 5182 * probing driver again. 5183 */ 5184 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5185 /* must succeed. */ 5186 amdgpu_ras_resume(tmp_adev); 5187 } else { 5188 r = -EINVAL; 5189 goto out; 5190 } 5191 5192 /* Update PSP FW topology after reset */ 5193 if (reset_context->hive && 5194 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5195 r = amdgpu_xgmi_update_topology( 5196 reset_context->hive, tmp_adev); 5197 } 5198 } 5199 5200 out: 5201 if (!r) { 5202 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5203 r = amdgpu_ib_ring_tests(tmp_adev); 5204 if (r) { 5205 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5206 need_full_reset = true; 5207 r = -EAGAIN; 5208 goto end; 5209 } 5210 } 5211 5212 if (!r) 5213 r = amdgpu_device_recover_vram(tmp_adev); 5214 else 5215 tmp_adev->asic_reset_res = r; 5216 } 5217 5218 end: 5219 if (need_full_reset) 5220 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5221 else 5222 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5223 return r; 5224 } 5225 5226 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5227 { 5228 5229 switch (amdgpu_asic_reset_method(adev)) { 5230 case AMD_RESET_METHOD_MODE1: 5231 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5232 break; 5233 case AMD_RESET_METHOD_MODE2: 5234 adev->mp1_state = PP_MP1_STATE_RESET; 5235 break; 5236 default: 5237 adev->mp1_state = PP_MP1_STATE_NONE; 5238 break; 5239 } 5240 5241 pci_dev_put(p); 5242 } 5243 5244 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5245 { 5246 amdgpu_vf_error_trans_all(adev); 5247 adev->mp1_state = PP_MP1_STATE_NONE; 5248 } 5249 5250 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5251 { 5252 STUB(); 5253 #ifdef notyet 5254 struct pci_dev *p = NULL; 5255 5256 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5257 adev->pdev->bus->number, 1); 5258 if (p) { 5259 pm_runtime_enable(&(p->dev)); 5260 pm_runtime_resume(&(p->dev)); 5261 } 5262 #endif 5263 } 5264 5265 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5266 { 5267 enum amd_reset_method reset_method; 5268 struct pci_dev *p = NULL; 5269 u64 expires; 5270 5271 /* 5272 * For now, only BACO and mode1 reset are confirmed 5273 * to suffer the audio issue without proper suspended. 5274 */ 5275 reset_method = amdgpu_asic_reset_method(adev); 5276 if ((reset_method != AMD_RESET_METHOD_BACO) && 5277 (reset_method != AMD_RESET_METHOD_MODE1)) 5278 return -EINVAL; 5279 5280 STUB(); 5281 return -ENOSYS; 5282 #ifdef notyet 5283 5284 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5285 adev->pdev->bus->number, 1); 5286 if (!p) 5287 return -ENODEV; 5288 5289 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5290 if (!expires) 5291 /* 5292 * If we cannot get the audio device autosuspend delay, 5293 * a fixed 4S interval will be used. Considering 3S is 5294 * the audio controller default autosuspend delay setting. 5295 * 4S used here is guaranteed to cover that. 5296 */ 5297 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5298 5299 while (!pm_runtime_status_suspended(&(p->dev))) { 5300 if (!pm_runtime_suspend(&(p->dev))) 5301 break; 5302 5303 if (expires < ktime_get_mono_fast_ns()) { 5304 dev_warn(adev->dev, "failed to suspend display audio\n"); 5305 pci_dev_put(p); 5306 /* TODO: abort the succeeding gpu reset? */ 5307 return -ETIMEDOUT; 5308 } 5309 } 5310 5311 pm_runtime_disable(&(p->dev)); 5312 5313 pci_dev_put(p); 5314 return 0; 5315 #endif 5316 } 5317 5318 static void amdgpu_device_recheck_guilty_jobs( 5319 struct amdgpu_device *adev, struct list_head *device_list_handle, 5320 struct amdgpu_reset_context *reset_context) 5321 { 5322 int i, r = 0; 5323 5324 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5325 struct amdgpu_ring *ring = adev->rings[i]; 5326 int ret = 0; 5327 struct drm_sched_job *s_job; 5328 5329 if (!ring || !ring->sched.thread) 5330 continue; 5331 5332 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5333 struct drm_sched_job, list); 5334 if (s_job == NULL) 5335 continue; 5336 5337 /* clear job's guilty and depend the folowing step to decide the real one */ 5338 drm_sched_reset_karma(s_job); 5339 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5340 5341 if (!s_job->s_fence->parent) { 5342 DRM_WARN("Failed to get a HW fence for job!"); 5343 continue; 5344 } 5345 5346 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5347 if (ret == 0) { /* timeout */ 5348 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5349 ring->sched.name, s_job->id); 5350 5351 5352 amdgpu_fence_driver_isr_toggle(adev, true); 5353 5354 /* Clear this failed job from fence array */ 5355 amdgpu_fence_driver_clear_job_fences(ring); 5356 5357 amdgpu_fence_driver_isr_toggle(adev, false); 5358 5359 /* Since the job won't signal and we go for 5360 * another resubmit drop this parent pointer 5361 */ 5362 dma_fence_put(s_job->s_fence->parent); 5363 s_job->s_fence->parent = NULL; 5364 5365 /* set guilty */ 5366 drm_sched_increase_karma(s_job); 5367 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5368 retry: 5369 /* do hw reset */ 5370 if (amdgpu_sriov_vf(adev)) { 5371 amdgpu_virt_fini_data_exchange(adev); 5372 r = amdgpu_device_reset_sriov(adev, false); 5373 if (r) 5374 adev->asic_reset_res = r; 5375 } else { 5376 clear_bit(AMDGPU_SKIP_HW_RESET, 5377 &reset_context->flags); 5378 r = amdgpu_do_asic_reset(device_list_handle, 5379 reset_context); 5380 if (r && r == -EAGAIN) 5381 goto retry; 5382 } 5383 5384 /* 5385 * add reset counter so that the following 5386 * resubmitted job could flush vmid 5387 */ 5388 atomic_inc(&adev->gpu_reset_counter); 5389 continue; 5390 } 5391 5392 /* got the hw fence, signal finished fence */ 5393 atomic_dec(ring->sched.score); 5394 dma_fence_get(&s_job->s_fence->finished); 5395 dma_fence_signal(&s_job->s_fence->finished); 5396 dma_fence_put(&s_job->s_fence->finished); 5397 5398 /* remove node from list and free the job */ 5399 spin_lock(&ring->sched.job_list_lock); 5400 list_del_init(&s_job->list); 5401 spin_unlock(&ring->sched.job_list_lock); 5402 ring->sched.ops->free_job(s_job); 5403 } 5404 } 5405 5406 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5407 { 5408 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5409 5410 #if defined(CONFIG_DEBUG_FS) 5411 if (!amdgpu_sriov_vf(adev)) 5412 cancel_work(&adev->reset_work); 5413 #endif 5414 5415 if (adev->kfd.dev) 5416 cancel_work(&adev->kfd.reset_work); 5417 5418 if (amdgpu_sriov_vf(adev)) 5419 cancel_work(&adev->virt.flr_work); 5420 5421 if (con && adev->ras_enabled) 5422 cancel_work(&con->recovery_work); 5423 5424 } 5425 5426 5427 /** 5428 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5429 * 5430 * @adev: amdgpu_device pointer 5431 * @job: which job trigger hang 5432 * 5433 * Attempt to reset the GPU if it has hung (all asics). 5434 * Attempt to do soft-reset or full-reset and reinitialize Asic 5435 * Returns 0 for success or an error on failure. 5436 */ 5437 5438 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5439 struct amdgpu_job *job, 5440 struct amdgpu_reset_context *reset_context) 5441 { 5442 struct list_head device_list, *device_list_handle = NULL; 5443 bool job_signaled = false; 5444 struct amdgpu_hive_info *hive = NULL; 5445 struct amdgpu_device *tmp_adev = NULL; 5446 int i, r = 0; 5447 bool need_emergency_restart = false; 5448 bool audio_suspended = false; 5449 int tmp_vram_lost_counter; 5450 bool gpu_reset_for_dev_remove = false; 5451 5452 gpu_reset_for_dev_remove = 5453 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5454 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5455 5456 /* 5457 * Special case: RAS triggered and full reset isn't supported 5458 */ 5459 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5460 5461 /* 5462 * Flush RAM to disk so that after reboot 5463 * the user can read log and see why the system rebooted. 5464 */ 5465 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5466 DRM_WARN("Emergency reboot."); 5467 5468 #ifdef notyet 5469 ksys_sync_helper(); 5470 emergency_restart(); 5471 #else 5472 panic("emergency_restart"); 5473 #endif 5474 } 5475 5476 dev_info(adev->dev, "GPU %s begin!\n", 5477 need_emergency_restart ? "jobs stop":"reset"); 5478 5479 if (!amdgpu_sriov_vf(adev)) 5480 hive = amdgpu_get_xgmi_hive(adev); 5481 if (hive) 5482 mutex_lock(&hive->hive_lock); 5483 5484 reset_context->job = job; 5485 reset_context->hive = hive; 5486 /* 5487 * Build list of devices to reset. 5488 * In case we are in XGMI hive mode, resort the device list 5489 * to put adev in the 1st position. 5490 */ 5491 INIT_LIST_HEAD(&device_list); 5492 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5493 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5494 list_add_tail(&tmp_adev->reset_list, &device_list); 5495 if (gpu_reset_for_dev_remove && adev->shutdown) 5496 tmp_adev->shutdown = true; 5497 } 5498 if (!list_is_first(&adev->reset_list, &device_list)) 5499 list_rotate_to_front(&adev->reset_list, &device_list); 5500 device_list_handle = &device_list; 5501 } else { 5502 list_add_tail(&adev->reset_list, &device_list); 5503 device_list_handle = &device_list; 5504 } 5505 5506 /* We need to lock reset domain only once both for XGMI and single device */ 5507 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5508 reset_list); 5509 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5510 5511 /* block all schedulers and reset given job's ring */ 5512 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5513 5514 amdgpu_device_set_mp1_state(tmp_adev); 5515 5516 /* 5517 * Try to put the audio codec into suspend state 5518 * before gpu reset started. 5519 * 5520 * Due to the power domain of the graphics device 5521 * is shared with AZ power domain. Without this, 5522 * we may change the audio hardware from behind 5523 * the audio driver's back. That will trigger 5524 * some audio codec errors. 5525 */ 5526 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5527 audio_suspended = true; 5528 5529 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5530 5531 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5532 5533 if (!amdgpu_sriov_vf(tmp_adev)) 5534 amdgpu_amdkfd_pre_reset(tmp_adev); 5535 5536 /* 5537 * Mark these ASICs to be reseted as untracked first 5538 * And add them back after reset completed 5539 */ 5540 amdgpu_unregister_gpu_instance(tmp_adev); 5541 5542 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5543 5544 /* disable ras on ALL IPs */ 5545 if (!need_emergency_restart && 5546 amdgpu_device_ip_need_full_reset(tmp_adev)) 5547 amdgpu_ras_suspend(tmp_adev); 5548 5549 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5550 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5551 5552 if (!ring || !ring->sched.thread) 5553 continue; 5554 5555 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5556 5557 if (need_emergency_restart) 5558 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5559 } 5560 atomic_inc(&tmp_adev->gpu_reset_counter); 5561 } 5562 5563 if (need_emergency_restart) 5564 goto skip_sched_resume; 5565 5566 /* 5567 * Must check guilty signal here since after this point all old 5568 * HW fences are force signaled. 5569 * 5570 * job->base holds a reference to parent fence 5571 */ 5572 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5573 job_signaled = true; 5574 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5575 goto skip_hw_reset; 5576 } 5577 5578 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5579 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5580 if (gpu_reset_for_dev_remove) { 5581 /* Workaroud for ASICs need to disable SMC first */ 5582 amdgpu_device_smu_fini_early(tmp_adev); 5583 } 5584 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5585 /*TODO Should we stop ?*/ 5586 if (r) { 5587 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5588 r, adev_to_drm(tmp_adev)->unique); 5589 tmp_adev->asic_reset_res = r; 5590 } 5591 5592 /* 5593 * Drop all pending non scheduler resets. Scheduler resets 5594 * were already dropped during drm_sched_stop 5595 */ 5596 amdgpu_device_stop_pending_resets(tmp_adev); 5597 } 5598 5599 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5600 /* Actual ASIC resets if needed.*/ 5601 /* Host driver will handle XGMI hive reset for SRIOV */ 5602 if (amdgpu_sriov_vf(adev)) { 5603 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5604 if (r) 5605 adev->asic_reset_res = r; 5606 5607 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5608 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5609 amdgpu_ras_resume(adev); 5610 } else { 5611 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5612 if (r && r == -EAGAIN) 5613 goto retry; 5614 5615 if (!r && gpu_reset_for_dev_remove) 5616 goto recover_end; 5617 } 5618 5619 skip_hw_reset: 5620 5621 /* Post ASIC reset for all devs .*/ 5622 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5623 5624 /* 5625 * Sometimes a later bad compute job can block a good gfx job as gfx 5626 * and compute ring share internal GC HW mutually. We add an additional 5627 * guilty jobs recheck step to find the real guilty job, it synchronously 5628 * submits and pends for the first job being signaled. If it gets timeout, 5629 * we identify it as a real guilty job. 5630 */ 5631 if (amdgpu_gpu_recovery == 2 && 5632 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5633 amdgpu_device_recheck_guilty_jobs( 5634 tmp_adev, device_list_handle, reset_context); 5635 5636 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5637 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5638 5639 if (!ring || !ring->sched.thread) 5640 continue; 5641 5642 /* No point to resubmit jobs if we didn't HW reset*/ 5643 if (!tmp_adev->asic_reset_res && !job_signaled) 5644 drm_sched_resubmit_jobs(&ring->sched); 5645 5646 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5647 } 5648 5649 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5650 amdgpu_mes_self_test(tmp_adev); 5651 5652 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5653 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5654 } 5655 5656 if (tmp_adev->asic_reset_res) 5657 r = tmp_adev->asic_reset_res; 5658 5659 tmp_adev->asic_reset_res = 0; 5660 5661 if (r) { 5662 /* bad news, how to tell it to userspace ? */ 5663 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5664 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5665 } else { 5666 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5667 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5668 DRM_WARN("smart shift update failed\n"); 5669 } 5670 } 5671 5672 skip_sched_resume: 5673 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5674 /* unlock kfd: SRIOV would do it separately */ 5675 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5676 amdgpu_amdkfd_post_reset(tmp_adev); 5677 5678 /* kfd_post_reset will do nothing if kfd device is not initialized, 5679 * need to bring up kfd here if it's not be initialized before 5680 */ 5681 if (!adev->kfd.init_complete) 5682 amdgpu_amdkfd_device_init(adev); 5683 5684 if (audio_suspended) 5685 amdgpu_device_resume_display_audio(tmp_adev); 5686 5687 amdgpu_device_unset_mp1_state(tmp_adev); 5688 } 5689 5690 recover_end: 5691 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5692 reset_list); 5693 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5694 5695 if (hive) { 5696 mutex_unlock(&hive->hive_lock); 5697 amdgpu_put_xgmi_hive(hive); 5698 } 5699 5700 if (r) 5701 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5702 5703 atomic_set(&adev->reset_domain->reset_res, r); 5704 return r; 5705 } 5706 5707 /** 5708 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5709 * 5710 * @adev: amdgpu_device pointer 5711 * 5712 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5713 * and lanes) of the slot the device is in. Handles APUs and 5714 * virtualized environments where PCIE config space may not be available. 5715 */ 5716 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5717 { 5718 struct pci_dev *pdev; 5719 enum pci_bus_speed speed_cap, platform_speed_cap; 5720 enum pcie_link_width platform_link_width; 5721 5722 if (amdgpu_pcie_gen_cap) 5723 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5724 5725 if (amdgpu_pcie_lane_cap) 5726 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5727 5728 /* covers APUs as well */ 5729 if (pci_is_root_bus(adev->pdev->bus)) { 5730 if (adev->pm.pcie_gen_mask == 0) 5731 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5732 if (adev->pm.pcie_mlw_mask == 0) 5733 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5734 return; 5735 } 5736 5737 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5738 return; 5739 5740 pcie_bandwidth_available(adev->pdev, NULL, 5741 &platform_speed_cap, &platform_link_width); 5742 5743 if (adev->pm.pcie_gen_mask == 0) { 5744 /* asic caps */ 5745 pdev = adev->pdev; 5746 speed_cap = pcie_get_speed_cap(pdev); 5747 if (speed_cap == PCI_SPEED_UNKNOWN) { 5748 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5749 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5751 } else { 5752 if (speed_cap == PCIE_SPEED_32_0GT) 5753 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5756 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5757 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5758 else if (speed_cap == PCIE_SPEED_16_0GT) 5759 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5761 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5762 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5763 else if (speed_cap == PCIE_SPEED_8_0GT) 5764 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5765 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5766 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5767 else if (speed_cap == PCIE_SPEED_5_0GT) 5768 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5769 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5770 else 5771 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5772 } 5773 /* platform caps */ 5774 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5775 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5777 } else { 5778 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5779 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5781 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5782 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5783 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5784 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5785 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5786 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5787 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5788 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5789 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5790 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5791 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5792 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5793 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5794 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5795 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5796 else 5797 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5798 5799 } 5800 } 5801 if (adev->pm.pcie_mlw_mask == 0) { 5802 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5803 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5804 } else { 5805 switch (platform_link_width) { 5806 case PCIE_LNK_X32: 5807 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5808 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5814 break; 5815 case PCIE_LNK_X16: 5816 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5822 break; 5823 case PCIE_LNK_X12: 5824 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5829 break; 5830 case PCIE_LNK_X8: 5831 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5835 break; 5836 case PCIE_LNK_X4: 5837 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5840 break; 5841 case PCIE_LNK_X2: 5842 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5844 break; 5845 case PCIE_LNK_X1: 5846 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5847 break; 5848 default: 5849 break; 5850 } 5851 } 5852 } 5853 } 5854 5855 /** 5856 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5857 * 5858 * @adev: amdgpu_device pointer 5859 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5860 * 5861 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5862 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5863 * @peer_adev. 5864 */ 5865 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5866 struct amdgpu_device *peer_adev) 5867 { 5868 #ifdef CONFIG_HSA_AMD_P2P 5869 uint64_t address_mask = peer_adev->dev->dma_mask ? 5870 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5871 resource_size_t aper_limit = 5872 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5873 bool p2p_access = 5874 !adev->gmc.xgmi.connected_to_cpu && 5875 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5876 5877 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5878 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5879 !(adev->gmc.aper_base & address_mask || 5880 aper_limit & address_mask)); 5881 #else 5882 return false; 5883 #endif 5884 } 5885 5886 int amdgpu_device_baco_enter(struct drm_device *dev) 5887 { 5888 struct amdgpu_device *adev = drm_to_adev(dev); 5889 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5890 5891 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5892 return -ENOTSUPP; 5893 5894 if (ras && adev->ras_enabled && 5895 adev->nbio.funcs->enable_doorbell_interrupt) 5896 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5897 5898 return amdgpu_dpm_baco_enter(adev); 5899 } 5900 5901 int amdgpu_device_baco_exit(struct drm_device *dev) 5902 { 5903 struct amdgpu_device *adev = drm_to_adev(dev); 5904 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5905 int ret = 0; 5906 5907 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5908 return -ENOTSUPP; 5909 5910 ret = amdgpu_dpm_baco_exit(adev); 5911 if (ret) 5912 return ret; 5913 5914 if (ras && adev->ras_enabled && 5915 adev->nbio.funcs->enable_doorbell_interrupt) 5916 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5917 5918 if (amdgpu_passthrough(adev) && 5919 adev->nbio.funcs->clear_doorbell_interrupt) 5920 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5921 5922 return 0; 5923 } 5924 5925 /** 5926 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5927 * @pdev: PCI device struct 5928 * @state: PCI channel state 5929 * 5930 * Description: Called when a PCI error is detected. 5931 * 5932 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5933 */ 5934 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5935 { 5936 STUB(); 5937 return 0; 5938 #ifdef notyet 5939 struct drm_device *dev = pci_get_drvdata(pdev); 5940 struct amdgpu_device *adev = drm_to_adev(dev); 5941 int i; 5942 5943 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5944 5945 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5946 DRM_WARN("No support for XGMI hive yet..."); 5947 return PCI_ERS_RESULT_DISCONNECT; 5948 } 5949 5950 adev->pci_channel_state = state; 5951 5952 switch (state) { 5953 case pci_channel_io_normal: 5954 return PCI_ERS_RESULT_CAN_RECOVER; 5955 /* Fatal error, prepare for slot reset */ 5956 case pci_channel_io_frozen: 5957 /* 5958 * Locking adev->reset_domain->sem will prevent any external access 5959 * to GPU during PCI error recovery 5960 */ 5961 amdgpu_device_lock_reset_domain(adev->reset_domain); 5962 amdgpu_device_set_mp1_state(adev); 5963 5964 /* 5965 * Block any work scheduling as we do for regular GPU reset 5966 * for the duration of the recovery 5967 */ 5968 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5969 struct amdgpu_ring *ring = adev->rings[i]; 5970 5971 if (!ring || !ring->sched.thread) 5972 continue; 5973 5974 drm_sched_stop(&ring->sched, NULL); 5975 } 5976 atomic_inc(&adev->gpu_reset_counter); 5977 return PCI_ERS_RESULT_NEED_RESET; 5978 case pci_channel_io_perm_failure: 5979 /* Permanent error, prepare for device removal */ 5980 return PCI_ERS_RESULT_DISCONNECT; 5981 } 5982 5983 return PCI_ERS_RESULT_NEED_RESET; 5984 #endif 5985 } 5986 5987 /** 5988 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5989 * @pdev: pointer to PCI device 5990 */ 5991 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5992 { 5993 5994 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5995 5996 /* TODO - dump whatever for debugging purposes */ 5997 5998 /* This called only if amdgpu_pci_error_detected returns 5999 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6000 * works, no need to reset slot. 6001 */ 6002 6003 return PCI_ERS_RESULT_RECOVERED; 6004 } 6005 6006 /** 6007 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6008 * @pdev: PCI device struct 6009 * 6010 * Description: This routine is called by the pci error recovery 6011 * code after the PCI slot has been reset, just before we 6012 * should resume normal operations. 6013 */ 6014 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6015 { 6016 STUB(); 6017 return PCI_ERS_RESULT_RECOVERED; 6018 #ifdef notyet 6019 struct drm_device *dev = pci_get_drvdata(pdev); 6020 struct amdgpu_device *adev = drm_to_adev(dev); 6021 int r, i; 6022 struct amdgpu_reset_context reset_context; 6023 u32 memsize; 6024 struct list_head device_list; 6025 6026 DRM_INFO("PCI error: slot reset callback!!\n"); 6027 6028 memset(&reset_context, 0, sizeof(reset_context)); 6029 6030 INIT_LIST_HEAD(&device_list); 6031 list_add_tail(&adev->reset_list, &device_list); 6032 6033 /* wait for asic to come out of reset */ 6034 drm_msleep(500); 6035 6036 /* Restore PCI confspace */ 6037 amdgpu_device_load_pci_state(pdev); 6038 6039 /* confirm ASIC came out of reset */ 6040 for (i = 0; i < adev->usec_timeout; i++) { 6041 memsize = amdgpu_asic_get_config_memsize(adev); 6042 6043 if (memsize != 0xffffffff) 6044 break; 6045 udelay(1); 6046 } 6047 if (memsize == 0xffffffff) { 6048 r = -ETIME; 6049 goto out; 6050 } 6051 6052 reset_context.method = AMD_RESET_METHOD_NONE; 6053 reset_context.reset_req_dev = adev; 6054 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6055 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6056 6057 adev->no_hw_access = true; 6058 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6059 adev->no_hw_access = false; 6060 if (r) 6061 goto out; 6062 6063 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6064 6065 out: 6066 if (!r) { 6067 if (amdgpu_device_cache_pci_state(adev->pdev)) 6068 pci_restore_state(adev->pdev); 6069 6070 DRM_INFO("PCIe error recovery succeeded\n"); 6071 } else { 6072 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6073 amdgpu_device_unset_mp1_state(adev); 6074 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6075 } 6076 6077 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6078 #endif 6079 } 6080 6081 /** 6082 * amdgpu_pci_resume() - resume normal ops after PCI reset 6083 * @pdev: pointer to PCI device 6084 * 6085 * Called when the error recovery driver tells us that its 6086 * OK to resume normal operation. 6087 */ 6088 void amdgpu_pci_resume(struct pci_dev *pdev) 6089 { 6090 STUB(); 6091 #ifdef notyet 6092 struct drm_device *dev = pci_get_drvdata(pdev); 6093 struct amdgpu_device *adev = drm_to_adev(dev); 6094 int i; 6095 6096 6097 DRM_INFO("PCI error: resume callback!!\n"); 6098 6099 /* Only continue execution for the case of pci_channel_io_frozen */ 6100 if (adev->pci_channel_state != pci_channel_io_frozen) 6101 return; 6102 6103 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6104 struct amdgpu_ring *ring = adev->rings[i]; 6105 6106 if (!ring || !ring->sched.thread) 6107 continue; 6108 6109 6110 drm_sched_resubmit_jobs(&ring->sched); 6111 drm_sched_start(&ring->sched, true); 6112 } 6113 6114 amdgpu_device_unset_mp1_state(adev); 6115 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6116 #endif 6117 } 6118 6119 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6120 { 6121 return false; 6122 #ifdef notyet 6123 struct drm_device *dev = pci_get_drvdata(pdev); 6124 struct amdgpu_device *adev = drm_to_adev(dev); 6125 int r; 6126 6127 r = pci_save_state(pdev); 6128 if (!r) { 6129 kfree(adev->pci_state); 6130 6131 adev->pci_state = pci_store_saved_state(pdev); 6132 6133 if (!adev->pci_state) { 6134 DRM_ERROR("Failed to store PCI saved state"); 6135 return false; 6136 } 6137 } else { 6138 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6139 return false; 6140 } 6141 6142 return true; 6143 #endif 6144 } 6145 6146 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6147 { 6148 STUB(); 6149 return false; 6150 #ifdef notyet 6151 struct drm_device *dev = pci_get_drvdata(pdev); 6152 struct amdgpu_device *adev = drm_to_adev(dev); 6153 int r; 6154 6155 if (!adev->pci_state) 6156 return false; 6157 6158 r = pci_load_saved_state(pdev, adev->pci_state); 6159 6160 if (!r) { 6161 pci_restore_state(pdev); 6162 } else { 6163 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6164 return false; 6165 } 6166 6167 return true; 6168 #endif 6169 } 6170 6171 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6172 struct amdgpu_ring *ring) 6173 { 6174 #ifdef CONFIG_X86_64 6175 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6176 return; 6177 #endif 6178 if (adev->gmc.xgmi.connected_to_cpu) 6179 return; 6180 6181 if (ring && ring->funcs->emit_hdp_flush) 6182 amdgpu_ring_emit_hdp_flush(ring); 6183 else 6184 amdgpu_asic_flush_hdp(adev, ring); 6185 } 6186 6187 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6188 struct amdgpu_ring *ring) 6189 { 6190 #ifdef CONFIG_X86_64 6191 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6192 return; 6193 #endif 6194 if (adev->gmc.xgmi.connected_to_cpu) 6195 return; 6196 6197 amdgpu_asic_invalidate_hdp(adev, ring); 6198 } 6199 6200 int amdgpu_in_reset(struct amdgpu_device *adev) 6201 { 6202 return atomic_read(&adev->reset_domain->in_gpu_reset); 6203 } 6204 6205 /** 6206 * amdgpu_device_halt() - bring hardware to some kind of halt state 6207 * 6208 * @adev: amdgpu_device pointer 6209 * 6210 * Bring hardware to some kind of halt state so that no one can touch it 6211 * any more. It will help to maintain error context when error occurred. 6212 * Compare to a simple hang, the system will keep stable at least for SSH 6213 * access. Then it should be trivial to inspect the hardware state and 6214 * see what's going on. Implemented as following: 6215 * 6216 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6217 * clears all CPU mappings to device, disallows remappings through page faults 6218 * 2. amdgpu_irq_disable_all() disables all interrupts 6219 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6220 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6221 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6222 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6223 * flush any in flight DMA operations 6224 */ 6225 void amdgpu_device_halt(struct amdgpu_device *adev) 6226 { 6227 struct pci_dev *pdev = adev->pdev; 6228 struct drm_device *ddev = adev_to_drm(adev); 6229 6230 drm_dev_unplug(ddev); 6231 6232 amdgpu_irq_disable_all(adev); 6233 6234 amdgpu_fence_driver_hw_fini(adev); 6235 6236 adev->no_hw_access = true; 6237 6238 amdgpu_device_unmap_mmio(adev); 6239 6240 pci_disable_device(pdev); 6241 pci_wait_for_pending_transaction(pdev); 6242 } 6243 6244 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6245 u32 reg) 6246 { 6247 unsigned long flags, address, data; 6248 u32 r; 6249 6250 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6251 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6252 6253 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6254 WREG32(address, reg * 4); 6255 (void)RREG32(address); 6256 r = RREG32(data); 6257 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6258 return r; 6259 } 6260 6261 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6262 u32 reg, u32 v) 6263 { 6264 unsigned long flags, address, data; 6265 6266 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6267 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6268 6269 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6270 WREG32(address, reg * 4); 6271 (void)RREG32(address); 6272 WREG32(data, v); 6273 (void)RREG32(data); 6274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6275 } 6276 6277 /** 6278 * amdgpu_device_switch_gang - switch to a new gang 6279 * @adev: amdgpu_device pointer 6280 * @gang: the gang to switch to 6281 * 6282 * Try to switch to a new gang. 6283 * Returns: NULL if we switched to the new gang or a reference to the current 6284 * gang leader. 6285 */ 6286 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6287 struct dma_fence *gang) 6288 { 6289 struct dma_fence *old = NULL; 6290 6291 do { 6292 dma_fence_put(old); 6293 rcu_read_lock(); 6294 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6295 rcu_read_unlock(); 6296 6297 if (old == gang) 6298 break; 6299 6300 if (!dma_fence_is_signaled(old)) 6301 return old; 6302 6303 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6304 old, gang) != old); 6305 6306 dma_fence_put(old); 6307 return NULL; 6308 } 6309 6310 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6311 { 6312 switch (adev->asic_type) { 6313 #ifdef CONFIG_DRM_AMDGPU_SI 6314 case CHIP_HAINAN: 6315 #endif 6316 case CHIP_TOPAZ: 6317 /* chips with no display hardware */ 6318 return false; 6319 #ifdef CONFIG_DRM_AMDGPU_SI 6320 case CHIP_TAHITI: 6321 case CHIP_PITCAIRN: 6322 case CHIP_VERDE: 6323 case CHIP_OLAND: 6324 #endif 6325 #ifdef CONFIG_DRM_AMDGPU_CIK 6326 case CHIP_BONAIRE: 6327 case CHIP_HAWAII: 6328 case CHIP_KAVERI: 6329 case CHIP_KABINI: 6330 case CHIP_MULLINS: 6331 #endif 6332 case CHIP_TONGA: 6333 case CHIP_FIJI: 6334 case CHIP_POLARIS10: 6335 case CHIP_POLARIS11: 6336 case CHIP_POLARIS12: 6337 case CHIP_VEGAM: 6338 case CHIP_CARRIZO: 6339 case CHIP_STONEY: 6340 /* chips with display hardware */ 6341 return true; 6342 default: 6343 /* IP discovery */ 6344 if (!adev->ip_versions[DCE_HWIP][0] || 6345 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6346 return false; 6347 return true; 6348 } 6349 } 6350