1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 83 #include <asm/intel-family.h> 84 #endif 85 86 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 93 94 #define AMDGPU_RESUME_MS 2000 95 #define AMDGPU_MAX_RETRY_LIMIT 2 96 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 97 98 static const struct drm_driver amdgpu_kms_driver; 99 100 const char *amdgpu_asic_name[] = { 101 "TAHITI", 102 "PITCAIRN", 103 "VERDE", 104 "OLAND", 105 "HAINAN", 106 "BONAIRE", 107 "KAVERI", 108 "KABINI", 109 "HAWAII", 110 "MULLINS", 111 "TOPAZ", 112 "TONGA", 113 "FIJI", 114 "CARRIZO", 115 "STONEY", 116 "POLARIS10", 117 "POLARIS11", 118 "POLARIS12", 119 "VEGAM", 120 "VEGA10", 121 "VEGA12", 122 "VEGA20", 123 "RAVEN", 124 "ARCTURUS", 125 "RENOIR", 126 "ALDEBARAN", 127 "NAVI10", 128 "CYAN_SKILLFISH", 129 "NAVI14", 130 "NAVI12", 131 "SIENNA_CICHLID", 132 "NAVY_FLOUNDER", 133 "VANGOGH", 134 "DIMGREY_CAVEFISH", 135 "BEIGE_GOBY", 136 "YELLOW_CARP", 137 "IP DISCOVERY", 138 "LAST", 139 }; 140 141 /** 142 * DOC: pcie_replay_count 143 * 144 * The amdgpu driver provides a sysfs API for reporting the total number 145 * of PCIe replays (NAKs) 146 * The file pcie_replay_count is used for this and returns the total 147 * number of replays as a sum of the NAKs generated and NAKs received 148 */ 149 150 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 struct drm_device *ddev = dev_get_drvdata(dev); 154 struct amdgpu_device *adev = drm_to_adev(ddev); 155 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 156 157 return sysfs_emit(buf, "%llu\n", cnt); 158 } 159 160 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 161 amdgpu_device_get_pcie_replay_count, NULL); 162 163 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 164 165 /** 166 * DOC: product_name 167 * 168 * The amdgpu driver provides a sysfs API for reporting the product name 169 * for the device 170 * The file serial_number is used for this and returns the product name 171 * as returned from the FRU. 172 * NOTE: This is only available for certain server cards 173 */ 174 175 static ssize_t amdgpu_device_get_product_name(struct device *dev, 176 struct device_attribute *attr, char *buf) 177 { 178 struct drm_device *ddev = dev_get_drvdata(dev); 179 struct amdgpu_device *adev = drm_to_adev(ddev); 180 181 return sysfs_emit(buf, "%s\n", adev->product_name); 182 } 183 184 static DEVICE_ATTR(product_name, S_IRUGO, 185 amdgpu_device_get_product_name, NULL); 186 187 /** 188 * DOC: product_number 189 * 190 * The amdgpu driver provides a sysfs API for reporting the part number 191 * for the device 192 * The file serial_number is used for this and returns the part number 193 * as returned from the FRU. 194 * NOTE: This is only available for certain server cards 195 */ 196 197 static ssize_t amdgpu_device_get_product_number(struct device *dev, 198 struct device_attribute *attr, char *buf) 199 { 200 struct drm_device *ddev = dev_get_drvdata(dev); 201 struct amdgpu_device *adev = drm_to_adev(ddev); 202 203 return sysfs_emit(buf, "%s\n", adev->product_number); 204 } 205 206 static DEVICE_ATTR(product_number, S_IRUGO, 207 amdgpu_device_get_product_number, NULL); 208 209 /** 210 * DOC: serial_number 211 * 212 * The amdgpu driver provides a sysfs API for reporting the serial number 213 * for the device 214 * The file serial_number is used for this and returns the serial number 215 * as returned from the FRU. 216 * NOTE: This is only available for certain server cards 217 */ 218 219 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 220 struct device_attribute *attr, char *buf) 221 { 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 225 return sysfs_emit(buf, "%s\n", adev->serial); 226 } 227 228 static DEVICE_ATTR(serial_number, S_IRUGO, 229 amdgpu_device_get_serial_number, NULL); 230 231 /** 232 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 233 * 234 * @dev: drm_device pointer 235 * 236 * Returns true if the device is a dGPU with ATPX power control, 237 * otherwise return false. 238 */ 239 bool amdgpu_device_supports_px(struct drm_device *dev) 240 { 241 struct amdgpu_device *adev = drm_to_adev(dev); 242 243 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 244 return true; 245 return false; 246 } 247 248 /** 249 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 250 * 251 * @dev: drm_device pointer 252 * 253 * Returns true if the device is a dGPU with ACPI power control, 254 * otherwise return false. 255 */ 256 bool amdgpu_device_supports_boco(struct drm_device *dev) 257 { 258 struct amdgpu_device *adev = drm_to_adev(dev); 259 260 if (adev->has_pr3 || 261 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 262 return true; 263 return false; 264 } 265 266 /** 267 * amdgpu_device_supports_baco - Does the device support BACO 268 * 269 * @dev: drm_device pointer 270 * 271 * Returns true if the device supporte BACO, 272 * otherwise return false. 273 */ 274 bool amdgpu_device_supports_baco(struct drm_device *dev) 275 { 276 struct amdgpu_device *adev = drm_to_adev(dev); 277 278 return amdgpu_asic_supports_baco(adev); 279 } 280 281 /** 282 * amdgpu_device_supports_smart_shift - Is the device dGPU with 283 * smart shift support 284 * 285 * @dev: drm_device pointer 286 * 287 * Returns true if the device is a dGPU with Smart Shift support, 288 * otherwise returns false. 289 */ 290 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 291 { 292 return (amdgpu_device_supports_boco(dev) && 293 amdgpu_acpi_is_power_shift_control_supported()); 294 } 295 296 /* 297 * VRAM access helper functions 298 */ 299 300 /** 301 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 302 * 303 * @adev: amdgpu_device pointer 304 * @pos: offset of the buffer in vram 305 * @buf: virtual address of the buffer in system memory 306 * @size: read/write size, sizeof(@buf) must > @size 307 * @write: true - write to vram, otherwise - read from vram 308 */ 309 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 310 void *buf, size_t size, bool write) 311 { 312 unsigned long flags; 313 uint32_t hi = ~0, tmp = 0; 314 uint32_t *data = buf; 315 uint64_t last; 316 int idx; 317 318 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 319 return; 320 321 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 322 323 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 324 for (last = pos + size; pos < last; pos += 4) { 325 tmp = pos >> 31; 326 327 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 328 if (tmp != hi) { 329 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 330 hi = tmp; 331 } 332 if (write) 333 WREG32_NO_KIQ(mmMM_DATA, *data++); 334 else 335 *data++ = RREG32_NO_KIQ(mmMM_DATA); 336 } 337 338 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 339 drm_dev_exit(idx); 340 } 341 342 /** 343 * amdgpu_device_aper_access - access vram by vram aperature 344 * 345 * @adev: amdgpu_device pointer 346 * @pos: offset of the buffer in vram 347 * @buf: virtual address of the buffer in system memory 348 * @size: read/write size, sizeof(@buf) must > @size 349 * @write: true - write to vram, otherwise - read from vram 350 * 351 * The return value means how many bytes have been transferred. 352 */ 353 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 354 void *buf, size_t size, bool write) 355 { 356 #ifdef CONFIG_64BIT 357 void __iomem *addr; 358 size_t count = 0; 359 uint64_t last; 360 361 if (!adev->mman.aper_base_kaddr) 362 return 0; 363 364 last = min(pos + size, adev->gmc.visible_vram_size); 365 if (last > pos) { 366 addr = adev->mman.aper_base_kaddr + pos; 367 count = last - pos; 368 369 if (write) { 370 memcpy_toio(addr, buf, count); 371 mb(); 372 amdgpu_device_flush_hdp(adev, NULL); 373 } else { 374 amdgpu_device_invalidate_hdp(adev, NULL); 375 mb(); 376 memcpy_fromio(buf, addr, count); 377 } 378 379 } 380 381 return count; 382 #else 383 return 0; 384 #endif 385 } 386 387 /** 388 * amdgpu_device_vram_access - read/write a buffer in vram 389 * 390 * @adev: amdgpu_device pointer 391 * @pos: offset of the buffer in vram 392 * @buf: virtual address of the buffer in system memory 393 * @size: read/write size, sizeof(@buf) must > @size 394 * @write: true - write to vram, otherwise - read from vram 395 */ 396 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 397 void *buf, size_t size, bool write) 398 { 399 size_t count; 400 401 /* try to using vram apreature to access vram first */ 402 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 403 size -= count; 404 if (size) { 405 /* using MM to access rest vram */ 406 pos += count; 407 buf += count; 408 amdgpu_device_mm_access(adev, pos, buf, size, write); 409 } 410 } 411 412 /* 413 * register access helper functions. 414 */ 415 416 /* Check if hw access should be skipped because of hotplug or device error */ 417 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 418 { 419 if (adev->no_hw_access) 420 return true; 421 422 #ifdef CONFIG_LOCKDEP 423 /* 424 * This is a bit complicated to understand, so worth a comment. What we assert 425 * here is that the GPU reset is not running on another thread in parallel. 426 * 427 * For this we trylock the read side of the reset semaphore, if that succeeds 428 * we know that the reset is not running in paralell. 429 * 430 * If the trylock fails we assert that we are either already holding the read 431 * side of the lock or are the reset thread itself and hold the write side of 432 * the lock. 433 */ 434 if (in_task()) { 435 if (down_read_trylock(&adev->reset_domain->sem)) 436 up_read(&adev->reset_domain->sem); 437 else 438 lockdep_assert_held(&adev->reset_domain->sem); 439 } 440 #endif 441 return false; 442 } 443 444 /** 445 * amdgpu_device_rreg - read a memory mapped IO or indirect register 446 * 447 * @adev: amdgpu_device pointer 448 * @reg: dword aligned register offset 449 * @acc_flags: access flags which require special behavior 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 454 uint32_t reg, uint32_t acc_flags) 455 { 456 uint32_t ret; 457 458 if (amdgpu_device_skip_hw_access(adev)) 459 return 0; 460 461 if ((reg * 4) < adev->rmmio_size) { 462 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 463 amdgpu_sriov_runtime(adev) && 464 down_read_trylock(&adev->reset_domain->sem)) { 465 ret = amdgpu_kiq_rreg(adev, reg); 466 up_read(&adev->reset_domain->sem); 467 } else { 468 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 469 } 470 } else { 471 ret = adev->pcie_rreg(adev, reg * 4); 472 } 473 474 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 475 476 return ret; 477 } 478 479 /* 480 * MMIO register read with bytes helper functions 481 * @offset:bytes offset from MMIO start 482 * 483 */ 484 485 /** 486 * amdgpu_mm_rreg8 - read a memory mapped IO register 487 * 488 * @adev: amdgpu_device pointer 489 * @offset: byte aligned register offset 490 * 491 * Returns the 8 bit value from the offset specified. 492 */ 493 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 494 { 495 if (amdgpu_device_skip_hw_access(adev)) 496 return 0; 497 498 if (offset < adev->rmmio_size) 499 return (readb(adev->rmmio + offset)); 500 BUG(); 501 } 502 503 /* 504 * MMIO register write with bytes helper functions 505 * @offset:bytes offset from MMIO start 506 * @value: the value want to be written to the register 507 * 508 */ 509 /** 510 * amdgpu_mm_wreg8 - read a memory mapped IO register 511 * 512 * @adev: amdgpu_device pointer 513 * @offset: byte aligned register offset 514 * @value: 8 bit value to write 515 * 516 * Writes the value specified to the offset specified. 517 */ 518 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 519 { 520 if (amdgpu_device_skip_hw_access(adev)) 521 return; 522 523 if (offset < adev->rmmio_size) 524 writeb(value, adev->rmmio + offset); 525 else 526 BUG(); 527 } 528 529 /** 530 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 531 * 532 * @adev: amdgpu_device pointer 533 * @reg: dword aligned register offset 534 * @v: 32 bit value to write to the register 535 * @acc_flags: access flags which require special behavior 536 * 537 * Writes the value specified to the offset specified. 538 */ 539 void amdgpu_device_wreg(struct amdgpu_device *adev, 540 uint32_t reg, uint32_t v, 541 uint32_t acc_flags) 542 { 543 if (amdgpu_device_skip_hw_access(adev)) 544 return; 545 546 if ((reg * 4) < adev->rmmio_size) { 547 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 548 amdgpu_sriov_runtime(adev) && 549 down_read_trylock(&adev->reset_domain->sem)) { 550 amdgpu_kiq_wreg(adev, reg, v); 551 up_read(&adev->reset_domain->sem); 552 } else { 553 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 554 } 555 } else { 556 adev->pcie_wreg(adev, reg * 4, v); 557 } 558 559 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 560 } 561 562 /** 563 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 564 * 565 * @adev: amdgpu_device pointer 566 * @reg: mmio/rlc register 567 * @v: value to write 568 * 569 * this function is invoked only for the debugfs register access 570 */ 571 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 572 uint32_t reg, uint32_t v) 573 { 574 if (amdgpu_device_skip_hw_access(adev)) 575 return; 576 577 if (amdgpu_sriov_fullaccess(adev) && 578 adev->gfx.rlc.funcs && 579 adev->gfx.rlc.funcs->is_rlcg_access_range) { 580 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 581 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 582 } else if ((reg * 4) >= adev->rmmio_size) { 583 adev->pcie_wreg(adev, reg * 4, v); 584 } else { 585 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 586 } 587 } 588 589 /** 590 * amdgpu_mm_rdoorbell - read a doorbell dword 591 * 592 * @adev: amdgpu_device pointer 593 * @index: doorbell index 594 * 595 * Returns the value in the doorbell aperture at the 596 * requested doorbell index (CIK). 597 */ 598 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 599 { 600 if (amdgpu_device_skip_hw_access(adev)) 601 return 0; 602 603 if (index < adev->doorbell.num_doorbells) { 604 return readl(adev->doorbell.ptr + index); 605 } else { 606 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 607 return 0; 608 } 609 } 610 611 /** 612 * amdgpu_mm_wdoorbell - write a doorbell dword 613 * 614 * @adev: amdgpu_device pointer 615 * @index: doorbell index 616 * @v: value to write 617 * 618 * Writes @v to the doorbell aperture at the 619 * requested doorbell index (CIK). 620 */ 621 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 622 { 623 if (amdgpu_device_skip_hw_access(adev)) 624 return; 625 626 if (index < adev->doorbell.num_doorbells) { 627 writel(v, adev->doorbell.ptr + index); 628 } else { 629 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 630 } 631 } 632 633 /** 634 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 635 * 636 * @adev: amdgpu_device pointer 637 * @index: doorbell index 638 * 639 * Returns the value in the doorbell aperture at the 640 * requested doorbell index (VEGA10+). 641 */ 642 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 643 { 644 if (amdgpu_device_skip_hw_access(adev)) 645 return 0; 646 647 if (index < adev->doorbell.num_doorbells) { 648 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 649 } else { 650 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 651 return 0; 652 } 653 } 654 655 /** 656 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 657 * 658 * @adev: amdgpu_device pointer 659 * @index: doorbell index 660 * @v: value to write 661 * 662 * Writes @v to the doorbell aperture at the 663 * requested doorbell index (VEGA10+). 664 */ 665 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 666 { 667 if (amdgpu_device_skip_hw_access(adev)) 668 return; 669 670 if (index < adev->doorbell.num_doorbells) { 671 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 672 } else { 673 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 674 } 675 } 676 677 /** 678 * amdgpu_device_indirect_rreg - read an indirect register 679 * 680 * @adev: amdgpu_device pointer 681 * @pcie_index: mmio register offset 682 * @pcie_data: mmio register offset 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 pcie_index, u32 pcie_data, 689 u32 reg_addr) 690 { 691 unsigned long flags; 692 u32 r; 693 void __iomem *pcie_index_offset; 694 void __iomem *pcie_data_offset; 695 696 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 697 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 698 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 699 700 writel(reg_addr, pcie_index_offset); 701 readl(pcie_index_offset); 702 r = readl(pcie_data_offset); 703 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 704 705 return r; 706 } 707 708 /** 709 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 710 * 711 * @adev: amdgpu_device pointer 712 * @pcie_index: mmio register offset 713 * @pcie_data: mmio register offset 714 * @reg_addr: indirect register address to read from 715 * 716 * Returns the value of indirect register @reg_addr 717 */ 718 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 719 u32 pcie_index, u32 pcie_data, 720 u32 reg_addr) 721 { 722 unsigned long flags; 723 u64 r; 724 void __iomem *pcie_index_offset; 725 void __iomem *pcie_data_offset; 726 727 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 728 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 729 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 730 731 /* read low 32 bits */ 732 writel(reg_addr, pcie_index_offset); 733 readl(pcie_index_offset); 734 r = readl(pcie_data_offset); 735 /* read high 32 bits */ 736 writel(reg_addr + 4, pcie_index_offset); 737 readl(pcie_index_offset); 738 r |= ((u64)readl(pcie_data_offset) << 32); 739 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 740 741 return r; 742 } 743 744 /** 745 * amdgpu_device_indirect_wreg - write an indirect register address 746 * 747 * @adev: amdgpu_device pointer 748 * @pcie_index: mmio register offset 749 * @pcie_data: mmio register offset 750 * @reg_addr: indirect register offset 751 * @reg_data: indirect register data 752 * 753 */ 754 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 755 u32 pcie_index, u32 pcie_data, 756 u32 reg_addr, u32 reg_data) 757 { 758 unsigned long flags; 759 void __iomem *pcie_index_offset; 760 void __iomem *pcie_data_offset; 761 762 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 763 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 764 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 765 766 writel(reg_addr, pcie_index_offset); 767 readl(pcie_index_offset); 768 writel(reg_data, pcie_data_offset); 769 readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 } 772 773 /** 774 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 775 * 776 * @adev: amdgpu_device pointer 777 * @pcie_index: mmio register offset 778 * @pcie_data: mmio register offset 779 * @reg_addr: indirect register offset 780 * @reg_data: indirect register data 781 * 782 */ 783 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 784 u32 pcie_index, u32 pcie_data, 785 u32 reg_addr, u64 reg_data) 786 { 787 unsigned long flags; 788 void __iomem *pcie_index_offset; 789 void __iomem *pcie_data_offset; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 795 /* write low 32 bits */ 796 writel(reg_addr, pcie_index_offset); 797 readl(pcie_index_offset); 798 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 799 readl(pcie_data_offset); 800 /* write high 32 bits */ 801 writel(reg_addr + 4, pcie_index_offset); 802 readl(pcie_index_offset); 803 writel((u32)(reg_data >> 32), pcie_data_offset); 804 readl(pcie_data_offset); 805 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 806 } 807 808 /** 809 * amdgpu_invalid_rreg - dummy reg read function 810 * 811 * @adev: amdgpu_device pointer 812 * @reg: offset of register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 * Returns the value in the register. 817 */ 818 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 819 { 820 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 821 BUG(); 822 return 0; 823 } 824 825 /** 826 * amdgpu_invalid_wreg - dummy reg write function 827 * 828 * @adev: amdgpu_device pointer 829 * @reg: offset of register 830 * @v: value to write to the register 831 * 832 * Dummy register read function. Used for register blocks 833 * that certain asics don't have (all asics). 834 */ 835 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 836 { 837 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 838 reg, v); 839 BUG(); 840 } 841 842 /** 843 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: offset of register 847 * 848 * Dummy register read function. Used for register blocks 849 * that certain asics don't have (all asics). 850 * Returns the value in the register. 851 */ 852 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_invalid_wreg64 - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @reg: offset of register 864 * @v: value to write to the register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 */ 869 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 870 { 871 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 872 reg, v); 873 BUG(); 874 } 875 876 /** 877 * amdgpu_block_invalid_rreg - dummy reg read function 878 * 879 * @adev: amdgpu_device pointer 880 * @block: offset of instance 881 * @reg: offset of register 882 * 883 * Dummy register read function. Used for register blocks 884 * that certain asics don't have (all asics). 885 * Returns the value in the register. 886 */ 887 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 888 uint32_t block, uint32_t reg) 889 { 890 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 891 reg, block); 892 BUG(); 893 return 0; 894 } 895 896 /** 897 * amdgpu_block_invalid_wreg - dummy reg write function 898 * 899 * @adev: amdgpu_device pointer 900 * @block: offset of instance 901 * @reg: offset of register 902 * @v: value to write to the register 903 * 904 * Dummy register read function. Used for register blocks 905 * that certain asics don't have (all asics). 906 */ 907 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 908 uint32_t block, 909 uint32_t reg, uint32_t v) 910 { 911 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 912 reg, block, v); 913 BUG(); 914 } 915 916 /** 917 * amdgpu_device_asic_init - Wrapper for atom asic_init 918 * 919 * @adev: amdgpu_device pointer 920 * 921 * Does any asic specific work and then calls atom asic init. 922 */ 923 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 924 { 925 amdgpu_asic_pre_asic_init(adev); 926 927 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 928 return amdgpu_atomfirmware_asic_init(adev, true); 929 else 930 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 931 } 932 933 /** 934 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 935 * 936 * @adev: amdgpu_device pointer 937 * 938 * Allocates a scratch page of VRAM for use by various things in the 939 * driver. 940 */ 941 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 942 { 943 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 944 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 945 &adev->vram_scratch.robj, 946 &adev->vram_scratch.gpu_addr, 947 (void **)&adev->vram_scratch.ptr); 948 } 949 950 /** 951 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 952 * 953 * @adev: amdgpu_device pointer 954 * 955 * Frees the VRAM scratch page. 956 */ 957 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 958 { 959 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 960 } 961 962 /** 963 * amdgpu_device_program_register_sequence - program an array of registers. 964 * 965 * @adev: amdgpu_device pointer 966 * @registers: pointer to the register array 967 * @array_size: size of the register array 968 * 969 * Programs an array or registers with and and or masks. 970 * This is a helper for setting golden registers. 971 */ 972 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 973 const u32 *registers, 974 const u32 array_size) 975 { 976 u32 tmp, reg, and_mask, or_mask; 977 int i; 978 979 if (array_size % 3) 980 return; 981 982 for (i = 0; i < array_size; i +=3) { 983 reg = registers[i + 0]; 984 and_mask = registers[i + 1]; 985 or_mask = registers[i + 2]; 986 987 if (and_mask == 0xffffffff) { 988 tmp = or_mask; 989 } else { 990 tmp = RREG32(reg); 991 tmp &= ~and_mask; 992 if (adev->family >= AMDGPU_FAMILY_AI) 993 tmp |= (or_mask & and_mask); 994 else 995 tmp |= or_mask; 996 } 997 WREG32(reg, tmp); 998 } 999 } 1000 1001 /** 1002 * amdgpu_device_pci_config_reset - reset the GPU 1003 * 1004 * @adev: amdgpu_device pointer 1005 * 1006 * Resets the GPU using the pci config reset sequence. 1007 * Only applicable to asics prior to vega10. 1008 */ 1009 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1010 { 1011 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1012 } 1013 1014 /** 1015 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1016 * 1017 * @adev: amdgpu_device pointer 1018 * 1019 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1020 */ 1021 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1022 { 1023 STUB(); 1024 return -ENOSYS; 1025 #ifdef notyet 1026 return pci_reset_function(adev->pdev); 1027 #endif 1028 } 1029 1030 /* 1031 * GPU doorbell aperture helpers function. 1032 */ 1033 /** 1034 * amdgpu_device_doorbell_init - Init doorbell driver information. 1035 * 1036 * @adev: amdgpu_device pointer 1037 * 1038 * Init doorbell driver information (CIK) 1039 * Returns 0 on success, error on failure. 1040 */ 1041 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1042 { 1043 1044 /* No doorbell on SI hardware generation */ 1045 if (adev->asic_type < CHIP_BONAIRE) { 1046 adev->doorbell.base = 0; 1047 adev->doorbell.size = 0; 1048 adev->doorbell.num_doorbells = 0; 1049 adev->doorbell.ptr = NULL; 1050 return 0; 1051 } 1052 1053 #ifdef __linux__ 1054 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1055 return -EINVAL; 1056 #endif 1057 1058 amdgpu_asic_init_doorbell_index(adev); 1059 1060 /* doorbell bar mapping */ 1061 #ifdef __linux__ 1062 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1063 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1064 #endif 1065 1066 if (adev->enable_mes) { 1067 adev->doorbell.num_doorbells = 1068 adev->doorbell.size / sizeof(u32); 1069 } else { 1070 adev->doorbell.num_doorbells = 1071 min_t(u32, adev->doorbell.size / sizeof(u32), 1072 adev->doorbell_index.max_assignment+1); 1073 if (adev->doorbell.num_doorbells == 0) 1074 return -EINVAL; 1075 1076 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1077 * paging queue doorbell use the second page. The 1078 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1079 * doorbells are in the first page. So with paging queue enabled, 1080 * the max num_doorbells should + 1 page (0x400 in dword) 1081 */ 1082 if (adev->asic_type >= CHIP_VEGA10) 1083 adev->doorbell.num_doorbells += 0x400; 1084 } 1085 1086 #ifdef __linux__ 1087 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1088 adev->doorbell.num_doorbells * 1089 sizeof(u32)); 1090 if (adev->doorbell.ptr == NULL) 1091 return -ENOMEM; 1092 #endif 1093 1094 return 0; 1095 } 1096 1097 /** 1098 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Tear down doorbell driver information (CIK) 1103 */ 1104 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1105 { 1106 #ifdef __linux__ 1107 iounmap(adev->doorbell.ptr); 1108 #else 1109 if (adev->doorbell.size > 0) 1110 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1111 adev->doorbell.size); 1112 #endif 1113 adev->doorbell.ptr = NULL; 1114 } 1115 1116 1117 1118 /* 1119 * amdgpu_device_wb_*() 1120 * Writeback is the method by which the GPU updates special pages in memory 1121 * with the status of certain GPU events (fences, ring pointers,etc.). 1122 */ 1123 1124 /** 1125 * amdgpu_device_wb_fini - Disable Writeback and free memory 1126 * 1127 * @adev: amdgpu_device pointer 1128 * 1129 * Disables Writeback and frees the Writeback memory (all asics). 1130 * Used at driver shutdown. 1131 */ 1132 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1133 { 1134 if (adev->wb.wb_obj) { 1135 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1136 &adev->wb.gpu_addr, 1137 (void **)&adev->wb.wb); 1138 adev->wb.wb_obj = NULL; 1139 } 1140 } 1141 1142 /** 1143 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1144 * 1145 * @adev: amdgpu_device pointer 1146 * 1147 * Initializes writeback and allocates writeback memory (all asics). 1148 * Used at driver startup. 1149 * Returns 0 on success or an -error on failure. 1150 */ 1151 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1152 { 1153 int r; 1154 1155 if (adev->wb.wb_obj == NULL) { 1156 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1157 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1158 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1159 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1160 (void **)&adev->wb.wb); 1161 if (r) { 1162 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1163 return r; 1164 } 1165 1166 adev->wb.num_wb = AMDGPU_MAX_WB; 1167 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1168 1169 /* clear wb memory */ 1170 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1171 } 1172 1173 return 0; 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_get - Allocate a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Allocate a wb slot for use by the driver (all asics). 1183 * Returns 0 on success or -EINVAL on failure. 1184 */ 1185 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1186 { 1187 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1188 1189 if (offset < adev->wb.num_wb) { 1190 __set_bit(offset, adev->wb.used); 1191 *wb = offset << 3; /* convert to dw offset */ 1192 return 0; 1193 } else { 1194 return -EINVAL; 1195 } 1196 } 1197 1198 /** 1199 * amdgpu_device_wb_free - Free a wb entry 1200 * 1201 * @adev: amdgpu_device pointer 1202 * @wb: wb index 1203 * 1204 * Free a wb slot allocated for use by the driver (all asics) 1205 */ 1206 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1207 { 1208 wb >>= 3; 1209 if (wb < adev->wb.num_wb) 1210 __clear_bit(wb, adev->wb.used); 1211 } 1212 1213 /** 1214 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1215 * 1216 * @adev: amdgpu_device pointer 1217 * 1218 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1219 * to fail, but if any of the BARs is not accessible after the size we abort 1220 * driver loading by returning -ENODEV. 1221 */ 1222 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1223 { 1224 #ifdef __linux__ 1225 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1226 struct pci_bus *root; 1227 struct resource *res; 1228 unsigned i; 1229 u16 cmd; 1230 int r; 1231 1232 /* Bypass for VF */ 1233 if (amdgpu_sriov_vf(adev)) 1234 return 0; 1235 1236 /* skip if the bios has already enabled large BAR */ 1237 if (adev->gmc.real_vram_size && 1238 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1239 return 0; 1240 1241 /* Check if the root BUS has 64bit memory resources */ 1242 root = adev->pdev->bus; 1243 while (root->parent) 1244 root = root->parent; 1245 1246 pci_bus_for_each_resource(root, res, i) { 1247 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1248 res->start > 0x100000000ull) 1249 break; 1250 } 1251 1252 /* Trying to resize is pointless without a root hub window above 4GB */ 1253 if (!res) 1254 return 0; 1255 1256 /* Limit the BAR size to what is available */ 1257 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1258 rbar_size); 1259 1260 /* Disable memory decoding while we change the BAR addresses and size */ 1261 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1262 pci_write_config_word(adev->pdev, PCI_COMMAND, 1263 cmd & ~PCI_COMMAND_MEMORY); 1264 1265 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1266 amdgpu_device_doorbell_fini(adev); 1267 if (adev->asic_type >= CHIP_BONAIRE) 1268 pci_release_resource(adev->pdev, 2); 1269 1270 pci_release_resource(adev->pdev, 0); 1271 1272 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1273 if (r == -ENOSPC) 1274 DRM_INFO("Not enough PCI address space for a large BAR."); 1275 else if (r && r != -ENOTSUPP) 1276 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1277 1278 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1279 1280 /* When the doorbell or fb BAR isn't available we have no chance of 1281 * using the device. 1282 */ 1283 r = amdgpu_device_doorbell_init(adev); 1284 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1285 return -ENODEV; 1286 1287 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1288 #endif /* __linux__ */ 1289 1290 return 0; 1291 } 1292 1293 /* 1294 * GPU helpers function. 1295 */ 1296 /** 1297 * amdgpu_device_need_post - check if the hw need post or not 1298 * 1299 * @adev: amdgpu_device pointer 1300 * 1301 * Check if the asic has been initialized (all asics) at driver startup 1302 * or post is needed if hw reset is performed. 1303 * Returns true if need or false if not. 1304 */ 1305 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1306 { 1307 uint32_t reg; 1308 1309 if (amdgpu_sriov_vf(adev)) 1310 return false; 1311 1312 if (amdgpu_passthrough(adev)) { 1313 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1314 * some old smc fw still need driver do vPost otherwise gpu hang, while 1315 * those smc fw version above 22.15 doesn't have this flaw, so we force 1316 * vpost executed for smc version below 22.15 1317 */ 1318 if (adev->asic_type == CHIP_FIJI) { 1319 int err; 1320 uint32_t fw_ver; 1321 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1322 /* force vPost if error occured */ 1323 if (err) 1324 return true; 1325 1326 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1327 if (fw_ver < 0x00160e00) 1328 return true; 1329 } 1330 } 1331 1332 /* Don't post if we need to reset whole hive on init */ 1333 if (adev->gmc.xgmi.pending_reset) 1334 return false; 1335 1336 if (adev->has_hw_reset) { 1337 adev->has_hw_reset = false; 1338 return true; 1339 } 1340 1341 /* bios scratch used on CIK+ */ 1342 if (adev->asic_type >= CHIP_BONAIRE) 1343 return amdgpu_atombios_scratch_need_asic_init(adev); 1344 1345 /* check MEM_SIZE for older asics */ 1346 reg = amdgpu_asic_get_config_memsize(adev); 1347 1348 if ((reg != 0) && (reg != 0xffffffff)) 1349 return false; 1350 1351 return true; 1352 } 1353 1354 /* 1355 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1356 * speed switching. Until we have confirmation from Intel that a specific host 1357 * supports it, it's safer that we keep it disabled for all. 1358 * 1359 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1360 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1361 */ 1362 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1363 { 1364 #if IS_ENABLED(CONFIG_X86) 1365 #ifdef __linux__ 1366 struct cpuinfo_x86 *c = &cpu_data(0); 1367 1368 if (c->x86_vendor == X86_VENDOR_INTEL) 1369 #else 1370 if (strcmp(cpu_vendor, "GenuineIntel") == 0) 1371 #endif 1372 return false; 1373 #endif 1374 return true; 1375 } 1376 1377 /** 1378 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1379 * 1380 * @adev: amdgpu_device pointer 1381 * 1382 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1383 * be set for this device. 1384 * 1385 * Returns true if it should be used or false if not. 1386 */ 1387 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1388 { 1389 switch (amdgpu_aspm) { 1390 case -1: 1391 break; 1392 case 0: 1393 return false; 1394 case 1: 1395 return true; 1396 default: 1397 return false; 1398 } 1399 return pcie_aspm_enabled(adev->pdev); 1400 } 1401 1402 bool amdgpu_device_aspm_support_quirk(void) 1403 { 1404 #if IS_ENABLED(CONFIG_X86) 1405 struct cpu_info *ci = curcpu(); 1406 1407 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1408 #else 1409 return true; 1410 #endif 1411 } 1412 1413 /* if we get transitioned to only one device, take VGA back */ 1414 /** 1415 * amdgpu_device_vga_set_decode - enable/disable vga decode 1416 * 1417 * @pdev: PCI device pointer 1418 * @state: enable/disable vga decode 1419 * 1420 * Enable/disable vga decode (all asics). 1421 * Returns VGA resource flags. 1422 */ 1423 #ifdef notyet 1424 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1425 bool state) 1426 { 1427 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1428 amdgpu_asic_set_vga_state(adev, state); 1429 if (state) 1430 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1431 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1432 else 1433 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1434 } 1435 #endif 1436 1437 /** 1438 * amdgpu_device_check_block_size - validate the vm block size 1439 * 1440 * @adev: amdgpu_device pointer 1441 * 1442 * Validates the vm block size specified via module parameter. 1443 * The vm block size defines number of bits in page table versus page directory, 1444 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1445 * page table and the remaining bits are in the page directory. 1446 */ 1447 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1448 { 1449 /* defines number of bits in page table versus page directory, 1450 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1451 * page table and the remaining bits are in the page directory */ 1452 if (amdgpu_vm_block_size == -1) 1453 return; 1454 1455 if (amdgpu_vm_block_size < 9) { 1456 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1457 amdgpu_vm_block_size); 1458 amdgpu_vm_block_size = -1; 1459 } 1460 } 1461 1462 /** 1463 * amdgpu_device_check_vm_size - validate the vm size 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Validates the vm size in GB specified via module parameter. 1468 * The VM size is the size of the GPU virtual memory space in GB. 1469 */ 1470 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1471 { 1472 /* no need to check the default value */ 1473 if (amdgpu_vm_size == -1) 1474 return; 1475 1476 if (amdgpu_vm_size < 1) { 1477 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1478 amdgpu_vm_size); 1479 amdgpu_vm_size = -1; 1480 } 1481 } 1482 1483 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1484 { 1485 #ifdef __linux__ 1486 struct sysinfo si; 1487 #endif 1488 bool is_os_64 = (sizeof(void *) == 8); 1489 uint64_t total_memory; 1490 uint64_t dram_size_seven_GB = 0x1B8000000; 1491 uint64_t dram_size_three_GB = 0xB8000000; 1492 1493 if (amdgpu_smu_memory_pool_size == 0) 1494 return; 1495 1496 if (!is_os_64) { 1497 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1498 goto def_value; 1499 } 1500 #ifdef __linux__ 1501 si_meminfo(&si); 1502 total_memory = (uint64_t)si.totalram * si.mem_unit; 1503 #else 1504 total_memory = ptoa(physmem); 1505 #endif 1506 1507 if ((amdgpu_smu_memory_pool_size == 1) || 1508 (amdgpu_smu_memory_pool_size == 2)) { 1509 if (total_memory < dram_size_three_GB) 1510 goto def_value1; 1511 } else if ((amdgpu_smu_memory_pool_size == 4) || 1512 (amdgpu_smu_memory_pool_size == 8)) { 1513 if (total_memory < dram_size_seven_GB) 1514 goto def_value1; 1515 } else { 1516 DRM_WARN("Smu memory pool size not supported\n"); 1517 goto def_value; 1518 } 1519 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1520 1521 return; 1522 1523 def_value1: 1524 DRM_WARN("No enough system memory\n"); 1525 def_value: 1526 adev->pm.smu_prv_buffer_size = 0; 1527 } 1528 1529 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1530 { 1531 if (!(adev->flags & AMD_IS_APU) || 1532 adev->asic_type < CHIP_RAVEN) 1533 return 0; 1534 1535 switch (adev->asic_type) { 1536 case CHIP_RAVEN: 1537 if (adev->pdev->device == 0x15dd) 1538 adev->apu_flags |= AMD_APU_IS_RAVEN; 1539 if (adev->pdev->device == 0x15d8) 1540 adev->apu_flags |= AMD_APU_IS_PICASSO; 1541 break; 1542 case CHIP_RENOIR: 1543 if ((adev->pdev->device == 0x1636) || 1544 (adev->pdev->device == 0x164c)) 1545 adev->apu_flags |= AMD_APU_IS_RENOIR; 1546 else 1547 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1548 break; 1549 case CHIP_VANGOGH: 1550 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1551 break; 1552 case CHIP_YELLOW_CARP: 1553 break; 1554 case CHIP_CYAN_SKILLFISH: 1555 if ((adev->pdev->device == 0x13FE) || 1556 (adev->pdev->device == 0x143F)) 1557 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1558 break; 1559 default: 1560 break; 1561 } 1562 1563 return 0; 1564 } 1565 1566 /** 1567 * amdgpu_device_check_arguments - validate module params 1568 * 1569 * @adev: amdgpu_device pointer 1570 * 1571 * Validates certain module parameters and updates 1572 * the associated values used by the driver (all asics). 1573 */ 1574 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1575 { 1576 if (amdgpu_sched_jobs < 4) { 1577 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1578 amdgpu_sched_jobs); 1579 amdgpu_sched_jobs = 4; 1580 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1581 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1582 amdgpu_sched_jobs); 1583 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1584 } 1585 1586 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1587 /* gart size must be greater or equal to 32M */ 1588 dev_warn(adev->dev, "gart size (%d) too small\n", 1589 amdgpu_gart_size); 1590 amdgpu_gart_size = -1; 1591 } 1592 1593 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1594 /* gtt size must be greater or equal to 32M */ 1595 dev_warn(adev->dev, "gtt size (%d) too small\n", 1596 amdgpu_gtt_size); 1597 amdgpu_gtt_size = -1; 1598 } 1599 1600 /* valid range is between 4 and 9 inclusive */ 1601 if (amdgpu_vm_fragment_size != -1 && 1602 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1603 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1604 amdgpu_vm_fragment_size = -1; 1605 } 1606 1607 if (amdgpu_sched_hw_submission < 2) { 1608 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1609 amdgpu_sched_hw_submission); 1610 amdgpu_sched_hw_submission = 2; 1611 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1612 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1613 amdgpu_sched_hw_submission); 1614 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1615 } 1616 1617 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1618 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1619 amdgpu_reset_method = -1; 1620 } 1621 1622 amdgpu_device_check_smu_prv_buffer_size(adev); 1623 1624 amdgpu_device_check_vm_size(adev); 1625 1626 amdgpu_device_check_block_size(adev); 1627 1628 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1629 1630 return 0; 1631 } 1632 1633 #ifdef __linux__ 1634 /** 1635 * amdgpu_switcheroo_set_state - set switcheroo state 1636 * 1637 * @pdev: pci dev pointer 1638 * @state: vga_switcheroo state 1639 * 1640 * Callback for the switcheroo driver. Suspends or resumes the 1641 * the asics before or after it is powered up using ACPI methods. 1642 */ 1643 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1644 enum vga_switcheroo_state state) 1645 { 1646 struct drm_device *dev = pci_get_drvdata(pdev); 1647 int r; 1648 1649 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1650 return; 1651 1652 if (state == VGA_SWITCHEROO_ON) { 1653 pr_info("switched on\n"); 1654 /* don't suspend or resume card normally */ 1655 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1656 1657 pci_set_power_state(pdev, PCI_D0); 1658 amdgpu_device_load_pci_state(pdev); 1659 r = pci_enable_device(pdev); 1660 if (r) 1661 DRM_WARN("pci_enable_device failed (%d)\n", r); 1662 amdgpu_device_resume(dev, true); 1663 1664 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1665 } else { 1666 pr_info("switched off\n"); 1667 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1668 amdgpu_device_suspend(dev, true); 1669 amdgpu_device_cache_pci_state(pdev); 1670 /* Shut down the device */ 1671 pci_disable_device(pdev); 1672 pci_set_power_state(pdev, PCI_D3cold); 1673 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1674 } 1675 } 1676 1677 /** 1678 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1679 * 1680 * @pdev: pci dev pointer 1681 * 1682 * Callback for the switcheroo driver. Check of the switcheroo 1683 * state can be changed. 1684 * Returns true if the state can be changed, false if not. 1685 */ 1686 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1687 { 1688 struct drm_device *dev = pci_get_drvdata(pdev); 1689 1690 /* 1691 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1692 * locking inversion with the driver load path. And the access here is 1693 * completely racy anyway. So don't bother with locking for now. 1694 */ 1695 return atomic_read(&dev->open_count) == 0; 1696 } 1697 #endif /* __linux__ */ 1698 1699 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1700 #ifdef notyet 1701 .set_gpu_state = amdgpu_switcheroo_set_state, 1702 .reprobe = NULL, 1703 .can_switch = amdgpu_switcheroo_can_switch, 1704 #endif 1705 }; 1706 1707 /** 1708 * amdgpu_device_ip_set_clockgating_state - set the CG state 1709 * 1710 * @dev: amdgpu_device pointer 1711 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1712 * @state: clockgating state (gate or ungate) 1713 * 1714 * Sets the requested clockgating state for all instances of 1715 * the hardware IP specified. 1716 * Returns the error code from the last instance. 1717 */ 1718 int amdgpu_device_ip_set_clockgating_state(void *dev, 1719 enum amd_ip_block_type block_type, 1720 enum amd_clockgating_state state) 1721 { 1722 struct amdgpu_device *adev = dev; 1723 int i, r = 0; 1724 1725 for (i = 0; i < adev->num_ip_blocks; i++) { 1726 if (!adev->ip_blocks[i].status.valid) 1727 continue; 1728 if (adev->ip_blocks[i].version->type != block_type) 1729 continue; 1730 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1731 continue; 1732 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1733 (void *)adev, state); 1734 if (r) 1735 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1736 adev->ip_blocks[i].version->funcs->name, r); 1737 } 1738 return r; 1739 } 1740 1741 /** 1742 * amdgpu_device_ip_set_powergating_state - set the PG state 1743 * 1744 * @dev: amdgpu_device pointer 1745 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1746 * @state: powergating state (gate or ungate) 1747 * 1748 * Sets the requested powergating state for all instances of 1749 * the hardware IP specified. 1750 * Returns the error code from the last instance. 1751 */ 1752 int amdgpu_device_ip_set_powergating_state(void *dev, 1753 enum amd_ip_block_type block_type, 1754 enum amd_powergating_state state) 1755 { 1756 struct amdgpu_device *adev = dev; 1757 int i, r = 0; 1758 1759 for (i = 0; i < adev->num_ip_blocks; i++) { 1760 if (!adev->ip_blocks[i].status.valid) 1761 continue; 1762 if (adev->ip_blocks[i].version->type != block_type) 1763 continue; 1764 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1765 continue; 1766 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1767 (void *)adev, state); 1768 if (r) 1769 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1770 adev->ip_blocks[i].version->funcs->name, r); 1771 } 1772 return r; 1773 } 1774 1775 /** 1776 * amdgpu_device_ip_get_clockgating_state - get the CG state 1777 * 1778 * @adev: amdgpu_device pointer 1779 * @flags: clockgating feature flags 1780 * 1781 * Walks the list of IPs on the device and updates the clockgating 1782 * flags for each IP. 1783 * Updates @flags with the feature flags for each hardware IP where 1784 * clockgating is enabled. 1785 */ 1786 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1787 u64 *flags) 1788 { 1789 int i; 1790 1791 for (i = 0; i < adev->num_ip_blocks; i++) { 1792 if (!adev->ip_blocks[i].status.valid) 1793 continue; 1794 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1795 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1796 } 1797 } 1798 1799 /** 1800 * amdgpu_device_ip_wait_for_idle - wait for idle 1801 * 1802 * @adev: amdgpu_device pointer 1803 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1804 * 1805 * Waits for the request hardware IP to be idle. 1806 * Returns 0 for success or a negative error code on failure. 1807 */ 1808 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1809 enum amd_ip_block_type block_type) 1810 { 1811 int i, r; 1812 1813 for (i = 0; i < adev->num_ip_blocks; i++) { 1814 if (!adev->ip_blocks[i].status.valid) 1815 continue; 1816 if (adev->ip_blocks[i].version->type == block_type) { 1817 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1818 if (r) 1819 return r; 1820 break; 1821 } 1822 } 1823 return 0; 1824 1825 } 1826 1827 /** 1828 * amdgpu_device_ip_is_idle - is the hardware IP idle 1829 * 1830 * @adev: amdgpu_device pointer 1831 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1832 * 1833 * Check if the hardware IP is idle or not. 1834 * Returns true if it the IP is idle, false if not. 1835 */ 1836 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1837 enum amd_ip_block_type block_type) 1838 { 1839 int i; 1840 1841 for (i = 0; i < adev->num_ip_blocks; i++) { 1842 if (!adev->ip_blocks[i].status.valid) 1843 continue; 1844 if (adev->ip_blocks[i].version->type == block_type) 1845 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1846 } 1847 return true; 1848 1849 } 1850 1851 /** 1852 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1853 * 1854 * @adev: amdgpu_device pointer 1855 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1856 * 1857 * Returns a pointer to the hardware IP block structure 1858 * if it exists for the asic, otherwise NULL. 1859 */ 1860 struct amdgpu_ip_block * 1861 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1862 enum amd_ip_block_type type) 1863 { 1864 int i; 1865 1866 for (i = 0; i < adev->num_ip_blocks; i++) 1867 if (adev->ip_blocks[i].version->type == type) 1868 return &adev->ip_blocks[i]; 1869 1870 return NULL; 1871 } 1872 1873 /** 1874 * amdgpu_device_ip_block_version_cmp 1875 * 1876 * @adev: amdgpu_device pointer 1877 * @type: enum amd_ip_block_type 1878 * @major: major version 1879 * @minor: minor version 1880 * 1881 * return 0 if equal or greater 1882 * return 1 if smaller or the ip_block doesn't exist 1883 */ 1884 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1885 enum amd_ip_block_type type, 1886 u32 major, u32 minor) 1887 { 1888 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1889 1890 if (ip_block && ((ip_block->version->major > major) || 1891 ((ip_block->version->major == major) && 1892 (ip_block->version->minor >= minor)))) 1893 return 0; 1894 1895 return 1; 1896 } 1897 1898 /** 1899 * amdgpu_device_ip_block_add 1900 * 1901 * @adev: amdgpu_device pointer 1902 * @ip_block_version: pointer to the IP to add 1903 * 1904 * Adds the IP block driver information to the collection of IPs 1905 * on the asic. 1906 */ 1907 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1908 const struct amdgpu_ip_block_version *ip_block_version) 1909 { 1910 if (!ip_block_version) 1911 return -EINVAL; 1912 1913 switch (ip_block_version->type) { 1914 case AMD_IP_BLOCK_TYPE_VCN: 1915 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1916 return 0; 1917 break; 1918 case AMD_IP_BLOCK_TYPE_JPEG: 1919 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1920 return 0; 1921 break; 1922 default: 1923 break; 1924 } 1925 1926 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1927 ip_block_version->funcs->name); 1928 1929 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1930 1931 return 0; 1932 } 1933 1934 /** 1935 * amdgpu_device_enable_virtual_display - enable virtual display feature 1936 * 1937 * @adev: amdgpu_device pointer 1938 * 1939 * Enabled the virtual display feature if the user has enabled it via 1940 * the module parameter virtual_display. This feature provides a virtual 1941 * display hardware on headless boards or in virtualized environments. 1942 * This function parses and validates the configuration string specified by 1943 * the user and configues the virtual display configuration (number of 1944 * virtual connectors, crtcs, etc.) specified. 1945 */ 1946 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1947 { 1948 adev->enable_virtual_display = false; 1949 1950 #ifdef notyet 1951 if (amdgpu_virtual_display) { 1952 const char *pci_address_name = pci_name(adev->pdev); 1953 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1954 1955 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1956 pciaddstr_tmp = pciaddstr; 1957 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1958 pciaddname = strsep(&pciaddname_tmp, ","); 1959 if (!strcmp("all", pciaddname) 1960 || !strcmp(pci_address_name, pciaddname)) { 1961 long num_crtc; 1962 int res = -1; 1963 1964 adev->enable_virtual_display = true; 1965 1966 if (pciaddname_tmp) 1967 res = kstrtol(pciaddname_tmp, 10, 1968 &num_crtc); 1969 1970 if (!res) { 1971 if (num_crtc < 1) 1972 num_crtc = 1; 1973 if (num_crtc > 6) 1974 num_crtc = 6; 1975 adev->mode_info.num_crtc = num_crtc; 1976 } else { 1977 adev->mode_info.num_crtc = 1; 1978 } 1979 break; 1980 } 1981 } 1982 1983 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1984 amdgpu_virtual_display, pci_address_name, 1985 adev->enable_virtual_display, adev->mode_info.num_crtc); 1986 1987 kfree(pciaddstr); 1988 } 1989 #endif 1990 } 1991 1992 /** 1993 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1994 * 1995 * @adev: amdgpu_device pointer 1996 * 1997 * Parses the asic configuration parameters specified in the gpu info 1998 * firmware and makes them availale to the driver for use in configuring 1999 * the asic. 2000 * Returns 0 on success, -EINVAL on failure. 2001 */ 2002 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2003 { 2004 const char *chip_name; 2005 char fw_name[40]; 2006 int err; 2007 const struct gpu_info_firmware_header_v1_0 *hdr; 2008 2009 adev->firmware.gpu_info_fw = NULL; 2010 2011 if (adev->mman.discovery_bin) { 2012 /* 2013 * FIXME: The bounding box is still needed by Navi12, so 2014 * temporarily read it from gpu_info firmware. Should be dropped 2015 * when DAL no longer needs it. 2016 */ 2017 if (adev->asic_type != CHIP_NAVI12) 2018 return 0; 2019 } 2020 2021 switch (adev->asic_type) { 2022 default: 2023 return 0; 2024 case CHIP_VEGA10: 2025 chip_name = "vega10"; 2026 break; 2027 case CHIP_VEGA12: 2028 chip_name = "vega12"; 2029 break; 2030 case CHIP_RAVEN: 2031 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2032 chip_name = "raven2"; 2033 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2034 chip_name = "picasso"; 2035 else 2036 chip_name = "raven"; 2037 break; 2038 case CHIP_ARCTURUS: 2039 chip_name = "arcturus"; 2040 break; 2041 case CHIP_NAVI12: 2042 chip_name = "navi12"; 2043 break; 2044 } 2045 2046 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2047 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2048 if (err) { 2049 dev_err(adev->dev, 2050 "Failed to load gpu_info firmware \"%s\"\n", 2051 fw_name); 2052 goto out; 2053 } 2054 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2055 if (err) { 2056 dev_err(adev->dev, 2057 "Failed to validate gpu_info firmware \"%s\"\n", 2058 fw_name); 2059 goto out; 2060 } 2061 2062 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2063 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2064 2065 switch (hdr->version_major) { 2066 case 1: 2067 { 2068 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2069 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2070 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2071 2072 /* 2073 * Should be droped when DAL no longer needs it. 2074 */ 2075 if (adev->asic_type == CHIP_NAVI12) 2076 goto parse_soc_bounding_box; 2077 2078 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2079 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2080 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2081 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2082 adev->gfx.config.max_texture_channel_caches = 2083 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2084 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2085 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2086 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2087 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2088 adev->gfx.config.double_offchip_lds_buf = 2089 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2090 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2091 adev->gfx.cu_info.max_waves_per_simd = 2092 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2093 adev->gfx.cu_info.max_scratch_slots_per_cu = 2094 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2095 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2096 if (hdr->version_minor >= 1) { 2097 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2098 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2099 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2100 adev->gfx.config.num_sc_per_sh = 2101 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2102 adev->gfx.config.num_packer_per_sc = 2103 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2104 } 2105 2106 parse_soc_bounding_box: 2107 /* 2108 * soc bounding box info is not integrated in disocovery table, 2109 * we always need to parse it from gpu info firmware if needed. 2110 */ 2111 if (hdr->version_minor == 2) { 2112 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2113 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2114 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2115 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2116 } 2117 break; 2118 } 2119 default: 2120 dev_err(adev->dev, 2121 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2122 err = -EINVAL; 2123 goto out; 2124 } 2125 out: 2126 return err; 2127 } 2128 2129 /** 2130 * amdgpu_device_ip_early_init - run early init for hardware IPs 2131 * 2132 * @adev: amdgpu_device pointer 2133 * 2134 * Early initialization pass for hardware IPs. The hardware IPs that make 2135 * up each asic are discovered each IP's early_init callback is run. This 2136 * is the first stage in initializing the asic. 2137 * Returns 0 on success, negative error code on failure. 2138 */ 2139 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2140 { 2141 struct drm_device *dev = adev_to_drm(adev); 2142 struct pci_dev *parent; 2143 int i, r; 2144 2145 amdgpu_device_enable_virtual_display(adev); 2146 2147 if (amdgpu_sriov_vf(adev)) { 2148 r = amdgpu_virt_request_full_gpu(adev, true); 2149 if (r) 2150 return r; 2151 } 2152 2153 switch (adev->asic_type) { 2154 #ifdef CONFIG_DRM_AMDGPU_SI 2155 case CHIP_VERDE: 2156 case CHIP_TAHITI: 2157 case CHIP_PITCAIRN: 2158 case CHIP_OLAND: 2159 case CHIP_HAINAN: 2160 adev->family = AMDGPU_FAMILY_SI; 2161 r = si_set_ip_blocks(adev); 2162 if (r) 2163 return r; 2164 break; 2165 #endif 2166 #ifdef CONFIG_DRM_AMDGPU_CIK 2167 case CHIP_BONAIRE: 2168 case CHIP_HAWAII: 2169 case CHIP_KAVERI: 2170 case CHIP_KABINI: 2171 case CHIP_MULLINS: 2172 if (adev->flags & AMD_IS_APU) 2173 adev->family = AMDGPU_FAMILY_KV; 2174 else 2175 adev->family = AMDGPU_FAMILY_CI; 2176 2177 r = cik_set_ip_blocks(adev); 2178 if (r) 2179 return r; 2180 break; 2181 #endif 2182 case CHIP_TOPAZ: 2183 case CHIP_TONGA: 2184 case CHIP_FIJI: 2185 case CHIP_POLARIS10: 2186 case CHIP_POLARIS11: 2187 case CHIP_POLARIS12: 2188 case CHIP_VEGAM: 2189 case CHIP_CARRIZO: 2190 case CHIP_STONEY: 2191 if (adev->flags & AMD_IS_APU) 2192 adev->family = AMDGPU_FAMILY_CZ; 2193 else 2194 adev->family = AMDGPU_FAMILY_VI; 2195 2196 r = vi_set_ip_blocks(adev); 2197 if (r) 2198 return r; 2199 break; 2200 default: 2201 r = amdgpu_discovery_set_ip_blocks(adev); 2202 if (r) 2203 return r; 2204 break; 2205 } 2206 2207 if (amdgpu_has_atpx() && 2208 (amdgpu_is_atpx_hybrid() || 2209 amdgpu_has_atpx_dgpu_power_cntl()) && 2210 ((adev->flags & AMD_IS_APU) == 0) && 2211 !pci_is_thunderbolt_attached(dev->pdev)) 2212 adev->flags |= AMD_IS_PX; 2213 2214 if (!(adev->flags & AMD_IS_APU)) { 2215 parent = pci_upstream_bridge(adev->pdev); 2216 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2217 } 2218 2219 amdgpu_amdkfd_device_probe(adev); 2220 2221 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2222 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2223 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2224 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2225 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2226 2227 for (i = 0; i < adev->num_ip_blocks; i++) { 2228 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2229 DRM_ERROR("disabled ip block: %d <%s>\n", 2230 i, adev->ip_blocks[i].version->funcs->name); 2231 adev->ip_blocks[i].status.valid = false; 2232 } else { 2233 if (adev->ip_blocks[i].version->funcs->early_init) { 2234 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2235 if (r == -ENOENT) { 2236 adev->ip_blocks[i].status.valid = false; 2237 } else if (r) { 2238 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2239 adev->ip_blocks[i].version->funcs->name, r); 2240 return r; 2241 } else { 2242 adev->ip_blocks[i].status.valid = true; 2243 } 2244 } else { 2245 adev->ip_blocks[i].status.valid = true; 2246 } 2247 } 2248 /* get the vbios after the asic_funcs are set up */ 2249 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2250 r = amdgpu_device_parse_gpu_info_fw(adev); 2251 if (r) 2252 return r; 2253 2254 /* Read BIOS */ 2255 if (!amdgpu_get_bios(adev)) 2256 return -EINVAL; 2257 2258 r = amdgpu_atombios_init(adev); 2259 if (r) { 2260 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2261 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2262 return r; 2263 } 2264 2265 /*get pf2vf msg info at it's earliest time*/ 2266 if (amdgpu_sriov_vf(adev)) 2267 amdgpu_virt_init_data_exchange(adev); 2268 2269 } 2270 } 2271 2272 adev->cg_flags &= amdgpu_cg_mask; 2273 adev->pg_flags &= amdgpu_pg_mask; 2274 2275 return 0; 2276 } 2277 2278 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2279 { 2280 int i, r; 2281 2282 for (i = 0; i < adev->num_ip_blocks; i++) { 2283 if (!adev->ip_blocks[i].status.sw) 2284 continue; 2285 if (adev->ip_blocks[i].status.hw) 2286 continue; 2287 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2288 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2289 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2290 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2291 if (r) { 2292 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2293 adev->ip_blocks[i].version->funcs->name, r); 2294 return r; 2295 } 2296 adev->ip_blocks[i].status.hw = true; 2297 } 2298 } 2299 2300 return 0; 2301 } 2302 2303 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2304 { 2305 int i, r; 2306 2307 for (i = 0; i < adev->num_ip_blocks; i++) { 2308 if (!adev->ip_blocks[i].status.sw) 2309 continue; 2310 if (adev->ip_blocks[i].status.hw) 2311 continue; 2312 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2313 if (r) { 2314 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2315 adev->ip_blocks[i].version->funcs->name, r); 2316 return r; 2317 } 2318 adev->ip_blocks[i].status.hw = true; 2319 } 2320 2321 return 0; 2322 } 2323 2324 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2325 { 2326 int r = 0; 2327 int i; 2328 uint32_t smu_version; 2329 2330 if (adev->asic_type >= CHIP_VEGA10) { 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2333 continue; 2334 2335 if (!adev->ip_blocks[i].status.sw) 2336 continue; 2337 2338 /* no need to do the fw loading again if already done*/ 2339 if (adev->ip_blocks[i].status.hw == true) 2340 break; 2341 2342 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2343 r = adev->ip_blocks[i].version->funcs->resume(adev); 2344 if (r) { 2345 DRM_ERROR("resume of IP block <%s> failed %d\n", 2346 adev->ip_blocks[i].version->funcs->name, r); 2347 return r; 2348 } 2349 } else { 2350 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2351 if (r) { 2352 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2353 adev->ip_blocks[i].version->funcs->name, r); 2354 return r; 2355 } 2356 } 2357 2358 adev->ip_blocks[i].status.hw = true; 2359 break; 2360 } 2361 } 2362 2363 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2364 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2365 2366 return r; 2367 } 2368 2369 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2370 { 2371 long timeout; 2372 int r, i; 2373 2374 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2375 struct amdgpu_ring *ring = adev->rings[i]; 2376 2377 /* No need to setup the GPU scheduler for rings that don't need it */ 2378 if (!ring || ring->no_scheduler) 2379 continue; 2380 2381 switch (ring->funcs->type) { 2382 case AMDGPU_RING_TYPE_GFX: 2383 timeout = adev->gfx_timeout; 2384 break; 2385 case AMDGPU_RING_TYPE_COMPUTE: 2386 timeout = adev->compute_timeout; 2387 break; 2388 case AMDGPU_RING_TYPE_SDMA: 2389 timeout = adev->sdma_timeout; 2390 break; 2391 default: 2392 timeout = adev->video_timeout; 2393 break; 2394 } 2395 2396 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2397 ring->num_hw_submission, amdgpu_job_hang_limit, 2398 timeout, adev->reset_domain->wq, 2399 ring->sched_score, ring->name, 2400 adev->dev); 2401 if (r) { 2402 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2403 ring->name); 2404 return r; 2405 } 2406 } 2407 2408 return 0; 2409 } 2410 2411 2412 /** 2413 * amdgpu_device_ip_init - run init for hardware IPs 2414 * 2415 * @adev: amdgpu_device pointer 2416 * 2417 * Main initialization pass for hardware IPs. The list of all the hardware 2418 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2419 * are run. sw_init initializes the software state associated with each IP 2420 * and hw_init initializes the hardware associated with each IP. 2421 * Returns 0 on success, negative error code on failure. 2422 */ 2423 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2424 { 2425 int i, r; 2426 2427 r = amdgpu_ras_init(adev); 2428 if (r) 2429 return r; 2430 2431 for (i = 0; i < adev->num_ip_blocks; i++) { 2432 if (!adev->ip_blocks[i].status.valid) 2433 continue; 2434 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2435 if (r) { 2436 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2437 adev->ip_blocks[i].version->funcs->name, r); 2438 goto init_failed; 2439 } 2440 adev->ip_blocks[i].status.sw = true; 2441 2442 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2443 /* need to do common hw init early so everything is set up for gmc */ 2444 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2445 if (r) { 2446 DRM_ERROR("hw_init %d failed %d\n", i, r); 2447 goto init_failed; 2448 } 2449 adev->ip_blocks[i].status.hw = true; 2450 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2451 /* need to do gmc hw init early so we can allocate gpu mem */ 2452 /* Try to reserve bad pages early */ 2453 if (amdgpu_sriov_vf(adev)) 2454 amdgpu_virt_exchange_data(adev); 2455 2456 r = amdgpu_device_vram_scratch_init(adev); 2457 if (r) { 2458 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2459 goto init_failed; 2460 } 2461 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2462 if (r) { 2463 DRM_ERROR("hw_init %d failed %d\n", i, r); 2464 goto init_failed; 2465 } 2466 r = amdgpu_device_wb_init(adev); 2467 if (r) { 2468 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2469 goto init_failed; 2470 } 2471 adev->ip_blocks[i].status.hw = true; 2472 2473 /* right after GMC hw init, we create CSA */ 2474 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2475 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2476 AMDGPU_GEM_DOMAIN_VRAM, 2477 AMDGPU_CSA_SIZE); 2478 if (r) { 2479 DRM_ERROR("allocate CSA failed %d\n", r); 2480 goto init_failed; 2481 } 2482 } 2483 } 2484 } 2485 2486 if (amdgpu_sriov_vf(adev)) 2487 amdgpu_virt_init_data_exchange(adev); 2488 2489 r = amdgpu_ib_pool_init(adev); 2490 if (r) { 2491 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2492 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2493 goto init_failed; 2494 } 2495 2496 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2497 if (r) 2498 goto init_failed; 2499 2500 r = amdgpu_device_ip_hw_init_phase1(adev); 2501 if (r) 2502 goto init_failed; 2503 2504 r = amdgpu_device_fw_loading(adev); 2505 if (r) 2506 goto init_failed; 2507 2508 r = amdgpu_device_ip_hw_init_phase2(adev); 2509 if (r) 2510 goto init_failed; 2511 2512 /* 2513 * retired pages will be loaded from eeprom and reserved here, 2514 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2515 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2516 * for I2C communication which only true at this point. 2517 * 2518 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2519 * failure from bad gpu situation and stop amdgpu init process 2520 * accordingly. For other failed cases, it will still release all 2521 * the resource and print error message, rather than returning one 2522 * negative value to upper level. 2523 * 2524 * Note: theoretically, this should be called before all vram allocations 2525 * to protect retired page from abusing 2526 */ 2527 r = amdgpu_ras_recovery_init(adev); 2528 if (r) 2529 goto init_failed; 2530 2531 /** 2532 * In case of XGMI grab extra reference for reset domain for this device 2533 */ 2534 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2535 if (amdgpu_xgmi_add_device(adev) == 0) { 2536 if (!amdgpu_sriov_vf(adev)) { 2537 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2538 2539 if (WARN_ON(!hive)) { 2540 r = -ENOENT; 2541 goto init_failed; 2542 } 2543 2544 if (!hive->reset_domain || 2545 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2546 r = -ENOENT; 2547 amdgpu_put_xgmi_hive(hive); 2548 goto init_failed; 2549 } 2550 2551 /* Drop the early temporary reset domain we created for device */ 2552 amdgpu_reset_put_reset_domain(adev->reset_domain); 2553 adev->reset_domain = hive->reset_domain; 2554 amdgpu_put_xgmi_hive(hive); 2555 } 2556 } 2557 } 2558 2559 r = amdgpu_device_init_schedulers(adev); 2560 if (r) 2561 goto init_failed; 2562 2563 /* Don't init kfd if whole hive need to be reset during init */ 2564 if (!adev->gmc.xgmi.pending_reset) 2565 amdgpu_amdkfd_device_init(adev); 2566 2567 amdgpu_fru_get_product_info(adev); 2568 2569 init_failed: 2570 2571 return r; 2572 } 2573 2574 /** 2575 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2576 * 2577 * @adev: amdgpu_device pointer 2578 * 2579 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2580 * this function before a GPU reset. If the value is retained after a 2581 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2582 */ 2583 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2584 { 2585 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2586 } 2587 2588 /** 2589 * amdgpu_device_check_vram_lost - check if vram is valid 2590 * 2591 * @adev: amdgpu_device pointer 2592 * 2593 * Checks the reset magic value written to the gart pointer in VRAM. 2594 * The driver calls this after a GPU reset to see if the contents of 2595 * VRAM is lost or now. 2596 * returns true if vram is lost, false if not. 2597 */ 2598 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2599 { 2600 if (memcmp(adev->gart.ptr, adev->reset_magic, 2601 AMDGPU_RESET_MAGIC_NUM)) 2602 return true; 2603 2604 if (!amdgpu_in_reset(adev)) 2605 return false; 2606 2607 /* 2608 * For all ASICs with baco/mode1 reset, the VRAM is 2609 * always assumed to be lost. 2610 */ 2611 switch (amdgpu_asic_reset_method(adev)) { 2612 case AMD_RESET_METHOD_BACO: 2613 case AMD_RESET_METHOD_MODE1: 2614 return true; 2615 default: 2616 return false; 2617 } 2618 } 2619 2620 /** 2621 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2622 * 2623 * @adev: amdgpu_device pointer 2624 * @state: clockgating state (gate or ungate) 2625 * 2626 * The list of all the hardware IPs that make up the asic is walked and the 2627 * set_clockgating_state callbacks are run. 2628 * Late initialization pass enabling clockgating for hardware IPs. 2629 * Fini or suspend, pass disabling clockgating for hardware IPs. 2630 * Returns 0 on success, negative error code on failure. 2631 */ 2632 2633 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2634 enum amd_clockgating_state state) 2635 { 2636 int i, j, r; 2637 2638 if (amdgpu_emu_mode == 1) 2639 return 0; 2640 2641 for (j = 0; j < adev->num_ip_blocks; j++) { 2642 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2643 if (!adev->ip_blocks[i].status.late_initialized) 2644 continue; 2645 /* skip CG for GFX on S0ix */ 2646 if (adev->in_s0ix && 2647 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2648 continue; 2649 /* skip CG for VCE/UVD, it's handled specially */ 2650 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2651 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2652 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2653 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2654 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2655 /* enable clockgating to save power */ 2656 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2657 state); 2658 if (r) { 2659 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2660 adev->ip_blocks[i].version->funcs->name, r); 2661 return r; 2662 } 2663 } 2664 } 2665 2666 return 0; 2667 } 2668 2669 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2670 enum amd_powergating_state state) 2671 { 2672 int i, j, r; 2673 2674 if (amdgpu_emu_mode == 1) 2675 return 0; 2676 2677 for (j = 0; j < adev->num_ip_blocks; j++) { 2678 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2679 if (!adev->ip_blocks[i].status.late_initialized) 2680 continue; 2681 /* skip PG for GFX on S0ix */ 2682 if (adev->in_s0ix && 2683 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2684 continue; 2685 /* skip CG for VCE/UVD, it's handled specially */ 2686 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2687 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2688 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2689 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2690 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2691 /* enable powergating to save power */ 2692 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2693 state); 2694 if (r) { 2695 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2696 adev->ip_blocks[i].version->funcs->name, r); 2697 return r; 2698 } 2699 } 2700 } 2701 return 0; 2702 } 2703 2704 static int amdgpu_device_enable_mgpu_fan_boost(void) 2705 { 2706 struct amdgpu_gpu_instance *gpu_ins; 2707 struct amdgpu_device *adev; 2708 int i, ret = 0; 2709 2710 mutex_lock(&mgpu_info.mutex); 2711 2712 /* 2713 * MGPU fan boost feature should be enabled 2714 * only when there are two or more dGPUs in 2715 * the system 2716 */ 2717 if (mgpu_info.num_dgpu < 2) 2718 goto out; 2719 2720 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2721 gpu_ins = &(mgpu_info.gpu_ins[i]); 2722 adev = gpu_ins->adev; 2723 if (!(adev->flags & AMD_IS_APU) && 2724 !gpu_ins->mgpu_fan_enabled) { 2725 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2726 if (ret) 2727 break; 2728 2729 gpu_ins->mgpu_fan_enabled = 1; 2730 } 2731 } 2732 2733 out: 2734 mutex_unlock(&mgpu_info.mutex); 2735 2736 return ret; 2737 } 2738 2739 /** 2740 * amdgpu_device_ip_late_init - run late init for hardware IPs 2741 * 2742 * @adev: amdgpu_device pointer 2743 * 2744 * Late initialization pass for hardware IPs. The list of all the hardware 2745 * IPs that make up the asic is walked and the late_init callbacks are run. 2746 * late_init covers any special initialization that an IP requires 2747 * after all of the have been initialized or something that needs to happen 2748 * late in the init process. 2749 * Returns 0 on success, negative error code on failure. 2750 */ 2751 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2752 { 2753 struct amdgpu_gpu_instance *gpu_instance; 2754 int i = 0, r; 2755 2756 for (i = 0; i < adev->num_ip_blocks; i++) { 2757 if (!adev->ip_blocks[i].status.hw) 2758 continue; 2759 if (adev->ip_blocks[i].version->funcs->late_init) { 2760 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2761 if (r) { 2762 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2763 adev->ip_blocks[i].version->funcs->name, r); 2764 return r; 2765 } 2766 } 2767 adev->ip_blocks[i].status.late_initialized = true; 2768 } 2769 2770 r = amdgpu_ras_late_init(adev); 2771 if (r) { 2772 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2773 return r; 2774 } 2775 2776 amdgpu_ras_set_error_query_ready(adev, true); 2777 2778 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2779 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2780 2781 amdgpu_device_fill_reset_magic(adev); 2782 2783 r = amdgpu_device_enable_mgpu_fan_boost(); 2784 if (r) 2785 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2786 2787 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2788 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2789 adev->asic_type == CHIP_ALDEBARAN )) 2790 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2791 2792 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2793 mutex_lock(&mgpu_info.mutex); 2794 2795 /* 2796 * Reset device p-state to low as this was booted with high. 2797 * 2798 * This should be performed only after all devices from the same 2799 * hive get initialized. 2800 * 2801 * However, it's unknown how many device in the hive in advance. 2802 * As this is counted one by one during devices initializations. 2803 * 2804 * So, we wait for all XGMI interlinked devices initialized. 2805 * This may bring some delays as those devices may come from 2806 * different hives. But that should be OK. 2807 */ 2808 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2809 for (i = 0; i < mgpu_info.num_gpu; i++) { 2810 gpu_instance = &(mgpu_info.gpu_ins[i]); 2811 if (gpu_instance->adev->flags & AMD_IS_APU) 2812 continue; 2813 2814 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2815 AMDGPU_XGMI_PSTATE_MIN); 2816 if (r) { 2817 DRM_ERROR("pstate setting failed (%d).\n", r); 2818 break; 2819 } 2820 } 2821 } 2822 2823 mutex_unlock(&mgpu_info.mutex); 2824 } 2825 2826 return 0; 2827 } 2828 2829 /** 2830 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2831 * 2832 * @adev: amdgpu_device pointer 2833 * 2834 * For ASICs need to disable SMC first 2835 */ 2836 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2837 { 2838 int i, r; 2839 2840 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2841 return; 2842 2843 for (i = 0; i < adev->num_ip_blocks; i++) { 2844 if (!adev->ip_blocks[i].status.hw) 2845 continue; 2846 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2847 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2848 /* XXX handle errors */ 2849 if (r) { 2850 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2851 adev->ip_blocks[i].version->funcs->name, r); 2852 } 2853 adev->ip_blocks[i].status.hw = false; 2854 break; 2855 } 2856 } 2857 } 2858 2859 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2860 { 2861 int i, r; 2862 2863 for (i = 0; i < adev->num_ip_blocks; i++) { 2864 if (!adev->ip_blocks[i].version->funcs->early_fini) 2865 continue; 2866 2867 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2868 if (r) { 2869 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2870 adev->ip_blocks[i].version->funcs->name, r); 2871 } 2872 } 2873 2874 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2875 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2876 2877 amdgpu_amdkfd_suspend(adev, false); 2878 2879 /* Workaroud for ASICs need to disable SMC first */ 2880 amdgpu_device_smu_fini_early(adev); 2881 2882 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2883 if (!adev->ip_blocks[i].status.hw) 2884 continue; 2885 2886 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2887 /* XXX handle errors */ 2888 if (r) { 2889 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2890 adev->ip_blocks[i].version->funcs->name, r); 2891 } 2892 2893 adev->ip_blocks[i].status.hw = false; 2894 } 2895 2896 if (amdgpu_sriov_vf(adev)) { 2897 if (amdgpu_virt_release_full_gpu(adev, false)) 2898 DRM_ERROR("failed to release exclusive mode on fini\n"); 2899 } 2900 2901 return 0; 2902 } 2903 2904 /** 2905 * amdgpu_device_ip_fini - run fini for hardware IPs 2906 * 2907 * @adev: amdgpu_device pointer 2908 * 2909 * Main teardown pass for hardware IPs. The list of all the hardware 2910 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2911 * are run. hw_fini tears down the hardware associated with each IP 2912 * and sw_fini tears down any software state associated with each IP. 2913 * Returns 0 on success, negative error code on failure. 2914 */ 2915 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2916 { 2917 int i, r; 2918 2919 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2920 amdgpu_virt_release_ras_err_handler_data(adev); 2921 2922 if (adev->gmc.xgmi.num_physical_nodes > 1) 2923 amdgpu_xgmi_remove_device(adev); 2924 2925 amdgpu_amdkfd_device_fini_sw(adev); 2926 2927 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2928 if (!adev->ip_blocks[i].status.sw) 2929 continue; 2930 2931 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2932 amdgpu_ucode_free_bo(adev); 2933 amdgpu_free_static_csa(&adev->virt.csa_obj); 2934 amdgpu_device_wb_fini(adev); 2935 amdgpu_device_vram_scratch_fini(adev); 2936 amdgpu_ib_pool_fini(adev); 2937 } 2938 2939 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2940 /* XXX handle errors */ 2941 if (r) { 2942 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2943 adev->ip_blocks[i].version->funcs->name, r); 2944 } 2945 adev->ip_blocks[i].status.sw = false; 2946 adev->ip_blocks[i].status.valid = false; 2947 } 2948 2949 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2950 if (!adev->ip_blocks[i].status.late_initialized) 2951 continue; 2952 if (adev->ip_blocks[i].version->funcs->late_fini) 2953 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2954 adev->ip_blocks[i].status.late_initialized = false; 2955 } 2956 2957 amdgpu_ras_fini(adev); 2958 2959 return 0; 2960 } 2961 2962 /** 2963 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2964 * 2965 * @work: work_struct. 2966 */ 2967 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2968 { 2969 struct amdgpu_device *adev = 2970 container_of(work, struct amdgpu_device, delayed_init_work.work); 2971 int r; 2972 2973 r = amdgpu_ib_ring_tests(adev); 2974 if (r) 2975 DRM_ERROR("ib ring test failed (%d).\n", r); 2976 } 2977 2978 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2979 { 2980 struct amdgpu_device *adev = 2981 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2982 2983 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2984 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2985 2986 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2987 adev->gfx.gfx_off_state = true; 2988 } 2989 2990 /** 2991 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2992 * 2993 * @adev: amdgpu_device pointer 2994 * 2995 * Main suspend function for hardware IPs. The list of all the hardware 2996 * IPs that make up the asic is walked, clockgating is disabled and the 2997 * suspend callbacks are run. suspend puts the hardware and software state 2998 * in each IP into a state suitable for suspend. 2999 * Returns 0 on success, negative error code on failure. 3000 */ 3001 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3002 { 3003 int i, r; 3004 3005 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3006 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3007 3008 /* 3009 * Per PMFW team's suggestion, driver needs to handle gfxoff 3010 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3011 * scenario. Add the missing df cstate disablement here. 3012 */ 3013 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3014 dev_warn(adev->dev, "Failed to disallow df cstate"); 3015 3016 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3017 if (!adev->ip_blocks[i].status.valid) 3018 continue; 3019 3020 /* displays are handled separately */ 3021 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3022 continue; 3023 3024 /* XXX handle errors */ 3025 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3026 /* XXX handle errors */ 3027 if (r) { 3028 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3029 adev->ip_blocks[i].version->funcs->name, r); 3030 return r; 3031 } 3032 3033 adev->ip_blocks[i].status.hw = false; 3034 } 3035 3036 return 0; 3037 } 3038 3039 /** 3040 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3041 * 3042 * @adev: amdgpu_device pointer 3043 * 3044 * Main suspend function for hardware IPs. The list of all the hardware 3045 * IPs that make up the asic is walked, clockgating is disabled and the 3046 * suspend callbacks are run. suspend puts the hardware and software state 3047 * in each IP into a state suitable for suspend. 3048 * Returns 0 on success, negative error code on failure. 3049 */ 3050 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3051 { 3052 int i, r; 3053 3054 if (adev->in_s0ix) 3055 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3056 3057 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3058 if (!adev->ip_blocks[i].status.valid) 3059 continue; 3060 /* displays are handled in phase1 */ 3061 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3062 continue; 3063 /* PSP lost connection when err_event_athub occurs */ 3064 if (amdgpu_ras_intr_triggered() && 3065 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3066 adev->ip_blocks[i].status.hw = false; 3067 continue; 3068 } 3069 3070 /* skip unnecessary suspend if we do not initialize them yet */ 3071 if (adev->gmc.xgmi.pending_reset && 3072 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3073 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3074 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3075 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3076 adev->ip_blocks[i].status.hw = false; 3077 continue; 3078 } 3079 3080 /* skip suspend of gfx/mes and psp for S0ix 3081 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3082 * like at runtime. PSP is also part of the always on hardware 3083 * so no need to suspend it. 3084 */ 3085 if (adev->in_s0ix && 3086 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3087 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3088 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3089 continue; 3090 3091 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3092 if (adev->in_s0ix && 3093 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3094 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3095 continue; 3096 3097 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3098 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3099 * from this location and RLC Autoload automatically also gets loaded 3100 * from here based on PMFW -> PSP message during re-init sequence. 3101 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3102 * the TMR and reload FWs again for IMU enabled APU ASICs. 3103 */ 3104 if (amdgpu_in_reset(adev) && 3105 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3106 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3107 continue; 3108 3109 /* XXX handle errors */ 3110 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3111 /* XXX handle errors */ 3112 if (r) { 3113 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3114 adev->ip_blocks[i].version->funcs->name, r); 3115 } 3116 adev->ip_blocks[i].status.hw = false; 3117 /* handle putting the SMC in the appropriate state */ 3118 if(!amdgpu_sriov_vf(adev)){ 3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3120 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3121 if (r) { 3122 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3123 adev->mp1_state, r); 3124 return r; 3125 } 3126 } 3127 } 3128 } 3129 3130 return 0; 3131 } 3132 3133 /** 3134 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3135 * 3136 * @adev: amdgpu_device pointer 3137 * 3138 * Main suspend function for hardware IPs. The list of all the hardware 3139 * IPs that make up the asic is walked, clockgating is disabled and the 3140 * suspend callbacks are run. suspend puts the hardware and software state 3141 * in each IP into a state suitable for suspend. 3142 * Returns 0 on success, negative error code on failure. 3143 */ 3144 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3145 { 3146 int r; 3147 3148 if (amdgpu_sriov_vf(adev)) { 3149 amdgpu_virt_fini_data_exchange(adev); 3150 amdgpu_virt_request_full_gpu(adev, false); 3151 } 3152 3153 r = amdgpu_device_ip_suspend_phase1(adev); 3154 if (r) 3155 return r; 3156 r = amdgpu_device_ip_suspend_phase2(adev); 3157 3158 if (amdgpu_sriov_vf(adev)) 3159 amdgpu_virt_release_full_gpu(adev, false); 3160 3161 return r; 3162 } 3163 3164 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3165 { 3166 int i, r; 3167 3168 static enum amd_ip_block_type ip_order[] = { 3169 AMD_IP_BLOCK_TYPE_COMMON, 3170 AMD_IP_BLOCK_TYPE_GMC, 3171 AMD_IP_BLOCK_TYPE_PSP, 3172 AMD_IP_BLOCK_TYPE_IH, 3173 }; 3174 3175 for (i = 0; i < adev->num_ip_blocks; i++) { 3176 int j; 3177 struct amdgpu_ip_block *block; 3178 3179 block = &adev->ip_blocks[i]; 3180 block->status.hw = false; 3181 3182 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3183 3184 if (block->version->type != ip_order[j] || 3185 !block->status.valid) 3186 continue; 3187 3188 r = block->version->funcs->hw_init(adev); 3189 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3190 if (r) 3191 return r; 3192 block->status.hw = true; 3193 } 3194 } 3195 3196 return 0; 3197 } 3198 3199 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3200 { 3201 int i, r; 3202 3203 static enum amd_ip_block_type ip_order[] = { 3204 AMD_IP_BLOCK_TYPE_SMC, 3205 AMD_IP_BLOCK_TYPE_DCE, 3206 AMD_IP_BLOCK_TYPE_GFX, 3207 AMD_IP_BLOCK_TYPE_SDMA, 3208 AMD_IP_BLOCK_TYPE_UVD, 3209 AMD_IP_BLOCK_TYPE_VCE, 3210 AMD_IP_BLOCK_TYPE_VCN 3211 }; 3212 3213 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3214 int j; 3215 struct amdgpu_ip_block *block; 3216 3217 for (j = 0; j < adev->num_ip_blocks; j++) { 3218 block = &adev->ip_blocks[j]; 3219 3220 if (block->version->type != ip_order[i] || 3221 !block->status.valid || 3222 block->status.hw) 3223 continue; 3224 3225 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3226 r = block->version->funcs->resume(adev); 3227 else 3228 r = block->version->funcs->hw_init(adev); 3229 3230 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3231 if (r) 3232 return r; 3233 block->status.hw = true; 3234 } 3235 } 3236 3237 return 0; 3238 } 3239 3240 /** 3241 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3242 * 3243 * @adev: amdgpu_device pointer 3244 * 3245 * First resume function for hardware IPs. The list of all the hardware 3246 * IPs that make up the asic is walked and the resume callbacks are run for 3247 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3248 * after a suspend and updates the software state as necessary. This 3249 * function is also used for restoring the GPU after a GPU reset. 3250 * Returns 0 on success, negative error code on failure. 3251 */ 3252 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3253 { 3254 int i, r; 3255 3256 for (i = 0; i < adev->num_ip_blocks; i++) { 3257 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3258 continue; 3259 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3260 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3261 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3262 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3263 3264 r = adev->ip_blocks[i].version->funcs->resume(adev); 3265 if (r) { 3266 DRM_ERROR("resume of IP block <%s> failed %d\n", 3267 adev->ip_blocks[i].version->funcs->name, r); 3268 return r; 3269 } 3270 adev->ip_blocks[i].status.hw = true; 3271 } 3272 } 3273 3274 return 0; 3275 } 3276 3277 /** 3278 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3279 * 3280 * @adev: amdgpu_device pointer 3281 * 3282 * First resume function for hardware IPs. The list of all the hardware 3283 * IPs that make up the asic is walked and the resume callbacks are run for 3284 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3285 * functional state after a suspend and updates the software state as 3286 * necessary. This function is also used for restoring the GPU after a GPU 3287 * reset. 3288 * Returns 0 on success, negative error code on failure. 3289 */ 3290 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3291 { 3292 int i, r; 3293 3294 for (i = 0; i < adev->num_ip_blocks; i++) { 3295 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3296 continue; 3297 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3298 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3299 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3300 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3301 continue; 3302 r = adev->ip_blocks[i].version->funcs->resume(adev); 3303 if (r) { 3304 DRM_ERROR("resume of IP block <%s> failed %d\n", 3305 adev->ip_blocks[i].version->funcs->name, r); 3306 return r; 3307 } 3308 adev->ip_blocks[i].status.hw = true; 3309 3310 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3311 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3312 * amdgpu_device_resume() after IP resume. 3313 */ 3314 amdgpu_gfx_off_ctrl(adev, false); 3315 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3316 } 3317 3318 } 3319 3320 return 0; 3321 } 3322 3323 /** 3324 * amdgpu_device_ip_resume - run resume for hardware IPs 3325 * 3326 * @adev: amdgpu_device pointer 3327 * 3328 * Main resume function for hardware IPs. The hardware IPs 3329 * are split into two resume functions because they are 3330 * are also used in in recovering from a GPU reset and some additional 3331 * steps need to be take between them. In this case (S3/S4) they are 3332 * run sequentially. 3333 * Returns 0 on success, negative error code on failure. 3334 */ 3335 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3336 { 3337 int r; 3338 3339 r = amdgpu_amdkfd_resume_iommu(adev); 3340 if (r) 3341 return r; 3342 3343 r = amdgpu_device_ip_resume_phase1(adev); 3344 if (r) 3345 return r; 3346 3347 r = amdgpu_device_fw_loading(adev); 3348 if (r) 3349 return r; 3350 3351 r = amdgpu_device_ip_resume_phase2(adev); 3352 3353 return r; 3354 } 3355 3356 /** 3357 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3358 * 3359 * @adev: amdgpu_device pointer 3360 * 3361 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3362 */ 3363 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3364 { 3365 if (amdgpu_sriov_vf(adev)) { 3366 if (adev->is_atom_fw) { 3367 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3368 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3369 } else { 3370 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3371 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3372 } 3373 3374 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3375 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3376 } 3377 } 3378 3379 /** 3380 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3381 * 3382 * @asic_type: AMD asic type 3383 * 3384 * Check if there is DC (new modesetting infrastructre) support for an asic. 3385 * returns true if DC has support, false if not. 3386 */ 3387 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3388 { 3389 switch (asic_type) { 3390 #ifdef CONFIG_DRM_AMDGPU_SI 3391 case CHIP_HAINAN: 3392 #endif 3393 case CHIP_TOPAZ: 3394 /* chips with no display hardware */ 3395 return false; 3396 #if defined(CONFIG_DRM_AMD_DC) 3397 case CHIP_TAHITI: 3398 case CHIP_PITCAIRN: 3399 case CHIP_VERDE: 3400 case CHIP_OLAND: 3401 /* 3402 * We have systems in the wild with these ASICs that require 3403 * LVDS and VGA support which is not supported with DC. 3404 * 3405 * Fallback to the non-DC driver here by default so as not to 3406 * cause regressions. 3407 */ 3408 #if defined(CONFIG_DRM_AMD_DC_SI) 3409 return amdgpu_dc > 0; 3410 #else 3411 return false; 3412 #endif 3413 case CHIP_BONAIRE: 3414 case CHIP_KAVERI: 3415 case CHIP_KABINI: 3416 case CHIP_MULLINS: 3417 /* 3418 * We have systems in the wild with these ASICs that require 3419 * VGA support which is not supported with DC. 3420 * 3421 * Fallback to the non-DC driver here by default so as not to 3422 * cause regressions. 3423 */ 3424 return amdgpu_dc > 0; 3425 default: 3426 return amdgpu_dc != 0; 3427 #else 3428 default: 3429 if (amdgpu_dc > 0) 3430 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3431 "but isn't supported by ASIC, ignoring\n"); 3432 return false; 3433 #endif 3434 } 3435 } 3436 3437 /** 3438 * amdgpu_device_has_dc_support - check if dc is supported 3439 * 3440 * @adev: amdgpu_device pointer 3441 * 3442 * Returns true for supported, false for not supported 3443 */ 3444 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3445 { 3446 if (amdgpu_sriov_vf(adev) || 3447 adev->enable_virtual_display || 3448 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3449 return false; 3450 3451 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3452 } 3453 3454 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3455 { 3456 struct amdgpu_device *adev = 3457 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3458 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3459 3460 /* It's a bug to not have a hive within this function */ 3461 if (WARN_ON(!hive)) 3462 return; 3463 3464 /* 3465 * Use task barrier to synchronize all xgmi reset works across the 3466 * hive. task_barrier_enter and task_barrier_exit will block 3467 * until all the threads running the xgmi reset works reach 3468 * those points. task_barrier_full will do both blocks. 3469 */ 3470 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3471 3472 task_barrier_enter(&hive->tb); 3473 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3474 3475 if (adev->asic_reset_res) 3476 goto fail; 3477 3478 task_barrier_exit(&hive->tb); 3479 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3480 3481 if (adev->asic_reset_res) 3482 goto fail; 3483 3484 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3485 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3486 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3487 } else { 3488 3489 task_barrier_full(&hive->tb); 3490 adev->asic_reset_res = amdgpu_asic_reset(adev); 3491 } 3492 3493 fail: 3494 if (adev->asic_reset_res) 3495 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3496 adev->asic_reset_res, adev_to_drm(adev)->unique); 3497 amdgpu_put_xgmi_hive(hive); 3498 } 3499 3500 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3501 { 3502 char *input = amdgpu_lockup_timeout; 3503 char *timeout_setting = NULL; 3504 int index = 0; 3505 long timeout; 3506 int ret = 0; 3507 3508 /* 3509 * By default timeout for non compute jobs is 10000 3510 * and 60000 for compute jobs. 3511 * In SR-IOV or passthrough mode, timeout for compute 3512 * jobs are 60000 by default. 3513 */ 3514 adev->gfx_timeout = msecs_to_jiffies(10000); 3515 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3516 if (amdgpu_sriov_vf(adev)) 3517 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3518 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3519 else 3520 adev->compute_timeout = msecs_to_jiffies(60000); 3521 3522 #ifdef notyet 3523 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3524 while ((timeout_setting = strsep(&input, ",")) && 3525 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3526 ret = kstrtol(timeout_setting, 0, &timeout); 3527 if (ret) 3528 return ret; 3529 3530 if (timeout == 0) { 3531 index++; 3532 continue; 3533 } else if (timeout < 0) { 3534 timeout = MAX_SCHEDULE_TIMEOUT; 3535 dev_warn(adev->dev, "lockup timeout disabled"); 3536 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3537 } else { 3538 timeout = msecs_to_jiffies(timeout); 3539 } 3540 3541 switch (index++) { 3542 case 0: 3543 adev->gfx_timeout = timeout; 3544 break; 3545 case 1: 3546 adev->compute_timeout = timeout; 3547 break; 3548 case 2: 3549 adev->sdma_timeout = timeout; 3550 break; 3551 case 3: 3552 adev->video_timeout = timeout; 3553 break; 3554 default: 3555 break; 3556 } 3557 } 3558 /* 3559 * There is only one value specified and 3560 * it should apply to all non-compute jobs. 3561 */ 3562 if (index == 1) { 3563 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3564 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3565 adev->compute_timeout = adev->gfx_timeout; 3566 } 3567 } 3568 #endif 3569 3570 return ret; 3571 } 3572 3573 /** 3574 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3575 * 3576 * @adev: amdgpu_device pointer 3577 * 3578 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3579 */ 3580 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3581 { 3582 #ifdef notyet 3583 struct iommu_domain *domain; 3584 3585 domain = iommu_get_domain_for_dev(adev->dev); 3586 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3587 #endif 3588 adev->ram_is_direct_mapped = true; 3589 } 3590 3591 static const struct attribute *amdgpu_dev_attributes[] = { 3592 &dev_attr_product_name.attr, 3593 &dev_attr_product_number.attr, 3594 &dev_attr_serial_number.attr, 3595 &dev_attr_pcie_replay_count.attr, 3596 NULL 3597 }; 3598 3599 /** 3600 * amdgpu_device_init - initialize the driver 3601 * 3602 * @adev: amdgpu_device pointer 3603 * @flags: driver flags 3604 * 3605 * Initializes the driver info and hw (all asics). 3606 * Returns 0 for success or an error on failure. 3607 * Called at driver startup. 3608 */ 3609 int amdgpu_device_init(struct amdgpu_device *adev, 3610 uint32_t flags) 3611 { 3612 struct drm_device *ddev = adev_to_drm(adev); 3613 struct pci_dev *pdev = adev->pdev; 3614 int r, i; 3615 bool px = false; 3616 u32 max_MBps; 3617 int tmp; 3618 3619 adev->shutdown = false; 3620 adev->flags = flags; 3621 3622 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3623 adev->asic_type = amdgpu_force_asic_type; 3624 else 3625 adev->asic_type = flags & AMD_ASIC_MASK; 3626 3627 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3628 if (amdgpu_emu_mode == 1) 3629 adev->usec_timeout *= 10; 3630 adev->gmc.gart_size = 512 * 1024 * 1024; 3631 adev->accel_working = false; 3632 adev->num_rings = 0; 3633 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3634 adev->mman.buffer_funcs = NULL; 3635 adev->mman.buffer_funcs_ring = NULL; 3636 adev->vm_manager.vm_pte_funcs = NULL; 3637 adev->vm_manager.vm_pte_num_scheds = 0; 3638 adev->gmc.gmc_funcs = NULL; 3639 adev->harvest_ip_mask = 0x0; 3640 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3641 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3642 3643 adev->smc_rreg = &amdgpu_invalid_rreg; 3644 adev->smc_wreg = &amdgpu_invalid_wreg; 3645 adev->pcie_rreg = &amdgpu_invalid_rreg; 3646 adev->pcie_wreg = &amdgpu_invalid_wreg; 3647 adev->pciep_rreg = &amdgpu_invalid_rreg; 3648 adev->pciep_wreg = &amdgpu_invalid_wreg; 3649 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3650 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3651 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3652 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3653 adev->didt_rreg = &amdgpu_invalid_rreg; 3654 adev->didt_wreg = &amdgpu_invalid_wreg; 3655 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3656 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3657 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3658 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3659 3660 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3661 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3662 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3663 3664 /* mutex initialization are all done here so we 3665 * can recall function without having locking issues */ 3666 rw_init(&adev->firmware.mutex, "agfw"); 3667 rw_init(&adev->pm.mutex, "agpm"); 3668 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3669 rw_init(&adev->srbm_mutex, "srbm"); 3670 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3671 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3672 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3673 rw_init(&adev->mn_lock, "agpumn"); 3674 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3675 hash_init(adev->mn_hash); 3676 rw_init(&adev->psp.mutex, "agpsp"); 3677 rw_init(&adev->notifier_lock, "agnf"); 3678 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3679 rw_init(&adev->benchmark_mutex, "agbm"); 3680 3681 amdgpu_device_init_apu_flags(adev); 3682 3683 r = amdgpu_device_check_arguments(adev); 3684 if (r) 3685 return r; 3686 3687 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3688 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3689 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3690 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3691 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3692 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3693 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3694 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3695 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3696 3697 INIT_LIST_HEAD(&adev->shadow_list); 3698 rw_init(&adev->shadow_list_lock, "sdwlst"); 3699 3700 INIT_LIST_HEAD(&adev->reset_list); 3701 3702 INIT_LIST_HEAD(&adev->ras_list); 3703 3704 INIT_DELAYED_WORK(&adev->delayed_init_work, 3705 amdgpu_device_delayed_init_work_handler); 3706 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3707 amdgpu_device_delay_enable_gfx_off); 3708 3709 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3710 3711 adev->gfx.gfx_off_req_count = 1; 3712 adev->gfx.gfx_off_residency = 0; 3713 adev->gfx.gfx_off_entrycount = 0; 3714 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3715 3716 atomic_set(&adev->throttling_logging_enabled, 1); 3717 /* 3718 * If throttling continues, logging will be performed every minute 3719 * to avoid log flooding. "-1" is subtracted since the thermal 3720 * throttling interrupt comes every second. Thus, the total logging 3721 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3722 * for throttling interrupt) = 60 seconds. 3723 */ 3724 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3725 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3726 3727 #ifdef __linux__ 3728 /* Registers mapping */ 3729 /* TODO: block userspace mapping of io register */ 3730 if (adev->asic_type >= CHIP_BONAIRE) { 3731 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3732 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3733 } else { 3734 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3735 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3736 } 3737 3738 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3739 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3740 3741 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3742 if (adev->rmmio == NULL) { 3743 return -ENOMEM; 3744 } 3745 #endif 3746 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3747 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3748 3749 amdgpu_device_get_pcie_info(adev); 3750 3751 if (amdgpu_mcbp) 3752 DRM_INFO("MCBP is enabled\n"); 3753 3754 /* 3755 * Reset domain needs to be present early, before XGMI hive discovered 3756 * (if any) and intitialized to use reset sem and in_gpu reset flag 3757 * early on during init and before calling to RREG32. 3758 */ 3759 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3760 if (!adev->reset_domain) 3761 return -ENOMEM; 3762 3763 /* detect hw virtualization here */ 3764 amdgpu_detect_virtualization(adev); 3765 3766 r = amdgpu_device_get_job_timeout_settings(adev); 3767 if (r) { 3768 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3769 return r; 3770 } 3771 3772 /* early init functions */ 3773 r = amdgpu_device_ip_early_init(adev); 3774 if (r) 3775 return r; 3776 3777 /* Get rid of things like offb */ 3778 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3779 if (r) 3780 return r; 3781 3782 /* Enable TMZ based on IP_VERSION */ 3783 amdgpu_gmc_tmz_set(adev); 3784 3785 amdgpu_gmc_noretry_set(adev); 3786 /* Need to get xgmi info early to decide the reset behavior*/ 3787 if (adev->gmc.xgmi.supported) { 3788 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3789 if (r) 3790 return r; 3791 } 3792 3793 /* enable PCIE atomic ops */ 3794 #ifdef notyet 3795 if (amdgpu_sriov_vf(adev)) 3796 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3797 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3798 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3799 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3800 * internal path natively support atomics, set have_atomics_support to true. 3801 */ 3802 else if ((adev->flags & AMD_IS_APU) && 3803 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3804 adev->have_atomics_support = true; 3805 else 3806 adev->have_atomics_support = 3807 !pci_enable_atomic_ops_to_root(adev->pdev, 3808 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3809 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3810 if (!adev->have_atomics_support) 3811 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3812 #else 3813 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3814 * internal path natively support atomics, set have_atomics_support to true. 3815 */ 3816 if ((adev->flags & AMD_IS_APU) && 3817 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3818 adev->have_atomics_support = true; 3819 else 3820 adev->have_atomics_support = false; 3821 #endif 3822 3823 /* doorbell bar mapping and doorbell index init*/ 3824 amdgpu_device_doorbell_init(adev); 3825 3826 if (amdgpu_emu_mode == 1) { 3827 /* post the asic on emulation mode */ 3828 emu_soc_asic_init(adev); 3829 goto fence_driver_init; 3830 } 3831 3832 amdgpu_reset_init(adev); 3833 3834 /* detect if we are with an SRIOV vbios */ 3835 amdgpu_device_detect_sriov_bios(adev); 3836 3837 /* check if we need to reset the asic 3838 * E.g., driver was not cleanly unloaded previously, etc. 3839 */ 3840 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3841 if (adev->gmc.xgmi.num_physical_nodes) { 3842 dev_info(adev->dev, "Pending hive reset.\n"); 3843 adev->gmc.xgmi.pending_reset = true; 3844 /* Only need to init necessary block for SMU to handle the reset */ 3845 for (i = 0; i < adev->num_ip_blocks; i++) { 3846 if (!adev->ip_blocks[i].status.valid) 3847 continue; 3848 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3849 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3850 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3851 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3852 DRM_DEBUG("IP %s disabled for hw_init.\n", 3853 adev->ip_blocks[i].version->funcs->name); 3854 adev->ip_blocks[i].status.hw = true; 3855 } 3856 } 3857 } else { 3858 tmp = amdgpu_reset_method; 3859 /* It should do a default reset when loading or reloading the driver, 3860 * regardless of the module parameter reset_method. 3861 */ 3862 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3863 r = amdgpu_asic_reset(adev); 3864 amdgpu_reset_method = tmp; 3865 if (r) { 3866 dev_err(adev->dev, "asic reset on init failed\n"); 3867 goto failed; 3868 } 3869 } 3870 } 3871 3872 pci_enable_pcie_error_reporting(adev->pdev); 3873 3874 /* Post card if necessary */ 3875 if (amdgpu_device_need_post(adev)) { 3876 if (!adev->bios) { 3877 dev_err(adev->dev, "no vBIOS found\n"); 3878 r = -EINVAL; 3879 goto failed; 3880 } 3881 DRM_INFO("GPU posting now...\n"); 3882 r = amdgpu_device_asic_init(adev); 3883 if (r) { 3884 dev_err(adev->dev, "gpu post error!\n"); 3885 goto failed; 3886 } 3887 } 3888 3889 if (adev->is_atom_fw) { 3890 /* Initialize clocks */ 3891 r = amdgpu_atomfirmware_get_clock_info(adev); 3892 if (r) { 3893 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3894 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3895 goto failed; 3896 } 3897 } else { 3898 /* Initialize clocks */ 3899 r = amdgpu_atombios_get_clock_info(adev); 3900 if (r) { 3901 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3902 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3903 goto failed; 3904 } 3905 /* init i2c buses */ 3906 if (!amdgpu_device_has_dc_support(adev)) 3907 amdgpu_atombios_i2c_init(adev); 3908 } 3909 3910 fence_driver_init: 3911 /* Fence driver */ 3912 r = amdgpu_fence_driver_sw_init(adev); 3913 if (r) { 3914 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3915 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3916 goto failed; 3917 } 3918 3919 /* init the mode config */ 3920 drm_mode_config_init(adev_to_drm(adev)); 3921 3922 r = amdgpu_device_ip_init(adev); 3923 if (r) { 3924 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3925 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3926 goto release_ras_con; 3927 } 3928 3929 amdgpu_fence_driver_hw_init(adev); 3930 3931 dev_info(adev->dev, 3932 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3933 adev->gfx.config.max_shader_engines, 3934 adev->gfx.config.max_sh_per_se, 3935 adev->gfx.config.max_cu_per_sh, 3936 adev->gfx.cu_info.number); 3937 3938 #ifdef __OpenBSD__ 3939 { 3940 const char *chip_name; 3941 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3942 int maj, min, rev; 3943 3944 switch (adev->asic_type) { 3945 case CHIP_RAVEN: 3946 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3947 chip_name = "RAVEN2"; 3948 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3949 chip_name = "PICASSO"; 3950 else 3951 chip_name = "RAVEN"; 3952 break; 3953 case CHIP_RENOIR: 3954 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3955 chip_name = "RENOIR"; 3956 else 3957 chip_name = "GREEN_SARDINE"; 3958 break; 3959 default: 3960 chip_name = amdgpu_asic_name[adev->asic_type]; 3961 } 3962 3963 printf("%s: %s", adev->self.dv_xname, chip_name); 3964 /* show graphics/compute ip block version, not set on < GFX9 */ 3965 if (version) { 3966 maj = IP_VERSION_MAJ(version); 3967 min = IP_VERSION_MIN(version); 3968 rev = IP_VERSION_REV(version); 3969 printf(" GC %d.%d.%d", maj, min, rev); 3970 } 3971 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3972 } 3973 #endif 3974 3975 adev->accel_working = true; 3976 3977 amdgpu_vm_check_compute_bug(adev); 3978 3979 /* Initialize the buffer migration limit. */ 3980 if (amdgpu_moverate >= 0) 3981 max_MBps = amdgpu_moverate; 3982 else 3983 max_MBps = 8; /* Allow 8 MB/s. */ 3984 /* Get a log2 for easy divisions. */ 3985 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3986 3987 r = amdgpu_pm_sysfs_init(adev); 3988 if (r) { 3989 adev->pm_sysfs_en = false; 3990 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3991 } else 3992 adev->pm_sysfs_en = true; 3993 3994 r = amdgpu_ucode_sysfs_init(adev); 3995 if (r) { 3996 adev->ucode_sysfs_en = false; 3997 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3998 } else 3999 adev->ucode_sysfs_en = true; 4000 4001 r = amdgpu_psp_sysfs_init(adev); 4002 if (r) { 4003 adev->psp_sysfs_en = false; 4004 if (!amdgpu_sriov_vf(adev)) 4005 DRM_ERROR("Creating psp sysfs failed\n"); 4006 } else 4007 adev->psp_sysfs_en = true; 4008 4009 /* 4010 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4011 * Otherwise the mgpu fan boost feature will be skipped due to the 4012 * gpu instance is counted less. 4013 */ 4014 amdgpu_register_gpu_instance(adev); 4015 4016 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4017 * explicit gating rather than handling it automatically. 4018 */ 4019 if (!adev->gmc.xgmi.pending_reset) { 4020 r = amdgpu_device_ip_late_init(adev); 4021 if (r) { 4022 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4023 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4024 goto release_ras_con; 4025 } 4026 /* must succeed. */ 4027 amdgpu_ras_resume(adev); 4028 queue_delayed_work(system_wq, &adev->delayed_init_work, 4029 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4030 } 4031 4032 if (amdgpu_sriov_vf(adev)) { 4033 amdgpu_virt_release_full_gpu(adev, true); 4034 flush_delayed_work(&adev->delayed_init_work); 4035 } 4036 4037 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4038 if (r) 4039 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4040 4041 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4042 r = amdgpu_pmu_init(adev); 4043 if (r) 4044 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4045 4046 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4047 if (amdgpu_device_cache_pci_state(adev->pdev)) 4048 pci_restore_state(pdev); 4049 4050 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4051 /* this will fail for cards that aren't VGA class devices, just 4052 * ignore it */ 4053 #ifdef notyet 4054 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4055 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4056 #endif 4057 4058 px = amdgpu_device_supports_px(ddev); 4059 4060 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4061 apple_gmux_detect(NULL, NULL))) 4062 vga_switcheroo_register_client(adev->pdev, 4063 &amdgpu_switcheroo_ops, px); 4064 4065 if (px) 4066 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4067 4068 if (adev->gmc.xgmi.pending_reset) 4069 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4070 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4071 4072 amdgpu_device_check_iommu_direct_map(adev); 4073 4074 return 0; 4075 4076 release_ras_con: 4077 if (amdgpu_sriov_vf(adev)) 4078 amdgpu_virt_release_full_gpu(adev, true); 4079 4080 /* failed in exclusive mode due to timeout */ 4081 if (amdgpu_sriov_vf(adev) && 4082 !amdgpu_sriov_runtime(adev) && 4083 amdgpu_virt_mmio_blocked(adev) && 4084 !amdgpu_virt_wait_reset(adev)) { 4085 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4086 /* Don't send request since VF is inactive. */ 4087 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4088 adev->virt.ops = NULL; 4089 r = -EAGAIN; 4090 } 4091 amdgpu_release_ras_context(adev); 4092 4093 failed: 4094 amdgpu_vf_error_trans_all(adev); 4095 4096 return r; 4097 } 4098 4099 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4100 { 4101 STUB(); 4102 #ifdef notyet 4103 /* Clear all CPU mappings pointing to this device */ 4104 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4105 #endif 4106 4107 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4108 amdgpu_device_doorbell_fini(adev); 4109 4110 #ifdef __linux__ 4111 iounmap(adev->rmmio); 4112 adev->rmmio = NULL; 4113 if (adev->mman.aper_base_kaddr) 4114 iounmap(adev->mman.aper_base_kaddr); 4115 adev->mman.aper_base_kaddr = NULL; 4116 #else 4117 if (adev->rmmio_size > 0) 4118 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4119 adev->rmmio_size); 4120 adev->rmmio_size = 0; 4121 adev->rmmio = NULL; 4122 if (adev->mman.aper_base_kaddr) 4123 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4124 adev->gmc.visible_vram_size); 4125 adev->mman.aper_base_kaddr = NULL; 4126 #endif 4127 4128 /* Memory manager related */ 4129 if (!adev->gmc.xgmi.connected_to_cpu) { 4130 #ifdef __linux__ 4131 arch_phys_wc_del(adev->gmc.vram_mtrr); 4132 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4133 #else 4134 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4135 #endif 4136 } 4137 } 4138 4139 /** 4140 * amdgpu_device_fini_hw - tear down the driver 4141 * 4142 * @adev: amdgpu_device pointer 4143 * 4144 * Tear down the driver info (all asics). 4145 * Called at driver shutdown. 4146 */ 4147 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4148 { 4149 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4150 flush_delayed_work(&adev->delayed_init_work); 4151 adev->shutdown = true; 4152 4153 /* make sure IB test finished before entering exclusive mode 4154 * to avoid preemption on IB test 4155 * */ 4156 if (amdgpu_sriov_vf(adev)) { 4157 amdgpu_virt_request_full_gpu(adev, false); 4158 amdgpu_virt_fini_data_exchange(adev); 4159 } 4160 4161 /* disable all interrupts */ 4162 amdgpu_irq_disable_all(adev); 4163 if (adev->mode_info.mode_config_initialized){ 4164 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4165 drm_helper_force_disable_all(adev_to_drm(adev)); 4166 else 4167 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4168 } 4169 amdgpu_fence_driver_hw_fini(adev); 4170 4171 if (adev->mman.initialized) { 4172 flush_delayed_work(&adev->mman.bdev.wq); 4173 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4174 } 4175 4176 if (adev->pm_sysfs_en) 4177 amdgpu_pm_sysfs_fini(adev); 4178 if (adev->ucode_sysfs_en) 4179 amdgpu_ucode_sysfs_fini(adev); 4180 if (adev->psp_sysfs_en) 4181 amdgpu_psp_sysfs_fini(adev); 4182 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4183 4184 /* disable ras feature must before hw fini */ 4185 amdgpu_ras_pre_fini(adev); 4186 4187 amdgpu_device_ip_fini_early(adev); 4188 4189 amdgpu_irq_fini_hw(adev); 4190 4191 if (adev->mman.initialized) 4192 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4193 4194 amdgpu_gart_dummy_page_fini(adev); 4195 4196 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4197 amdgpu_device_unmap_mmio(adev); 4198 4199 } 4200 4201 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4202 { 4203 int idx; 4204 bool px; 4205 4206 amdgpu_fence_driver_sw_fini(adev); 4207 amdgpu_device_ip_fini(adev); 4208 release_firmware(adev->firmware.gpu_info_fw); 4209 adev->firmware.gpu_info_fw = NULL; 4210 adev->accel_working = false; 4211 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4212 4213 amdgpu_reset_fini(adev); 4214 4215 /* free i2c buses */ 4216 if (!amdgpu_device_has_dc_support(adev)) 4217 amdgpu_i2c_fini(adev); 4218 4219 if (amdgpu_emu_mode != 1) 4220 amdgpu_atombios_fini(adev); 4221 4222 kfree(adev->bios); 4223 adev->bios = NULL; 4224 4225 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4226 4227 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4228 apple_gmux_detect(NULL, NULL))) 4229 vga_switcheroo_unregister_client(adev->pdev); 4230 4231 if (px) 4232 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4233 4234 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4235 vga_client_unregister(adev->pdev); 4236 4237 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4238 #ifdef __linux__ 4239 iounmap(adev->rmmio); 4240 adev->rmmio = NULL; 4241 #else 4242 if (adev->rmmio_size > 0) 4243 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4244 adev->rmmio_size); 4245 adev->rmmio_size = 0; 4246 adev->rmmio = NULL; 4247 #endif 4248 amdgpu_device_doorbell_fini(adev); 4249 drm_dev_exit(idx); 4250 } 4251 4252 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4253 amdgpu_pmu_fini(adev); 4254 if (adev->mman.discovery_bin) 4255 amdgpu_discovery_fini(adev); 4256 4257 amdgpu_reset_put_reset_domain(adev->reset_domain); 4258 adev->reset_domain = NULL; 4259 4260 kfree(adev->pci_state); 4261 4262 } 4263 4264 /** 4265 * amdgpu_device_evict_resources - evict device resources 4266 * @adev: amdgpu device object 4267 * 4268 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4269 * of the vram memory type. Mainly used for evicting device resources 4270 * at suspend time. 4271 * 4272 */ 4273 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4274 { 4275 int ret; 4276 4277 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4278 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4279 return 0; 4280 4281 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4282 if (ret) 4283 DRM_WARN("evicting device resources failed\n"); 4284 return ret; 4285 } 4286 4287 /* 4288 * Suspend & resume. 4289 */ 4290 /** 4291 * amdgpu_device_suspend - initiate device suspend 4292 * 4293 * @dev: drm dev pointer 4294 * @fbcon : notify the fbdev of suspend 4295 * 4296 * Puts the hw in the suspend state (all asics). 4297 * Returns 0 for success or an error on failure. 4298 * Called at driver suspend. 4299 */ 4300 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4301 { 4302 struct amdgpu_device *adev = drm_to_adev(dev); 4303 int r = 0; 4304 4305 if (adev->shutdown) 4306 return 0; 4307 4308 #ifdef notyet 4309 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4310 return 0; 4311 #endif 4312 4313 adev->in_suspend = true; 4314 4315 if (amdgpu_sriov_vf(adev)) { 4316 amdgpu_virt_fini_data_exchange(adev); 4317 r = amdgpu_virt_request_full_gpu(adev, false); 4318 if (r) 4319 return r; 4320 } 4321 4322 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4323 DRM_WARN("smart shift update failed\n"); 4324 4325 drm_kms_helper_poll_disable(dev); 4326 4327 if (fbcon) 4328 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4329 4330 cancel_delayed_work_sync(&adev->delayed_init_work); 4331 4332 amdgpu_ras_suspend(adev); 4333 4334 amdgpu_device_ip_suspend_phase1(adev); 4335 4336 if (!adev->in_s0ix) 4337 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4338 4339 r = amdgpu_device_evict_resources(adev); 4340 if (r) 4341 return r; 4342 4343 amdgpu_fence_driver_hw_fini(adev); 4344 4345 amdgpu_device_ip_suspend_phase2(adev); 4346 4347 if (amdgpu_sriov_vf(adev)) 4348 amdgpu_virt_release_full_gpu(adev, false); 4349 4350 return 0; 4351 } 4352 4353 /** 4354 * amdgpu_device_resume - initiate device resume 4355 * 4356 * @dev: drm dev pointer 4357 * @fbcon : notify the fbdev of resume 4358 * 4359 * Bring the hw back to operating state (all asics). 4360 * Returns 0 for success or an error on failure. 4361 * Called at driver resume. 4362 */ 4363 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4364 { 4365 struct amdgpu_device *adev = drm_to_adev(dev); 4366 int r = 0; 4367 4368 if (amdgpu_sriov_vf(adev)) { 4369 r = amdgpu_virt_request_full_gpu(adev, true); 4370 if (r) 4371 return r; 4372 } 4373 4374 #ifdef notyet 4375 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4376 return 0; 4377 #endif 4378 4379 if (adev->in_s0ix) 4380 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4381 4382 /* post card */ 4383 if (amdgpu_device_need_post(adev)) { 4384 r = amdgpu_device_asic_init(adev); 4385 if (r) 4386 dev_err(adev->dev, "amdgpu asic init failed\n"); 4387 } 4388 4389 r = amdgpu_device_ip_resume(adev); 4390 4391 /* no matter what r is, always need to properly release full GPU */ 4392 if (amdgpu_sriov_vf(adev)) { 4393 amdgpu_virt_init_data_exchange(adev); 4394 amdgpu_virt_release_full_gpu(adev, true); 4395 } 4396 4397 if (r) { 4398 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4399 return r; 4400 } 4401 amdgpu_fence_driver_hw_init(adev); 4402 4403 r = amdgpu_device_ip_late_init(adev); 4404 if (r) 4405 return r; 4406 4407 queue_delayed_work(system_wq, &adev->delayed_init_work, 4408 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4409 4410 if (!adev->in_s0ix) { 4411 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4412 if (r) 4413 return r; 4414 } 4415 4416 /* Make sure IB tests flushed */ 4417 flush_delayed_work(&adev->delayed_init_work); 4418 4419 if (adev->in_s0ix) { 4420 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4421 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4422 */ 4423 amdgpu_gfx_off_ctrl(adev, true); 4424 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4425 } 4426 if (fbcon) 4427 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4428 4429 drm_kms_helper_poll_enable(dev); 4430 4431 amdgpu_ras_resume(adev); 4432 4433 /* 4434 * Most of the connector probing functions try to acquire runtime pm 4435 * refs to ensure that the GPU is powered on when connector polling is 4436 * performed. Since we're calling this from a runtime PM callback, 4437 * trying to acquire rpm refs will cause us to deadlock. 4438 * 4439 * Since we're guaranteed to be holding the rpm lock, it's safe to 4440 * temporarily disable the rpm helpers so this doesn't deadlock us. 4441 */ 4442 #if defined(CONFIG_PM) && defined(__linux__) 4443 dev->dev->power.disable_depth++; 4444 #endif 4445 if (!amdgpu_device_has_dc_support(adev)) 4446 drm_helper_hpd_irq_event(dev); 4447 else 4448 drm_kms_helper_hotplug_event(dev); 4449 #if defined(CONFIG_PM) && defined(__linux__) 4450 dev->dev->power.disable_depth--; 4451 #endif 4452 adev->in_suspend = false; 4453 4454 if (adev->enable_mes) 4455 amdgpu_mes_self_test(adev); 4456 4457 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4458 DRM_WARN("smart shift update failed\n"); 4459 4460 return 0; 4461 } 4462 4463 /** 4464 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4465 * 4466 * @adev: amdgpu_device pointer 4467 * 4468 * The list of all the hardware IPs that make up the asic is walked and 4469 * the check_soft_reset callbacks are run. check_soft_reset determines 4470 * if the asic is still hung or not. 4471 * Returns true if any of the IPs are still in a hung state, false if not. 4472 */ 4473 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4474 { 4475 int i; 4476 bool asic_hang = false; 4477 4478 if (amdgpu_sriov_vf(adev)) 4479 return true; 4480 4481 if (amdgpu_asic_need_full_reset(adev)) 4482 return true; 4483 4484 for (i = 0; i < adev->num_ip_blocks; i++) { 4485 if (!adev->ip_blocks[i].status.valid) 4486 continue; 4487 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4488 adev->ip_blocks[i].status.hang = 4489 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4490 if (adev->ip_blocks[i].status.hang) { 4491 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4492 asic_hang = true; 4493 } 4494 } 4495 return asic_hang; 4496 } 4497 4498 /** 4499 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4500 * 4501 * @adev: amdgpu_device pointer 4502 * 4503 * The list of all the hardware IPs that make up the asic is walked and the 4504 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4505 * handles any IP specific hardware or software state changes that are 4506 * necessary for a soft reset to succeed. 4507 * Returns 0 on success, negative error code on failure. 4508 */ 4509 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4510 { 4511 int i, r = 0; 4512 4513 for (i = 0; i < adev->num_ip_blocks; i++) { 4514 if (!adev->ip_blocks[i].status.valid) 4515 continue; 4516 if (adev->ip_blocks[i].status.hang && 4517 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4518 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4519 if (r) 4520 return r; 4521 } 4522 } 4523 4524 return 0; 4525 } 4526 4527 /** 4528 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4529 * 4530 * @adev: amdgpu_device pointer 4531 * 4532 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4533 * reset is necessary to recover. 4534 * Returns true if a full asic reset is required, false if not. 4535 */ 4536 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4537 { 4538 int i; 4539 4540 if (amdgpu_asic_need_full_reset(adev)) 4541 return true; 4542 4543 for (i = 0; i < adev->num_ip_blocks; i++) { 4544 if (!adev->ip_blocks[i].status.valid) 4545 continue; 4546 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4547 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4548 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4549 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4550 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4551 if (adev->ip_blocks[i].status.hang) { 4552 dev_info(adev->dev, "Some block need full reset!\n"); 4553 return true; 4554 } 4555 } 4556 } 4557 return false; 4558 } 4559 4560 /** 4561 * amdgpu_device_ip_soft_reset - do a soft reset 4562 * 4563 * @adev: amdgpu_device pointer 4564 * 4565 * The list of all the hardware IPs that make up the asic is walked and the 4566 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4567 * IP specific hardware or software state changes that are necessary to soft 4568 * reset the IP. 4569 * Returns 0 on success, negative error code on failure. 4570 */ 4571 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4572 { 4573 int i, r = 0; 4574 4575 for (i = 0; i < adev->num_ip_blocks; i++) { 4576 if (!adev->ip_blocks[i].status.valid) 4577 continue; 4578 if (adev->ip_blocks[i].status.hang && 4579 adev->ip_blocks[i].version->funcs->soft_reset) { 4580 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4581 if (r) 4582 return r; 4583 } 4584 } 4585 4586 return 0; 4587 } 4588 4589 /** 4590 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4591 * 4592 * @adev: amdgpu_device pointer 4593 * 4594 * The list of all the hardware IPs that make up the asic is walked and the 4595 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4596 * handles any IP specific hardware or software state changes that are 4597 * necessary after the IP has been soft reset. 4598 * Returns 0 on success, negative error code on failure. 4599 */ 4600 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4601 { 4602 int i, r = 0; 4603 4604 for (i = 0; i < adev->num_ip_blocks; i++) { 4605 if (!adev->ip_blocks[i].status.valid) 4606 continue; 4607 if (adev->ip_blocks[i].status.hang && 4608 adev->ip_blocks[i].version->funcs->post_soft_reset) 4609 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4610 if (r) 4611 return r; 4612 } 4613 4614 return 0; 4615 } 4616 4617 /** 4618 * amdgpu_device_recover_vram - Recover some VRAM contents 4619 * 4620 * @adev: amdgpu_device pointer 4621 * 4622 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4623 * restore things like GPUVM page tables after a GPU reset where 4624 * the contents of VRAM might be lost. 4625 * 4626 * Returns: 4627 * 0 on success, negative error code on failure. 4628 */ 4629 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4630 { 4631 struct dma_fence *fence = NULL, *next = NULL; 4632 struct amdgpu_bo *shadow; 4633 struct amdgpu_bo_vm *vmbo; 4634 long r = 1, tmo; 4635 4636 if (amdgpu_sriov_runtime(adev)) 4637 tmo = msecs_to_jiffies(8000); 4638 else 4639 tmo = msecs_to_jiffies(100); 4640 4641 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4642 mutex_lock(&adev->shadow_list_lock); 4643 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4644 /* If vm is compute context or adev is APU, shadow will be NULL */ 4645 if (!vmbo->shadow) 4646 continue; 4647 shadow = vmbo->shadow; 4648 4649 /* No need to recover an evicted BO */ 4650 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4651 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4652 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4653 continue; 4654 4655 r = amdgpu_bo_restore_shadow(shadow, &next); 4656 if (r) 4657 break; 4658 4659 if (fence) { 4660 tmo = dma_fence_wait_timeout(fence, false, tmo); 4661 dma_fence_put(fence); 4662 fence = next; 4663 if (tmo == 0) { 4664 r = -ETIMEDOUT; 4665 break; 4666 } else if (tmo < 0) { 4667 r = tmo; 4668 break; 4669 } 4670 } else { 4671 fence = next; 4672 } 4673 } 4674 mutex_unlock(&adev->shadow_list_lock); 4675 4676 if (fence) 4677 tmo = dma_fence_wait_timeout(fence, false, tmo); 4678 dma_fence_put(fence); 4679 4680 if (r < 0 || tmo <= 0) { 4681 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4682 return -EIO; 4683 } 4684 4685 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4686 return 0; 4687 } 4688 4689 4690 /** 4691 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4692 * 4693 * @adev: amdgpu_device pointer 4694 * @from_hypervisor: request from hypervisor 4695 * 4696 * do VF FLR and reinitialize Asic 4697 * return 0 means succeeded otherwise failed 4698 */ 4699 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4700 bool from_hypervisor) 4701 { 4702 int r; 4703 struct amdgpu_hive_info *hive = NULL; 4704 int retry_limit = 0; 4705 4706 retry: 4707 amdgpu_amdkfd_pre_reset(adev); 4708 4709 if (from_hypervisor) 4710 r = amdgpu_virt_request_full_gpu(adev, true); 4711 else 4712 r = amdgpu_virt_reset_gpu(adev); 4713 if (r) 4714 return r; 4715 4716 /* Resume IP prior to SMC */ 4717 r = amdgpu_device_ip_reinit_early_sriov(adev); 4718 if (r) 4719 goto error; 4720 4721 amdgpu_virt_init_data_exchange(adev); 4722 4723 r = amdgpu_device_fw_loading(adev); 4724 if (r) 4725 return r; 4726 4727 /* now we are okay to resume SMC/CP/SDMA */ 4728 r = amdgpu_device_ip_reinit_late_sriov(adev); 4729 if (r) 4730 goto error; 4731 4732 hive = amdgpu_get_xgmi_hive(adev); 4733 /* Update PSP FW topology after reset */ 4734 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4735 r = amdgpu_xgmi_update_topology(hive, adev); 4736 4737 if (hive) 4738 amdgpu_put_xgmi_hive(hive); 4739 4740 if (!r) { 4741 amdgpu_irq_gpu_reset_resume_helper(adev); 4742 r = amdgpu_ib_ring_tests(adev); 4743 4744 amdgpu_amdkfd_post_reset(adev); 4745 } 4746 4747 error: 4748 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4749 amdgpu_inc_vram_lost(adev); 4750 r = amdgpu_device_recover_vram(adev); 4751 } 4752 amdgpu_virt_release_full_gpu(adev, true); 4753 4754 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4755 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4756 retry_limit++; 4757 goto retry; 4758 } else 4759 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4760 } 4761 4762 return r; 4763 } 4764 4765 /** 4766 * amdgpu_device_has_job_running - check if there is any job in mirror list 4767 * 4768 * @adev: amdgpu_device pointer 4769 * 4770 * check if there is any job in mirror list 4771 */ 4772 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4773 { 4774 int i; 4775 struct drm_sched_job *job; 4776 4777 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4778 struct amdgpu_ring *ring = adev->rings[i]; 4779 4780 if (!ring || !ring->sched.thread) 4781 continue; 4782 4783 spin_lock(&ring->sched.job_list_lock); 4784 job = list_first_entry_or_null(&ring->sched.pending_list, 4785 struct drm_sched_job, list); 4786 spin_unlock(&ring->sched.job_list_lock); 4787 if (job) 4788 return true; 4789 } 4790 return false; 4791 } 4792 4793 /** 4794 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4795 * 4796 * @adev: amdgpu_device pointer 4797 * 4798 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4799 * a hung GPU. 4800 */ 4801 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4802 { 4803 4804 if (amdgpu_gpu_recovery == 0) 4805 goto disabled; 4806 4807 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4808 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4809 return false; 4810 } 4811 4812 if (amdgpu_sriov_vf(adev)) 4813 return true; 4814 4815 if (amdgpu_gpu_recovery == -1) { 4816 switch (adev->asic_type) { 4817 #ifdef CONFIG_DRM_AMDGPU_SI 4818 case CHIP_VERDE: 4819 case CHIP_TAHITI: 4820 case CHIP_PITCAIRN: 4821 case CHIP_OLAND: 4822 case CHIP_HAINAN: 4823 #endif 4824 #ifdef CONFIG_DRM_AMDGPU_CIK 4825 case CHIP_KAVERI: 4826 case CHIP_KABINI: 4827 case CHIP_MULLINS: 4828 #endif 4829 case CHIP_CARRIZO: 4830 case CHIP_STONEY: 4831 case CHIP_CYAN_SKILLFISH: 4832 goto disabled; 4833 default: 4834 break; 4835 } 4836 } 4837 4838 return true; 4839 4840 disabled: 4841 dev_info(adev->dev, "GPU recovery disabled.\n"); 4842 return false; 4843 } 4844 4845 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4846 { 4847 u32 i; 4848 int ret = 0; 4849 4850 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4851 4852 dev_info(adev->dev, "GPU mode1 reset\n"); 4853 4854 /* disable BM */ 4855 pci_clear_master(adev->pdev); 4856 4857 amdgpu_device_cache_pci_state(adev->pdev); 4858 4859 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4860 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4861 ret = amdgpu_dpm_mode1_reset(adev); 4862 } else { 4863 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4864 ret = psp_gpu_reset(adev); 4865 } 4866 4867 if (ret) 4868 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4869 4870 amdgpu_device_load_pci_state(adev->pdev); 4871 4872 /* wait for asic to come out of reset */ 4873 for (i = 0; i < adev->usec_timeout; i++) { 4874 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4875 4876 if (memsize != 0xffffffff) 4877 break; 4878 udelay(1); 4879 } 4880 4881 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4882 return ret; 4883 } 4884 4885 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4886 struct amdgpu_reset_context *reset_context) 4887 { 4888 int i, r = 0; 4889 struct amdgpu_job *job = NULL; 4890 bool need_full_reset = 4891 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4892 4893 if (reset_context->reset_req_dev == adev) 4894 job = reset_context->job; 4895 4896 if (amdgpu_sriov_vf(adev)) { 4897 /* stop the data exchange thread */ 4898 amdgpu_virt_fini_data_exchange(adev); 4899 } 4900 4901 amdgpu_fence_driver_isr_toggle(adev, true); 4902 4903 /* block all schedulers and reset given job's ring */ 4904 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4905 struct amdgpu_ring *ring = adev->rings[i]; 4906 4907 if (!ring || !ring->sched.thread) 4908 continue; 4909 4910 /*clear job fence from fence drv to avoid force_completion 4911 *leave NULL and vm flush fence in fence drv */ 4912 amdgpu_fence_driver_clear_job_fences(ring); 4913 4914 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4915 amdgpu_fence_driver_force_completion(ring); 4916 } 4917 4918 amdgpu_fence_driver_isr_toggle(adev, false); 4919 4920 if (job && job->vm) 4921 drm_sched_increase_karma(&job->base); 4922 4923 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4924 /* If reset handler not implemented, continue; otherwise return */ 4925 if (r == -ENOSYS) 4926 r = 0; 4927 else 4928 return r; 4929 4930 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4931 if (!amdgpu_sriov_vf(adev)) { 4932 4933 if (!need_full_reset) 4934 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4935 4936 if (!need_full_reset && amdgpu_gpu_recovery) { 4937 amdgpu_device_ip_pre_soft_reset(adev); 4938 r = amdgpu_device_ip_soft_reset(adev); 4939 amdgpu_device_ip_post_soft_reset(adev); 4940 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4941 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4942 need_full_reset = true; 4943 } 4944 } 4945 4946 if (need_full_reset) 4947 r = amdgpu_device_ip_suspend(adev); 4948 if (need_full_reset) 4949 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4950 else 4951 clear_bit(AMDGPU_NEED_FULL_RESET, 4952 &reset_context->flags); 4953 } 4954 4955 return r; 4956 } 4957 4958 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4959 { 4960 int i; 4961 4962 lockdep_assert_held(&adev->reset_domain->sem); 4963 4964 for (i = 0; i < adev->num_regs; i++) { 4965 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4966 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4967 adev->reset_dump_reg_value[i]); 4968 } 4969 4970 return 0; 4971 } 4972 4973 #ifdef CONFIG_DEV_COREDUMP 4974 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4975 size_t count, void *data, size_t datalen) 4976 { 4977 struct drm_printer p; 4978 struct amdgpu_device *adev = data; 4979 struct drm_print_iterator iter; 4980 int i; 4981 4982 iter.data = buffer; 4983 iter.offset = 0; 4984 iter.start = offset; 4985 iter.remain = count; 4986 4987 p = drm_coredump_printer(&iter); 4988 4989 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4990 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4991 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4992 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4993 if (adev->reset_task_info.pid) 4994 drm_printf(&p, "process_name: %s PID: %d\n", 4995 adev->reset_task_info.process_name, 4996 adev->reset_task_info.pid); 4997 4998 if (adev->reset_vram_lost) 4999 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 5000 if (adev->num_regs) { 5001 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 5002 5003 for (i = 0; i < adev->num_regs; i++) 5004 drm_printf(&p, "0x%08x: 0x%08x\n", 5005 adev->reset_dump_reg_list[i], 5006 adev->reset_dump_reg_value[i]); 5007 } 5008 5009 return count - iter.remain; 5010 } 5011 5012 static void amdgpu_devcoredump_free(void *data) 5013 { 5014 } 5015 5016 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 5017 { 5018 struct drm_device *dev = adev_to_drm(adev); 5019 5020 ktime_get_ts64(&adev->reset_time); 5021 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5022 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5023 } 5024 #endif 5025 5026 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5027 struct amdgpu_reset_context *reset_context) 5028 { 5029 struct amdgpu_device *tmp_adev = NULL; 5030 bool need_full_reset, skip_hw_reset, vram_lost = false; 5031 int r = 0; 5032 bool gpu_reset_for_dev_remove = 0; 5033 5034 /* Try reset handler method first */ 5035 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5036 reset_list); 5037 amdgpu_reset_reg_dumps(tmp_adev); 5038 5039 reset_context->reset_device_list = device_list_handle; 5040 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5041 /* If reset handler not implemented, continue; otherwise return */ 5042 if (r == -ENOSYS) 5043 r = 0; 5044 else 5045 return r; 5046 5047 /* Reset handler not implemented, use the default method */ 5048 need_full_reset = 5049 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5050 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5051 5052 gpu_reset_for_dev_remove = 5053 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5054 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5055 5056 /* 5057 * ASIC reset has to be done on all XGMI hive nodes ASAP 5058 * to allow proper links negotiation in FW (within 1 sec) 5059 */ 5060 if (!skip_hw_reset && need_full_reset) { 5061 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5062 /* For XGMI run all resets in parallel to speed up the process */ 5063 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5064 tmp_adev->gmc.xgmi.pending_reset = false; 5065 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5066 r = -EALREADY; 5067 } else 5068 r = amdgpu_asic_reset(tmp_adev); 5069 5070 if (r) { 5071 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5072 r, adev_to_drm(tmp_adev)->unique); 5073 break; 5074 } 5075 } 5076 5077 /* For XGMI wait for all resets to complete before proceed */ 5078 if (!r) { 5079 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5080 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5081 flush_work(&tmp_adev->xgmi_reset_work); 5082 r = tmp_adev->asic_reset_res; 5083 if (r) 5084 break; 5085 } 5086 } 5087 } 5088 } 5089 5090 if (!r && amdgpu_ras_intr_triggered()) { 5091 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5092 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5093 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5094 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5095 } 5096 5097 amdgpu_ras_intr_cleared(); 5098 } 5099 5100 /* Since the mode1 reset affects base ip blocks, the 5101 * phase1 ip blocks need to be resumed. Otherwise there 5102 * will be a BIOS signature error and the psp bootloader 5103 * can't load kdb on the next amdgpu install. 5104 */ 5105 if (gpu_reset_for_dev_remove) { 5106 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5107 amdgpu_device_ip_resume_phase1(tmp_adev); 5108 5109 goto end; 5110 } 5111 5112 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5113 if (need_full_reset) { 5114 /* post card */ 5115 r = amdgpu_device_asic_init(tmp_adev); 5116 if (r) { 5117 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5118 } else { 5119 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5120 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5121 if (r) 5122 goto out; 5123 5124 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5125 if (r) 5126 goto out; 5127 5128 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5129 #ifdef CONFIG_DEV_COREDUMP 5130 tmp_adev->reset_vram_lost = vram_lost; 5131 memset(&tmp_adev->reset_task_info, 0, 5132 sizeof(tmp_adev->reset_task_info)); 5133 if (reset_context->job && reset_context->job->vm) 5134 tmp_adev->reset_task_info = 5135 reset_context->job->vm->task_info; 5136 amdgpu_reset_capture_coredumpm(tmp_adev); 5137 #endif 5138 if (vram_lost) { 5139 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5140 amdgpu_inc_vram_lost(tmp_adev); 5141 } 5142 5143 r = amdgpu_device_fw_loading(tmp_adev); 5144 if (r) 5145 return r; 5146 5147 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5148 if (r) 5149 goto out; 5150 5151 if (vram_lost) 5152 amdgpu_device_fill_reset_magic(tmp_adev); 5153 5154 /* 5155 * Add this ASIC as tracked as reset was already 5156 * complete successfully. 5157 */ 5158 amdgpu_register_gpu_instance(tmp_adev); 5159 5160 if (!reset_context->hive && 5161 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5162 amdgpu_xgmi_add_device(tmp_adev); 5163 5164 r = amdgpu_device_ip_late_init(tmp_adev); 5165 if (r) 5166 goto out; 5167 5168 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5169 5170 /* 5171 * The GPU enters bad state once faulty pages 5172 * by ECC has reached the threshold, and ras 5173 * recovery is scheduled next. So add one check 5174 * here to break recovery if it indeed exceeds 5175 * bad page threshold, and remind user to 5176 * retire this GPU or setting one bigger 5177 * bad_page_threshold value to fix this once 5178 * probing driver again. 5179 */ 5180 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5181 /* must succeed. */ 5182 amdgpu_ras_resume(tmp_adev); 5183 } else { 5184 r = -EINVAL; 5185 goto out; 5186 } 5187 5188 /* Update PSP FW topology after reset */ 5189 if (reset_context->hive && 5190 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5191 r = amdgpu_xgmi_update_topology( 5192 reset_context->hive, tmp_adev); 5193 } 5194 } 5195 5196 out: 5197 if (!r) { 5198 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5199 r = amdgpu_ib_ring_tests(tmp_adev); 5200 if (r) { 5201 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5202 need_full_reset = true; 5203 r = -EAGAIN; 5204 goto end; 5205 } 5206 } 5207 5208 if (!r) 5209 r = amdgpu_device_recover_vram(tmp_adev); 5210 else 5211 tmp_adev->asic_reset_res = r; 5212 } 5213 5214 end: 5215 if (need_full_reset) 5216 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5217 else 5218 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5219 return r; 5220 } 5221 5222 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5223 { 5224 5225 switch (amdgpu_asic_reset_method(adev)) { 5226 case AMD_RESET_METHOD_MODE1: 5227 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5228 break; 5229 case AMD_RESET_METHOD_MODE2: 5230 adev->mp1_state = PP_MP1_STATE_RESET; 5231 break; 5232 default: 5233 adev->mp1_state = PP_MP1_STATE_NONE; 5234 break; 5235 } 5236 5237 pci_dev_put(p); 5238 } 5239 5240 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5241 { 5242 amdgpu_vf_error_trans_all(adev); 5243 adev->mp1_state = PP_MP1_STATE_NONE; 5244 } 5245 5246 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5247 { 5248 STUB(); 5249 #ifdef notyet 5250 struct pci_dev *p = NULL; 5251 5252 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5253 adev->pdev->bus->number, 1); 5254 if (p) { 5255 pm_runtime_enable(&(p->dev)); 5256 pm_runtime_resume(&(p->dev)); 5257 } 5258 #endif 5259 } 5260 5261 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5262 { 5263 enum amd_reset_method reset_method; 5264 struct pci_dev *p = NULL; 5265 u64 expires; 5266 5267 /* 5268 * For now, only BACO and mode1 reset are confirmed 5269 * to suffer the audio issue without proper suspended. 5270 */ 5271 reset_method = amdgpu_asic_reset_method(adev); 5272 if ((reset_method != AMD_RESET_METHOD_BACO) && 5273 (reset_method != AMD_RESET_METHOD_MODE1)) 5274 return -EINVAL; 5275 5276 STUB(); 5277 return -ENOSYS; 5278 #ifdef notyet 5279 5280 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5281 adev->pdev->bus->number, 1); 5282 if (!p) 5283 return -ENODEV; 5284 5285 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5286 if (!expires) 5287 /* 5288 * If we cannot get the audio device autosuspend delay, 5289 * a fixed 4S interval will be used. Considering 3S is 5290 * the audio controller default autosuspend delay setting. 5291 * 4S used here is guaranteed to cover that. 5292 */ 5293 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5294 5295 while (!pm_runtime_status_suspended(&(p->dev))) { 5296 if (!pm_runtime_suspend(&(p->dev))) 5297 break; 5298 5299 if (expires < ktime_get_mono_fast_ns()) { 5300 dev_warn(adev->dev, "failed to suspend display audio\n"); 5301 pci_dev_put(p); 5302 /* TODO: abort the succeeding gpu reset? */ 5303 return -ETIMEDOUT; 5304 } 5305 } 5306 5307 pm_runtime_disable(&(p->dev)); 5308 5309 pci_dev_put(p); 5310 return 0; 5311 #endif 5312 } 5313 5314 static void amdgpu_device_recheck_guilty_jobs( 5315 struct amdgpu_device *adev, struct list_head *device_list_handle, 5316 struct amdgpu_reset_context *reset_context) 5317 { 5318 int i, r = 0; 5319 5320 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5321 struct amdgpu_ring *ring = adev->rings[i]; 5322 int ret = 0; 5323 struct drm_sched_job *s_job; 5324 5325 if (!ring || !ring->sched.thread) 5326 continue; 5327 5328 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5329 struct drm_sched_job, list); 5330 if (s_job == NULL) 5331 continue; 5332 5333 /* clear job's guilty and depend the folowing step to decide the real one */ 5334 drm_sched_reset_karma(s_job); 5335 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5336 5337 if (!s_job->s_fence->parent) { 5338 DRM_WARN("Failed to get a HW fence for job!"); 5339 continue; 5340 } 5341 5342 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5343 if (ret == 0) { /* timeout */ 5344 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5345 ring->sched.name, s_job->id); 5346 5347 5348 amdgpu_fence_driver_isr_toggle(adev, true); 5349 5350 /* Clear this failed job from fence array */ 5351 amdgpu_fence_driver_clear_job_fences(ring); 5352 5353 amdgpu_fence_driver_isr_toggle(adev, false); 5354 5355 /* Since the job won't signal and we go for 5356 * another resubmit drop this parent pointer 5357 */ 5358 dma_fence_put(s_job->s_fence->parent); 5359 s_job->s_fence->parent = NULL; 5360 5361 /* set guilty */ 5362 drm_sched_increase_karma(s_job); 5363 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5364 retry: 5365 /* do hw reset */ 5366 if (amdgpu_sriov_vf(adev)) { 5367 amdgpu_virt_fini_data_exchange(adev); 5368 r = amdgpu_device_reset_sriov(adev, false); 5369 if (r) 5370 adev->asic_reset_res = r; 5371 } else { 5372 clear_bit(AMDGPU_SKIP_HW_RESET, 5373 &reset_context->flags); 5374 r = amdgpu_do_asic_reset(device_list_handle, 5375 reset_context); 5376 if (r && r == -EAGAIN) 5377 goto retry; 5378 } 5379 5380 /* 5381 * add reset counter so that the following 5382 * resubmitted job could flush vmid 5383 */ 5384 atomic_inc(&adev->gpu_reset_counter); 5385 continue; 5386 } 5387 5388 /* got the hw fence, signal finished fence */ 5389 atomic_dec(ring->sched.score); 5390 dma_fence_get(&s_job->s_fence->finished); 5391 dma_fence_signal(&s_job->s_fence->finished); 5392 dma_fence_put(&s_job->s_fence->finished); 5393 5394 /* remove node from list and free the job */ 5395 spin_lock(&ring->sched.job_list_lock); 5396 list_del_init(&s_job->list); 5397 spin_unlock(&ring->sched.job_list_lock); 5398 ring->sched.ops->free_job(s_job); 5399 } 5400 } 5401 5402 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5403 { 5404 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5405 5406 #if defined(CONFIG_DEBUG_FS) 5407 if (!amdgpu_sriov_vf(adev)) 5408 cancel_work(&adev->reset_work); 5409 #endif 5410 5411 if (adev->kfd.dev) 5412 cancel_work(&adev->kfd.reset_work); 5413 5414 if (amdgpu_sriov_vf(adev)) 5415 cancel_work(&adev->virt.flr_work); 5416 5417 if (con && adev->ras_enabled) 5418 cancel_work(&con->recovery_work); 5419 5420 } 5421 5422 5423 /** 5424 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5425 * 5426 * @adev: amdgpu_device pointer 5427 * @job: which job trigger hang 5428 * 5429 * Attempt to reset the GPU if it has hung (all asics). 5430 * Attempt to do soft-reset or full-reset and reinitialize Asic 5431 * Returns 0 for success or an error on failure. 5432 */ 5433 5434 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5435 struct amdgpu_job *job, 5436 struct amdgpu_reset_context *reset_context) 5437 { 5438 struct list_head device_list, *device_list_handle = NULL; 5439 bool job_signaled = false; 5440 struct amdgpu_hive_info *hive = NULL; 5441 struct amdgpu_device *tmp_adev = NULL; 5442 int i, r = 0; 5443 bool need_emergency_restart = false; 5444 bool audio_suspended = false; 5445 int tmp_vram_lost_counter; 5446 bool gpu_reset_for_dev_remove = false; 5447 5448 gpu_reset_for_dev_remove = 5449 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5450 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5451 5452 /* 5453 * Special case: RAS triggered and full reset isn't supported 5454 */ 5455 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5456 5457 /* 5458 * Flush RAM to disk so that after reboot 5459 * the user can read log and see why the system rebooted. 5460 */ 5461 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5462 DRM_WARN("Emergency reboot."); 5463 5464 #ifdef notyet 5465 ksys_sync_helper(); 5466 emergency_restart(); 5467 #else 5468 panic("emergency_restart"); 5469 #endif 5470 } 5471 5472 dev_info(adev->dev, "GPU %s begin!\n", 5473 need_emergency_restart ? "jobs stop":"reset"); 5474 5475 if (!amdgpu_sriov_vf(adev)) 5476 hive = amdgpu_get_xgmi_hive(adev); 5477 if (hive) 5478 mutex_lock(&hive->hive_lock); 5479 5480 reset_context->job = job; 5481 reset_context->hive = hive; 5482 /* 5483 * Build list of devices to reset. 5484 * In case we are in XGMI hive mode, resort the device list 5485 * to put adev in the 1st position. 5486 */ 5487 INIT_LIST_HEAD(&device_list); 5488 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5489 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5490 list_add_tail(&tmp_adev->reset_list, &device_list); 5491 if (gpu_reset_for_dev_remove && adev->shutdown) 5492 tmp_adev->shutdown = true; 5493 } 5494 if (!list_is_first(&adev->reset_list, &device_list)) 5495 list_rotate_to_front(&adev->reset_list, &device_list); 5496 device_list_handle = &device_list; 5497 } else { 5498 list_add_tail(&adev->reset_list, &device_list); 5499 device_list_handle = &device_list; 5500 } 5501 5502 /* We need to lock reset domain only once both for XGMI and single device */ 5503 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5504 reset_list); 5505 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5506 5507 /* block all schedulers and reset given job's ring */ 5508 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5509 5510 amdgpu_device_set_mp1_state(tmp_adev); 5511 5512 /* 5513 * Try to put the audio codec into suspend state 5514 * before gpu reset started. 5515 * 5516 * Due to the power domain of the graphics device 5517 * is shared with AZ power domain. Without this, 5518 * we may change the audio hardware from behind 5519 * the audio driver's back. That will trigger 5520 * some audio codec errors. 5521 */ 5522 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5523 audio_suspended = true; 5524 5525 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5526 5527 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5528 5529 if (!amdgpu_sriov_vf(tmp_adev)) 5530 amdgpu_amdkfd_pre_reset(tmp_adev); 5531 5532 /* 5533 * Mark these ASICs to be reseted as untracked first 5534 * And add them back after reset completed 5535 */ 5536 amdgpu_unregister_gpu_instance(tmp_adev); 5537 5538 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5539 5540 /* disable ras on ALL IPs */ 5541 if (!need_emergency_restart && 5542 amdgpu_device_ip_need_full_reset(tmp_adev)) 5543 amdgpu_ras_suspend(tmp_adev); 5544 5545 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5546 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5547 5548 if (!ring || !ring->sched.thread) 5549 continue; 5550 5551 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5552 5553 if (need_emergency_restart) 5554 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5555 } 5556 atomic_inc(&tmp_adev->gpu_reset_counter); 5557 } 5558 5559 if (need_emergency_restart) 5560 goto skip_sched_resume; 5561 5562 /* 5563 * Must check guilty signal here since after this point all old 5564 * HW fences are force signaled. 5565 * 5566 * job->base holds a reference to parent fence 5567 */ 5568 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5569 job_signaled = true; 5570 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5571 goto skip_hw_reset; 5572 } 5573 5574 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5575 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5576 if (gpu_reset_for_dev_remove) { 5577 /* Workaroud for ASICs need to disable SMC first */ 5578 amdgpu_device_smu_fini_early(tmp_adev); 5579 } 5580 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5581 /*TODO Should we stop ?*/ 5582 if (r) { 5583 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5584 r, adev_to_drm(tmp_adev)->unique); 5585 tmp_adev->asic_reset_res = r; 5586 } 5587 5588 /* 5589 * Drop all pending non scheduler resets. Scheduler resets 5590 * were already dropped during drm_sched_stop 5591 */ 5592 amdgpu_device_stop_pending_resets(tmp_adev); 5593 } 5594 5595 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5596 /* Actual ASIC resets if needed.*/ 5597 /* Host driver will handle XGMI hive reset for SRIOV */ 5598 if (amdgpu_sriov_vf(adev)) { 5599 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5600 if (r) 5601 adev->asic_reset_res = r; 5602 5603 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5604 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5605 amdgpu_ras_resume(adev); 5606 } else { 5607 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5608 if (r && r == -EAGAIN) 5609 goto retry; 5610 5611 if (!r && gpu_reset_for_dev_remove) 5612 goto recover_end; 5613 } 5614 5615 skip_hw_reset: 5616 5617 /* Post ASIC reset for all devs .*/ 5618 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5619 5620 /* 5621 * Sometimes a later bad compute job can block a good gfx job as gfx 5622 * and compute ring share internal GC HW mutually. We add an additional 5623 * guilty jobs recheck step to find the real guilty job, it synchronously 5624 * submits and pends for the first job being signaled. If it gets timeout, 5625 * we identify it as a real guilty job. 5626 */ 5627 if (amdgpu_gpu_recovery == 2 && 5628 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5629 amdgpu_device_recheck_guilty_jobs( 5630 tmp_adev, device_list_handle, reset_context); 5631 5632 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5633 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5634 5635 if (!ring || !ring->sched.thread) 5636 continue; 5637 5638 /* No point to resubmit jobs if we didn't HW reset*/ 5639 if (!tmp_adev->asic_reset_res && !job_signaled) 5640 drm_sched_resubmit_jobs(&ring->sched); 5641 5642 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5643 } 5644 5645 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5646 amdgpu_mes_self_test(tmp_adev); 5647 5648 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5649 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5650 } 5651 5652 if (tmp_adev->asic_reset_res) 5653 r = tmp_adev->asic_reset_res; 5654 5655 tmp_adev->asic_reset_res = 0; 5656 5657 if (r) { 5658 /* bad news, how to tell it to userspace ? */ 5659 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5660 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5661 } else { 5662 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5663 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5664 DRM_WARN("smart shift update failed\n"); 5665 } 5666 } 5667 5668 skip_sched_resume: 5669 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5670 /* unlock kfd: SRIOV would do it separately */ 5671 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5672 amdgpu_amdkfd_post_reset(tmp_adev); 5673 5674 /* kfd_post_reset will do nothing if kfd device is not initialized, 5675 * need to bring up kfd here if it's not be initialized before 5676 */ 5677 if (!adev->kfd.init_complete) 5678 amdgpu_amdkfd_device_init(adev); 5679 5680 if (audio_suspended) 5681 amdgpu_device_resume_display_audio(tmp_adev); 5682 5683 amdgpu_device_unset_mp1_state(tmp_adev); 5684 } 5685 5686 recover_end: 5687 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5688 reset_list); 5689 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5690 5691 if (hive) { 5692 mutex_unlock(&hive->hive_lock); 5693 amdgpu_put_xgmi_hive(hive); 5694 } 5695 5696 if (r) 5697 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5698 5699 atomic_set(&adev->reset_domain->reset_res, r); 5700 return r; 5701 } 5702 5703 /** 5704 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5705 * 5706 * @adev: amdgpu_device pointer 5707 * 5708 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5709 * and lanes) of the slot the device is in. Handles APUs and 5710 * virtualized environments where PCIE config space may not be available. 5711 */ 5712 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5713 { 5714 struct pci_dev *pdev; 5715 enum pci_bus_speed speed_cap, platform_speed_cap; 5716 enum pcie_link_width platform_link_width; 5717 5718 if (amdgpu_pcie_gen_cap) 5719 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5720 5721 if (amdgpu_pcie_lane_cap) 5722 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5723 5724 /* covers APUs as well */ 5725 if (pci_is_root_bus(adev->pdev->bus)) { 5726 if (adev->pm.pcie_gen_mask == 0) 5727 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5728 if (adev->pm.pcie_mlw_mask == 0) 5729 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5730 return; 5731 } 5732 5733 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5734 return; 5735 5736 pcie_bandwidth_available(adev->pdev, NULL, 5737 &platform_speed_cap, &platform_link_width); 5738 5739 if (adev->pm.pcie_gen_mask == 0) { 5740 /* asic caps */ 5741 pdev = adev->pdev; 5742 speed_cap = pcie_get_speed_cap(pdev); 5743 if (speed_cap == PCI_SPEED_UNKNOWN) { 5744 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5745 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5746 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5747 } else { 5748 if (speed_cap == PCIE_SPEED_32_0GT) 5749 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5751 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5752 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5753 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5754 else if (speed_cap == PCIE_SPEED_16_0GT) 5755 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5756 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5757 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5758 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5759 else if (speed_cap == PCIE_SPEED_8_0GT) 5760 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5761 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5762 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5763 else if (speed_cap == PCIE_SPEED_5_0GT) 5764 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5765 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5766 else 5767 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5768 } 5769 /* platform caps */ 5770 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5771 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5772 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5773 } else { 5774 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5775 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5777 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5778 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5780 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5781 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5782 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5783 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5784 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5785 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5786 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5787 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5788 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5789 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5790 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5791 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5792 else 5793 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5794 5795 } 5796 } 5797 if (adev->pm.pcie_mlw_mask == 0) { 5798 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5799 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5800 } else { 5801 switch (platform_link_width) { 5802 case PCIE_LNK_X32: 5803 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5807 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5808 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5810 break; 5811 case PCIE_LNK_X16: 5812 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5815 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5818 break; 5819 case PCIE_LNK_X12: 5820 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5825 break; 5826 case PCIE_LNK_X8: 5827 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5831 break; 5832 case PCIE_LNK_X4: 5833 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5836 break; 5837 case PCIE_LNK_X2: 5838 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5840 break; 5841 case PCIE_LNK_X1: 5842 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5843 break; 5844 default: 5845 break; 5846 } 5847 } 5848 } 5849 } 5850 5851 /** 5852 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5853 * 5854 * @adev: amdgpu_device pointer 5855 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5856 * 5857 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5858 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5859 * @peer_adev. 5860 */ 5861 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5862 struct amdgpu_device *peer_adev) 5863 { 5864 #ifdef CONFIG_HSA_AMD_P2P 5865 uint64_t address_mask = peer_adev->dev->dma_mask ? 5866 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5867 resource_size_t aper_limit = 5868 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5869 bool p2p_access = 5870 !adev->gmc.xgmi.connected_to_cpu && 5871 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5872 5873 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5874 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5875 !(adev->gmc.aper_base & address_mask || 5876 aper_limit & address_mask)); 5877 #else 5878 return false; 5879 #endif 5880 } 5881 5882 int amdgpu_device_baco_enter(struct drm_device *dev) 5883 { 5884 struct amdgpu_device *adev = drm_to_adev(dev); 5885 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5886 5887 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5888 return -ENOTSUPP; 5889 5890 if (ras && adev->ras_enabled && 5891 adev->nbio.funcs->enable_doorbell_interrupt) 5892 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5893 5894 return amdgpu_dpm_baco_enter(adev); 5895 } 5896 5897 int amdgpu_device_baco_exit(struct drm_device *dev) 5898 { 5899 struct amdgpu_device *adev = drm_to_adev(dev); 5900 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5901 int ret = 0; 5902 5903 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5904 return -ENOTSUPP; 5905 5906 ret = amdgpu_dpm_baco_exit(adev); 5907 if (ret) 5908 return ret; 5909 5910 if (ras && adev->ras_enabled && 5911 adev->nbio.funcs->enable_doorbell_interrupt) 5912 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5913 5914 if (amdgpu_passthrough(adev) && 5915 adev->nbio.funcs->clear_doorbell_interrupt) 5916 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5917 5918 return 0; 5919 } 5920 5921 /** 5922 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5923 * @pdev: PCI device struct 5924 * @state: PCI channel state 5925 * 5926 * Description: Called when a PCI error is detected. 5927 * 5928 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5929 */ 5930 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5931 { 5932 STUB(); 5933 return 0; 5934 #ifdef notyet 5935 struct drm_device *dev = pci_get_drvdata(pdev); 5936 struct amdgpu_device *adev = drm_to_adev(dev); 5937 int i; 5938 5939 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5940 5941 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5942 DRM_WARN("No support for XGMI hive yet..."); 5943 return PCI_ERS_RESULT_DISCONNECT; 5944 } 5945 5946 adev->pci_channel_state = state; 5947 5948 switch (state) { 5949 case pci_channel_io_normal: 5950 return PCI_ERS_RESULT_CAN_RECOVER; 5951 /* Fatal error, prepare for slot reset */ 5952 case pci_channel_io_frozen: 5953 /* 5954 * Locking adev->reset_domain->sem will prevent any external access 5955 * to GPU during PCI error recovery 5956 */ 5957 amdgpu_device_lock_reset_domain(adev->reset_domain); 5958 amdgpu_device_set_mp1_state(adev); 5959 5960 /* 5961 * Block any work scheduling as we do for regular GPU reset 5962 * for the duration of the recovery 5963 */ 5964 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5965 struct amdgpu_ring *ring = adev->rings[i]; 5966 5967 if (!ring || !ring->sched.thread) 5968 continue; 5969 5970 drm_sched_stop(&ring->sched, NULL); 5971 } 5972 atomic_inc(&adev->gpu_reset_counter); 5973 return PCI_ERS_RESULT_NEED_RESET; 5974 case pci_channel_io_perm_failure: 5975 /* Permanent error, prepare for device removal */ 5976 return PCI_ERS_RESULT_DISCONNECT; 5977 } 5978 5979 return PCI_ERS_RESULT_NEED_RESET; 5980 #endif 5981 } 5982 5983 /** 5984 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5985 * @pdev: pointer to PCI device 5986 */ 5987 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5988 { 5989 5990 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5991 5992 /* TODO - dump whatever for debugging purposes */ 5993 5994 /* This called only if amdgpu_pci_error_detected returns 5995 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5996 * works, no need to reset slot. 5997 */ 5998 5999 return PCI_ERS_RESULT_RECOVERED; 6000 } 6001 6002 /** 6003 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6004 * @pdev: PCI device struct 6005 * 6006 * Description: This routine is called by the pci error recovery 6007 * code after the PCI slot has been reset, just before we 6008 * should resume normal operations. 6009 */ 6010 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6011 { 6012 STUB(); 6013 return PCI_ERS_RESULT_RECOVERED; 6014 #ifdef notyet 6015 struct drm_device *dev = pci_get_drvdata(pdev); 6016 struct amdgpu_device *adev = drm_to_adev(dev); 6017 int r, i; 6018 struct amdgpu_reset_context reset_context; 6019 u32 memsize; 6020 struct list_head device_list; 6021 6022 DRM_INFO("PCI error: slot reset callback!!\n"); 6023 6024 memset(&reset_context, 0, sizeof(reset_context)); 6025 6026 INIT_LIST_HEAD(&device_list); 6027 list_add_tail(&adev->reset_list, &device_list); 6028 6029 /* wait for asic to come out of reset */ 6030 drm_msleep(500); 6031 6032 /* Restore PCI confspace */ 6033 amdgpu_device_load_pci_state(pdev); 6034 6035 /* confirm ASIC came out of reset */ 6036 for (i = 0; i < adev->usec_timeout; i++) { 6037 memsize = amdgpu_asic_get_config_memsize(adev); 6038 6039 if (memsize != 0xffffffff) 6040 break; 6041 udelay(1); 6042 } 6043 if (memsize == 0xffffffff) { 6044 r = -ETIME; 6045 goto out; 6046 } 6047 6048 reset_context.method = AMD_RESET_METHOD_NONE; 6049 reset_context.reset_req_dev = adev; 6050 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6051 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6052 6053 adev->no_hw_access = true; 6054 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6055 adev->no_hw_access = false; 6056 if (r) 6057 goto out; 6058 6059 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6060 6061 out: 6062 if (!r) { 6063 if (amdgpu_device_cache_pci_state(adev->pdev)) 6064 pci_restore_state(adev->pdev); 6065 6066 DRM_INFO("PCIe error recovery succeeded\n"); 6067 } else { 6068 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6069 amdgpu_device_unset_mp1_state(adev); 6070 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6071 } 6072 6073 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6074 #endif 6075 } 6076 6077 /** 6078 * amdgpu_pci_resume() - resume normal ops after PCI reset 6079 * @pdev: pointer to PCI device 6080 * 6081 * Called when the error recovery driver tells us that its 6082 * OK to resume normal operation. 6083 */ 6084 void amdgpu_pci_resume(struct pci_dev *pdev) 6085 { 6086 STUB(); 6087 #ifdef notyet 6088 struct drm_device *dev = pci_get_drvdata(pdev); 6089 struct amdgpu_device *adev = drm_to_adev(dev); 6090 int i; 6091 6092 6093 DRM_INFO("PCI error: resume callback!!\n"); 6094 6095 /* Only continue execution for the case of pci_channel_io_frozen */ 6096 if (adev->pci_channel_state != pci_channel_io_frozen) 6097 return; 6098 6099 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6100 struct amdgpu_ring *ring = adev->rings[i]; 6101 6102 if (!ring || !ring->sched.thread) 6103 continue; 6104 6105 6106 drm_sched_resubmit_jobs(&ring->sched); 6107 drm_sched_start(&ring->sched, true); 6108 } 6109 6110 amdgpu_device_unset_mp1_state(adev); 6111 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6112 #endif 6113 } 6114 6115 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6116 { 6117 return false; 6118 #ifdef notyet 6119 struct drm_device *dev = pci_get_drvdata(pdev); 6120 struct amdgpu_device *adev = drm_to_adev(dev); 6121 int r; 6122 6123 r = pci_save_state(pdev); 6124 if (!r) { 6125 kfree(adev->pci_state); 6126 6127 adev->pci_state = pci_store_saved_state(pdev); 6128 6129 if (!adev->pci_state) { 6130 DRM_ERROR("Failed to store PCI saved state"); 6131 return false; 6132 } 6133 } else { 6134 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6135 return false; 6136 } 6137 6138 return true; 6139 #endif 6140 } 6141 6142 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6143 { 6144 STUB(); 6145 return false; 6146 #ifdef notyet 6147 struct drm_device *dev = pci_get_drvdata(pdev); 6148 struct amdgpu_device *adev = drm_to_adev(dev); 6149 int r; 6150 6151 if (!adev->pci_state) 6152 return false; 6153 6154 r = pci_load_saved_state(pdev, adev->pci_state); 6155 6156 if (!r) { 6157 pci_restore_state(pdev); 6158 } else { 6159 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6160 return false; 6161 } 6162 6163 return true; 6164 #endif 6165 } 6166 6167 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6168 struct amdgpu_ring *ring) 6169 { 6170 #ifdef CONFIG_X86_64 6171 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6172 return; 6173 #endif 6174 if (adev->gmc.xgmi.connected_to_cpu) 6175 return; 6176 6177 if (ring && ring->funcs->emit_hdp_flush) 6178 amdgpu_ring_emit_hdp_flush(ring); 6179 else 6180 amdgpu_asic_flush_hdp(adev, ring); 6181 } 6182 6183 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6184 struct amdgpu_ring *ring) 6185 { 6186 #ifdef CONFIG_X86_64 6187 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6188 return; 6189 #endif 6190 if (adev->gmc.xgmi.connected_to_cpu) 6191 return; 6192 6193 amdgpu_asic_invalidate_hdp(adev, ring); 6194 } 6195 6196 int amdgpu_in_reset(struct amdgpu_device *adev) 6197 { 6198 return atomic_read(&adev->reset_domain->in_gpu_reset); 6199 } 6200 6201 /** 6202 * amdgpu_device_halt() - bring hardware to some kind of halt state 6203 * 6204 * @adev: amdgpu_device pointer 6205 * 6206 * Bring hardware to some kind of halt state so that no one can touch it 6207 * any more. It will help to maintain error context when error occurred. 6208 * Compare to a simple hang, the system will keep stable at least for SSH 6209 * access. Then it should be trivial to inspect the hardware state and 6210 * see what's going on. Implemented as following: 6211 * 6212 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6213 * clears all CPU mappings to device, disallows remappings through page faults 6214 * 2. amdgpu_irq_disable_all() disables all interrupts 6215 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6216 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6217 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6218 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6219 * flush any in flight DMA operations 6220 */ 6221 void amdgpu_device_halt(struct amdgpu_device *adev) 6222 { 6223 struct pci_dev *pdev = adev->pdev; 6224 struct drm_device *ddev = adev_to_drm(adev); 6225 6226 drm_dev_unplug(ddev); 6227 6228 amdgpu_irq_disable_all(adev); 6229 6230 amdgpu_fence_driver_hw_fini(adev); 6231 6232 adev->no_hw_access = true; 6233 6234 amdgpu_device_unmap_mmio(adev); 6235 6236 pci_disable_device(pdev); 6237 pci_wait_for_pending_transaction(pdev); 6238 } 6239 6240 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6241 u32 reg) 6242 { 6243 unsigned long flags, address, data; 6244 u32 r; 6245 6246 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6247 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6248 6249 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6250 WREG32(address, reg * 4); 6251 (void)RREG32(address); 6252 r = RREG32(data); 6253 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6254 return r; 6255 } 6256 6257 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6258 u32 reg, u32 v) 6259 { 6260 unsigned long flags, address, data; 6261 6262 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6263 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6264 6265 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6266 WREG32(address, reg * 4); 6267 (void)RREG32(address); 6268 WREG32(data, v); 6269 (void)RREG32(data); 6270 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6271 } 6272 6273 /** 6274 * amdgpu_device_switch_gang - switch to a new gang 6275 * @adev: amdgpu_device pointer 6276 * @gang: the gang to switch to 6277 * 6278 * Try to switch to a new gang. 6279 * Returns: NULL if we switched to the new gang or a reference to the current 6280 * gang leader. 6281 */ 6282 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6283 struct dma_fence *gang) 6284 { 6285 struct dma_fence *old = NULL; 6286 6287 do { 6288 dma_fence_put(old); 6289 rcu_read_lock(); 6290 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6291 rcu_read_unlock(); 6292 6293 if (old == gang) 6294 break; 6295 6296 if (!dma_fence_is_signaled(old)) 6297 return old; 6298 6299 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6300 old, gang) != old); 6301 6302 dma_fence_put(old); 6303 return NULL; 6304 } 6305 6306 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6307 { 6308 switch (adev->asic_type) { 6309 #ifdef CONFIG_DRM_AMDGPU_SI 6310 case CHIP_HAINAN: 6311 #endif 6312 case CHIP_TOPAZ: 6313 /* chips with no display hardware */ 6314 return false; 6315 #ifdef CONFIG_DRM_AMDGPU_SI 6316 case CHIP_TAHITI: 6317 case CHIP_PITCAIRN: 6318 case CHIP_VERDE: 6319 case CHIP_OLAND: 6320 #endif 6321 #ifdef CONFIG_DRM_AMDGPU_CIK 6322 case CHIP_BONAIRE: 6323 case CHIP_HAWAII: 6324 case CHIP_KAVERI: 6325 case CHIP_KABINI: 6326 case CHIP_MULLINS: 6327 #endif 6328 case CHIP_TONGA: 6329 case CHIP_FIJI: 6330 case CHIP_POLARIS10: 6331 case CHIP_POLARIS11: 6332 case CHIP_POLARIS12: 6333 case CHIP_VEGAM: 6334 case CHIP_CARRIZO: 6335 case CHIP_STONEY: 6336 /* chips with display hardware */ 6337 return true; 6338 default: 6339 /* IP discovery */ 6340 if (!adev->ip_versions[DCE_HWIP][0] || 6341 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6342 return false; 6343 return true; 6344 } 6345 } 6346