1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 77 #include <linux/suspend.h> 78 #include <drm/task_barrier.h> 79 #include <linux/pm_runtime.h> 80 81 #include <drm/drm_drv.h> 82 83 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 84 #include <asm/intel-family.h> 85 #endif 86 87 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 94 95 #define AMDGPU_RESUME_MS 2000 96 #define AMDGPU_MAX_RETRY_LIMIT 2 97 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 98 99 static const struct drm_driver amdgpu_kms_driver; 100 101 const char *amdgpu_asic_name[] = { 102 "TAHITI", 103 "PITCAIRN", 104 "VERDE", 105 "OLAND", 106 "HAINAN", 107 "BONAIRE", 108 "KAVERI", 109 "KABINI", 110 "HAWAII", 111 "MULLINS", 112 "TOPAZ", 113 "TONGA", 114 "FIJI", 115 "CARRIZO", 116 "STONEY", 117 "POLARIS10", 118 "POLARIS11", 119 "POLARIS12", 120 "VEGAM", 121 "VEGA10", 122 "VEGA12", 123 "VEGA20", 124 "RAVEN", 125 "ARCTURUS", 126 "RENOIR", 127 "ALDEBARAN", 128 "NAVI10", 129 "CYAN_SKILLFISH", 130 "NAVI14", 131 "NAVI12", 132 "SIENNA_CICHLID", 133 "NAVY_FLOUNDER", 134 "VANGOGH", 135 "DIMGREY_CAVEFISH", 136 "BEIGE_GOBY", 137 "YELLOW_CARP", 138 "IP DISCOVERY", 139 "LAST", 140 }; 141 142 /** 143 * DOC: pcie_replay_count 144 * 145 * The amdgpu driver provides a sysfs API for reporting the total number 146 * of PCIe replays (NAKs) 147 * The file pcie_replay_count is used for this and returns the total 148 * number of replays as a sum of the NAKs generated and NAKs received 149 */ 150 151 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 152 struct device_attribute *attr, char *buf) 153 { 154 struct drm_device *ddev = dev_get_drvdata(dev); 155 struct amdgpu_device *adev = drm_to_adev(ddev); 156 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 157 158 return sysfs_emit(buf, "%llu\n", cnt); 159 } 160 161 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 162 amdgpu_device_get_pcie_replay_count, NULL); 163 164 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 165 166 /** 167 * DOC: product_name 168 * 169 * The amdgpu driver provides a sysfs API for reporting the product name 170 * for the device 171 * The file serial_number is used for this and returns the product name 172 * as returned from the FRU. 173 * NOTE: This is only available for certain server cards 174 */ 175 176 static ssize_t amdgpu_device_get_product_name(struct device *dev, 177 struct device_attribute *attr, char *buf) 178 { 179 struct drm_device *ddev = dev_get_drvdata(dev); 180 struct amdgpu_device *adev = drm_to_adev(ddev); 181 182 return sysfs_emit(buf, "%s\n", adev->product_name); 183 } 184 185 static DEVICE_ATTR(product_name, S_IRUGO, 186 amdgpu_device_get_product_name, NULL); 187 188 /** 189 * DOC: product_number 190 * 191 * The amdgpu driver provides a sysfs API for reporting the part number 192 * for the device 193 * The file serial_number is used for this and returns the part number 194 * as returned from the FRU. 195 * NOTE: This is only available for certain server cards 196 */ 197 198 static ssize_t amdgpu_device_get_product_number(struct device *dev, 199 struct device_attribute *attr, char *buf) 200 { 201 struct drm_device *ddev = dev_get_drvdata(dev); 202 struct amdgpu_device *adev = drm_to_adev(ddev); 203 204 return sysfs_emit(buf, "%s\n", adev->product_number); 205 } 206 207 static DEVICE_ATTR(product_number, S_IRUGO, 208 amdgpu_device_get_product_number, NULL); 209 210 /** 211 * DOC: serial_number 212 * 213 * The amdgpu driver provides a sysfs API for reporting the serial number 214 * for the device 215 * The file serial_number is used for this and returns the serial number 216 * as returned from the FRU. 217 * NOTE: This is only available for certain server cards 218 */ 219 220 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 221 struct device_attribute *attr, char *buf) 222 { 223 struct drm_device *ddev = dev_get_drvdata(dev); 224 struct amdgpu_device *adev = drm_to_adev(ddev); 225 226 return sysfs_emit(buf, "%s\n", adev->serial); 227 } 228 229 static DEVICE_ATTR(serial_number, S_IRUGO, 230 amdgpu_device_get_serial_number, NULL); 231 232 /** 233 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 234 * 235 * @dev: drm_device pointer 236 * 237 * Returns true if the device is a dGPU with ATPX power control, 238 * otherwise return false. 239 */ 240 bool amdgpu_device_supports_px(struct drm_device *dev) 241 { 242 struct amdgpu_device *adev = drm_to_adev(dev); 243 244 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 245 return true; 246 return false; 247 } 248 249 /** 250 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 251 * 252 * @dev: drm_device pointer 253 * 254 * Returns true if the device is a dGPU with ACPI power control, 255 * otherwise return false. 256 */ 257 bool amdgpu_device_supports_boco(struct drm_device *dev) 258 { 259 struct amdgpu_device *adev = drm_to_adev(dev); 260 261 if (adev->has_pr3 || 262 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 263 return true; 264 return false; 265 } 266 267 /** 268 * amdgpu_device_supports_baco - Does the device support BACO 269 * 270 * @dev: drm_device pointer 271 * 272 * Returns true if the device supporte BACO, 273 * otherwise return false. 274 */ 275 bool amdgpu_device_supports_baco(struct drm_device *dev) 276 { 277 struct amdgpu_device *adev = drm_to_adev(dev); 278 279 return amdgpu_asic_supports_baco(adev); 280 } 281 282 /** 283 * amdgpu_device_supports_smart_shift - Is the device dGPU with 284 * smart shift support 285 * 286 * @dev: drm_device pointer 287 * 288 * Returns true if the device is a dGPU with Smart Shift support, 289 * otherwise returns false. 290 */ 291 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 292 { 293 return (amdgpu_device_supports_boco(dev) && 294 amdgpu_acpi_is_power_shift_control_supported()); 295 } 296 297 /* 298 * VRAM access helper functions 299 */ 300 301 /** 302 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 303 * 304 * @adev: amdgpu_device pointer 305 * @pos: offset of the buffer in vram 306 * @buf: virtual address of the buffer in system memory 307 * @size: read/write size, sizeof(@buf) must > @size 308 * @write: true - write to vram, otherwise - read from vram 309 */ 310 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 311 void *buf, size_t size, bool write) 312 { 313 unsigned long flags; 314 uint32_t hi = ~0, tmp = 0; 315 uint32_t *data = buf; 316 uint64_t last; 317 int idx; 318 319 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 320 return; 321 322 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 323 324 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 325 for (last = pos + size; pos < last; pos += 4) { 326 tmp = pos >> 31; 327 328 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 329 if (tmp != hi) { 330 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 331 hi = tmp; 332 } 333 if (write) 334 WREG32_NO_KIQ(mmMM_DATA, *data++); 335 else 336 *data++ = RREG32_NO_KIQ(mmMM_DATA); 337 } 338 339 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 340 drm_dev_exit(idx); 341 } 342 343 /** 344 * amdgpu_device_aper_access - access vram by vram aperature 345 * 346 * @adev: amdgpu_device pointer 347 * @pos: offset of the buffer in vram 348 * @buf: virtual address of the buffer in system memory 349 * @size: read/write size, sizeof(@buf) must > @size 350 * @write: true - write to vram, otherwise - read from vram 351 * 352 * The return value means how many bytes have been transferred. 353 */ 354 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 355 void *buf, size_t size, bool write) 356 { 357 #ifdef CONFIG_64BIT 358 void __iomem *addr; 359 size_t count = 0; 360 uint64_t last; 361 362 if (!adev->mman.aper_base_kaddr) 363 return 0; 364 365 last = min(pos + size, adev->gmc.visible_vram_size); 366 if (last > pos) { 367 addr = adev->mman.aper_base_kaddr + pos; 368 count = last - pos; 369 370 if (write) { 371 memcpy_toio(addr, buf, count); 372 mb(); 373 amdgpu_device_flush_hdp(adev, NULL); 374 } else { 375 amdgpu_device_invalidate_hdp(adev, NULL); 376 mb(); 377 memcpy_fromio(buf, addr, count); 378 } 379 380 } 381 382 return count; 383 #else 384 return 0; 385 #endif 386 } 387 388 /** 389 * amdgpu_device_vram_access - read/write a buffer in vram 390 * 391 * @adev: amdgpu_device pointer 392 * @pos: offset of the buffer in vram 393 * @buf: virtual address of the buffer in system memory 394 * @size: read/write size, sizeof(@buf) must > @size 395 * @write: true - write to vram, otherwise - read from vram 396 */ 397 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 398 void *buf, size_t size, bool write) 399 { 400 size_t count; 401 402 /* try to using vram apreature to access vram first */ 403 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 404 size -= count; 405 if (size) { 406 /* using MM to access rest vram */ 407 pos += count; 408 buf += count; 409 amdgpu_device_mm_access(adev, pos, buf, size, write); 410 } 411 } 412 413 /* 414 * register access helper functions. 415 */ 416 417 /* Check if hw access should be skipped because of hotplug or device error */ 418 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 419 { 420 if (adev->no_hw_access) 421 return true; 422 423 #ifdef CONFIG_LOCKDEP 424 /* 425 * This is a bit complicated to understand, so worth a comment. What we assert 426 * here is that the GPU reset is not running on another thread in parallel. 427 * 428 * For this we trylock the read side of the reset semaphore, if that succeeds 429 * we know that the reset is not running in paralell. 430 * 431 * If the trylock fails we assert that we are either already holding the read 432 * side of the lock or are the reset thread itself and hold the write side of 433 * the lock. 434 */ 435 if (in_task()) { 436 if (down_read_trylock(&adev->reset_domain->sem)) 437 up_read(&adev->reset_domain->sem); 438 else 439 lockdep_assert_held(&adev->reset_domain->sem); 440 } 441 #endif 442 return false; 443 } 444 445 /** 446 * amdgpu_device_rreg - read a memory mapped IO or indirect register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * @acc_flags: access flags which require special behavior 451 * 452 * Returns the 32 bit value from the offset specified. 453 */ 454 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 455 uint32_t reg, uint32_t acc_flags) 456 { 457 uint32_t ret; 458 459 if (amdgpu_device_skip_hw_access(adev)) 460 return 0; 461 462 if ((reg * 4) < adev->rmmio_size) { 463 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 464 amdgpu_sriov_runtime(adev) && 465 down_read_trylock(&adev->reset_domain->sem)) { 466 ret = amdgpu_kiq_rreg(adev, reg); 467 up_read(&adev->reset_domain->sem); 468 } else { 469 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 470 } 471 } else { 472 ret = adev->pcie_rreg(adev, reg * 4); 473 } 474 475 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 476 477 return ret; 478 } 479 480 /* 481 * MMIO register read with bytes helper functions 482 * @offset:bytes offset from MMIO start 483 * 484 */ 485 486 /** 487 * amdgpu_mm_rreg8 - read a memory mapped IO register 488 * 489 * @adev: amdgpu_device pointer 490 * @offset: byte aligned register offset 491 * 492 * Returns the 8 bit value from the offset specified. 493 */ 494 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 495 { 496 if (amdgpu_device_skip_hw_access(adev)) 497 return 0; 498 499 if (offset < adev->rmmio_size) 500 return (readb(adev->rmmio + offset)); 501 BUG(); 502 } 503 504 /* 505 * MMIO register write with bytes helper functions 506 * @offset:bytes offset from MMIO start 507 * @value: the value want to be written to the register 508 * 509 */ 510 /** 511 * amdgpu_mm_wreg8 - read a memory mapped IO register 512 * 513 * @adev: amdgpu_device pointer 514 * @offset: byte aligned register offset 515 * @value: 8 bit value to write 516 * 517 * Writes the value specified to the offset specified. 518 */ 519 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 520 { 521 if (amdgpu_device_skip_hw_access(adev)) 522 return; 523 524 if (offset < adev->rmmio_size) 525 writeb(value, adev->rmmio + offset); 526 else 527 BUG(); 528 } 529 530 /** 531 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 532 * 533 * @adev: amdgpu_device pointer 534 * @reg: dword aligned register offset 535 * @v: 32 bit value to write to the register 536 * @acc_flags: access flags which require special behavior 537 * 538 * Writes the value specified to the offset specified. 539 */ 540 void amdgpu_device_wreg(struct amdgpu_device *adev, 541 uint32_t reg, uint32_t v, 542 uint32_t acc_flags) 543 { 544 if (amdgpu_device_skip_hw_access(adev)) 545 return; 546 547 if ((reg * 4) < adev->rmmio_size) { 548 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 549 amdgpu_sriov_runtime(adev) && 550 down_read_trylock(&adev->reset_domain->sem)) { 551 amdgpu_kiq_wreg(adev, reg, v); 552 up_read(&adev->reset_domain->sem); 553 } else { 554 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 555 } 556 } else { 557 adev->pcie_wreg(adev, reg * 4, v); 558 } 559 560 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 561 } 562 563 /** 564 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 565 * 566 * @adev: amdgpu_device pointer 567 * @reg: mmio/rlc register 568 * @v: value to write 569 * 570 * this function is invoked only for the debugfs register access 571 */ 572 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 573 uint32_t reg, uint32_t v) 574 { 575 if (amdgpu_device_skip_hw_access(adev)) 576 return; 577 578 if (amdgpu_sriov_fullaccess(adev) && 579 adev->gfx.rlc.funcs && 580 adev->gfx.rlc.funcs->is_rlcg_access_range) { 581 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 582 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 583 } else if ((reg * 4) >= adev->rmmio_size) { 584 adev->pcie_wreg(adev, reg * 4, v); 585 } else { 586 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 587 } 588 } 589 590 /** 591 * amdgpu_mm_rdoorbell - read a doorbell dword 592 * 593 * @adev: amdgpu_device pointer 594 * @index: doorbell index 595 * 596 * Returns the value in the doorbell aperture at the 597 * requested doorbell index (CIK). 598 */ 599 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 600 { 601 if (amdgpu_device_skip_hw_access(adev)) 602 return 0; 603 604 if (index < adev->doorbell.num_doorbells) { 605 return readl(adev->doorbell.ptr + index); 606 } else { 607 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 608 return 0; 609 } 610 } 611 612 /** 613 * amdgpu_mm_wdoorbell - write a doorbell dword 614 * 615 * @adev: amdgpu_device pointer 616 * @index: doorbell index 617 * @v: value to write 618 * 619 * Writes @v to the doorbell aperture at the 620 * requested doorbell index (CIK). 621 */ 622 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 623 { 624 if (amdgpu_device_skip_hw_access(adev)) 625 return; 626 627 if (index < adev->doorbell.num_doorbells) { 628 writel(v, adev->doorbell.ptr + index); 629 } else { 630 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 631 } 632 } 633 634 /** 635 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 636 * 637 * @adev: amdgpu_device pointer 638 * @index: doorbell index 639 * 640 * Returns the value in the doorbell aperture at the 641 * requested doorbell index (VEGA10+). 642 */ 643 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 644 { 645 if (amdgpu_device_skip_hw_access(adev)) 646 return 0; 647 648 if (index < adev->doorbell.num_doorbells) { 649 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 650 } else { 651 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 652 return 0; 653 } 654 } 655 656 /** 657 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 658 * 659 * @adev: amdgpu_device pointer 660 * @index: doorbell index 661 * @v: value to write 662 * 663 * Writes @v to the doorbell aperture at the 664 * requested doorbell index (VEGA10+). 665 */ 666 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 667 { 668 if (amdgpu_device_skip_hw_access(adev)) 669 return; 670 671 if (index < adev->doorbell.num_doorbells) { 672 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 673 } else { 674 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 675 } 676 } 677 678 /** 679 * amdgpu_device_indirect_rreg - read an indirect register 680 * 681 * @adev: amdgpu_device pointer 682 * @pcie_index: mmio register offset 683 * @pcie_data: mmio register offset 684 * @reg_addr: indirect register address to read from 685 * 686 * Returns the value of indirect register @reg_addr 687 */ 688 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 689 u32 pcie_index, u32 pcie_data, 690 u32 reg_addr) 691 { 692 unsigned long flags; 693 u32 r; 694 void __iomem *pcie_index_offset; 695 void __iomem *pcie_data_offset; 696 697 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 698 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 699 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 700 701 writel(reg_addr, pcie_index_offset); 702 readl(pcie_index_offset); 703 r = readl(pcie_data_offset); 704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 705 706 return r; 707 } 708 709 /** 710 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 711 * 712 * @adev: amdgpu_device pointer 713 * @pcie_index: mmio register offset 714 * @pcie_data: mmio register offset 715 * @reg_addr: indirect register address to read from 716 * 717 * Returns the value of indirect register @reg_addr 718 */ 719 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 720 u32 pcie_index, u32 pcie_data, 721 u32 reg_addr) 722 { 723 unsigned long flags; 724 u64 r; 725 void __iomem *pcie_index_offset; 726 void __iomem *pcie_data_offset; 727 728 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 731 732 /* read low 32 bits */ 733 writel(reg_addr, pcie_index_offset); 734 readl(pcie_index_offset); 735 r = readl(pcie_data_offset); 736 /* read high 32 bits */ 737 writel(reg_addr + 4, pcie_index_offset); 738 readl(pcie_index_offset); 739 r |= ((u64)readl(pcie_data_offset) << 32); 740 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 741 742 return r; 743 } 744 745 /** 746 * amdgpu_device_indirect_wreg - write an indirect register address 747 * 748 * @adev: amdgpu_device pointer 749 * @pcie_index: mmio register offset 750 * @pcie_data: mmio register offset 751 * @reg_addr: indirect register offset 752 * @reg_data: indirect register data 753 * 754 */ 755 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 756 u32 pcie_index, u32 pcie_data, 757 u32 reg_addr, u32 reg_data) 758 { 759 unsigned long flags; 760 void __iomem *pcie_index_offset; 761 void __iomem *pcie_data_offset; 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 writel(reg_data, pcie_data_offset); 770 readl(pcie_data_offset); 771 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 772 } 773 774 /** 775 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 776 * 777 * @adev: amdgpu_device pointer 778 * @pcie_index: mmio register offset 779 * @pcie_data: mmio register offset 780 * @reg_addr: indirect register offset 781 * @reg_data: indirect register data 782 * 783 */ 784 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 785 u32 pcie_index, u32 pcie_data, 786 u32 reg_addr, u64 reg_data) 787 { 788 unsigned long flags; 789 void __iomem *pcie_index_offset; 790 void __iomem *pcie_data_offset; 791 792 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 793 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 794 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 795 796 /* write low 32 bits */ 797 writel(reg_addr, pcie_index_offset); 798 readl(pcie_index_offset); 799 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 800 readl(pcie_data_offset); 801 /* write high 32 bits */ 802 writel(reg_addr + 4, pcie_index_offset); 803 readl(pcie_index_offset); 804 writel((u32)(reg_data >> 32), pcie_data_offset); 805 readl(pcie_data_offset); 806 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 807 } 808 809 /** 810 * amdgpu_invalid_rreg - dummy reg read function 811 * 812 * @adev: amdgpu_device pointer 813 * @reg: offset of register 814 * 815 * Dummy register read function. Used for register blocks 816 * that certain asics don't have (all asics). 817 * Returns the value in the register. 818 */ 819 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 820 { 821 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 822 BUG(); 823 return 0; 824 } 825 826 /** 827 * amdgpu_invalid_wreg - dummy reg write function 828 * 829 * @adev: amdgpu_device pointer 830 * @reg: offset of register 831 * @v: value to write to the register 832 * 833 * Dummy register read function. Used for register blocks 834 * that certain asics don't have (all asics). 835 */ 836 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 837 { 838 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 839 reg, v); 840 BUG(); 841 } 842 843 /** 844 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 845 * 846 * @adev: amdgpu_device pointer 847 * @reg: offset of register 848 * 849 * Dummy register read function. Used for register blocks 850 * that certain asics don't have (all asics). 851 * Returns the value in the register. 852 */ 853 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 854 { 855 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_invalid_wreg64 - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @reg: offset of register 865 * @v: value to write to the register 866 * 867 * Dummy register read function. Used for register blocks 868 * that certain asics don't have (all asics). 869 */ 870 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 871 { 872 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 873 reg, v); 874 BUG(); 875 } 876 877 /** 878 * amdgpu_block_invalid_rreg - dummy reg read function 879 * 880 * @adev: amdgpu_device pointer 881 * @block: offset of instance 882 * @reg: offset of register 883 * 884 * Dummy register read function. Used for register blocks 885 * that certain asics don't have (all asics). 886 * Returns the value in the register. 887 */ 888 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 889 uint32_t block, uint32_t reg) 890 { 891 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 892 reg, block); 893 BUG(); 894 return 0; 895 } 896 897 /** 898 * amdgpu_block_invalid_wreg - dummy reg write function 899 * 900 * @adev: amdgpu_device pointer 901 * @block: offset of instance 902 * @reg: offset of register 903 * @v: value to write to the register 904 * 905 * Dummy register read function. Used for register blocks 906 * that certain asics don't have (all asics). 907 */ 908 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 909 uint32_t block, 910 uint32_t reg, uint32_t v) 911 { 912 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 913 reg, block, v); 914 BUG(); 915 } 916 917 /** 918 * amdgpu_device_asic_init - Wrapper for atom asic_init 919 * 920 * @adev: amdgpu_device pointer 921 * 922 * Does any asic specific work and then calls atom asic init. 923 */ 924 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 925 { 926 amdgpu_asic_pre_asic_init(adev); 927 928 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 929 return amdgpu_atomfirmware_asic_init(adev, true); 930 else 931 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 932 } 933 934 /** 935 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 936 * 937 * @adev: amdgpu_device pointer 938 * 939 * Allocates a scratch page of VRAM for use by various things in the 940 * driver. 941 */ 942 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 943 { 944 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 945 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 946 &adev->vram_scratch.robj, 947 &adev->vram_scratch.gpu_addr, 948 (void **)&adev->vram_scratch.ptr); 949 } 950 951 /** 952 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 953 * 954 * @adev: amdgpu_device pointer 955 * 956 * Frees the VRAM scratch page. 957 */ 958 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 959 { 960 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 961 } 962 963 /** 964 * amdgpu_device_program_register_sequence - program an array of registers. 965 * 966 * @adev: amdgpu_device pointer 967 * @registers: pointer to the register array 968 * @array_size: size of the register array 969 * 970 * Programs an array or registers with and and or masks. 971 * This is a helper for setting golden registers. 972 */ 973 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 974 const u32 *registers, 975 const u32 array_size) 976 { 977 u32 tmp, reg, and_mask, or_mask; 978 int i; 979 980 if (array_size % 3) 981 return; 982 983 for (i = 0; i < array_size; i +=3) { 984 reg = registers[i + 0]; 985 and_mask = registers[i + 1]; 986 or_mask = registers[i + 2]; 987 988 if (and_mask == 0xffffffff) { 989 tmp = or_mask; 990 } else { 991 tmp = RREG32(reg); 992 tmp &= ~and_mask; 993 if (adev->family >= AMDGPU_FAMILY_AI) 994 tmp |= (or_mask & and_mask); 995 else 996 tmp |= or_mask; 997 } 998 WREG32(reg, tmp); 999 } 1000 } 1001 1002 /** 1003 * amdgpu_device_pci_config_reset - reset the GPU 1004 * 1005 * @adev: amdgpu_device pointer 1006 * 1007 * Resets the GPU using the pci config reset sequence. 1008 * Only applicable to asics prior to vega10. 1009 */ 1010 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1011 { 1012 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1013 } 1014 1015 /** 1016 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1021 */ 1022 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1023 { 1024 STUB(); 1025 return -ENOSYS; 1026 #ifdef notyet 1027 return pci_reset_function(adev->pdev); 1028 #endif 1029 } 1030 1031 /* 1032 * GPU doorbell aperture helpers function. 1033 */ 1034 /** 1035 * amdgpu_device_doorbell_init - Init doorbell driver information. 1036 * 1037 * @adev: amdgpu_device pointer 1038 * 1039 * Init doorbell driver information (CIK) 1040 * Returns 0 on success, error on failure. 1041 */ 1042 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1043 { 1044 1045 /* No doorbell on SI hardware generation */ 1046 if (adev->asic_type < CHIP_BONAIRE) { 1047 adev->doorbell.base = 0; 1048 adev->doorbell.size = 0; 1049 adev->doorbell.num_doorbells = 0; 1050 adev->doorbell.ptr = NULL; 1051 return 0; 1052 } 1053 1054 #ifdef __linux__ 1055 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1056 return -EINVAL; 1057 #endif 1058 1059 amdgpu_asic_init_doorbell_index(adev); 1060 1061 /* doorbell bar mapping */ 1062 #ifdef __linux__ 1063 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1064 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1065 #endif 1066 1067 if (adev->enable_mes) { 1068 adev->doorbell.num_doorbells = 1069 adev->doorbell.size / sizeof(u32); 1070 } else { 1071 adev->doorbell.num_doorbells = 1072 min_t(u32, adev->doorbell.size / sizeof(u32), 1073 adev->doorbell_index.max_assignment+1); 1074 if (adev->doorbell.num_doorbells == 0) 1075 return -EINVAL; 1076 1077 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1078 * paging queue doorbell use the second page. The 1079 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1080 * doorbells are in the first page. So with paging queue enabled, 1081 * the max num_doorbells should + 1 page (0x400 in dword) 1082 */ 1083 if (adev->asic_type >= CHIP_VEGA10) 1084 adev->doorbell.num_doorbells += 0x400; 1085 } 1086 1087 #ifdef __linux__ 1088 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1089 adev->doorbell.num_doorbells * 1090 sizeof(u32)); 1091 if (adev->doorbell.ptr == NULL) 1092 return -ENOMEM; 1093 #endif 1094 1095 return 0; 1096 } 1097 1098 /** 1099 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Tear down doorbell driver information (CIK) 1104 */ 1105 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1106 { 1107 #ifdef __linux__ 1108 iounmap(adev->doorbell.ptr); 1109 #else 1110 if (adev->doorbell.size > 0) 1111 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1112 adev->doorbell.size); 1113 #endif 1114 adev->doorbell.ptr = NULL; 1115 } 1116 1117 1118 1119 /* 1120 * amdgpu_device_wb_*() 1121 * Writeback is the method by which the GPU updates special pages in memory 1122 * with the status of certain GPU events (fences, ring pointers,etc.). 1123 */ 1124 1125 /** 1126 * amdgpu_device_wb_fini - Disable Writeback and free memory 1127 * 1128 * @adev: amdgpu_device pointer 1129 * 1130 * Disables Writeback and frees the Writeback memory (all asics). 1131 * Used at driver shutdown. 1132 */ 1133 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1134 { 1135 if (adev->wb.wb_obj) { 1136 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1137 &adev->wb.gpu_addr, 1138 (void **)&adev->wb.wb); 1139 adev->wb.wb_obj = NULL; 1140 } 1141 } 1142 1143 /** 1144 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1145 * 1146 * @adev: amdgpu_device pointer 1147 * 1148 * Initializes writeback and allocates writeback memory (all asics). 1149 * Used at driver startup. 1150 * Returns 0 on success or an -error on failure. 1151 */ 1152 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1153 { 1154 int r; 1155 1156 if (adev->wb.wb_obj == NULL) { 1157 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1158 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1159 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1160 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1161 (void **)&adev->wb.wb); 1162 if (r) { 1163 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1164 return r; 1165 } 1166 1167 adev->wb.num_wb = AMDGPU_MAX_WB; 1168 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1169 1170 /* clear wb memory */ 1171 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1172 } 1173 1174 return 0; 1175 } 1176 1177 /** 1178 * amdgpu_device_wb_get - Allocate a wb entry 1179 * 1180 * @adev: amdgpu_device pointer 1181 * @wb: wb index 1182 * 1183 * Allocate a wb slot for use by the driver (all asics). 1184 * Returns 0 on success or -EINVAL on failure. 1185 */ 1186 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1187 { 1188 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1189 1190 if (offset < adev->wb.num_wb) { 1191 __set_bit(offset, adev->wb.used); 1192 *wb = offset << 3; /* convert to dw offset */ 1193 return 0; 1194 } else { 1195 return -EINVAL; 1196 } 1197 } 1198 1199 /** 1200 * amdgpu_device_wb_free - Free a wb entry 1201 * 1202 * @adev: amdgpu_device pointer 1203 * @wb: wb index 1204 * 1205 * Free a wb slot allocated for use by the driver (all asics) 1206 */ 1207 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1208 { 1209 wb >>= 3; 1210 if (wb < adev->wb.num_wb) 1211 __clear_bit(wb, adev->wb.used); 1212 } 1213 1214 /** 1215 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1216 * 1217 * @adev: amdgpu_device pointer 1218 * 1219 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1220 * to fail, but if any of the BARs is not accessible after the size we abort 1221 * driver loading by returning -ENODEV. 1222 */ 1223 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1224 { 1225 #ifdef __linux__ 1226 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1227 struct pci_bus *root; 1228 struct resource *res; 1229 unsigned i; 1230 u16 cmd; 1231 int r; 1232 1233 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1234 return 0; 1235 1236 /* Bypass for VF */ 1237 if (amdgpu_sriov_vf(adev)) 1238 return 0; 1239 1240 /* skip if the bios has already enabled large BAR */ 1241 if (adev->gmc.real_vram_size && 1242 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1243 return 0; 1244 1245 /* Check if the root BUS has 64bit memory resources */ 1246 root = adev->pdev->bus; 1247 while (root->parent) 1248 root = root->parent; 1249 1250 pci_bus_for_each_resource(root, res, i) { 1251 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1252 res->start > 0x100000000ull) 1253 break; 1254 } 1255 1256 /* Trying to resize is pointless without a root hub window above 4GB */ 1257 if (!res) 1258 return 0; 1259 1260 /* Limit the BAR size to what is available */ 1261 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1262 rbar_size); 1263 1264 /* Disable memory decoding while we change the BAR addresses and size */ 1265 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1266 pci_write_config_word(adev->pdev, PCI_COMMAND, 1267 cmd & ~PCI_COMMAND_MEMORY); 1268 1269 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1270 amdgpu_device_doorbell_fini(adev); 1271 if (adev->asic_type >= CHIP_BONAIRE) 1272 pci_release_resource(adev->pdev, 2); 1273 1274 pci_release_resource(adev->pdev, 0); 1275 1276 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1277 if (r == -ENOSPC) 1278 DRM_INFO("Not enough PCI address space for a large BAR."); 1279 else if (r && r != -ENOTSUPP) 1280 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1281 1282 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1283 1284 /* When the doorbell or fb BAR isn't available we have no chance of 1285 * using the device. 1286 */ 1287 r = amdgpu_device_doorbell_init(adev); 1288 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1289 return -ENODEV; 1290 1291 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1292 #endif /* __linux__ */ 1293 1294 return 0; 1295 } 1296 1297 /* 1298 * GPU helpers function. 1299 */ 1300 /** 1301 * amdgpu_device_need_post - check if the hw need post or not 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Check if the asic has been initialized (all asics) at driver startup 1306 * or post is needed if hw reset is performed. 1307 * Returns true if need or false if not. 1308 */ 1309 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1310 { 1311 uint32_t reg; 1312 1313 if (amdgpu_sriov_vf(adev)) 1314 return false; 1315 1316 if (amdgpu_passthrough(adev)) { 1317 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1318 * some old smc fw still need driver do vPost otherwise gpu hang, while 1319 * those smc fw version above 22.15 doesn't have this flaw, so we force 1320 * vpost executed for smc version below 22.15 1321 */ 1322 if (adev->asic_type == CHIP_FIJI) { 1323 int err; 1324 uint32_t fw_ver; 1325 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1326 /* force vPost if error occured */ 1327 if (err) 1328 return true; 1329 1330 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1331 if (fw_ver < 0x00160e00) 1332 return true; 1333 } 1334 } 1335 1336 /* Don't post if we need to reset whole hive on init */ 1337 if (adev->gmc.xgmi.pending_reset) 1338 return false; 1339 1340 if (adev->has_hw_reset) { 1341 adev->has_hw_reset = false; 1342 return true; 1343 } 1344 1345 /* bios scratch used on CIK+ */ 1346 if (adev->asic_type >= CHIP_BONAIRE) 1347 return amdgpu_atombios_scratch_need_asic_init(adev); 1348 1349 /* check MEM_SIZE for older asics */ 1350 reg = amdgpu_asic_get_config_memsize(adev); 1351 1352 if ((reg != 0) && (reg != 0xffffffff)) 1353 return false; 1354 1355 return true; 1356 } 1357 1358 /* 1359 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1360 * speed switching. Until we have confirmation from Intel that a specific host 1361 * supports it, it's safer that we keep it disabled for all. 1362 * 1363 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1364 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1365 */ 1366 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1367 { 1368 #if IS_ENABLED(CONFIG_X86) 1369 #ifdef __linux__ 1370 struct cpuinfo_x86 *c = &cpu_data(0); 1371 1372 if (c->x86_vendor == X86_VENDOR_INTEL) 1373 #else 1374 if (strcmp(cpu_vendor, "GenuineIntel") == 0) 1375 #endif 1376 return false; 1377 #endif 1378 return true; 1379 } 1380 1381 /** 1382 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1383 * 1384 * @adev: amdgpu_device pointer 1385 * 1386 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1387 * be set for this device. 1388 * 1389 * Returns true if it should be used or false if not. 1390 */ 1391 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1392 { 1393 switch (amdgpu_aspm) { 1394 case -1: 1395 break; 1396 case 0: 1397 return false; 1398 case 1: 1399 return true; 1400 default: 1401 return false; 1402 } 1403 return pcie_aspm_enabled(adev->pdev); 1404 } 1405 1406 bool amdgpu_device_aspm_support_quirk(void) 1407 { 1408 #if IS_ENABLED(CONFIG_X86) 1409 struct cpu_info *ci = curcpu(); 1410 1411 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1412 #else 1413 return true; 1414 #endif 1415 } 1416 1417 /* if we get transitioned to only one device, take VGA back */ 1418 /** 1419 * amdgpu_device_vga_set_decode - enable/disable vga decode 1420 * 1421 * @pdev: PCI device pointer 1422 * @state: enable/disable vga decode 1423 * 1424 * Enable/disable vga decode (all asics). 1425 * Returns VGA resource flags. 1426 */ 1427 #ifdef notyet 1428 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1429 bool state) 1430 { 1431 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1432 amdgpu_asic_set_vga_state(adev, state); 1433 if (state) 1434 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1435 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1436 else 1437 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1438 } 1439 #endif 1440 1441 /** 1442 * amdgpu_device_check_block_size - validate the vm block size 1443 * 1444 * @adev: amdgpu_device pointer 1445 * 1446 * Validates the vm block size specified via module parameter. 1447 * The vm block size defines number of bits in page table versus page directory, 1448 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1449 * page table and the remaining bits are in the page directory. 1450 */ 1451 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1452 { 1453 /* defines number of bits in page table versus page directory, 1454 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1455 * page table and the remaining bits are in the page directory */ 1456 if (amdgpu_vm_block_size == -1) 1457 return; 1458 1459 if (amdgpu_vm_block_size < 9) { 1460 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1461 amdgpu_vm_block_size); 1462 amdgpu_vm_block_size = -1; 1463 } 1464 } 1465 1466 /** 1467 * amdgpu_device_check_vm_size - validate the vm size 1468 * 1469 * @adev: amdgpu_device pointer 1470 * 1471 * Validates the vm size in GB specified via module parameter. 1472 * The VM size is the size of the GPU virtual memory space in GB. 1473 */ 1474 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1475 { 1476 /* no need to check the default value */ 1477 if (amdgpu_vm_size == -1) 1478 return; 1479 1480 if (amdgpu_vm_size < 1) { 1481 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1482 amdgpu_vm_size); 1483 amdgpu_vm_size = -1; 1484 } 1485 } 1486 1487 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1488 { 1489 #ifdef __linux__ 1490 struct sysinfo si; 1491 #endif 1492 bool is_os_64 = (sizeof(void *) == 8); 1493 uint64_t total_memory; 1494 uint64_t dram_size_seven_GB = 0x1B8000000; 1495 uint64_t dram_size_three_GB = 0xB8000000; 1496 1497 if (amdgpu_smu_memory_pool_size == 0) 1498 return; 1499 1500 if (!is_os_64) { 1501 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1502 goto def_value; 1503 } 1504 #ifdef __linux__ 1505 si_meminfo(&si); 1506 total_memory = (uint64_t)si.totalram * si.mem_unit; 1507 #else 1508 total_memory = ptoa(physmem); 1509 #endif 1510 1511 if ((amdgpu_smu_memory_pool_size == 1) || 1512 (amdgpu_smu_memory_pool_size == 2)) { 1513 if (total_memory < dram_size_three_GB) 1514 goto def_value1; 1515 } else if ((amdgpu_smu_memory_pool_size == 4) || 1516 (amdgpu_smu_memory_pool_size == 8)) { 1517 if (total_memory < dram_size_seven_GB) 1518 goto def_value1; 1519 } else { 1520 DRM_WARN("Smu memory pool size not supported\n"); 1521 goto def_value; 1522 } 1523 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1524 1525 return; 1526 1527 def_value1: 1528 DRM_WARN("No enough system memory\n"); 1529 def_value: 1530 adev->pm.smu_prv_buffer_size = 0; 1531 } 1532 1533 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1534 { 1535 if (!(adev->flags & AMD_IS_APU) || 1536 adev->asic_type < CHIP_RAVEN) 1537 return 0; 1538 1539 switch (adev->asic_type) { 1540 case CHIP_RAVEN: 1541 if (adev->pdev->device == 0x15dd) 1542 adev->apu_flags |= AMD_APU_IS_RAVEN; 1543 if (adev->pdev->device == 0x15d8) 1544 adev->apu_flags |= AMD_APU_IS_PICASSO; 1545 break; 1546 case CHIP_RENOIR: 1547 if ((adev->pdev->device == 0x1636) || 1548 (adev->pdev->device == 0x164c)) 1549 adev->apu_flags |= AMD_APU_IS_RENOIR; 1550 else 1551 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1552 break; 1553 case CHIP_VANGOGH: 1554 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1555 break; 1556 case CHIP_YELLOW_CARP: 1557 break; 1558 case CHIP_CYAN_SKILLFISH: 1559 if ((adev->pdev->device == 0x13FE) || 1560 (adev->pdev->device == 0x143F)) 1561 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1562 break; 1563 default: 1564 break; 1565 } 1566 1567 return 0; 1568 } 1569 1570 /** 1571 * amdgpu_device_check_arguments - validate module params 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Validates certain module parameters and updates 1576 * the associated values used by the driver (all asics). 1577 */ 1578 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1579 { 1580 if (amdgpu_sched_jobs < 4) { 1581 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1582 amdgpu_sched_jobs); 1583 amdgpu_sched_jobs = 4; 1584 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1585 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1586 amdgpu_sched_jobs); 1587 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1588 } 1589 1590 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1591 /* gart size must be greater or equal to 32M */ 1592 dev_warn(adev->dev, "gart size (%d) too small\n", 1593 amdgpu_gart_size); 1594 amdgpu_gart_size = -1; 1595 } 1596 1597 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1598 /* gtt size must be greater or equal to 32M */ 1599 dev_warn(adev->dev, "gtt size (%d) too small\n", 1600 amdgpu_gtt_size); 1601 amdgpu_gtt_size = -1; 1602 } 1603 1604 /* valid range is between 4 and 9 inclusive */ 1605 if (amdgpu_vm_fragment_size != -1 && 1606 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1607 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1608 amdgpu_vm_fragment_size = -1; 1609 } 1610 1611 if (amdgpu_sched_hw_submission < 2) { 1612 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1613 amdgpu_sched_hw_submission); 1614 amdgpu_sched_hw_submission = 2; 1615 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1616 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1617 amdgpu_sched_hw_submission); 1618 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1619 } 1620 1621 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1622 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1623 amdgpu_reset_method = -1; 1624 } 1625 1626 amdgpu_device_check_smu_prv_buffer_size(adev); 1627 1628 amdgpu_device_check_vm_size(adev); 1629 1630 amdgpu_device_check_block_size(adev); 1631 1632 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1633 1634 return 0; 1635 } 1636 1637 #ifdef __linux__ 1638 /** 1639 * amdgpu_switcheroo_set_state - set switcheroo state 1640 * 1641 * @pdev: pci dev pointer 1642 * @state: vga_switcheroo state 1643 * 1644 * Callback for the switcheroo driver. Suspends or resumes the 1645 * the asics before or after it is powered up using ACPI methods. 1646 */ 1647 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1648 enum vga_switcheroo_state state) 1649 { 1650 struct drm_device *dev = pci_get_drvdata(pdev); 1651 int r; 1652 1653 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1654 return; 1655 1656 if (state == VGA_SWITCHEROO_ON) { 1657 pr_info("switched on\n"); 1658 /* don't suspend or resume card normally */ 1659 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1660 1661 pci_set_power_state(pdev, PCI_D0); 1662 amdgpu_device_load_pci_state(pdev); 1663 r = pci_enable_device(pdev); 1664 if (r) 1665 DRM_WARN("pci_enable_device failed (%d)\n", r); 1666 amdgpu_device_resume(dev, true); 1667 1668 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1669 } else { 1670 pr_info("switched off\n"); 1671 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1672 amdgpu_device_suspend(dev, true); 1673 amdgpu_device_cache_pci_state(pdev); 1674 /* Shut down the device */ 1675 pci_disable_device(pdev); 1676 pci_set_power_state(pdev, PCI_D3cold); 1677 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1678 } 1679 } 1680 1681 /** 1682 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1683 * 1684 * @pdev: pci dev pointer 1685 * 1686 * Callback for the switcheroo driver. Check of the switcheroo 1687 * state can be changed. 1688 * Returns true if the state can be changed, false if not. 1689 */ 1690 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1691 { 1692 struct drm_device *dev = pci_get_drvdata(pdev); 1693 1694 /* 1695 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1696 * locking inversion with the driver load path. And the access here is 1697 * completely racy anyway. So don't bother with locking for now. 1698 */ 1699 return atomic_read(&dev->open_count) == 0; 1700 } 1701 #endif /* __linux__ */ 1702 1703 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1704 #ifdef notyet 1705 .set_gpu_state = amdgpu_switcheroo_set_state, 1706 .reprobe = NULL, 1707 .can_switch = amdgpu_switcheroo_can_switch, 1708 #endif 1709 }; 1710 1711 /** 1712 * amdgpu_device_ip_set_clockgating_state - set the CG state 1713 * 1714 * @dev: amdgpu_device pointer 1715 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1716 * @state: clockgating state (gate or ungate) 1717 * 1718 * Sets the requested clockgating state for all instances of 1719 * the hardware IP specified. 1720 * Returns the error code from the last instance. 1721 */ 1722 int amdgpu_device_ip_set_clockgating_state(void *dev, 1723 enum amd_ip_block_type block_type, 1724 enum amd_clockgating_state state) 1725 { 1726 struct amdgpu_device *adev = dev; 1727 int i, r = 0; 1728 1729 for (i = 0; i < adev->num_ip_blocks; i++) { 1730 if (!adev->ip_blocks[i].status.valid) 1731 continue; 1732 if (adev->ip_blocks[i].version->type != block_type) 1733 continue; 1734 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1735 continue; 1736 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1737 (void *)adev, state); 1738 if (r) 1739 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1740 adev->ip_blocks[i].version->funcs->name, r); 1741 } 1742 return r; 1743 } 1744 1745 /** 1746 * amdgpu_device_ip_set_powergating_state - set the PG state 1747 * 1748 * @dev: amdgpu_device pointer 1749 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1750 * @state: powergating state (gate or ungate) 1751 * 1752 * Sets the requested powergating state for all instances of 1753 * the hardware IP specified. 1754 * Returns the error code from the last instance. 1755 */ 1756 int amdgpu_device_ip_set_powergating_state(void *dev, 1757 enum amd_ip_block_type block_type, 1758 enum amd_powergating_state state) 1759 { 1760 struct amdgpu_device *adev = dev; 1761 int i, r = 0; 1762 1763 for (i = 0; i < adev->num_ip_blocks; i++) { 1764 if (!adev->ip_blocks[i].status.valid) 1765 continue; 1766 if (adev->ip_blocks[i].version->type != block_type) 1767 continue; 1768 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1769 continue; 1770 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1771 (void *)adev, state); 1772 if (r) 1773 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1774 adev->ip_blocks[i].version->funcs->name, r); 1775 } 1776 return r; 1777 } 1778 1779 /** 1780 * amdgpu_device_ip_get_clockgating_state - get the CG state 1781 * 1782 * @adev: amdgpu_device pointer 1783 * @flags: clockgating feature flags 1784 * 1785 * Walks the list of IPs on the device and updates the clockgating 1786 * flags for each IP. 1787 * Updates @flags with the feature flags for each hardware IP where 1788 * clockgating is enabled. 1789 */ 1790 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1791 u64 *flags) 1792 { 1793 int i; 1794 1795 for (i = 0; i < adev->num_ip_blocks; i++) { 1796 if (!adev->ip_blocks[i].status.valid) 1797 continue; 1798 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1799 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1800 } 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_wait_for_idle - wait for idle 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1808 * 1809 * Waits for the request hardware IP to be idle. 1810 * Returns 0 for success or a negative error code on failure. 1811 */ 1812 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1813 enum amd_ip_block_type block_type) 1814 { 1815 int i, r; 1816 1817 for (i = 0; i < adev->num_ip_blocks; i++) { 1818 if (!adev->ip_blocks[i].status.valid) 1819 continue; 1820 if (adev->ip_blocks[i].version->type == block_type) { 1821 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1822 if (r) 1823 return r; 1824 break; 1825 } 1826 } 1827 return 0; 1828 1829 } 1830 1831 /** 1832 * amdgpu_device_ip_is_idle - is the hardware IP idle 1833 * 1834 * @adev: amdgpu_device pointer 1835 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1836 * 1837 * Check if the hardware IP is idle or not. 1838 * Returns true if it the IP is idle, false if not. 1839 */ 1840 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1841 enum amd_ip_block_type block_type) 1842 { 1843 int i; 1844 1845 for (i = 0; i < adev->num_ip_blocks; i++) { 1846 if (!adev->ip_blocks[i].status.valid) 1847 continue; 1848 if (adev->ip_blocks[i].version->type == block_type) 1849 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1850 } 1851 return true; 1852 1853 } 1854 1855 /** 1856 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1857 * 1858 * @adev: amdgpu_device pointer 1859 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1860 * 1861 * Returns a pointer to the hardware IP block structure 1862 * if it exists for the asic, otherwise NULL. 1863 */ 1864 struct amdgpu_ip_block * 1865 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1866 enum amd_ip_block_type type) 1867 { 1868 int i; 1869 1870 for (i = 0; i < adev->num_ip_blocks; i++) 1871 if (adev->ip_blocks[i].version->type == type) 1872 return &adev->ip_blocks[i]; 1873 1874 return NULL; 1875 } 1876 1877 /** 1878 * amdgpu_device_ip_block_version_cmp 1879 * 1880 * @adev: amdgpu_device pointer 1881 * @type: enum amd_ip_block_type 1882 * @major: major version 1883 * @minor: minor version 1884 * 1885 * return 0 if equal or greater 1886 * return 1 if smaller or the ip_block doesn't exist 1887 */ 1888 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1889 enum amd_ip_block_type type, 1890 u32 major, u32 minor) 1891 { 1892 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1893 1894 if (ip_block && ((ip_block->version->major > major) || 1895 ((ip_block->version->major == major) && 1896 (ip_block->version->minor >= minor)))) 1897 return 0; 1898 1899 return 1; 1900 } 1901 1902 /** 1903 * amdgpu_device_ip_block_add 1904 * 1905 * @adev: amdgpu_device pointer 1906 * @ip_block_version: pointer to the IP to add 1907 * 1908 * Adds the IP block driver information to the collection of IPs 1909 * on the asic. 1910 */ 1911 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1912 const struct amdgpu_ip_block_version *ip_block_version) 1913 { 1914 if (!ip_block_version) 1915 return -EINVAL; 1916 1917 switch (ip_block_version->type) { 1918 case AMD_IP_BLOCK_TYPE_VCN: 1919 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1920 return 0; 1921 break; 1922 case AMD_IP_BLOCK_TYPE_JPEG: 1923 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1924 return 0; 1925 break; 1926 default: 1927 break; 1928 } 1929 1930 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1931 ip_block_version->funcs->name); 1932 1933 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1934 1935 return 0; 1936 } 1937 1938 /** 1939 * amdgpu_device_enable_virtual_display - enable virtual display feature 1940 * 1941 * @adev: amdgpu_device pointer 1942 * 1943 * Enabled the virtual display feature if the user has enabled it via 1944 * the module parameter virtual_display. This feature provides a virtual 1945 * display hardware on headless boards or in virtualized environments. 1946 * This function parses and validates the configuration string specified by 1947 * the user and configues the virtual display configuration (number of 1948 * virtual connectors, crtcs, etc.) specified. 1949 */ 1950 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1951 { 1952 adev->enable_virtual_display = false; 1953 1954 #ifdef notyet 1955 if (amdgpu_virtual_display) { 1956 const char *pci_address_name = pci_name(adev->pdev); 1957 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1958 1959 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1960 pciaddstr_tmp = pciaddstr; 1961 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1962 pciaddname = strsep(&pciaddname_tmp, ","); 1963 if (!strcmp("all", pciaddname) 1964 || !strcmp(pci_address_name, pciaddname)) { 1965 long num_crtc; 1966 int res = -1; 1967 1968 adev->enable_virtual_display = true; 1969 1970 if (pciaddname_tmp) 1971 res = kstrtol(pciaddname_tmp, 10, 1972 &num_crtc); 1973 1974 if (!res) { 1975 if (num_crtc < 1) 1976 num_crtc = 1; 1977 if (num_crtc > 6) 1978 num_crtc = 6; 1979 adev->mode_info.num_crtc = num_crtc; 1980 } else { 1981 adev->mode_info.num_crtc = 1; 1982 } 1983 break; 1984 } 1985 } 1986 1987 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1988 amdgpu_virtual_display, pci_address_name, 1989 adev->enable_virtual_display, adev->mode_info.num_crtc); 1990 1991 kfree(pciaddstr); 1992 } 1993 #endif 1994 } 1995 1996 /** 1997 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1998 * 1999 * @adev: amdgpu_device pointer 2000 * 2001 * Parses the asic configuration parameters specified in the gpu info 2002 * firmware and makes them availale to the driver for use in configuring 2003 * the asic. 2004 * Returns 0 on success, -EINVAL on failure. 2005 */ 2006 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2007 { 2008 const char *chip_name; 2009 char fw_name[40]; 2010 int err; 2011 const struct gpu_info_firmware_header_v1_0 *hdr; 2012 2013 adev->firmware.gpu_info_fw = NULL; 2014 2015 if (adev->mman.discovery_bin) { 2016 /* 2017 * FIXME: The bounding box is still needed by Navi12, so 2018 * temporarily read it from gpu_info firmware. Should be dropped 2019 * when DAL no longer needs it. 2020 */ 2021 if (adev->asic_type != CHIP_NAVI12) 2022 return 0; 2023 } 2024 2025 switch (adev->asic_type) { 2026 default: 2027 return 0; 2028 case CHIP_VEGA10: 2029 chip_name = "vega10"; 2030 break; 2031 case CHIP_VEGA12: 2032 chip_name = "vega12"; 2033 break; 2034 case CHIP_RAVEN: 2035 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2036 chip_name = "raven2"; 2037 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2038 chip_name = "picasso"; 2039 else 2040 chip_name = "raven"; 2041 break; 2042 case CHIP_ARCTURUS: 2043 chip_name = "arcturus"; 2044 break; 2045 case CHIP_NAVI12: 2046 chip_name = "navi12"; 2047 break; 2048 } 2049 2050 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2051 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2052 if (err) { 2053 dev_err(adev->dev, 2054 "Failed to load gpu_info firmware \"%s\"\n", 2055 fw_name); 2056 goto out; 2057 } 2058 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2059 if (err) { 2060 dev_err(adev->dev, 2061 "Failed to validate gpu_info firmware \"%s\"\n", 2062 fw_name); 2063 goto out; 2064 } 2065 2066 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2067 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2068 2069 switch (hdr->version_major) { 2070 case 1: 2071 { 2072 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2073 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2074 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2075 2076 /* 2077 * Should be droped when DAL no longer needs it. 2078 */ 2079 if (adev->asic_type == CHIP_NAVI12) 2080 goto parse_soc_bounding_box; 2081 2082 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2083 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2084 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2085 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2086 adev->gfx.config.max_texture_channel_caches = 2087 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2088 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2089 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2090 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2091 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2092 adev->gfx.config.double_offchip_lds_buf = 2093 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2094 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2095 adev->gfx.cu_info.max_waves_per_simd = 2096 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2097 adev->gfx.cu_info.max_scratch_slots_per_cu = 2098 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2099 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2100 if (hdr->version_minor >= 1) { 2101 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2102 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2103 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2104 adev->gfx.config.num_sc_per_sh = 2105 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2106 adev->gfx.config.num_packer_per_sc = 2107 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2108 } 2109 2110 parse_soc_bounding_box: 2111 /* 2112 * soc bounding box info is not integrated in disocovery table, 2113 * we always need to parse it from gpu info firmware if needed. 2114 */ 2115 if (hdr->version_minor == 2) { 2116 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2117 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2118 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2119 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2120 } 2121 break; 2122 } 2123 default: 2124 dev_err(adev->dev, 2125 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2126 err = -EINVAL; 2127 goto out; 2128 } 2129 out: 2130 return err; 2131 } 2132 2133 /** 2134 * amdgpu_device_ip_early_init - run early init for hardware IPs 2135 * 2136 * @adev: amdgpu_device pointer 2137 * 2138 * Early initialization pass for hardware IPs. The hardware IPs that make 2139 * up each asic are discovered each IP's early_init callback is run. This 2140 * is the first stage in initializing the asic. 2141 * Returns 0 on success, negative error code on failure. 2142 */ 2143 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2144 { 2145 struct pci_dev *parent; 2146 int i, r; 2147 2148 amdgpu_device_enable_virtual_display(adev); 2149 2150 if (amdgpu_sriov_vf(adev)) { 2151 r = amdgpu_virt_request_full_gpu(adev, true); 2152 if (r) 2153 return r; 2154 } 2155 2156 switch (adev->asic_type) { 2157 #ifdef CONFIG_DRM_AMDGPU_SI 2158 case CHIP_VERDE: 2159 case CHIP_TAHITI: 2160 case CHIP_PITCAIRN: 2161 case CHIP_OLAND: 2162 case CHIP_HAINAN: 2163 adev->family = AMDGPU_FAMILY_SI; 2164 r = si_set_ip_blocks(adev); 2165 if (r) 2166 return r; 2167 break; 2168 #endif 2169 #ifdef CONFIG_DRM_AMDGPU_CIK 2170 case CHIP_BONAIRE: 2171 case CHIP_HAWAII: 2172 case CHIP_KAVERI: 2173 case CHIP_KABINI: 2174 case CHIP_MULLINS: 2175 if (adev->flags & AMD_IS_APU) 2176 adev->family = AMDGPU_FAMILY_KV; 2177 else 2178 adev->family = AMDGPU_FAMILY_CI; 2179 2180 r = cik_set_ip_blocks(adev); 2181 if (r) 2182 return r; 2183 break; 2184 #endif 2185 case CHIP_TOPAZ: 2186 case CHIP_TONGA: 2187 case CHIP_FIJI: 2188 case CHIP_POLARIS10: 2189 case CHIP_POLARIS11: 2190 case CHIP_POLARIS12: 2191 case CHIP_VEGAM: 2192 case CHIP_CARRIZO: 2193 case CHIP_STONEY: 2194 if (adev->flags & AMD_IS_APU) 2195 adev->family = AMDGPU_FAMILY_CZ; 2196 else 2197 adev->family = AMDGPU_FAMILY_VI; 2198 2199 r = vi_set_ip_blocks(adev); 2200 if (r) 2201 return r; 2202 break; 2203 default: 2204 r = amdgpu_discovery_set_ip_blocks(adev); 2205 if (r) 2206 return r; 2207 break; 2208 } 2209 2210 if (amdgpu_has_atpx() && 2211 (amdgpu_is_atpx_hybrid() || 2212 amdgpu_has_atpx_dgpu_power_cntl()) && 2213 ((adev->flags & AMD_IS_APU) == 0) && 2214 !dev_is_removable(&adev->pdev->dev)) 2215 adev->flags |= AMD_IS_PX; 2216 2217 if (!(adev->flags & AMD_IS_APU)) { 2218 #ifdef notyet 2219 parent = pcie_find_root_port(adev->pdev); 2220 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2221 #else 2222 adev->has_pr3 = false; 2223 #endif 2224 } 2225 2226 amdgpu_amdkfd_device_probe(adev); 2227 2228 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2229 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2230 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2231 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2232 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2233 2234 for (i = 0; i < adev->num_ip_blocks; i++) { 2235 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2236 DRM_ERROR("disabled ip block: %d <%s>\n", 2237 i, adev->ip_blocks[i].version->funcs->name); 2238 adev->ip_blocks[i].status.valid = false; 2239 } else { 2240 if (adev->ip_blocks[i].version->funcs->early_init) { 2241 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2242 if (r == -ENOENT) { 2243 adev->ip_blocks[i].status.valid = false; 2244 } else if (r) { 2245 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2246 adev->ip_blocks[i].version->funcs->name, r); 2247 return r; 2248 } else { 2249 adev->ip_blocks[i].status.valid = true; 2250 } 2251 } else { 2252 adev->ip_blocks[i].status.valid = true; 2253 } 2254 } 2255 /* get the vbios after the asic_funcs are set up */ 2256 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2257 r = amdgpu_device_parse_gpu_info_fw(adev); 2258 if (r) 2259 return r; 2260 2261 /* Read BIOS */ 2262 if (!amdgpu_get_bios(adev)) 2263 return -EINVAL; 2264 2265 r = amdgpu_atombios_init(adev); 2266 if (r) { 2267 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2268 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2269 return r; 2270 } 2271 2272 /*get pf2vf msg info at it's earliest time*/ 2273 if (amdgpu_sriov_vf(adev)) 2274 amdgpu_virt_init_data_exchange(adev); 2275 2276 } 2277 } 2278 2279 adev->cg_flags &= amdgpu_cg_mask; 2280 adev->pg_flags &= amdgpu_pg_mask; 2281 2282 return 0; 2283 } 2284 2285 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2286 { 2287 int i, r; 2288 2289 for (i = 0; i < adev->num_ip_blocks; i++) { 2290 if (!adev->ip_blocks[i].status.sw) 2291 continue; 2292 if (adev->ip_blocks[i].status.hw) 2293 continue; 2294 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2295 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2296 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2297 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2298 if (r) { 2299 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2300 adev->ip_blocks[i].version->funcs->name, r); 2301 return r; 2302 } 2303 adev->ip_blocks[i].status.hw = true; 2304 } 2305 } 2306 2307 return 0; 2308 } 2309 2310 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2311 { 2312 int i, r; 2313 2314 for (i = 0; i < adev->num_ip_blocks; i++) { 2315 if (!adev->ip_blocks[i].status.sw) 2316 continue; 2317 if (adev->ip_blocks[i].status.hw) 2318 continue; 2319 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2320 if (r) { 2321 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2322 adev->ip_blocks[i].version->funcs->name, r); 2323 return r; 2324 } 2325 adev->ip_blocks[i].status.hw = true; 2326 } 2327 2328 return 0; 2329 } 2330 2331 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2332 { 2333 int r = 0; 2334 int i; 2335 uint32_t smu_version; 2336 2337 if (adev->asic_type >= CHIP_VEGA10) { 2338 for (i = 0; i < adev->num_ip_blocks; i++) { 2339 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2340 continue; 2341 2342 if (!adev->ip_blocks[i].status.sw) 2343 continue; 2344 2345 /* no need to do the fw loading again if already done*/ 2346 if (adev->ip_blocks[i].status.hw == true) 2347 break; 2348 2349 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2350 r = adev->ip_blocks[i].version->funcs->resume(adev); 2351 if (r) { 2352 DRM_ERROR("resume of IP block <%s> failed %d\n", 2353 adev->ip_blocks[i].version->funcs->name, r); 2354 return r; 2355 } 2356 } else { 2357 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2358 if (r) { 2359 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2360 adev->ip_blocks[i].version->funcs->name, r); 2361 return r; 2362 } 2363 } 2364 2365 adev->ip_blocks[i].status.hw = true; 2366 break; 2367 } 2368 } 2369 2370 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2371 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2372 2373 return r; 2374 } 2375 2376 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2377 { 2378 long timeout; 2379 int r, i; 2380 2381 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2382 struct amdgpu_ring *ring = adev->rings[i]; 2383 2384 /* No need to setup the GPU scheduler for rings that don't need it */ 2385 if (!ring || ring->no_scheduler) 2386 continue; 2387 2388 switch (ring->funcs->type) { 2389 case AMDGPU_RING_TYPE_GFX: 2390 timeout = adev->gfx_timeout; 2391 break; 2392 case AMDGPU_RING_TYPE_COMPUTE: 2393 timeout = adev->compute_timeout; 2394 break; 2395 case AMDGPU_RING_TYPE_SDMA: 2396 timeout = adev->sdma_timeout; 2397 break; 2398 default: 2399 timeout = adev->video_timeout; 2400 break; 2401 } 2402 2403 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2404 ring->num_hw_submission, amdgpu_job_hang_limit, 2405 timeout, adev->reset_domain->wq, 2406 ring->sched_score, ring->name, 2407 adev->dev); 2408 if (r) { 2409 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2410 ring->name); 2411 return r; 2412 } 2413 } 2414 2415 return 0; 2416 } 2417 2418 2419 /** 2420 * amdgpu_device_ip_init - run init for hardware IPs 2421 * 2422 * @adev: amdgpu_device pointer 2423 * 2424 * Main initialization pass for hardware IPs. The list of all the hardware 2425 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2426 * are run. sw_init initializes the software state associated with each IP 2427 * and hw_init initializes the hardware associated with each IP. 2428 * Returns 0 on success, negative error code on failure. 2429 */ 2430 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2431 { 2432 int i, r; 2433 2434 r = amdgpu_ras_init(adev); 2435 if (r) 2436 return r; 2437 2438 for (i = 0; i < adev->num_ip_blocks; i++) { 2439 if (!adev->ip_blocks[i].status.valid) 2440 continue; 2441 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2442 if (r) { 2443 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2444 adev->ip_blocks[i].version->funcs->name, r); 2445 goto init_failed; 2446 } 2447 adev->ip_blocks[i].status.sw = true; 2448 2449 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2450 /* need to do common hw init early so everything is set up for gmc */ 2451 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2452 if (r) { 2453 DRM_ERROR("hw_init %d failed %d\n", i, r); 2454 goto init_failed; 2455 } 2456 adev->ip_blocks[i].status.hw = true; 2457 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2458 /* need to do gmc hw init early so we can allocate gpu mem */ 2459 /* Try to reserve bad pages early */ 2460 if (amdgpu_sriov_vf(adev)) 2461 amdgpu_virt_exchange_data(adev); 2462 2463 r = amdgpu_device_vram_scratch_init(adev); 2464 if (r) { 2465 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2466 goto init_failed; 2467 } 2468 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2469 if (r) { 2470 DRM_ERROR("hw_init %d failed %d\n", i, r); 2471 goto init_failed; 2472 } 2473 r = amdgpu_device_wb_init(adev); 2474 if (r) { 2475 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2476 goto init_failed; 2477 } 2478 adev->ip_blocks[i].status.hw = true; 2479 2480 /* right after GMC hw init, we create CSA */ 2481 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2482 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2483 AMDGPU_GEM_DOMAIN_VRAM, 2484 AMDGPU_CSA_SIZE); 2485 if (r) { 2486 DRM_ERROR("allocate CSA failed %d\n", r); 2487 goto init_failed; 2488 } 2489 } 2490 } 2491 } 2492 2493 if (amdgpu_sriov_vf(adev)) 2494 amdgpu_virt_init_data_exchange(adev); 2495 2496 r = amdgpu_ib_pool_init(adev); 2497 if (r) { 2498 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2499 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2500 goto init_failed; 2501 } 2502 2503 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2504 if (r) 2505 goto init_failed; 2506 2507 r = amdgpu_device_ip_hw_init_phase1(adev); 2508 if (r) 2509 goto init_failed; 2510 2511 r = amdgpu_device_fw_loading(adev); 2512 if (r) 2513 goto init_failed; 2514 2515 r = amdgpu_device_ip_hw_init_phase2(adev); 2516 if (r) 2517 goto init_failed; 2518 2519 /* 2520 * retired pages will be loaded from eeprom and reserved here, 2521 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2522 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2523 * for I2C communication which only true at this point. 2524 * 2525 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2526 * failure from bad gpu situation and stop amdgpu init process 2527 * accordingly. For other failed cases, it will still release all 2528 * the resource and print error message, rather than returning one 2529 * negative value to upper level. 2530 * 2531 * Note: theoretically, this should be called before all vram allocations 2532 * to protect retired page from abusing 2533 */ 2534 r = amdgpu_ras_recovery_init(adev); 2535 if (r) 2536 goto init_failed; 2537 2538 /** 2539 * In case of XGMI grab extra reference for reset domain for this device 2540 */ 2541 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2542 if (amdgpu_xgmi_add_device(adev) == 0) { 2543 if (!amdgpu_sriov_vf(adev)) { 2544 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2545 2546 if (WARN_ON(!hive)) { 2547 r = -ENOENT; 2548 goto init_failed; 2549 } 2550 2551 if (!hive->reset_domain || 2552 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2553 r = -ENOENT; 2554 amdgpu_put_xgmi_hive(hive); 2555 goto init_failed; 2556 } 2557 2558 /* Drop the early temporary reset domain we created for device */ 2559 amdgpu_reset_put_reset_domain(adev->reset_domain); 2560 adev->reset_domain = hive->reset_domain; 2561 amdgpu_put_xgmi_hive(hive); 2562 } 2563 } 2564 } 2565 2566 r = amdgpu_device_init_schedulers(adev); 2567 if (r) 2568 goto init_failed; 2569 2570 /* Don't init kfd if whole hive need to be reset during init */ 2571 if (!adev->gmc.xgmi.pending_reset) 2572 amdgpu_amdkfd_device_init(adev); 2573 2574 amdgpu_fru_get_product_info(adev); 2575 2576 init_failed: 2577 2578 return r; 2579 } 2580 2581 /** 2582 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2583 * 2584 * @adev: amdgpu_device pointer 2585 * 2586 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2587 * this function before a GPU reset. If the value is retained after a 2588 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2589 */ 2590 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2591 { 2592 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2593 } 2594 2595 /** 2596 * amdgpu_device_check_vram_lost - check if vram is valid 2597 * 2598 * @adev: amdgpu_device pointer 2599 * 2600 * Checks the reset magic value written to the gart pointer in VRAM. 2601 * The driver calls this after a GPU reset to see if the contents of 2602 * VRAM is lost or now. 2603 * returns true if vram is lost, false if not. 2604 */ 2605 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2606 { 2607 if (memcmp(adev->gart.ptr, adev->reset_magic, 2608 AMDGPU_RESET_MAGIC_NUM)) 2609 return true; 2610 2611 if (!amdgpu_in_reset(adev)) 2612 return false; 2613 2614 /* 2615 * For all ASICs with baco/mode1 reset, the VRAM is 2616 * always assumed to be lost. 2617 */ 2618 switch (amdgpu_asic_reset_method(adev)) { 2619 case AMD_RESET_METHOD_BACO: 2620 case AMD_RESET_METHOD_MODE1: 2621 return true; 2622 default: 2623 return false; 2624 } 2625 } 2626 2627 /** 2628 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2629 * 2630 * @adev: amdgpu_device pointer 2631 * @state: clockgating state (gate or ungate) 2632 * 2633 * The list of all the hardware IPs that make up the asic is walked and the 2634 * set_clockgating_state callbacks are run. 2635 * Late initialization pass enabling clockgating for hardware IPs. 2636 * Fini or suspend, pass disabling clockgating for hardware IPs. 2637 * Returns 0 on success, negative error code on failure. 2638 */ 2639 2640 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2641 enum amd_clockgating_state state) 2642 { 2643 int i, j, r; 2644 2645 if (amdgpu_emu_mode == 1) 2646 return 0; 2647 2648 for (j = 0; j < adev->num_ip_blocks; j++) { 2649 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2650 if (!adev->ip_blocks[i].status.late_initialized) 2651 continue; 2652 /* skip CG for GFX on S0ix */ 2653 if (adev->in_s0ix && 2654 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2655 continue; 2656 /* skip CG for VCE/UVD, it's handled specially */ 2657 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2658 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2659 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2660 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2661 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2662 /* enable clockgating to save power */ 2663 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2664 state); 2665 if (r) { 2666 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2667 adev->ip_blocks[i].version->funcs->name, r); 2668 return r; 2669 } 2670 } 2671 } 2672 2673 return 0; 2674 } 2675 2676 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2677 enum amd_powergating_state state) 2678 { 2679 int i, j, r; 2680 2681 if (amdgpu_emu_mode == 1) 2682 return 0; 2683 2684 for (j = 0; j < adev->num_ip_blocks; j++) { 2685 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2686 if (!adev->ip_blocks[i].status.late_initialized) 2687 continue; 2688 /* skip PG for GFX on S0ix */ 2689 if (adev->in_s0ix && 2690 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2691 continue; 2692 /* skip CG for VCE/UVD, it's handled specially */ 2693 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2694 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2695 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2696 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2697 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2698 /* enable powergating to save power */ 2699 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2700 state); 2701 if (r) { 2702 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2703 adev->ip_blocks[i].version->funcs->name, r); 2704 return r; 2705 } 2706 } 2707 } 2708 return 0; 2709 } 2710 2711 static int amdgpu_device_enable_mgpu_fan_boost(void) 2712 { 2713 struct amdgpu_gpu_instance *gpu_ins; 2714 struct amdgpu_device *adev; 2715 int i, ret = 0; 2716 2717 mutex_lock(&mgpu_info.mutex); 2718 2719 /* 2720 * MGPU fan boost feature should be enabled 2721 * only when there are two or more dGPUs in 2722 * the system 2723 */ 2724 if (mgpu_info.num_dgpu < 2) 2725 goto out; 2726 2727 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2728 gpu_ins = &(mgpu_info.gpu_ins[i]); 2729 adev = gpu_ins->adev; 2730 if (!(adev->flags & AMD_IS_APU) && 2731 !gpu_ins->mgpu_fan_enabled) { 2732 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2733 if (ret) 2734 break; 2735 2736 gpu_ins->mgpu_fan_enabled = 1; 2737 } 2738 } 2739 2740 out: 2741 mutex_unlock(&mgpu_info.mutex); 2742 2743 return ret; 2744 } 2745 2746 /** 2747 * amdgpu_device_ip_late_init - run late init for hardware IPs 2748 * 2749 * @adev: amdgpu_device pointer 2750 * 2751 * Late initialization pass for hardware IPs. The list of all the hardware 2752 * IPs that make up the asic is walked and the late_init callbacks are run. 2753 * late_init covers any special initialization that an IP requires 2754 * after all of the have been initialized or something that needs to happen 2755 * late in the init process. 2756 * Returns 0 on success, negative error code on failure. 2757 */ 2758 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2759 { 2760 struct amdgpu_gpu_instance *gpu_instance; 2761 int i = 0, r; 2762 2763 for (i = 0; i < adev->num_ip_blocks; i++) { 2764 if (!adev->ip_blocks[i].status.hw) 2765 continue; 2766 if (adev->ip_blocks[i].version->funcs->late_init) { 2767 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2768 if (r) { 2769 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2770 adev->ip_blocks[i].version->funcs->name, r); 2771 return r; 2772 } 2773 } 2774 adev->ip_blocks[i].status.late_initialized = true; 2775 } 2776 2777 r = amdgpu_ras_late_init(adev); 2778 if (r) { 2779 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2780 return r; 2781 } 2782 2783 amdgpu_ras_set_error_query_ready(adev, true); 2784 2785 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2786 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2787 2788 amdgpu_device_fill_reset_magic(adev); 2789 2790 r = amdgpu_device_enable_mgpu_fan_boost(); 2791 if (r) 2792 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2793 2794 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2795 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2796 adev->asic_type == CHIP_ALDEBARAN )) 2797 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2798 2799 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2800 mutex_lock(&mgpu_info.mutex); 2801 2802 /* 2803 * Reset device p-state to low as this was booted with high. 2804 * 2805 * This should be performed only after all devices from the same 2806 * hive get initialized. 2807 * 2808 * However, it's unknown how many device in the hive in advance. 2809 * As this is counted one by one during devices initializations. 2810 * 2811 * So, we wait for all XGMI interlinked devices initialized. 2812 * This may bring some delays as those devices may come from 2813 * different hives. But that should be OK. 2814 */ 2815 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2816 for (i = 0; i < mgpu_info.num_gpu; i++) { 2817 gpu_instance = &(mgpu_info.gpu_ins[i]); 2818 if (gpu_instance->adev->flags & AMD_IS_APU) 2819 continue; 2820 2821 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2822 AMDGPU_XGMI_PSTATE_MIN); 2823 if (r) { 2824 DRM_ERROR("pstate setting failed (%d).\n", r); 2825 break; 2826 } 2827 } 2828 } 2829 2830 mutex_unlock(&mgpu_info.mutex); 2831 } 2832 2833 return 0; 2834 } 2835 2836 /** 2837 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2838 * 2839 * @adev: amdgpu_device pointer 2840 * 2841 * For ASICs need to disable SMC first 2842 */ 2843 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2844 { 2845 int i, r; 2846 2847 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2848 return; 2849 2850 for (i = 0; i < adev->num_ip_blocks; i++) { 2851 if (!adev->ip_blocks[i].status.hw) 2852 continue; 2853 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2854 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2855 /* XXX handle errors */ 2856 if (r) { 2857 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2858 adev->ip_blocks[i].version->funcs->name, r); 2859 } 2860 adev->ip_blocks[i].status.hw = false; 2861 break; 2862 } 2863 } 2864 } 2865 2866 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2867 { 2868 int i, r; 2869 2870 for (i = 0; i < adev->num_ip_blocks; i++) { 2871 if (!adev->ip_blocks[i].version->funcs->early_fini) 2872 continue; 2873 2874 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2875 if (r) { 2876 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2877 adev->ip_blocks[i].version->funcs->name, r); 2878 } 2879 } 2880 2881 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2882 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2883 2884 amdgpu_amdkfd_suspend(adev, false); 2885 2886 /* Workaroud for ASICs need to disable SMC first */ 2887 amdgpu_device_smu_fini_early(adev); 2888 2889 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2890 if (!adev->ip_blocks[i].status.hw) 2891 continue; 2892 2893 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2894 /* XXX handle errors */ 2895 if (r) { 2896 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2897 adev->ip_blocks[i].version->funcs->name, r); 2898 } 2899 2900 adev->ip_blocks[i].status.hw = false; 2901 } 2902 2903 if (amdgpu_sriov_vf(adev)) { 2904 if (amdgpu_virt_release_full_gpu(adev, false)) 2905 DRM_ERROR("failed to release exclusive mode on fini\n"); 2906 } 2907 2908 return 0; 2909 } 2910 2911 /** 2912 * amdgpu_device_ip_fini - run fini for hardware IPs 2913 * 2914 * @adev: amdgpu_device pointer 2915 * 2916 * Main teardown pass for hardware IPs. The list of all the hardware 2917 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2918 * are run. hw_fini tears down the hardware associated with each IP 2919 * and sw_fini tears down any software state associated with each IP. 2920 * Returns 0 on success, negative error code on failure. 2921 */ 2922 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2923 { 2924 int i, r; 2925 2926 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2927 amdgpu_virt_release_ras_err_handler_data(adev); 2928 2929 if (adev->gmc.xgmi.num_physical_nodes > 1) 2930 amdgpu_xgmi_remove_device(adev); 2931 2932 amdgpu_amdkfd_device_fini_sw(adev); 2933 2934 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2935 if (!adev->ip_blocks[i].status.sw) 2936 continue; 2937 2938 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2939 amdgpu_ucode_free_bo(adev); 2940 amdgpu_free_static_csa(&adev->virt.csa_obj); 2941 amdgpu_device_wb_fini(adev); 2942 amdgpu_device_vram_scratch_fini(adev); 2943 amdgpu_ib_pool_fini(adev); 2944 } 2945 2946 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2947 /* XXX handle errors */ 2948 if (r) { 2949 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2950 adev->ip_blocks[i].version->funcs->name, r); 2951 } 2952 adev->ip_blocks[i].status.sw = false; 2953 adev->ip_blocks[i].status.valid = false; 2954 } 2955 2956 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2957 if (!adev->ip_blocks[i].status.late_initialized) 2958 continue; 2959 if (adev->ip_blocks[i].version->funcs->late_fini) 2960 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2961 adev->ip_blocks[i].status.late_initialized = false; 2962 } 2963 2964 amdgpu_ras_fini(adev); 2965 2966 return 0; 2967 } 2968 2969 /** 2970 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2971 * 2972 * @work: work_struct. 2973 */ 2974 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2975 { 2976 struct amdgpu_device *adev = 2977 container_of(work, struct amdgpu_device, delayed_init_work.work); 2978 int r; 2979 2980 r = amdgpu_ib_ring_tests(adev); 2981 if (r) 2982 DRM_ERROR("ib ring test failed (%d).\n", r); 2983 } 2984 2985 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2986 { 2987 struct amdgpu_device *adev = 2988 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2989 2990 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2991 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2992 2993 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2994 adev->gfx.gfx_off_state = true; 2995 } 2996 2997 /** 2998 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2999 * 3000 * @adev: amdgpu_device pointer 3001 * 3002 * Main suspend function for hardware IPs. The list of all the hardware 3003 * IPs that make up the asic is walked, clockgating is disabled and the 3004 * suspend callbacks are run. suspend puts the hardware and software state 3005 * in each IP into a state suitable for suspend. 3006 * Returns 0 on success, negative error code on failure. 3007 */ 3008 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3009 { 3010 int i, r; 3011 3012 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3013 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3014 3015 /* 3016 * Per PMFW team's suggestion, driver needs to handle gfxoff 3017 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3018 * scenario. Add the missing df cstate disablement here. 3019 */ 3020 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3021 dev_warn(adev->dev, "Failed to disallow df cstate"); 3022 3023 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3024 if (!adev->ip_blocks[i].status.valid) 3025 continue; 3026 3027 /* displays are handled separately */ 3028 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3029 continue; 3030 3031 /* XXX handle errors */ 3032 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3033 /* XXX handle errors */ 3034 if (r) { 3035 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3036 adev->ip_blocks[i].version->funcs->name, r); 3037 return r; 3038 } 3039 3040 adev->ip_blocks[i].status.hw = false; 3041 } 3042 3043 return 0; 3044 } 3045 3046 /** 3047 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3048 * 3049 * @adev: amdgpu_device pointer 3050 * 3051 * Main suspend function for hardware IPs. The list of all the hardware 3052 * IPs that make up the asic is walked, clockgating is disabled and the 3053 * suspend callbacks are run. suspend puts the hardware and software state 3054 * in each IP into a state suitable for suspend. 3055 * Returns 0 on success, negative error code on failure. 3056 */ 3057 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3058 { 3059 int i, r; 3060 3061 if (adev->in_s0ix) 3062 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3063 3064 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3065 if (!adev->ip_blocks[i].status.valid) 3066 continue; 3067 /* displays are handled in phase1 */ 3068 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3069 continue; 3070 /* PSP lost connection when err_event_athub occurs */ 3071 if (amdgpu_ras_intr_triggered() && 3072 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3073 adev->ip_blocks[i].status.hw = false; 3074 continue; 3075 } 3076 3077 /* skip unnecessary suspend if we do not initialize them yet */ 3078 if (adev->gmc.xgmi.pending_reset && 3079 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3080 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3081 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3082 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3083 adev->ip_blocks[i].status.hw = false; 3084 continue; 3085 } 3086 3087 /* skip suspend of gfx/mes and psp for S0ix 3088 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3089 * like at runtime. PSP is also part of the always on hardware 3090 * so no need to suspend it. 3091 */ 3092 if (adev->in_s0ix && 3093 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3094 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3095 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3096 continue; 3097 3098 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3099 if (adev->in_s0ix && 3100 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3101 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3102 continue; 3103 3104 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3105 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3106 * from this location and RLC Autoload automatically also gets loaded 3107 * from here based on PMFW -> PSP message during re-init sequence. 3108 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3109 * the TMR and reload FWs again for IMU enabled APU ASICs. 3110 */ 3111 if (amdgpu_in_reset(adev) && 3112 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3113 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3114 continue; 3115 3116 /* XXX handle errors */ 3117 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3118 /* XXX handle errors */ 3119 if (r) { 3120 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3121 adev->ip_blocks[i].version->funcs->name, r); 3122 } 3123 adev->ip_blocks[i].status.hw = false; 3124 /* handle putting the SMC in the appropriate state */ 3125 if(!amdgpu_sriov_vf(adev)){ 3126 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3127 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3128 if (r) { 3129 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3130 adev->mp1_state, r); 3131 return r; 3132 } 3133 } 3134 } 3135 } 3136 3137 return 0; 3138 } 3139 3140 /** 3141 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3142 * 3143 * @adev: amdgpu_device pointer 3144 * 3145 * Main suspend function for hardware IPs. The list of all the hardware 3146 * IPs that make up the asic is walked, clockgating is disabled and the 3147 * suspend callbacks are run. suspend puts the hardware and software state 3148 * in each IP into a state suitable for suspend. 3149 * Returns 0 on success, negative error code on failure. 3150 */ 3151 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3152 { 3153 int r; 3154 3155 if (amdgpu_sriov_vf(adev)) { 3156 amdgpu_virt_fini_data_exchange(adev); 3157 amdgpu_virt_request_full_gpu(adev, false); 3158 } 3159 3160 r = amdgpu_device_ip_suspend_phase1(adev); 3161 if (r) 3162 return r; 3163 r = amdgpu_device_ip_suspend_phase2(adev); 3164 3165 if (amdgpu_sriov_vf(adev)) 3166 amdgpu_virt_release_full_gpu(adev, false); 3167 3168 return r; 3169 } 3170 3171 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3172 { 3173 int i, r; 3174 3175 static enum amd_ip_block_type ip_order[] = { 3176 AMD_IP_BLOCK_TYPE_COMMON, 3177 AMD_IP_BLOCK_TYPE_GMC, 3178 AMD_IP_BLOCK_TYPE_PSP, 3179 AMD_IP_BLOCK_TYPE_IH, 3180 }; 3181 3182 for (i = 0; i < adev->num_ip_blocks; i++) { 3183 int j; 3184 struct amdgpu_ip_block *block; 3185 3186 block = &adev->ip_blocks[i]; 3187 block->status.hw = false; 3188 3189 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3190 3191 if (block->version->type != ip_order[j] || 3192 !block->status.valid) 3193 continue; 3194 3195 r = block->version->funcs->hw_init(adev); 3196 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3197 if (r) 3198 return r; 3199 block->status.hw = true; 3200 } 3201 } 3202 3203 return 0; 3204 } 3205 3206 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3207 { 3208 int i, r; 3209 3210 static enum amd_ip_block_type ip_order[] = { 3211 AMD_IP_BLOCK_TYPE_SMC, 3212 AMD_IP_BLOCK_TYPE_DCE, 3213 AMD_IP_BLOCK_TYPE_GFX, 3214 AMD_IP_BLOCK_TYPE_SDMA, 3215 AMD_IP_BLOCK_TYPE_UVD, 3216 AMD_IP_BLOCK_TYPE_VCE, 3217 AMD_IP_BLOCK_TYPE_VCN 3218 }; 3219 3220 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3221 int j; 3222 struct amdgpu_ip_block *block; 3223 3224 for (j = 0; j < adev->num_ip_blocks; j++) { 3225 block = &adev->ip_blocks[j]; 3226 3227 if (block->version->type != ip_order[i] || 3228 !block->status.valid || 3229 block->status.hw) 3230 continue; 3231 3232 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3233 r = block->version->funcs->resume(adev); 3234 else 3235 r = block->version->funcs->hw_init(adev); 3236 3237 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3238 if (r) 3239 return r; 3240 block->status.hw = true; 3241 } 3242 } 3243 3244 return 0; 3245 } 3246 3247 /** 3248 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3249 * 3250 * @adev: amdgpu_device pointer 3251 * 3252 * First resume function for hardware IPs. The list of all the hardware 3253 * IPs that make up the asic is walked and the resume callbacks are run for 3254 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3255 * after a suspend and updates the software state as necessary. This 3256 * function is also used for restoring the GPU after a GPU reset. 3257 * Returns 0 on success, negative error code on failure. 3258 */ 3259 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3260 { 3261 int i, r; 3262 3263 for (i = 0; i < adev->num_ip_blocks; i++) { 3264 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3265 continue; 3266 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3267 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3268 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3269 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3270 3271 r = adev->ip_blocks[i].version->funcs->resume(adev); 3272 if (r) { 3273 DRM_ERROR("resume of IP block <%s> failed %d\n", 3274 adev->ip_blocks[i].version->funcs->name, r); 3275 return r; 3276 } 3277 adev->ip_blocks[i].status.hw = true; 3278 } 3279 } 3280 3281 return 0; 3282 } 3283 3284 /** 3285 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3286 * 3287 * @adev: amdgpu_device pointer 3288 * 3289 * First resume function for hardware IPs. The list of all the hardware 3290 * IPs that make up the asic is walked and the resume callbacks are run for 3291 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3292 * functional state after a suspend and updates the software state as 3293 * necessary. This function is also used for restoring the GPU after a GPU 3294 * reset. 3295 * Returns 0 on success, negative error code on failure. 3296 */ 3297 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3298 { 3299 int i, r; 3300 3301 for (i = 0; i < adev->num_ip_blocks; i++) { 3302 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3303 continue; 3304 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3305 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3306 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3307 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3308 continue; 3309 r = adev->ip_blocks[i].version->funcs->resume(adev); 3310 if (r) { 3311 DRM_ERROR("resume of IP block <%s> failed %d\n", 3312 adev->ip_blocks[i].version->funcs->name, r); 3313 return r; 3314 } 3315 adev->ip_blocks[i].status.hw = true; 3316 3317 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3318 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3319 * amdgpu_device_resume() after IP resume. 3320 */ 3321 amdgpu_gfx_off_ctrl(adev, false); 3322 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3323 } 3324 3325 } 3326 3327 return 0; 3328 } 3329 3330 /** 3331 * amdgpu_device_ip_resume - run resume for hardware IPs 3332 * 3333 * @adev: amdgpu_device pointer 3334 * 3335 * Main resume function for hardware IPs. The hardware IPs 3336 * are split into two resume functions because they are 3337 * are also used in in recovering from a GPU reset and some additional 3338 * steps need to be take between them. In this case (S3/S4) they are 3339 * run sequentially. 3340 * Returns 0 on success, negative error code on failure. 3341 */ 3342 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3343 { 3344 int r; 3345 3346 r = amdgpu_amdkfd_resume_iommu(adev); 3347 if (r) 3348 return r; 3349 3350 r = amdgpu_device_ip_resume_phase1(adev); 3351 if (r) 3352 return r; 3353 3354 r = amdgpu_device_fw_loading(adev); 3355 if (r) 3356 return r; 3357 3358 r = amdgpu_device_ip_resume_phase2(adev); 3359 3360 return r; 3361 } 3362 3363 /** 3364 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3365 * 3366 * @adev: amdgpu_device pointer 3367 * 3368 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3369 */ 3370 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3371 { 3372 if (amdgpu_sriov_vf(adev)) { 3373 if (adev->is_atom_fw) { 3374 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3375 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3376 } else { 3377 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3378 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3379 } 3380 3381 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3382 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3383 } 3384 } 3385 3386 /** 3387 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3388 * 3389 * @asic_type: AMD asic type 3390 * 3391 * Check if there is DC (new modesetting infrastructre) support for an asic. 3392 * returns true if DC has support, false if not. 3393 */ 3394 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3395 { 3396 switch (asic_type) { 3397 #ifdef CONFIG_DRM_AMDGPU_SI 3398 case CHIP_HAINAN: 3399 #endif 3400 case CHIP_TOPAZ: 3401 /* chips with no display hardware */ 3402 return false; 3403 #if defined(CONFIG_DRM_AMD_DC) 3404 case CHIP_TAHITI: 3405 case CHIP_PITCAIRN: 3406 case CHIP_VERDE: 3407 case CHIP_OLAND: 3408 /* 3409 * We have systems in the wild with these ASICs that require 3410 * LVDS and VGA support which is not supported with DC. 3411 * 3412 * Fallback to the non-DC driver here by default so as not to 3413 * cause regressions. 3414 */ 3415 #if defined(CONFIG_DRM_AMD_DC_SI) 3416 return amdgpu_dc > 0; 3417 #else 3418 return false; 3419 #endif 3420 case CHIP_BONAIRE: 3421 case CHIP_KAVERI: 3422 case CHIP_KABINI: 3423 case CHIP_MULLINS: 3424 /* 3425 * We have systems in the wild with these ASICs that require 3426 * VGA support which is not supported with DC. 3427 * 3428 * Fallback to the non-DC driver here by default so as not to 3429 * cause regressions. 3430 */ 3431 return amdgpu_dc > 0; 3432 default: 3433 return amdgpu_dc != 0; 3434 #else 3435 default: 3436 if (amdgpu_dc > 0) 3437 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3438 "but isn't supported by ASIC, ignoring\n"); 3439 return false; 3440 #endif 3441 } 3442 } 3443 3444 /** 3445 * amdgpu_device_has_dc_support - check if dc is supported 3446 * 3447 * @adev: amdgpu_device pointer 3448 * 3449 * Returns true for supported, false for not supported 3450 */ 3451 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3452 { 3453 if (amdgpu_sriov_vf(adev) || 3454 adev->enable_virtual_display || 3455 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3456 return false; 3457 3458 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3459 } 3460 3461 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3462 { 3463 struct amdgpu_device *adev = 3464 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3465 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3466 3467 /* It's a bug to not have a hive within this function */ 3468 if (WARN_ON(!hive)) 3469 return; 3470 3471 /* 3472 * Use task barrier to synchronize all xgmi reset works across the 3473 * hive. task_barrier_enter and task_barrier_exit will block 3474 * until all the threads running the xgmi reset works reach 3475 * those points. task_barrier_full will do both blocks. 3476 */ 3477 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3478 3479 task_barrier_enter(&hive->tb); 3480 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3481 3482 if (adev->asic_reset_res) 3483 goto fail; 3484 3485 task_barrier_exit(&hive->tb); 3486 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3487 3488 if (adev->asic_reset_res) 3489 goto fail; 3490 3491 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3492 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3493 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3494 } else { 3495 3496 task_barrier_full(&hive->tb); 3497 adev->asic_reset_res = amdgpu_asic_reset(adev); 3498 } 3499 3500 fail: 3501 if (adev->asic_reset_res) 3502 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3503 adev->asic_reset_res, adev_to_drm(adev)->unique); 3504 amdgpu_put_xgmi_hive(hive); 3505 } 3506 3507 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3508 { 3509 char *input = amdgpu_lockup_timeout; 3510 char *timeout_setting = NULL; 3511 int index = 0; 3512 long timeout; 3513 int ret = 0; 3514 3515 /* 3516 * By default timeout for non compute jobs is 10000 3517 * and 60000 for compute jobs. 3518 * In SR-IOV or passthrough mode, timeout for compute 3519 * jobs are 60000 by default. 3520 */ 3521 adev->gfx_timeout = msecs_to_jiffies(10000); 3522 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3523 if (amdgpu_sriov_vf(adev)) 3524 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3525 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3526 else 3527 adev->compute_timeout = msecs_to_jiffies(60000); 3528 3529 #ifdef notyet 3530 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3531 while ((timeout_setting = strsep(&input, ",")) && 3532 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3533 ret = kstrtol(timeout_setting, 0, &timeout); 3534 if (ret) 3535 return ret; 3536 3537 if (timeout == 0) { 3538 index++; 3539 continue; 3540 } else if (timeout < 0) { 3541 timeout = MAX_SCHEDULE_TIMEOUT; 3542 dev_warn(adev->dev, "lockup timeout disabled"); 3543 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3544 } else { 3545 timeout = msecs_to_jiffies(timeout); 3546 } 3547 3548 switch (index++) { 3549 case 0: 3550 adev->gfx_timeout = timeout; 3551 break; 3552 case 1: 3553 adev->compute_timeout = timeout; 3554 break; 3555 case 2: 3556 adev->sdma_timeout = timeout; 3557 break; 3558 case 3: 3559 adev->video_timeout = timeout; 3560 break; 3561 default: 3562 break; 3563 } 3564 } 3565 /* 3566 * There is only one value specified and 3567 * it should apply to all non-compute jobs. 3568 */ 3569 if (index == 1) { 3570 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3571 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3572 adev->compute_timeout = adev->gfx_timeout; 3573 } 3574 } 3575 #endif 3576 3577 return ret; 3578 } 3579 3580 /** 3581 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3582 * 3583 * @adev: amdgpu_device pointer 3584 * 3585 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3586 */ 3587 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3588 { 3589 #ifdef notyet 3590 struct iommu_domain *domain; 3591 3592 domain = iommu_get_domain_for_dev(adev->dev); 3593 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3594 #endif 3595 adev->ram_is_direct_mapped = true; 3596 } 3597 3598 static const struct attribute *amdgpu_dev_attributes[] = { 3599 &dev_attr_product_name.attr, 3600 &dev_attr_product_number.attr, 3601 &dev_attr_serial_number.attr, 3602 &dev_attr_pcie_replay_count.attr, 3603 NULL 3604 }; 3605 3606 /** 3607 * amdgpu_device_init - initialize the driver 3608 * 3609 * @adev: amdgpu_device pointer 3610 * @flags: driver flags 3611 * 3612 * Initializes the driver info and hw (all asics). 3613 * Returns 0 for success or an error on failure. 3614 * Called at driver startup. 3615 */ 3616 int amdgpu_device_init(struct amdgpu_device *adev, 3617 uint32_t flags) 3618 { 3619 struct drm_device *ddev = adev_to_drm(adev); 3620 struct pci_dev *pdev = adev->pdev; 3621 int r, i; 3622 bool px = false; 3623 u32 max_MBps; 3624 int tmp; 3625 3626 adev->shutdown = false; 3627 adev->flags = flags; 3628 3629 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3630 adev->asic_type = amdgpu_force_asic_type; 3631 else 3632 adev->asic_type = flags & AMD_ASIC_MASK; 3633 3634 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3635 if (amdgpu_emu_mode == 1) 3636 adev->usec_timeout *= 10; 3637 adev->gmc.gart_size = 512 * 1024 * 1024; 3638 adev->accel_working = false; 3639 adev->num_rings = 0; 3640 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3641 adev->mman.buffer_funcs = NULL; 3642 adev->mman.buffer_funcs_ring = NULL; 3643 adev->vm_manager.vm_pte_funcs = NULL; 3644 adev->vm_manager.vm_pte_num_scheds = 0; 3645 adev->gmc.gmc_funcs = NULL; 3646 adev->harvest_ip_mask = 0x0; 3647 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3648 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3649 3650 adev->smc_rreg = &amdgpu_invalid_rreg; 3651 adev->smc_wreg = &amdgpu_invalid_wreg; 3652 adev->pcie_rreg = &amdgpu_invalid_rreg; 3653 adev->pcie_wreg = &amdgpu_invalid_wreg; 3654 adev->pciep_rreg = &amdgpu_invalid_rreg; 3655 adev->pciep_wreg = &amdgpu_invalid_wreg; 3656 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3657 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3658 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3659 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3660 adev->didt_rreg = &amdgpu_invalid_rreg; 3661 adev->didt_wreg = &amdgpu_invalid_wreg; 3662 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3663 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3664 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3665 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3666 3667 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3668 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3669 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3670 3671 /* mutex initialization are all done here so we 3672 * can recall function without having locking issues */ 3673 rw_init(&adev->firmware.mutex, "agfw"); 3674 rw_init(&adev->pm.mutex, "agpm"); 3675 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3676 rw_init(&adev->srbm_mutex, "srbm"); 3677 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3678 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3679 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3680 rw_init(&adev->mn_lock, "agpumn"); 3681 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3682 hash_init(adev->mn_hash); 3683 rw_init(&adev->psp.mutex, "agpsp"); 3684 rw_init(&adev->notifier_lock, "agnf"); 3685 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3686 rw_init(&adev->benchmark_mutex, "agbm"); 3687 3688 amdgpu_device_init_apu_flags(adev); 3689 3690 r = amdgpu_device_check_arguments(adev); 3691 if (r) 3692 return r; 3693 3694 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3695 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3696 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3697 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3698 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3699 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3700 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3701 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3702 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3703 3704 INIT_LIST_HEAD(&adev->shadow_list); 3705 rw_init(&adev->shadow_list_lock, "sdwlst"); 3706 3707 INIT_LIST_HEAD(&adev->reset_list); 3708 3709 INIT_LIST_HEAD(&adev->ras_list); 3710 3711 INIT_DELAYED_WORK(&adev->delayed_init_work, 3712 amdgpu_device_delayed_init_work_handler); 3713 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3714 amdgpu_device_delay_enable_gfx_off); 3715 3716 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3717 3718 adev->gfx.gfx_off_req_count = 1; 3719 adev->gfx.gfx_off_residency = 0; 3720 adev->gfx.gfx_off_entrycount = 0; 3721 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3722 3723 atomic_set(&adev->throttling_logging_enabled, 1); 3724 /* 3725 * If throttling continues, logging will be performed every minute 3726 * to avoid log flooding. "-1" is subtracted since the thermal 3727 * throttling interrupt comes every second. Thus, the total logging 3728 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3729 * for throttling interrupt) = 60 seconds. 3730 */ 3731 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3732 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3733 3734 #ifdef __linux__ 3735 /* Registers mapping */ 3736 /* TODO: block userspace mapping of io register */ 3737 if (adev->asic_type >= CHIP_BONAIRE) { 3738 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3739 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3740 } else { 3741 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3742 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3743 } 3744 3745 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3746 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3747 3748 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3749 if (adev->rmmio == NULL) { 3750 return -ENOMEM; 3751 } 3752 #endif 3753 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3754 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3755 3756 amdgpu_device_get_pcie_info(adev); 3757 3758 if (amdgpu_mcbp) 3759 DRM_INFO("MCBP is enabled\n"); 3760 3761 /* 3762 * Reset domain needs to be present early, before XGMI hive discovered 3763 * (if any) and intitialized to use reset sem and in_gpu reset flag 3764 * early on during init and before calling to RREG32. 3765 */ 3766 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3767 if (!adev->reset_domain) 3768 return -ENOMEM; 3769 3770 /* detect hw virtualization here */ 3771 amdgpu_detect_virtualization(adev); 3772 3773 r = amdgpu_device_get_job_timeout_settings(adev); 3774 if (r) { 3775 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3776 return r; 3777 } 3778 3779 /* early init functions */ 3780 r = amdgpu_device_ip_early_init(adev); 3781 if (r) 3782 return r; 3783 3784 /* Get rid of things like offb */ 3785 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3786 if (r) 3787 return r; 3788 3789 /* Enable TMZ based on IP_VERSION */ 3790 amdgpu_gmc_tmz_set(adev); 3791 3792 amdgpu_gmc_noretry_set(adev); 3793 /* Need to get xgmi info early to decide the reset behavior*/ 3794 if (adev->gmc.xgmi.supported) { 3795 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3796 if (r) 3797 return r; 3798 } 3799 3800 /* enable PCIE atomic ops */ 3801 #ifdef notyet 3802 if (amdgpu_sriov_vf(adev)) 3803 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3804 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3805 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3806 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3807 * internal path natively support atomics, set have_atomics_support to true. 3808 */ 3809 else if ((adev->flags & AMD_IS_APU) && 3810 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3811 adev->have_atomics_support = true; 3812 else 3813 adev->have_atomics_support = 3814 !pci_enable_atomic_ops_to_root(adev->pdev, 3815 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3816 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3817 if (!adev->have_atomics_support) 3818 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3819 #else 3820 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3821 * internal path natively support atomics, set have_atomics_support to true. 3822 */ 3823 if ((adev->flags & AMD_IS_APU) && 3824 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) 3825 adev->have_atomics_support = true; 3826 else 3827 adev->have_atomics_support = false; 3828 #endif 3829 3830 /* doorbell bar mapping and doorbell index init*/ 3831 amdgpu_device_doorbell_init(adev); 3832 3833 if (amdgpu_emu_mode == 1) { 3834 /* post the asic on emulation mode */ 3835 emu_soc_asic_init(adev); 3836 goto fence_driver_init; 3837 } 3838 3839 amdgpu_reset_init(adev); 3840 3841 /* detect if we are with an SRIOV vbios */ 3842 amdgpu_device_detect_sriov_bios(adev); 3843 3844 /* check if we need to reset the asic 3845 * E.g., driver was not cleanly unloaded previously, etc. 3846 */ 3847 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3848 if (adev->gmc.xgmi.num_physical_nodes) { 3849 dev_info(adev->dev, "Pending hive reset.\n"); 3850 adev->gmc.xgmi.pending_reset = true; 3851 /* Only need to init necessary block for SMU to handle the reset */ 3852 for (i = 0; i < adev->num_ip_blocks; i++) { 3853 if (!adev->ip_blocks[i].status.valid) 3854 continue; 3855 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3856 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3857 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3858 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3859 DRM_DEBUG("IP %s disabled for hw_init.\n", 3860 adev->ip_blocks[i].version->funcs->name); 3861 adev->ip_blocks[i].status.hw = true; 3862 } 3863 } 3864 } else { 3865 tmp = amdgpu_reset_method; 3866 /* It should do a default reset when loading or reloading the driver, 3867 * regardless of the module parameter reset_method. 3868 */ 3869 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3870 r = amdgpu_asic_reset(adev); 3871 amdgpu_reset_method = tmp; 3872 if (r) { 3873 dev_err(adev->dev, "asic reset on init failed\n"); 3874 goto failed; 3875 } 3876 } 3877 } 3878 3879 pci_enable_pcie_error_reporting(adev->pdev); 3880 3881 /* Post card if necessary */ 3882 if (amdgpu_device_need_post(adev)) { 3883 if (!adev->bios) { 3884 dev_err(adev->dev, "no vBIOS found\n"); 3885 r = -EINVAL; 3886 goto failed; 3887 } 3888 DRM_INFO("GPU posting now...\n"); 3889 r = amdgpu_device_asic_init(adev); 3890 if (r) { 3891 dev_err(adev->dev, "gpu post error!\n"); 3892 goto failed; 3893 } 3894 } 3895 3896 if (adev->is_atom_fw) { 3897 /* Initialize clocks */ 3898 r = amdgpu_atomfirmware_get_clock_info(adev); 3899 if (r) { 3900 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3901 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3902 goto failed; 3903 } 3904 } else { 3905 /* Initialize clocks */ 3906 r = amdgpu_atombios_get_clock_info(adev); 3907 if (r) { 3908 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3909 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3910 goto failed; 3911 } 3912 /* init i2c buses */ 3913 if (!amdgpu_device_has_dc_support(adev)) 3914 amdgpu_atombios_i2c_init(adev); 3915 } 3916 3917 fence_driver_init: 3918 /* Fence driver */ 3919 r = amdgpu_fence_driver_sw_init(adev); 3920 if (r) { 3921 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3922 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3923 goto failed; 3924 } 3925 3926 /* init the mode config */ 3927 drm_mode_config_init(adev_to_drm(adev)); 3928 3929 r = amdgpu_device_ip_init(adev); 3930 if (r) { 3931 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3932 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3933 goto release_ras_con; 3934 } 3935 3936 amdgpu_fence_driver_hw_init(adev); 3937 3938 dev_info(adev->dev, 3939 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3940 adev->gfx.config.max_shader_engines, 3941 adev->gfx.config.max_sh_per_se, 3942 adev->gfx.config.max_cu_per_sh, 3943 adev->gfx.cu_info.number); 3944 3945 #ifdef __OpenBSD__ 3946 { 3947 const char *chip_name; 3948 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3949 int maj, min, rev; 3950 3951 switch (adev->asic_type) { 3952 case CHIP_RAVEN: 3953 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3954 chip_name = "RAVEN2"; 3955 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3956 chip_name = "PICASSO"; 3957 else 3958 chip_name = "RAVEN"; 3959 break; 3960 case CHIP_RENOIR: 3961 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3962 chip_name = "RENOIR"; 3963 else 3964 chip_name = "GREEN_SARDINE"; 3965 break; 3966 default: 3967 chip_name = amdgpu_asic_name[adev->asic_type]; 3968 } 3969 3970 printf("%s: %s", adev->self.dv_xname, chip_name); 3971 /* show graphics/compute ip block version, not set on < GFX9 */ 3972 if (version) { 3973 maj = IP_VERSION_MAJ(version); 3974 min = IP_VERSION_MIN(version); 3975 rev = IP_VERSION_REV(version); 3976 printf(" GC %d.%d.%d", maj, min, rev); 3977 } 3978 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3979 } 3980 #endif 3981 3982 adev->accel_working = true; 3983 3984 amdgpu_vm_check_compute_bug(adev); 3985 3986 /* Initialize the buffer migration limit. */ 3987 if (amdgpu_moverate >= 0) 3988 max_MBps = amdgpu_moverate; 3989 else 3990 max_MBps = 8; /* Allow 8 MB/s. */ 3991 /* Get a log2 for easy divisions. */ 3992 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3993 3994 r = amdgpu_pm_sysfs_init(adev); 3995 if (r) { 3996 adev->pm_sysfs_en = false; 3997 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3998 } else 3999 adev->pm_sysfs_en = true; 4000 4001 r = amdgpu_ucode_sysfs_init(adev); 4002 if (r) { 4003 adev->ucode_sysfs_en = false; 4004 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4005 } else 4006 adev->ucode_sysfs_en = true; 4007 4008 r = amdgpu_psp_sysfs_init(adev); 4009 if (r) { 4010 adev->psp_sysfs_en = false; 4011 if (!amdgpu_sriov_vf(adev)) 4012 DRM_ERROR("Creating psp sysfs failed\n"); 4013 } else 4014 adev->psp_sysfs_en = true; 4015 4016 /* 4017 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4018 * Otherwise the mgpu fan boost feature will be skipped due to the 4019 * gpu instance is counted less. 4020 */ 4021 amdgpu_register_gpu_instance(adev); 4022 4023 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4024 * explicit gating rather than handling it automatically. 4025 */ 4026 if (!adev->gmc.xgmi.pending_reset) { 4027 r = amdgpu_device_ip_late_init(adev); 4028 if (r) { 4029 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4030 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4031 goto release_ras_con; 4032 } 4033 /* must succeed. */ 4034 amdgpu_ras_resume(adev); 4035 queue_delayed_work(system_wq, &adev->delayed_init_work, 4036 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4037 } 4038 4039 if (amdgpu_sriov_vf(adev)) { 4040 amdgpu_virt_release_full_gpu(adev, true); 4041 flush_delayed_work(&adev->delayed_init_work); 4042 } 4043 4044 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4045 if (r) 4046 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4047 4048 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4049 r = amdgpu_pmu_init(adev); 4050 if (r) 4051 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4052 4053 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4054 if (amdgpu_device_cache_pci_state(adev->pdev)) 4055 pci_restore_state(pdev); 4056 4057 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4058 /* this will fail for cards that aren't VGA class devices, just 4059 * ignore it */ 4060 #ifdef notyet 4061 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4062 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4063 #endif 4064 4065 px = amdgpu_device_supports_px(ddev); 4066 4067 if (px || (!dev_is_removable(&adev->pdev->dev) && 4068 apple_gmux_detect(NULL, NULL))) 4069 vga_switcheroo_register_client(adev->pdev, 4070 &amdgpu_switcheroo_ops, px); 4071 4072 if (px) 4073 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4074 4075 if (adev->gmc.xgmi.pending_reset) 4076 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4077 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4078 4079 amdgpu_device_check_iommu_direct_map(adev); 4080 4081 return 0; 4082 4083 release_ras_con: 4084 if (amdgpu_sriov_vf(adev)) 4085 amdgpu_virt_release_full_gpu(adev, true); 4086 4087 /* failed in exclusive mode due to timeout */ 4088 if (amdgpu_sriov_vf(adev) && 4089 !amdgpu_sriov_runtime(adev) && 4090 amdgpu_virt_mmio_blocked(adev) && 4091 !amdgpu_virt_wait_reset(adev)) { 4092 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4093 /* Don't send request since VF is inactive. */ 4094 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4095 adev->virt.ops = NULL; 4096 r = -EAGAIN; 4097 } 4098 amdgpu_release_ras_context(adev); 4099 4100 failed: 4101 amdgpu_vf_error_trans_all(adev); 4102 4103 return r; 4104 } 4105 4106 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4107 { 4108 STUB(); 4109 #ifdef notyet 4110 /* Clear all CPU mappings pointing to this device */ 4111 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4112 #endif 4113 4114 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4115 amdgpu_device_doorbell_fini(adev); 4116 4117 #ifdef __linux__ 4118 iounmap(adev->rmmio); 4119 adev->rmmio = NULL; 4120 if (adev->mman.aper_base_kaddr) 4121 iounmap(adev->mman.aper_base_kaddr); 4122 adev->mman.aper_base_kaddr = NULL; 4123 #else 4124 if (adev->rmmio_size > 0) 4125 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4126 adev->rmmio_size); 4127 adev->rmmio_size = 0; 4128 adev->rmmio = NULL; 4129 if (adev->mman.aper_base_kaddr) 4130 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4131 adev->gmc.visible_vram_size); 4132 adev->mman.aper_base_kaddr = NULL; 4133 #endif 4134 4135 /* Memory manager related */ 4136 if (!adev->gmc.xgmi.connected_to_cpu) { 4137 #ifdef __linux__ 4138 arch_phys_wc_del(adev->gmc.vram_mtrr); 4139 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4140 #else 4141 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4142 #endif 4143 } 4144 } 4145 4146 /** 4147 * amdgpu_device_fini_hw - tear down the driver 4148 * 4149 * @adev: amdgpu_device pointer 4150 * 4151 * Tear down the driver info (all asics). 4152 * Called at driver shutdown. 4153 */ 4154 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4155 { 4156 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4157 flush_delayed_work(&adev->delayed_init_work); 4158 adev->shutdown = true; 4159 4160 /* make sure IB test finished before entering exclusive mode 4161 * to avoid preemption on IB test 4162 * */ 4163 if (amdgpu_sriov_vf(adev)) { 4164 amdgpu_virt_request_full_gpu(adev, false); 4165 amdgpu_virt_fini_data_exchange(adev); 4166 } 4167 4168 /* disable all interrupts */ 4169 amdgpu_irq_disable_all(adev); 4170 if (adev->mode_info.mode_config_initialized){ 4171 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4172 drm_helper_force_disable_all(adev_to_drm(adev)); 4173 else 4174 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4175 } 4176 amdgpu_fence_driver_hw_fini(adev); 4177 4178 if (adev->mman.initialized) { 4179 flush_delayed_work(&adev->mman.bdev.wq); 4180 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4181 } 4182 4183 if (adev->pm_sysfs_en) 4184 amdgpu_pm_sysfs_fini(adev); 4185 if (adev->ucode_sysfs_en) 4186 amdgpu_ucode_sysfs_fini(adev); 4187 if (adev->psp_sysfs_en) 4188 amdgpu_psp_sysfs_fini(adev); 4189 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4190 4191 /* disable ras feature must before hw fini */ 4192 amdgpu_ras_pre_fini(adev); 4193 4194 amdgpu_device_ip_fini_early(adev); 4195 4196 amdgpu_irq_fini_hw(adev); 4197 4198 if (adev->mman.initialized) 4199 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4200 4201 amdgpu_gart_dummy_page_fini(adev); 4202 4203 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4204 amdgpu_device_unmap_mmio(adev); 4205 4206 } 4207 4208 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4209 { 4210 int idx; 4211 bool px; 4212 4213 amdgpu_fence_driver_sw_fini(adev); 4214 amdgpu_device_ip_fini(adev); 4215 release_firmware(adev->firmware.gpu_info_fw); 4216 adev->firmware.gpu_info_fw = NULL; 4217 adev->accel_working = false; 4218 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4219 4220 amdgpu_reset_fini(adev); 4221 4222 /* free i2c buses */ 4223 if (!amdgpu_device_has_dc_support(adev)) 4224 amdgpu_i2c_fini(adev); 4225 4226 if (amdgpu_emu_mode != 1) 4227 amdgpu_atombios_fini(adev); 4228 4229 kfree(adev->bios); 4230 adev->bios = NULL; 4231 4232 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4233 4234 if (px || (!dev_is_removable(&adev->pdev->dev) && 4235 apple_gmux_detect(NULL, NULL))) 4236 vga_switcheroo_unregister_client(adev->pdev); 4237 4238 if (px) 4239 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4240 4241 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4242 vga_client_unregister(adev->pdev); 4243 4244 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4245 #ifdef __linux__ 4246 iounmap(adev->rmmio); 4247 adev->rmmio = NULL; 4248 #else 4249 if (adev->rmmio_size > 0) 4250 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4251 adev->rmmio_size); 4252 adev->rmmio_size = 0; 4253 adev->rmmio = NULL; 4254 #endif 4255 amdgpu_device_doorbell_fini(adev); 4256 drm_dev_exit(idx); 4257 } 4258 4259 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4260 amdgpu_pmu_fini(adev); 4261 if (adev->mman.discovery_bin) 4262 amdgpu_discovery_fini(adev); 4263 4264 amdgpu_reset_put_reset_domain(adev->reset_domain); 4265 adev->reset_domain = NULL; 4266 4267 kfree(adev->pci_state); 4268 4269 } 4270 4271 /** 4272 * amdgpu_device_evict_resources - evict device resources 4273 * @adev: amdgpu device object 4274 * 4275 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4276 * of the vram memory type. Mainly used for evicting device resources 4277 * at suspend time. 4278 * 4279 */ 4280 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4281 { 4282 int ret; 4283 4284 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4285 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4286 return 0; 4287 4288 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4289 if (ret) 4290 DRM_WARN("evicting device resources failed\n"); 4291 return ret; 4292 } 4293 4294 /* 4295 * Suspend & resume. 4296 */ 4297 /** 4298 * amdgpu_device_suspend - initiate device suspend 4299 * 4300 * @dev: drm dev pointer 4301 * @fbcon : notify the fbdev of suspend 4302 * 4303 * Puts the hw in the suspend state (all asics). 4304 * Returns 0 for success or an error on failure. 4305 * Called at driver suspend. 4306 */ 4307 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4308 { 4309 struct amdgpu_device *adev = drm_to_adev(dev); 4310 int r = 0; 4311 4312 if (adev->shutdown) 4313 return 0; 4314 4315 #ifdef notyet 4316 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4317 return 0; 4318 #endif 4319 4320 adev->in_suspend = true; 4321 4322 if (amdgpu_sriov_vf(adev)) { 4323 amdgpu_virt_fini_data_exchange(adev); 4324 r = amdgpu_virt_request_full_gpu(adev, false); 4325 if (r) 4326 return r; 4327 } 4328 4329 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4330 DRM_WARN("smart shift update failed\n"); 4331 4332 drm_kms_helper_poll_disable(dev); 4333 4334 if (fbcon) 4335 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4336 4337 cancel_delayed_work_sync(&adev->delayed_init_work); 4338 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4339 4340 amdgpu_ras_suspend(adev); 4341 4342 amdgpu_device_ip_suspend_phase1(adev); 4343 4344 if (!adev->in_s0ix) 4345 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4346 4347 r = amdgpu_device_evict_resources(adev); 4348 if (r) 4349 return r; 4350 4351 amdgpu_fence_driver_hw_fini(adev); 4352 4353 amdgpu_device_ip_suspend_phase2(adev); 4354 4355 if (amdgpu_sriov_vf(adev)) 4356 amdgpu_virt_release_full_gpu(adev, false); 4357 4358 return 0; 4359 } 4360 4361 /** 4362 * amdgpu_device_resume - initiate device resume 4363 * 4364 * @dev: drm dev pointer 4365 * @fbcon : notify the fbdev of resume 4366 * 4367 * Bring the hw back to operating state (all asics). 4368 * Returns 0 for success or an error on failure. 4369 * Called at driver resume. 4370 */ 4371 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4372 { 4373 struct amdgpu_device *adev = drm_to_adev(dev); 4374 int r = 0; 4375 4376 if (amdgpu_sriov_vf(adev)) { 4377 r = amdgpu_virt_request_full_gpu(adev, true); 4378 if (r) 4379 return r; 4380 } 4381 4382 #ifdef notyet 4383 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4384 return 0; 4385 #endif 4386 4387 if (adev->in_s0ix) 4388 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4389 4390 /* post card */ 4391 if (amdgpu_device_need_post(adev)) { 4392 r = amdgpu_device_asic_init(adev); 4393 if (r) 4394 dev_err(adev->dev, "amdgpu asic init failed\n"); 4395 } 4396 4397 r = amdgpu_device_ip_resume(adev); 4398 4399 /* no matter what r is, always need to properly release full GPU */ 4400 if (amdgpu_sriov_vf(adev)) { 4401 amdgpu_virt_init_data_exchange(adev); 4402 amdgpu_virt_release_full_gpu(adev, true); 4403 } 4404 4405 if (r) { 4406 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4407 return r; 4408 } 4409 amdgpu_fence_driver_hw_init(adev); 4410 4411 r = amdgpu_device_ip_late_init(adev); 4412 if (r) 4413 return r; 4414 4415 queue_delayed_work(system_wq, &adev->delayed_init_work, 4416 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4417 4418 if (!adev->in_s0ix) { 4419 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4420 if (r) 4421 return r; 4422 } 4423 4424 /* Make sure IB tests flushed */ 4425 flush_delayed_work(&adev->delayed_init_work); 4426 4427 if (adev->in_s0ix) { 4428 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4429 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4430 */ 4431 amdgpu_gfx_off_ctrl(adev, true); 4432 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4433 } 4434 if (fbcon) 4435 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4436 4437 drm_kms_helper_poll_enable(dev); 4438 4439 amdgpu_ras_resume(adev); 4440 4441 /* 4442 * Most of the connector probing functions try to acquire runtime pm 4443 * refs to ensure that the GPU is powered on when connector polling is 4444 * performed. Since we're calling this from a runtime PM callback, 4445 * trying to acquire rpm refs will cause us to deadlock. 4446 * 4447 * Since we're guaranteed to be holding the rpm lock, it's safe to 4448 * temporarily disable the rpm helpers so this doesn't deadlock us. 4449 */ 4450 #if defined(CONFIG_PM) && defined(__linux__) 4451 dev->dev->power.disable_depth++; 4452 #endif 4453 if (!amdgpu_device_has_dc_support(adev)) 4454 drm_helper_hpd_irq_event(dev); 4455 else 4456 drm_kms_helper_hotplug_event(dev); 4457 #if defined(CONFIG_PM) && defined(__linux__) 4458 dev->dev->power.disable_depth--; 4459 #endif 4460 adev->in_suspend = false; 4461 4462 if (adev->enable_mes) 4463 amdgpu_mes_self_test(adev); 4464 4465 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4466 DRM_WARN("smart shift update failed\n"); 4467 4468 return 0; 4469 } 4470 4471 /** 4472 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4473 * 4474 * @adev: amdgpu_device pointer 4475 * 4476 * The list of all the hardware IPs that make up the asic is walked and 4477 * the check_soft_reset callbacks are run. check_soft_reset determines 4478 * if the asic is still hung or not. 4479 * Returns true if any of the IPs are still in a hung state, false if not. 4480 */ 4481 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4482 { 4483 int i; 4484 bool asic_hang = false; 4485 4486 if (amdgpu_sriov_vf(adev)) 4487 return true; 4488 4489 if (amdgpu_asic_need_full_reset(adev)) 4490 return true; 4491 4492 for (i = 0; i < adev->num_ip_blocks; i++) { 4493 if (!adev->ip_blocks[i].status.valid) 4494 continue; 4495 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4496 adev->ip_blocks[i].status.hang = 4497 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4498 if (adev->ip_blocks[i].status.hang) { 4499 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4500 asic_hang = true; 4501 } 4502 } 4503 return asic_hang; 4504 } 4505 4506 /** 4507 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4508 * 4509 * @adev: amdgpu_device pointer 4510 * 4511 * The list of all the hardware IPs that make up the asic is walked and the 4512 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4513 * handles any IP specific hardware or software state changes that are 4514 * necessary for a soft reset to succeed. 4515 * Returns 0 on success, negative error code on failure. 4516 */ 4517 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4518 { 4519 int i, r = 0; 4520 4521 for (i = 0; i < adev->num_ip_blocks; i++) { 4522 if (!adev->ip_blocks[i].status.valid) 4523 continue; 4524 if (adev->ip_blocks[i].status.hang && 4525 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4526 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4527 if (r) 4528 return r; 4529 } 4530 } 4531 4532 return 0; 4533 } 4534 4535 /** 4536 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4537 * 4538 * @adev: amdgpu_device pointer 4539 * 4540 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4541 * reset is necessary to recover. 4542 * Returns true if a full asic reset is required, false if not. 4543 */ 4544 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4545 { 4546 int i; 4547 4548 if (amdgpu_asic_need_full_reset(adev)) 4549 return true; 4550 4551 for (i = 0; i < adev->num_ip_blocks; i++) { 4552 if (!adev->ip_blocks[i].status.valid) 4553 continue; 4554 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4555 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4556 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4557 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4558 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4559 if (adev->ip_blocks[i].status.hang) { 4560 dev_info(adev->dev, "Some block need full reset!\n"); 4561 return true; 4562 } 4563 } 4564 } 4565 return false; 4566 } 4567 4568 /** 4569 * amdgpu_device_ip_soft_reset - do a soft reset 4570 * 4571 * @adev: amdgpu_device pointer 4572 * 4573 * The list of all the hardware IPs that make up the asic is walked and the 4574 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4575 * IP specific hardware or software state changes that are necessary to soft 4576 * reset the IP. 4577 * Returns 0 on success, negative error code on failure. 4578 */ 4579 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4580 { 4581 int i, r = 0; 4582 4583 for (i = 0; i < adev->num_ip_blocks; i++) { 4584 if (!adev->ip_blocks[i].status.valid) 4585 continue; 4586 if (adev->ip_blocks[i].status.hang && 4587 adev->ip_blocks[i].version->funcs->soft_reset) { 4588 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4589 if (r) 4590 return r; 4591 } 4592 } 4593 4594 return 0; 4595 } 4596 4597 /** 4598 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4599 * 4600 * @adev: amdgpu_device pointer 4601 * 4602 * The list of all the hardware IPs that make up the asic is walked and the 4603 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4604 * handles any IP specific hardware or software state changes that are 4605 * necessary after the IP has been soft reset. 4606 * Returns 0 on success, negative error code on failure. 4607 */ 4608 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4609 { 4610 int i, r = 0; 4611 4612 for (i = 0; i < adev->num_ip_blocks; i++) { 4613 if (!adev->ip_blocks[i].status.valid) 4614 continue; 4615 if (adev->ip_blocks[i].status.hang && 4616 adev->ip_blocks[i].version->funcs->post_soft_reset) 4617 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4618 if (r) 4619 return r; 4620 } 4621 4622 return 0; 4623 } 4624 4625 /** 4626 * amdgpu_device_recover_vram - Recover some VRAM contents 4627 * 4628 * @adev: amdgpu_device pointer 4629 * 4630 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4631 * restore things like GPUVM page tables after a GPU reset where 4632 * the contents of VRAM might be lost. 4633 * 4634 * Returns: 4635 * 0 on success, negative error code on failure. 4636 */ 4637 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4638 { 4639 struct dma_fence *fence = NULL, *next = NULL; 4640 struct amdgpu_bo *shadow; 4641 struct amdgpu_bo_vm *vmbo; 4642 long r = 1, tmo; 4643 4644 if (amdgpu_sriov_runtime(adev)) 4645 tmo = msecs_to_jiffies(8000); 4646 else 4647 tmo = msecs_to_jiffies(100); 4648 4649 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4650 mutex_lock(&adev->shadow_list_lock); 4651 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4652 /* If vm is compute context or adev is APU, shadow will be NULL */ 4653 if (!vmbo->shadow) 4654 continue; 4655 shadow = vmbo->shadow; 4656 4657 /* No need to recover an evicted BO */ 4658 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4659 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4660 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4661 continue; 4662 4663 r = amdgpu_bo_restore_shadow(shadow, &next); 4664 if (r) 4665 break; 4666 4667 if (fence) { 4668 tmo = dma_fence_wait_timeout(fence, false, tmo); 4669 dma_fence_put(fence); 4670 fence = next; 4671 if (tmo == 0) { 4672 r = -ETIMEDOUT; 4673 break; 4674 } else if (tmo < 0) { 4675 r = tmo; 4676 break; 4677 } 4678 } else { 4679 fence = next; 4680 } 4681 } 4682 mutex_unlock(&adev->shadow_list_lock); 4683 4684 if (fence) 4685 tmo = dma_fence_wait_timeout(fence, false, tmo); 4686 dma_fence_put(fence); 4687 4688 if (r < 0 || tmo <= 0) { 4689 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4690 return -EIO; 4691 } 4692 4693 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4694 return 0; 4695 } 4696 4697 4698 /** 4699 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4700 * 4701 * @adev: amdgpu_device pointer 4702 * @from_hypervisor: request from hypervisor 4703 * 4704 * do VF FLR and reinitialize Asic 4705 * return 0 means succeeded otherwise failed 4706 */ 4707 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4708 bool from_hypervisor) 4709 { 4710 int r; 4711 struct amdgpu_hive_info *hive = NULL; 4712 int retry_limit = 0; 4713 4714 retry: 4715 amdgpu_amdkfd_pre_reset(adev); 4716 4717 if (from_hypervisor) 4718 r = amdgpu_virt_request_full_gpu(adev, true); 4719 else 4720 r = amdgpu_virt_reset_gpu(adev); 4721 if (r) 4722 return r; 4723 4724 /* Resume IP prior to SMC */ 4725 r = amdgpu_device_ip_reinit_early_sriov(adev); 4726 if (r) 4727 goto error; 4728 4729 amdgpu_virt_init_data_exchange(adev); 4730 4731 r = amdgpu_device_fw_loading(adev); 4732 if (r) 4733 return r; 4734 4735 /* now we are okay to resume SMC/CP/SDMA */ 4736 r = amdgpu_device_ip_reinit_late_sriov(adev); 4737 if (r) 4738 goto error; 4739 4740 hive = amdgpu_get_xgmi_hive(adev); 4741 /* Update PSP FW topology after reset */ 4742 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4743 r = amdgpu_xgmi_update_topology(hive, adev); 4744 4745 if (hive) 4746 amdgpu_put_xgmi_hive(hive); 4747 4748 if (!r) { 4749 amdgpu_irq_gpu_reset_resume_helper(adev); 4750 r = amdgpu_ib_ring_tests(adev); 4751 4752 amdgpu_amdkfd_post_reset(adev); 4753 } 4754 4755 error: 4756 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4757 amdgpu_inc_vram_lost(adev); 4758 r = amdgpu_device_recover_vram(adev); 4759 } 4760 amdgpu_virt_release_full_gpu(adev, true); 4761 4762 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4763 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4764 retry_limit++; 4765 goto retry; 4766 } else 4767 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4768 } 4769 4770 return r; 4771 } 4772 4773 /** 4774 * amdgpu_device_has_job_running - check if there is any job in mirror list 4775 * 4776 * @adev: amdgpu_device pointer 4777 * 4778 * check if there is any job in mirror list 4779 */ 4780 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4781 { 4782 int i; 4783 struct drm_sched_job *job; 4784 4785 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4786 struct amdgpu_ring *ring = adev->rings[i]; 4787 4788 if (!ring || !ring->sched.thread) 4789 continue; 4790 4791 spin_lock(&ring->sched.job_list_lock); 4792 job = list_first_entry_or_null(&ring->sched.pending_list, 4793 struct drm_sched_job, list); 4794 spin_unlock(&ring->sched.job_list_lock); 4795 if (job) 4796 return true; 4797 } 4798 return false; 4799 } 4800 4801 /** 4802 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4803 * 4804 * @adev: amdgpu_device pointer 4805 * 4806 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4807 * a hung GPU. 4808 */ 4809 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4810 { 4811 4812 if (amdgpu_gpu_recovery == 0) 4813 goto disabled; 4814 4815 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4816 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4817 return false; 4818 } 4819 4820 if (amdgpu_sriov_vf(adev)) 4821 return true; 4822 4823 if (amdgpu_gpu_recovery == -1) { 4824 switch (adev->asic_type) { 4825 #ifdef CONFIG_DRM_AMDGPU_SI 4826 case CHIP_VERDE: 4827 case CHIP_TAHITI: 4828 case CHIP_PITCAIRN: 4829 case CHIP_OLAND: 4830 case CHIP_HAINAN: 4831 #endif 4832 #ifdef CONFIG_DRM_AMDGPU_CIK 4833 case CHIP_KAVERI: 4834 case CHIP_KABINI: 4835 case CHIP_MULLINS: 4836 #endif 4837 case CHIP_CARRIZO: 4838 case CHIP_STONEY: 4839 case CHIP_CYAN_SKILLFISH: 4840 goto disabled; 4841 default: 4842 break; 4843 } 4844 } 4845 4846 return true; 4847 4848 disabled: 4849 dev_info(adev->dev, "GPU recovery disabled.\n"); 4850 return false; 4851 } 4852 4853 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4854 { 4855 u32 i; 4856 int ret = 0; 4857 4858 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4859 4860 dev_info(adev->dev, "GPU mode1 reset\n"); 4861 4862 /* disable BM */ 4863 pci_clear_master(adev->pdev); 4864 4865 amdgpu_device_cache_pci_state(adev->pdev); 4866 4867 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4868 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4869 ret = amdgpu_dpm_mode1_reset(adev); 4870 } else { 4871 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4872 ret = psp_gpu_reset(adev); 4873 } 4874 4875 if (ret) 4876 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4877 4878 amdgpu_device_load_pci_state(adev->pdev); 4879 4880 /* wait for asic to come out of reset */ 4881 for (i = 0; i < adev->usec_timeout; i++) { 4882 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4883 4884 if (memsize != 0xffffffff) 4885 break; 4886 udelay(1); 4887 } 4888 4889 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4890 return ret; 4891 } 4892 4893 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4894 struct amdgpu_reset_context *reset_context) 4895 { 4896 int i, r = 0; 4897 struct amdgpu_job *job = NULL; 4898 bool need_full_reset = 4899 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4900 4901 if (reset_context->reset_req_dev == adev) 4902 job = reset_context->job; 4903 4904 if (amdgpu_sriov_vf(adev)) { 4905 /* stop the data exchange thread */ 4906 amdgpu_virt_fini_data_exchange(adev); 4907 } 4908 4909 amdgpu_fence_driver_isr_toggle(adev, true); 4910 4911 /* block all schedulers and reset given job's ring */ 4912 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4913 struct amdgpu_ring *ring = adev->rings[i]; 4914 4915 if (!ring || !ring->sched.thread) 4916 continue; 4917 4918 /*clear job fence from fence drv to avoid force_completion 4919 *leave NULL and vm flush fence in fence drv */ 4920 amdgpu_fence_driver_clear_job_fences(ring); 4921 4922 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4923 amdgpu_fence_driver_force_completion(ring); 4924 } 4925 4926 amdgpu_fence_driver_isr_toggle(adev, false); 4927 4928 if (job && job->vm) 4929 drm_sched_increase_karma(&job->base); 4930 4931 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4932 /* If reset handler not implemented, continue; otherwise return */ 4933 if (r == -ENOSYS) 4934 r = 0; 4935 else 4936 return r; 4937 4938 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4939 if (!amdgpu_sriov_vf(adev)) { 4940 4941 if (!need_full_reset) 4942 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4943 4944 if (!need_full_reset && amdgpu_gpu_recovery) { 4945 amdgpu_device_ip_pre_soft_reset(adev); 4946 r = amdgpu_device_ip_soft_reset(adev); 4947 amdgpu_device_ip_post_soft_reset(adev); 4948 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4949 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4950 need_full_reset = true; 4951 } 4952 } 4953 4954 if (need_full_reset) 4955 r = amdgpu_device_ip_suspend(adev); 4956 if (need_full_reset) 4957 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4958 else 4959 clear_bit(AMDGPU_NEED_FULL_RESET, 4960 &reset_context->flags); 4961 } 4962 4963 return r; 4964 } 4965 4966 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4967 { 4968 int i; 4969 4970 lockdep_assert_held(&adev->reset_domain->sem); 4971 4972 for (i = 0; i < adev->num_regs; i++) { 4973 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4974 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4975 adev->reset_dump_reg_value[i]); 4976 } 4977 4978 return 0; 4979 } 4980 4981 #ifdef CONFIG_DEV_COREDUMP 4982 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4983 size_t count, void *data, size_t datalen) 4984 { 4985 struct drm_printer p; 4986 struct amdgpu_device *adev = data; 4987 struct drm_print_iterator iter; 4988 int i; 4989 4990 iter.data = buffer; 4991 iter.offset = 0; 4992 iter.start = offset; 4993 iter.remain = count; 4994 4995 p = drm_coredump_printer(&iter); 4996 4997 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4998 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4999 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 5000 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 5001 if (adev->reset_task_info.pid) 5002 drm_printf(&p, "process_name: %s PID: %d\n", 5003 adev->reset_task_info.process_name, 5004 adev->reset_task_info.pid); 5005 5006 if (adev->reset_vram_lost) 5007 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 5008 if (adev->num_regs) { 5009 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 5010 5011 for (i = 0; i < adev->num_regs; i++) 5012 drm_printf(&p, "0x%08x: 0x%08x\n", 5013 adev->reset_dump_reg_list[i], 5014 adev->reset_dump_reg_value[i]); 5015 } 5016 5017 return count - iter.remain; 5018 } 5019 5020 static void amdgpu_devcoredump_free(void *data) 5021 { 5022 } 5023 5024 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 5025 { 5026 struct drm_device *dev = adev_to_drm(adev); 5027 5028 ktime_get_ts64(&adev->reset_time); 5029 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 5030 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5031 } 5032 #endif 5033 5034 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5035 struct amdgpu_reset_context *reset_context) 5036 { 5037 struct amdgpu_device *tmp_adev = NULL; 5038 bool need_full_reset, skip_hw_reset, vram_lost = false; 5039 int r = 0; 5040 bool gpu_reset_for_dev_remove = 0; 5041 5042 /* Try reset handler method first */ 5043 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5044 reset_list); 5045 amdgpu_reset_reg_dumps(tmp_adev); 5046 5047 reset_context->reset_device_list = device_list_handle; 5048 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5049 /* If reset handler not implemented, continue; otherwise return */ 5050 if (r == -ENOSYS) 5051 r = 0; 5052 else 5053 return r; 5054 5055 /* Reset handler not implemented, use the default method */ 5056 need_full_reset = 5057 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5058 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5059 5060 gpu_reset_for_dev_remove = 5061 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5062 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5063 5064 /* 5065 * ASIC reset has to be done on all XGMI hive nodes ASAP 5066 * to allow proper links negotiation in FW (within 1 sec) 5067 */ 5068 if (!skip_hw_reset && need_full_reset) { 5069 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5070 /* For XGMI run all resets in parallel to speed up the process */ 5071 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5072 tmp_adev->gmc.xgmi.pending_reset = false; 5073 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5074 r = -EALREADY; 5075 } else 5076 r = amdgpu_asic_reset(tmp_adev); 5077 5078 if (r) { 5079 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5080 r, adev_to_drm(tmp_adev)->unique); 5081 break; 5082 } 5083 } 5084 5085 /* For XGMI wait for all resets to complete before proceed */ 5086 if (!r) { 5087 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5088 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5089 flush_work(&tmp_adev->xgmi_reset_work); 5090 r = tmp_adev->asic_reset_res; 5091 if (r) 5092 break; 5093 } 5094 } 5095 } 5096 } 5097 5098 if (!r && amdgpu_ras_intr_triggered()) { 5099 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5100 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5101 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5102 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5103 } 5104 5105 amdgpu_ras_intr_cleared(); 5106 } 5107 5108 /* Since the mode1 reset affects base ip blocks, the 5109 * phase1 ip blocks need to be resumed. Otherwise there 5110 * will be a BIOS signature error and the psp bootloader 5111 * can't load kdb on the next amdgpu install. 5112 */ 5113 if (gpu_reset_for_dev_remove) { 5114 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5115 amdgpu_device_ip_resume_phase1(tmp_adev); 5116 5117 goto end; 5118 } 5119 5120 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5121 if (need_full_reset) { 5122 /* post card */ 5123 r = amdgpu_device_asic_init(tmp_adev); 5124 if (r) { 5125 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5126 } else { 5127 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5128 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5129 if (r) 5130 goto out; 5131 5132 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5133 if (r) 5134 goto out; 5135 5136 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5137 #ifdef CONFIG_DEV_COREDUMP 5138 tmp_adev->reset_vram_lost = vram_lost; 5139 memset(&tmp_adev->reset_task_info, 0, 5140 sizeof(tmp_adev->reset_task_info)); 5141 if (reset_context->job && reset_context->job->vm) 5142 tmp_adev->reset_task_info = 5143 reset_context->job->vm->task_info; 5144 amdgpu_reset_capture_coredumpm(tmp_adev); 5145 #endif 5146 if (vram_lost) { 5147 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5148 amdgpu_inc_vram_lost(tmp_adev); 5149 } 5150 5151 r = amdgpu_device_fw_loading(tmp_adev); 5152 if (r) 5153 return r; 5154 5155 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5156 if (r) 5157 goto out; 5158 5159 if (vram_lost) 5160 amdgpu_device_fill_reset_magic(tmp_adev); 5161 5162 /* 5163 * Add this ASIC as tracked as reset was already 5164 * complete successfully. 5165 */ 5166 amdgpu_register_gpu_instance(tmp_adev); 5167 5168 if (!reset_context->hive && 5169 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5170 amdgpu_xgmi_add_device(tmp_adev); 5171 5172 r = amdgpu_device_ip_late_init(tmp_adev); 5173 if (r) 5174 goto out; 5175 5176 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5177 5178 /* 5179 * The GPU enters bad state once faulty pages 5180 * by ECC has reached the threshold, and ras 5181 * recovery is scheduled next. So add one check 5182 * here to break recovery if it indeed exceeds 5183 * bad page threshold, and remind user to 5184 * retire this GPU or setting one bigger 5185 * bad_page_threshold value to fix this once 5186 * probing driver again. 5187 */ 5188 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5189 /* must succeed. */ 5190 amdgpu_ras_resume(tmp_adev); 5191 } else { 5192 r = -EINVAL; 5193 goto out; 5194 } 5195 5196 /* Update PSP FW topology after reset */ 5197 if (reset_context->hive && 5198 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5199 r = amdgpu_xgmi_update_topology( 5200 reset_context->hive, tmp_adev); 5201 } 5202 } 5203 5204 out: 5205 if (!r) { 5206 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5207 r = amdgpu_ib_ring_tests(tmp_adev); 5208 if (r) { 5209 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5210 need_full_reset = true; 5211 r = -EAGAIN; 5212 goto end; 5213 } 5214 } 5215 5216 if (!r) 5217 r = amdgpu_device_recover_vram(tmp_adev); 5218 else 5219 tmp_adev->asic_reset_res = r; 5220 } 5221 5222 end: 5223 if (need_full_reset) 5224 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5225 else 5226 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5227 return r; 5228 } 5229 5230 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5231 { 5232 5233 switch (amdgpu_asic_reset_method(adev)) { 5234 case AMD_RESET_METHOD_MODE1: 5235 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5236 break; 5237 case AMD_RESET_METHOD_MODE2: 5238 adev->mp1_state = PP_MP1_STATE_RESET; 5239 break; 5240 default: 5241 adev->mp1_state = PP_MP1_STATE_NONE; 5242 break; 5243 } 5244 5245 pci_dev_put(p); 5246 } 5247 5248 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5249 { 5250 amdgpu_vf_error_trans_all(adev); 5251 adev->mp1_state = PP_MP1_STATE_NONE; 5252 } 5253 5254 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5255 { 5256 STUB(); 5257 #ifdef notyet 5258 struct pci_dev *p = NULL; 5259 5260 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5261 adev->pdev->bus->number, 1); 5262 if (p) { 5263 pm_runtime_enable(&(p->dev)); 5264 pm_runtime_resume(&(p->dev)); 5265 } 5266 #endif 5267 } 5268 5269 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5270 { 5271 enum amd_reset_method reset_method; 5272 struct pci_dev *p = NULL; 5273 u64 expires; 5274 5275 /* 5276 * For now, only BACO and mode1 reset are confirmed 5277 * to suffer the audio issue without proper suspended. 5278 */ 5279 reset_method = amdgpu_asic_reset_method(adev); 5280 if ((reset_method != AMD_RESET_METHOD_BACO) && 5281 (reset_method != AMD_RESET_METHOD_MODE1)) 5282 return -EINVAL; 5283 5284 STUB(); 5285 return -ENOSYS; 5286 #ifdef notyet 5287 5288 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5289 adev->pdev->bus->number, 1); 5290 if (!p) 5291 return -ENODEV; 5292 5293 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5294 if (!expires) 5295 /* 5296 * If we cannot get the audio device autosuspend delay, 5297 * a fixed 4S interval will be used. Considering 3S is 5298 * the audio controller default autosuspend delay setting. 5299 * 4S used here is guaranteed to cover that. 5300 */ 5301 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5302 5303 while (!pm_runtime_status_suspended(&(p->dev))) { 5304 if (!pm_runtime_suspend(&(p->dev))) 5305 break; 5306 5307 if (expires < ktime_get_mono_fast_ns()) { 5308 dev_warn(adev->dev, "failed to suspend display audio\n"); 5309 pci_dev_put(p); 5310 /* TODO: abort the succeeding gpu reset? */ 5311 return -ETIMEDOUT; 5312 } 5313 } 5314 5315 pm_runtime_disable(&(p->dev)); 5316 5317 pci_dev_put(p); 5318 return 0; 5319 #endif 5320 } 5321 5322 static void amdgpu_device_recheck_guilty_jobs( 5323 struct amdgpu_device *adev, struct list_head *device_list_handle, 5324 struct amdgpu_reset_context *reset_context) 5325 { 5326 int i, r = 0; 5327 5328 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5329 struct amdgpu_ring *ring = adev->rings[i]; 5330 int ret = 0; 5331 struct drm_sched_job *s_job; 5332 5333 if (!ring || !ring->sched.thread) 5334 continue; 5335 5336 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5337 struct drm_sched_job, list); 5338 if (s_job == NULL) 5339 continue; 5340 5341 /* clear job's guilty and depend the folowing step to decide the real one */ 5342 drm_sched_reset_karma(s_job); 5343 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5344 5345 if (!s_job->s_fence->parent) { 5346 DRM_WARN("Failed to get a HW fence for job!"); 5347 continue; 5348 } 5349 5350 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5351 if (ret == 0) { /* timeout */ 5352 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5353 ring->sched.name, s_job->id); 5354 5355 5356 amdgpu_fence_driver_isr_toggle(adev, true); 5357 5358 /* Clear this failed job from fence array */ 5359 amdgpu_fence_driver_clear_job_fences(ring); 5360 5361 amdgpu_fence_driver_isr_toggle(adev, false); 5362 5363 /* Since the job won't signal and we go for 5364 * another resubmit drop this parent pointer 5365 */ 5366 dma_fence_put(s_job->s_fence->parent); 5367 s_job->s_fence->parent = NULL; 5368 5369 /* set guilty */ 5370 drm_sched_increase_karma(s_job); 5371 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5372 retry: 5373 /* do hw reset */ 5374 if (amdgpu_sriov_vf(adev)) { 5375 amdgpu_virt_fini_data_exchange(adev); 5376 r = amdgpu_device_reset_sriov(adev, false); 5377 if (r) 5378 adev->asic_reset_res = r; 5379 } else { 5380 clear_bit(AMDGPU_SKIP_HW_RESET, 5381 &reset_context->flags); 5382 r = amdgpu_do_asic_reset(device_list_handle, 5383 reset_context); 5384 if (r && r == -EAGAIN) 5385 goto retry; 5386 } 5387 5388 /* 5389 * add reset counter so that the following 5390 * resubmitted job could flush vmid 5391 */ 5392 atomic_inc(&adev->gpu_reset_counter); 5393 continue; 5394 } 5395 5396 /* got the hw fence, signal finished fence */ 5397 atomic_dec(ring->sched.score); 5398 dma_fence_get(&s_job->s_fence->finished); 5399 dma_fence_signal(&s_job->s_fence->finished); 5400 dma_fence_put(&s_job->s_fence->finished); 5401 5402 /* remove node from list and free the job */ 5403 spin_lock(&ring->sched.job_list_lock); 5404 list_del_init(&s_job->list); 5405 spin_unlock(&ring->sched.job_list_lock); 5406 ring->sched.ops->free_job(s_job); 5407 } 5408 } 5409 5410 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5411 { 5412 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5413 5414 #if defined(CONFIG_DEBUG_FS) 5415 if (!amdgpu_sriov_vf(adev)) 5416 cancel_work(&adev->reset_work); 5417 #endif 5418 5419 if (adev->kfd.dev) 5420 cancel_work(&adev->kfd.reset_work); 5421 5422 if (amdgpu_sriov_vf(adev)) 5423 cancel_work(&adev->virt.flr_work); 5424 5425 if (con && adev->ras_enabled) 5426 cancel_work(&con->recovery_work); 5427 5428 } 5429 5430 5431 /** 5432 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5433 * 5434 * @adev: amdgpu_device pointer 5435 * @job: which job trigger hang 5436 * 5437 * Attempt to reset the GPU if it has hung (all asics). 5438 * Attempt to do soft-reset or full-reset and reinitialize Asic 5439 * Returns 0 for success or an error on failure. 5440 */ 5441 5442 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5443 struct amdgpu_job *job, 5444 struct amdgpu_reset_context *reset_context) 5445 { 5446 struct list_head device_list, *device_list_handle = NULL; 5447 bool job_signaled = false; 5448 struct amdgpu_hive_info *hive = NULL; 5449 struct amdgpu_device *tmp_adev = NULL; 5450 int i, r = 0; 5451 bool need_emergency_restart = false; 5452 bool audio_suspended = false; 5453 int tmp_vram_lost_counter; 5454 bool gpu_reset_for_dev_remove = false; 5455 5456 gpu_reset_for_dev_remove = 5457 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5458 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5459 5460 /* 5461 * Special case: RAS triggered and full reset isn't supported 5462 */ 5463 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5464 5465 /* 5466 * Flush RAM to disk so that after reboot 5467 * the user can read log and see why the system rebooted. 5468 */ 5469 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5470 amdgpu_ras_get_context(adev)->reboot) { 5471 DRM_WARN("Emergency reboot."); 5472 5473 #ifdef notyet 5474 ksys_sync_helper(); 5475 emergency_restart(); 5476 #else 5477 panic("emergency_restart"); 5478 #endif 5479 } 5480 5481 dev_info(adev->dev, "GPU %s begin!\n", 5482 need_emergency_restart ? "jobs stop":"reset"); 5483 5484 if (!amdgpu_sriov_vf(adev)) 5485 hive = amdgpu_get_xgmi_hive(adev); 5486 if (hive) 5487 mutex_lock(&hive->hive_lock); 5488 5489 reset_context->job = job; 5490 reset_context->hive = hive; 5491 /* 5492 * Build list of devices to reset. 5493 * In case we are in XGMI hive mode, resort the device list 5494 * to put adev in the 1st position. 5495 */ 5496 INIT_LIST_HEAD(&device_list); 5497 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5498 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5499 list_add_tail(&tmp_adev->reset_list, &device_list); 5500 if (gpu_reset_for_dev_remove && adev->shutdown) 5501 tmp_adev->shutdown = true; 5502 } 5503 if (!list_is_first(&adev->reset_list, &device_list)) 5504 list_rotate_to_front(&adev->reset_list, &device_list); 5505 device_list_handle = &device_list; 5506 } else { 5507 list_add_tail(&adev->reset_list, &device_list); 5508 device_list_handle = &device_list; 5509 } 5510 5511 /* We need to lock reset domain only once both for XGMI and single device */ 5512 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5513 reset_list); 5514 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5515 5516 /* block all schedulers and reset given job's ring */ 5517 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5518 5519 amdgpu_device_set_mp1_state(tmp_adev); 5520 5521 /* 5522 * Try to put the audio codec into suspend state 5523 * before gpu reset started. 5524 * 5525 * Due to the power domain of the graphics device 5526 * is shared with AZ power domain. Without this, 5527 * we may change the audio hardware from behind 5528 * the audio driver's back. That will trigger 5529 * some audio codec errors. 5530 */ 5531 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5532 audio_suspended = true; 5533 5534 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5535 5536 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5537 5538 if (!amdgpu_sriov_vf(tmp_adev)) 5539 amdgpu_amdkfd_pre_reset(tmp_adev); 5540 5541 /* 5542 * Mark these ASICs to be reseted as untracked first 5543 * And add them back after reset completed 5544 */ 5545 amdgpu_unregister_gpu_instance(tmp_adev); 5546 5547 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5548 5549 /* disable ras on ALL IPs */ 5550 if (!need_emergency_restart && 5551 amdgpu_device_ip_need_full_reset(tmp_adev)) 5552 amdgpu_ras_suspend(tmp_adev); 5553 5554 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5555 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5556 5557 if (!ring || !ring->sched.thread) 5558 continue; 5559 5560 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5561 5562 if (need_emergency_restart) 5563 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5564 } 5565 atomic_inc(&tmp_adev->gpu_reset_counter); 5566 } 5567 5568 if (need_emergency_restart) 5569 goto skip_sched_resume; 5570 5571 /* 5572 * Must check guilty signal here since after this point all old 5573 * HW fences are force signaled. 5574 * 5575 * job->base holds a reference to parent fence 5576 */ 5577 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5578 job_signaled = true; 5579 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5580 goto skip_hw_reset; 5581 } 5582 5583 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5584 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5585 if (gpu_reset_for_dev_remove) { 5586 /* Workaroud for ASICs need to disable SMC first */ 5587 amdgpu_device_smu_fini_early(tmp_adev); 5588 } 5589 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5590 /*TODO Should we stop ?*/ 5591 if (r) { 5592 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5593 r, adev_to_drm(tmp_adev)->unique); 5594 tmp_adev->asic_reset_res = r; 5595 } 5596 5597 /* 5598 * Drop all pending non scheduler resets. Scheduler resets 5599 * were already dropped during drm_sched_stop 5600 */ 5601 amdgpu_device_stop_pending_resets(tmp_adev); 5602 } 5603 5604 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5605 /* Actual ASIC resets if needed.*/ 5606 /* Host driver will handle XGMI hive reset for SRIOV */ 5607 if (amdgpu_sriov_vf(adev)) { 5608 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5609 if (r) 5610 adev->asic_reset_res = r; 5611 5612 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5613 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5614 amdgpu_ras_resume(adev); 5615 } else { 5616 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5617 if (r && r == -EAGAIN) 5618 goto retry; 5619 5620 if (!r && gpu_reset_for_dev_remove) 5621 goto recover_end; 5622 } 5623 5624 skip_hw_reset: 5625 5626 /* Post ASIC reset for all devs .*/ 5627 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5628 5629 /* 5630 * Sometimes a later bad compute job can block a good gfx job as gfx 5631 * and compute ring share internal GC HW mutually. We add an additional 5632 * guilty jobs recheck step to find the real guilty job, it synchronously 5633 * submits and pends for the first job being signaled. If it gets timeout, 5634 * we identify it as a real guilty job. 5635 */ 5636 if (amdgpu_gpu_recovery == 2 && 5637 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5638 amdgpu_device_recheck_guilty_jobs( 5639 tmp_adev, device_list_handle, reset_context); 5640 5641 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5642 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5643 5644 if (!ring || !ring->sched.thread) 5645 continue; 5646 5647 /* No point to resubmit jobs if we didn't HW reset*/ 5648 if (!tmp_adev->asic_reset_res && !job_signaled) 5649 drm_sched_resubmit_jobs(&ring->sched); 5650 5651 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5652 } 5653 5654 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5655 amdgpu_mes_self_test(tmp_adev); 5656 5657 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5658 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5659 } 5660 5661 if (tmp_adev->asic_reset_res) 5662 r = tmp_adev->asic_reset_res; 5663 5664 tmp_adev->asic_reset_res = 0; 5665 5666 if (r) { 5667 /* bad news, how to tell it to userspace ? */ 5668 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5669 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5670 } else { 5671 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5672 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5673 DRM_WARN("smart shift update failed\n"); 5674 } 5675 } 5676 5677 skip_sched_resume: 5678 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5679 /* unlock kfd: SRIOV would do it separately */ 5680 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5681 amdgpu_amdkfd_post_reset(tmp_adev); 5682 5683 /* kfd_post_reset will do nothing if kfd device is not initialized, 5684 * need to bring up kfd here if it's not be initialized before 5685 */ 5686 if (!adev->kfd.init_complete) 5687 amdgpu_amdkfd_device_init(adev); 5688 5689 if (audio_suspended) 5690 amdgpu_device_resume_display_audio(tmp_adev); 5691 5692 amdgpu_device_unset_mp1_state(tmp_adev); 5693 } 5694 5695 recover_end: 5696 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5697 reset_list); 5698 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5699 5700 if (hive) { 5701 mutex_unlock(&hive->hive_lock); 5702 amdgpu_put_xgmi_hive(hive); 5703 } 5704 5705 if (r) 5706 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5707 5708 atomic_set(&adev->reset_domain->reset_res, r); 5709 return r; 5710 } 5711 5712 /** 5713 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5714 * 5715 * @adev: amdgpu_device pointer 5716 * 5717 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5718 * and lanes) of the slot the device is in. Handles APUs and 5719 * virtualized environments where PCIE config space may not be available. 5720 */ 5721 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5722 { 5723 struct pci_dev *pdev; 5724 enum pci_bus_speed speed_cap, platform_speed_cap; 5725 enum pcie_link_width platform_link_width; 5726 5727 if (amdgpu_pcie_gen_cap) 5728 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5729 5730 if (amdgpu_pcie_lane_cap) 5731 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5732 5733 /* covers APUs as well */ 5734 if (pci_is_root_bus(adev->pdev->bus)) { 5735 if (adev->pm.pcie_gen_mask == 0) 5736 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5737 if (adev->pm.pcie_mlw_mask == 0) 5738 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5739 return; 5740 } 5741 5742 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5743 return; 5744 5745 pcie_bandwidth_available(adev->pdev, NULL, 5746 &platform_speed_cap, &platform_link_width); 5747 5748 if (adev->pm.pcie_gen_mask == 0) { 5749 /* asic caps */ 5750 pdev = adev->pdev; 5751 speed_cap = pcie_get_speed_cap(pdev); 5752 if (speed_cap == PCI_SPEED_UNKNOWN) { 5753 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5756 } else { 5757 if (speed_cap == PCIE_SPEED_32_0GT) 5758 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5761 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5762 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5763 else if (speed_cap == PCIE_SPEED_16_0GT) 5764 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5765 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5766 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5767 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5768 else if (speed_cap == PCIE_SPEED_8_0GT) 5769 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5770 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5771 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5772 else if (speed_cap == PCIE_SPEED_5_0GT) 5773 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5774 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5775 else 5776 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5777 } 5778 /* platform caps */ 5779 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5780 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5781 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5782 } else { 5783 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5784 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5785 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5786 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5787 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5788 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5789 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5790 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5791 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5792 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5793 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5794 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5795 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5796 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5797 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5798 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5799 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5800 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5801 else 5802 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5803 5804 } 5805 } 5806 if (adev->pm.pcie_mlw_mask == 0) { 5807 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5808 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5809 } else { 5810 switch (platform_link_width) { 5811 case PCIE_LNK_X32: 5812 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5815 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5819 break; 5820 case PCIE_LNK_X16: 5821 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5827 break; 5828 case PCIE_LNK_X12: 5829 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5834 break; 5835 case PCIE_LNK_X8: 5836 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5840 break; 5841 case PCIE_LNK_X4: 5842 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5844 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5845 break; 5846 case PCIE_LNK_X2: 5847 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5848 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5849 break; 5850 case PCIE_LNK_X1: 5851 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5852 break; 5853 default: 5854 break; 5855 } 5856 } 5857 } 5858 } 5859 5860 /** 5861 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5862 * 5863 * @adev: amdgpu_device pointer 5864 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5865 * 5866 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5867 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5868 * @peer_adev. 5869 */ 5870 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5871 struct amdgpu_device *peer_adev) 5872 { 5873 #ifdef CONFIG_HSA_AMD_P2P 5874 uint64_t address_mask = peer_adev->dev->dma_mask ? 5875 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5876 resource_size_t aper_limit = 5877 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5878 bool p2p_access = 5879 !adev->gmc.xgmi.connected_to_cpu && 5880 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5881 5882 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5883 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5884 !(adev->gmc.aper_base & address_mask || 5885 aper_limit & address_mask)); 5886 #else 5887 return false; 5888 #endif 5889 } 5890 5891 int amdgpu_device_baco_enter(struct drm_device *dev) 5892 { 5893 struct amdgpu_device *adev = drm_to_adev(dev); 5894 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5895 5896 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5897 return -ENOTSUPP; 5898 5899 if (ras && adev->ras_enabled && 5900 adev->nbio.funcs->enable_doorbell_interrupt) 5901 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5902 5903 return amdgpu_dpm_baco_enter(adev); 5904 } 5905 5906 int amdgpu_device_baco_exit(struct drm_device *dev) 5907 { 5908 struct amdgpu_device *adev = drm_to_adev(dev); 5909 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5910 int ret = 0; 5911 5912 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5913 return -ENOTSUPP; 5914 5915 ret = amdgpu_dpm_baco_exit(adev); 5916 if (ret) 5917 return ret; 5918 5919 if (ras && adev->ras_enabled && 5920 adev->nbio.funcs->enable_doorbell_interrupt) 5921 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5922 5923 if (amdgpu_passthrough(adev) && 5924 adev->nbio.funcs->clear_doorbell_interrupt) 5925 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5926 5927 return 0; 5928 } 5929 5930 /** 5931 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5932 * @pdev: PCI device struct 5933 * @state: PCI channel state 5934 * 5935 * Description: Called when a PCI error is detected. 5936 * 5937 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5938 */ 5939 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5940 { 5941 STUB(); 5942 return 0; 5943 #ifdef notyet 5944 struct drm_device *dev = pci_get_drvdata(pdev); 5945 struct amdgpu_device *adev = drm_to_adev(dev); 5946 int i; 5947 5948 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5949 5950 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5951 DRM_WARN("No support for XGMI hive yet..."); 5952 return PCI_ERS_RESULT_DISCONNECT; 5953 } 5954 5955 adev->pci_channel_state = state; 5956 5957 switch (state) { 5958 case pci_channel_io_normal: 5959 return PCI_ERS_RESULT_CAN_RECOVER; 5960 /* Fatal error, prepare for slot reset */ 5961 case pci_channel_io_frozen: 5962 /* 5963 * Locking adev->reset_domain->sem will prevent any external access 5964 * to GPU during PCI error recovery 5965 */ 5966 amdgpu_device_lock_reset_domain(adev->reset_domain); 5967 amdgpu_device_set_mp1_state(adev); 5968 5969 /* 5970 * Block any work scheduling as we do for regular GPU reset 5971 * for the duration of the recovery 5972 */ 5973 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5974 struct amdgpu_ring *ring = adev->rings[i]; 5975 5976 if (!ring || !ring->sched.thread) 5977 continue; 5978 5979 drm_sched_stop(&ring->sched, NULL); 5980 } 5981 atomic_inc(&adev->gpu_reset_counter); 5982 return PCI_ERS_RESULT_NEED_RESET; 5983 case pci_channel_io_perm_failure: 5984 /* Permanent error, prepare for device removal */ 5985 return PCI_ERS_RESULT_DISCONNECT; 5986 } 5987 5988 return PCI_ERS_RESULT_NEED_RESET; 5989 #endif 5990 } 5991 5992 /** 5993 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5994 * @pdev: pointer to PCI device 5995 */ 5996 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5997 { 5998 5999 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6000 6001 /* TODO - dump whatever for debugging purposes */ 6002 6003 /* This called only if amdgpu_pci_error_detected returns 6004 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6005 * works, no need to reset slot. 6006 */ 6007 6008 return PCI_ERS_RESULT_RECOVERED; 6009 } 6010 6011 /** 6012 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6013 * @pdev: PCI device struct 6014 * 6015 * Description: This routine is called by the pci error recovery 6016 * code after the PCI slot has been reset, just before we 6017 * should resume normal operations. 6018 */ 6019 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6020 { 6021 STUB(); 6022 return PCI_ERS_RESULT_RECOVERED; 6023 #ifdef notyet 6024 struct drm_device *dev = pci_get_drvdata(pdev); 6025 struct amdgpu_device *adev = drm_to_adev(dev); 6026 int r, i; 6027 struct amdgpu_reset_context reset_context; 6028 u32 memsize; 6029 struct list_head device_list; 6030 6031 DRM_INFO("PCI error: slot reset callback!!\n"); 6032 6033 memset(&reset_context, 0, sizeof(reset_context)); 6034 6035 INIT_LIST_HEAD(&device_list); 6036 list_add_tail(&adev->reset_list, &device_list); 6037 6038 /* wait for asic to come out of reset */ 6039 drm_msleep(500); 6040 6041 /* Restore PCI confspace */ 6042 amdgpu_device_load_pci_state(pdev); 6043 6044 /* confirm ASIC came out of reset */ 6045 for (i = 0; i < adev->usec_timeout; i++) { 6046 memsize = amdgpu_asic_get_config_memsize(adev); 6047 6048 if (memsize != 0xffffffff) 6049 break; 6050 udelay(1); 6051 } 6052 if (memsize == 0xffffffff) { 6053 r = -ETIME; 6054 goto out; 6055 } 6056 6057 reset_context.method = AMD_RESET_METHOD_NONE; 6058 reset_context.reset_req_dev = adev; 6059 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6060 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6061 6062 adev->no_hw_access = true; 6063 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6064 adev->no_hw_access = false; 6065 if (r) 6066 goto out; 6067 6068 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6069 6070 out: 6071 if (!r) { 6072 if (amdgpu_device_cache_pci_state(adev->pdev)) 6073 pci_restore_state(adev->pdev); 6074 6075 DRM_INFO("PCIe error recovery succeeded\n"); 6076 } else { 6077 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6078 amdgpu_device_unset_mp1_state(adev); 6079 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6080 } 6081 6082 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6083 #endif 6084 } 6085 6086 /** 6087 * amdgpu_pci_resume() - resume normal ops after PCI reset 6088 * @pdev: pointer to PCI device 6089 * 6090 * Called when the error recovery driver tells us that its 6091 * OK to resume normal operation. 6092 */ 6093 void amdgpu_pci_resume(struct pci_dev *pdev) 6094 { 6095 STUB(); 6096 #ifdef notyet 6097 struct drm_device *dev = pci_get_drvdata(pdev); 6098 struct amdgpu_device *adev = drm_to_adev(dev); 6099 int i; 6100 6101 6102 DRM_INFO("PCI error: resume callback!!\n"); 6103 6104 /* Only continue execution for the case of pci_channel_io_frozen */ 6105 if (adev->pci_channel_state != pci_channel_io_frozen) 6106 return; 6107 6108 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6109 struct amdgpu_ring *ring = adev->rings[i]; 6110 6111 if (!ring || !ring->sched.thread) 6112 continue; 6113 6114 6115 drm_sched_resubmit_jobs(&ring->sched); 6116 drm_sched_start(&ring->sched, true); 6117 } 6118 6119 amdgpu_device_unset_mp1_state(adev); 6120 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6121 #endif 6122 } 6123 6124 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6125 { 6126 return false; 6127 #ifdef notyet 6128 struct drm_device *dev = pci_get_drvdata(pdev); 6129 struct amdgpu_device *adev = drm_to_adev(dev); 6130 int r; 6131 6132 r = pci_save_state(pdev); 6133 if (!r) { 6134 kfree(adev->pci_state); 6135 6136 adev->pci_state = pci_store_saved_state(pdev); 6137 6138 if (!adev->pci_state) { 6139 DRM_ERROR("Failed to store PCI saved state"); 6140 return false; 6141 } 6142 } else { 6143 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6144 return false; 6145 } 6146 6147 return true; 6148 #endif 6149 } 6150 6151 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6152 { 6153 STUB(); 6154 return false; 6155 #ifdef notyet 6156 struct drm_device *dev = pci_get_drvdata(pdev); 6157 struct amdgpu_device *adev = drm_to_adev(dev); 6158 int r; 6159 6160 if (!adev->pci_state) 6161 return false; 6162 6163 r = pci_load_saved_state(pdev, adev->pci_state); 6164 6165 if (!r) { 6166 pci_restore_state(pdev); 6167 } else { 6168 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6169 return false; 6170 } 6171 6172 return true; 6173 #endif 6174 } 6175 6176 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6177 struct amdgpu_ring *ring) 6178 { 6179 #ifdef CONFIG_X86_64 6180 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6181 return; 6182 #endif 6183 if (adev->gmc.xgmi.connected_to_cpu) 6184 return; 6185 6186 if (ring && ring->funcs->emit_hdp_flush) 6187 amdgpu_ring_emit_hdp_flush(ring); 6188 else 6189 amdgpu_asic_flush_hdp(adev, ring); 6190 } 6191 6192 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6193 struct amdgpu_ring *ring) 6194 { 6195 #ifdef CONFIG_X86_64 6196 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6197 return; 6198 #endif 6199 if (adev->gmc.xgmi.connected_to_cpu) 6200 return; 6201 6202 amdgpu_asic_invalidate_hdp(adev, ring); 6203 } 6204 6205 int amdgpu_in_reset(struct amdgpu_device *adev) 6206 { 6207 return atomic_read(&adev->reset_domain->in_gpu_reset); 6208 } 6209 6210 /** 6211 * amdgpu_device_halt() - bring hardware to some kind of halt state 6212 * 6213 * @adev: amdgpu_device pointer 6214 * 6215 * Bring hardware to some kind of halt state so that no one can touch it 6216 * any more. It will help to maintain error context when error occurred. 6217 * Compare to a simple hang, the system will keep stable at least for SSH 6218 * access. Then it should be trivial to inspect the hardware state and 6219 * see what's going on. Implemented as following: 6220 * 6221 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6222 * clears all CPU mappings to device, disallows remappings through page faults 6223 * 2. amdgpu_irq_disable_all() disables all interrupts 6224 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6225 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6226 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6227 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6228 * flush any in flight DMA operations 6229 */ 6230 void amdgpu_device_halt(struct amdgpu_device *adev) 6231 { 6232 struct pci_dev *pdev = adev->pdev; 6233 struct drm_device *ddev = adev_to_drm(adev); 6234 6235 drm_dev_unplug(ddev); 6236 6237 amdgpu_irq_disable_all(adev); 6238 6239 amdgpu_fence_driver_hw_fini(adev); 6240 6241 adev->no_hw_access = true; 6242 6243 amdgpu_device_unmap_mmio(adev); 6244 6245 pci_disable_device(pdev); 6246 pci_wait_for_pending_transaction(pdev); 6247 } 6248 6249 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6250 u32 reg) 6251 { 6252 unsigned long flags, address, data; 6253 u32 r; 6254 6255 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6256 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6257 6258 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6259 WREG32(address, reg * 4); 6260 (void)RREG32(address); 6261 r = RREG32(data); 6262 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6263 return r; 6264 } 6265 6266 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6267 u32 reg, u32 v) 6268 { 6269 unsigned long flags, address, data; 6270 6271 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6272 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6273 6274 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6275 WREG32(address, reg * 4); 6276 (void)RREG32(address); 6277 WREG32(data, v); 6278 (void)RREG32(data); 6279 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6280 } 6281 6282 /** 6283 * amdgpu_device_switch_gang - switch to a new gang 6284 * @adev: amdgpu_device pointer 6285 * @gang: the gang to switch to 6286 * 6287 * Try to switch to a new gang. 6288 * Returns: NULL if we switched to the new gang or a reference to the current 6289 * gang leader. 6290 */ 6291 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6292 struct dma_fence *gang) 6293 { 6294 struct dma_fence *old = NULL; 6295 6296 do { 6297 dma_fence_put(old); 6298 rcu_read_lock(); 6299 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6300 rcu_read_unlock(); 6301 6302 if (old == gang) 6303 break; 6304 6305 if (!dma_fence_is_signaled(old)) 6306 return old; 6307 6308 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6309 old, gang) != old); 6310 6311 dma_fence_put(old); 6312 return NULL; 6313 } 6314 6315 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6316 { 6317 switch (adev->asic_type) { 6318 #ifdef CONFIG_DRM_AMDGPU_SI 6319 case CHIP_HAINAN: 6320 #endif 6321 case CHIP_TOPAZ: 6322 /* chips with no display hardware */ 6323 return false; 6324 #ifdef CONFIG_DRM_AMDGPU_SI 6325 case CHIP_TAHITI: 6326 case CHIP_PITCAIRN: 6327 case CHIP_VERDE: 6328 case CHIP_OLAND: 6329 #endif 6330 #ifdef CONFIG_DRM_AMDGPU_CIK 6331 case CHIP_BONAIRE: 6332 case CHIP_HAWAII: 6333 case CHIP_KAVERI: 6334 case CHIP_KABINI: 6335 case CHIP_MULLINS: 6336 #endif 6337 case CHIP_TONGA: 6338 case CHIP_FIJI: 6339 case CHIP_POLARIS10: 6340 case CHIP_POLARIS11: 6341 case CHIP_POLARIS12: 6342 case CHIP_VEGAM: 6343 case CHIP_CARRIZO: 6344 case CHIP_STONEY: 6345 /* chips with display hardware */ 6346 return true; 6347 default: 6348 /* IP discovery */ 6349 if (!adev->ip_versions[DCE_HWIP][0] || 6350 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6351 return false; 6352 return true; 6353 } 6354 } 6355