1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/vgaarb.h> 45 #include <linux/vga_switcheroo.h> 46 #include <linux/efi.h> 47 #include "amdgpu.h" 48 #include "amdgpu_trace.h" 49 #include "amdgpu_i2c.h" 50 #include "atom.h" 51 #include "amdgpu_atombios.h" 52 #include "amdgpu_atomfirmware.h" 53 #include "amd_pcie.h" 54 #ifdef CONFIG_DRM_AMDGPU_SI 55 #include "si.h" 56 #endif 57 #ifdef CONFIG_DRM_AMDGPU_CIK 58 #include "cik.h" 59 #endif 60 #include "vi.h" 61 #include "soc15.h" 62 #include "nv.h" 63 #include "bif/bif_4_1_d.h" 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 #include "amdgpu_fru_eeprom.h" 74 #include "amdgpu_reset.h" 75 76 #include <linux/suspend.h> 77 #include <drm/task_barrier.h> 78 #include <linux/pm_runtime.h> 79 80 #include <drm/drm_drv.h> 81 82 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 83 #include <asm/intel-family.h> 84 #endif 85 86 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 93 94 #define AMDGPU_RESUME_MS 2000 95 #define AMDGPU_MAX_RETRY_LIMIT 2 96 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 97 98 static const struct drm_driver amdgpu_kms_driver; 99 100 const char *amdgpu_asic_name[] = { 101 "TAHITI", 102 "PITCAIRN", 103 "VERDE", 104 "OLAND", 105 "HAINAN", 106 "BONAIRE", 107 "KAVERI", 108 "KABINI", 109 "HAWAII", 110 "MULLINS", 111 "TOPAZ", 112 "TONGA", 113 "FIJI", 114 "CARRIZO", 115 "STONEY", 116 "POLARIS10", 117 "POLARIS11", 118 "POLARIS12", 119 "VEGAM", 120 "VEGA10", 121 "VEGA12", 122 "VEGA20", 123 "RAVEN", 124 "ARCTURUS", 125 "RENOIR", 126 "ALDEBARAN", 127 "NAVI10", 128 "CYAN_SKILLFISH", 129 "NAVI14", 130 "NAVI12", 131 "SIENNA_CICHLID", 132 "NAVY_FLOUNDER", 133 "VANGOGH", 134 "DIMGREY_CAVEFISH", 135 "BEIGE_GOBY", 136 "YELLOW_CARP", 137 "IP DISCOVERY", 138 "LAST", 139 }; 140 141 /** 142 * DOC: pcie_replay_count 143 * 144 * The amdgpu driver provides a sysfs API for reporting the total number 145 * of PCIe replays (NAKs) 146 * The file pcie_replay_count is used for this and returns the total 147 * number of replays as a sum of the NAKs generated and NAKs received 148 */ 149 150 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 struct drm_device *ddev = dev_get_drvdata(dev); 154 struct amdgpu_device *adev = drm_to_adev(ddev); 155 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 156 157 return sysfs_emit(buf, "%llu\n", cnt); 158 } 159 160 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 161 amdgpu_device_get_pcie_replay_count, NULL); 162 163 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 164 165 /** 166 * DOC: product_name 167 * 168 * The amdgpu driver provides a sysfs API for reporting the product name 169 * for the device 170 * The file serial_number is used for this and returns the product name 171 * as returned from the FRU. 172 * NOTE: This is only available for certain server cards 173 */ 174 175 static ssize_t amdgpu_device_get_product_name(struct device *dev, 176 struct device_attribute *attr, char *buf) 177 { 178 struct drm_device *ddev = dev_get_drvdata(dev); 179 struct amdgpu_device *adev = drm_to_adev(ddev); 180 181 return sysfs_emit(buf, "%s\n", adev->product_name); 182 } 183 184 static DEVICE_ATTR(product_name, S_IRUGO, 185 amdgpu_device_get_product_name, NULL); 186 187 /** 188 * DOC: product_number 189 * 190 * The amdgpu driver provides a sysfs API for reporting the part number 191 * for the device 192 * The file serial_number is used for this and returns the part number 193 * as returned from the FRU. 194 * NOTE: This is only available for certain server cards 195 */ 196 197 static ssize_t amdgpu_device_get_product_number(struct device *dev, 198 struct device_attribute *attr, char *buf) 199 { 200 struct drm_device *ddev = dev_get_drvdata(dev); 201 struct amdgpu_device *adev = drm_to_adev(ddev); 202 203 return sysfs_emit(buf, "%s\n", adev->product_number); 204 } 205 206 static DEVICE_ATTR(product_number, S_IRUGO, 207 amdgpu_device_get_product_number, NULL); 208 209 /** 210 * DOC: serial_number 211 * 212 * The amdgpu driver provides a sysfs API for reporting the serial number 213 * for the device 214 * The file serial_number is used for this and returns the serial number 215 * as returned from the FRU. 216 * NOTE: This is only available for certain server cards 217 */ 218 219 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 220 struct device_attribute *attr, char *buf) 221 { 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 225 return sysfs_emit(buf, "%s\n", adev->serial); 226 } 227 228 static DEVICE_ATTR(serial_number, S_IRUGO, 229 amdgpu_device_get_serial_number, NULL); 230 231 /** 232 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 233 * 234 * @dev: drm_device pointer 235 * 236 * Returns true if the device is a dGPU with ATPX power control, 237 * otherwise return false. 238 */ 239 bool amdgpu_device_supports_px(struct drm_device *dev) 240 { 241 struct amdgpu_device *adev = drm_to_adev(dev); 242 243 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 244 return true; 245 return false; 246 } 247 248 /** 249 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 250 * 251 * @dev: drm_device pointer 252 * 253 * Returns true if the device is a dGPU with ACPI power control, 254 * otherwise return false. 255 */ 256 bool amdgpu_device_supports_boco(struct drm_device *dev) 257 { 258 struct amdgpu_device *adev = drm_to_adev(dev); 259 260 if (adev->has_pr3 || 261 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 262 return true; 263 return false; 264 } 265 266 /** 267 * amdgpu_device_supports_baco - Does the device support BACO 268 * 269 * @dev: drm_device pointer 270 * 271 * Returns true if the device supporte BACO, 272 * otherwise return false. 273 */ 274 bool amdgpu_device_supports_baco(struct drm_device *dev) 275 { 276 struct amdgpu_device *adev = drm_to_adev(dev); 277 278 return amdgpu_asic_supports_baco(adev); 279 } 280 281 /** 282 * amdgpu_device_supports_smart_shift - Is the device dGPU with 283 * smart shift support 284 * 285 * @dev: drm_device pointer 286 * 287 * Returns true if the device is a dGPU with Smart Shift support, 288 * otherwise returns false. 289 */ 290 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 291 { 292 return (amdgpu_device_supports_boco(dev) && 293 amdgpu_acpi_is_power_shift_control_supported()); 294 } 295 296 /* 297 * VRAM access helper functions 298 */ 299 300 /** 301 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 302 * 303 * @adev: amdgpu_device pointer 304 * @pos: offset of the buffer in vram 305 * @buf: virtual address of the buffer in system memory 306 * @size: read/write size, sizeof(@buf) must > @size 307 * @write: true - write to vram, otherwise - read from vram 308 */ 309 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 310 void *buf, size_t size, bool write) 311 { 312 unsigned long flags; 313 uint32_t hi = ~0, tmp = 0; 314 uint32_t *data = buf; 315 uint64_t last; 316 int idx; 317 318 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 319 return; 320 321 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 322 323 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 324 for (last = pos + size; pos < last; pos += 4) { 325 tmp = pos >> 31; 326 327 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 328 if (tmp != hi) { 329 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 330 hi = tmp; 331 } 332 if (write) 333 WREG32_NO_KIQ(mmMM_DATA, *data++); 334 else 335 *data++ = RREG32_NO_KIQ(mmMM_DATA); 336 } 337 338 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 339 drm_dev_exit(idx); 340 } 341 342 /** 343 * amdgpu_device_aper_access - access vram by vram aperature 344 * 345 * @adev: amdgpu_device pointer 346 * @pos: offset of the buffer in vram 347 * @buf: virtual address of the buffer in system memory 348 * @size: read/write size, sizeof(@buf) must > @size 349 * @write: true - write to vram, otherwise - read from vram 350 * 351 * The return value means how many bytes have been transferred. 352 */ 353 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 354 void *buf, size_t size, bool write) 355 { 356 #ifdef CONFIG_64BIT 357 void __iomem *addr; 358 size_t count = 0; 359 uint64_t last; 360 361 if (!adev->mman.aper_base_kaddr) 362 return 0; 363 364 last = min(pos + size, adev->gmc.visible_vram_size); 365 if (last > pos) { 366 addr = adev->mman.aper_base_kaddr + pos; 367 count = last - pos; 368 369 if (write) { 370 memcpy_toio(addr, buf, count); 371 mb(); 372 amdgpu_device_flush_hdp(adev, NULL); 373 } else { 374 amdgpu_device_invalidate_hdp(adev, NULL); 375 mb(); 376 memcpy_fromio(buf, addr, count); 377 } 378 379 } 380 381 return count; 382 #else 383 return 0; 384 #endif 385 } 386 387 /** 388 * amdgpu_device_vram_access - read/write a buffer in vram 389 * 390 * @adev: amdgpu_device pointer 391 * @pos: offset of the buffer in vram 392 * @buf: virtual address of the buffer in system memory 393 * @size: read/write size, sizeof(@buf) must > @size 394 * @write: true - write to vram, otherwise - read from vram 395 */ 396 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 397 void *buf, size_t size, bool write) 398 { 399 size_t count; 400 401 /* try to using vram apreature to access vram first */ 402 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 403 size -= count; 404 if (size) { 405 /* using MM to access rest vram */ 406 pos += count; 407 buf += count; 408 amdgpu_device_mm_access(adev, pos, buf, size, write); 409 } 410 } 411 412 /* 413 * register access helper functions. 414 */ 415 416 /* Check if hw access should be skipped because of hotplug or device error */ 417 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 418 { 419 if (adev->no_hw_access) 420 return true; 421 422 #ifdef CONFIG_LOCKDEP 423 /* 424 * This is a bit complicated to understand, so worth a comment. What we assert 425 * here is that the GPU reset is not running on another thread in parallel. 426 * 427 * For this we trylock the read side of the reset semaphore, if that succeeds 428 * we know that the reset is not running in paralell. 429 * 430 * If the trylock fails we assert that we are either already holding the read 431 * side of the lock or are the reset thread itself and hold the write side of 432 * the lock. 433 */ 434 if (in_task()) { 435 if (down_read_trylock(&adev->reset_domain->sem)) 436 up_read(&adev->reset_domain->sem); 437 else 438 lockdep_assert_held(&adev->reset_domain->sem); 439 } 440 #endif 441 return false; 442 } 443 444 /** 445 * amdgpu_device_rreg - read a memory mapped IO or indirect register 446 * 447 * @adev: amdgpu_device pointer 448 * @reg: dword aligned register offset 449 * @acc_flags: access flags which require special behavior 450 * 451 * Returns the 32 bit value from the offset specified. 452 */ 453 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 454 uint32_t reg, uint32_t acc_flags) 455 { 456 uint32_t ret; 457 458 if (amdgpu_device_skip_hw_access(adev)) 459 return 0; 460 461 if ((reg * 4) < adev->rmmio_size) { 462 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 463 amdgpu_sriov_runtime(adev) && 464 down_read_trylock(&adev->reset_domain->sem)) { 465 ret = amdgpu_kiq_rreg(adev, reg); 466 up_read(&adev->reset_domain->sem); 467 } else { 468 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 469 } 470 } else { 471 ret = adev->pcie_rreg(adev, reg * 4); 472 } 473 474 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 475 476 return ret; 477 } 478 479 /* 480 * MMIO register read with bytes helper functions 481 * @offset:bytes offset from MMIO start 482 * 483 */ 484 485 /** 486 * amdgpu_mm_rreg8 - read a memory mapped IO register 487 * 488 * @adev: amdgpu_device pointer 489 * @offset: byte aligned register offset 490 * 491 * Returns the 8 bit value from the offset specified. 492 */ 493 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 494 { 495 if (amdgpu_device_skip_hw_access(adev)) 496 return 0; 497 498 if (offset < adev->rmmio_size) 499 return (readb(adev->rmmio + offset)); 500 BUG(); 501 } 502 503 /* 504 * MMIO register write with bytes helper functions 505 * @offset:bytes offset from MMIO start 506 * @value: the value want to be written to the register 507 * 508 */ 509 /** 510 * amdgpu_mm_wreg8 - read a memory mapped IO register 511 * 512 * @adev: amdgpu_device pointer 513 * @offset: byte aligned register offset 514 * @value: 8 bit value to write 515 * 516 * Writes the value specified to the offset specified. 517 */ 518 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 519 { 520 if (amdgpu_device_skip_hw_access(adev)) 521 return; 522 523 if (offset < adev->rmmio_size) 524 writeb(value, adev->rmmio + offset); 525 else 526 BUG(); 527 } 528 529 /** 530 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 531 * 532 * @adev: amdgpu_device pointer 533 * @reg: dword aligned register offset 534 * @v: 32 bit value to write to the register 535 * @acc_flags: access flags which require special behavior 536 * 537 * Writes the value specified to the offset specified. 538 */ 539 void amdgpu_device_wreg(struct amdgpu_device *adev, 540 uint32_t reg, uint32_t v, 541 uint32_t acc_flags) 542 { 543 if (amdgpu_device_skip_hw_access(adev)) 544 return; 545 546 if ((reg * 4) < adev->rmmio_size) { 547 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 548 amdgpu_sriov_runtime(adev) && 549 down_read_trylock(&adev->reset_domain->sem)) { 550 amdgpu_kiq_wreg(adev, reg, v); 551 up_read(&adev->reset_domain->sem); 552 } else { 553 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 554 } 555 } else { 556 adev->pcie_wreg(adev, reg * 4, v); 557 } 558 559 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 560 } 561 562 /** 563 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 564 * 565 * @adev: amdgpu_device pointer 566 * @reg: mmio/rlc register 567 * @v: value to write 568 * 569 * this function is invoked only for the debugfs register access 570 */ 571 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 572 uint32_t reg, uint32_t v) 573 { 574 if (amdgpu_device_skip_hw_access(adev)) 575 return; 576 577 if (amdgpu_sriov_fullaccess(adev) && 578 adev->gfx.rlc.funcs && 579 adev->gfx.rlc.funcs->is_rlcg_access_range) { 580 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 581 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 582 } else if ((reg * 4) >= adev->rmmio_size) { 583 adev->pcie_wreg(adev, reg * 4, v); 584 } else { 585 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 586 } 587 } 588 589 /** 590 * amdgpu_mm_rdoorbell - read a doorbell dword 591 * 592 * @adev: amdgpu_device pointer 593 * @index: doorbell index 594 * 595 * Returns the value in the doorbell aperture at the 596 * requested doorbell index (CIK). 597 */ 598 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 599 { 600 if (amdgpu_device_skip_hw_access(adev)) 601 return 0; 602 603 if (index < adev->doorbell.num_doorbells) { 604 return readl(adev->doorbell.ptr + index); 605 } else { 606 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 607 return 0; 608 } 609 } 610 611 /** 612 * amdgpu_mm_wdoorbell - write a doorbell dword 613 * 614 * @adev: amdgpu_device pointer 615 * @index: doorbell index 616 * @v: value to write 617 * 618 * Writes @v to the doorbell aperture at the 619 * requested doorbell index (CIK). 620 */ 621 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 622 { 623 if (amdgpu_device_skip_hw_access(adev)) 624 return; 625 626 if (index < adev->doorbell.num_doorbells) { 627 writel(v, adev->doorbell.ptr + index); 628 } else { 629 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 630 } 631 } 632 633 /** 634 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 635 * 636 * @adev: amdgpu_device pointer 637 * @index: doorbell index 638 * 639 * Returns the value in the doorbell aperture at the 640 * requested doorbell index (VEGA10+). 641 */ 642 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 643 { 644 if (amdgpu_device_skip_hw_access(adev)) 645 return 0; 646 647 if (index < adev->doorbell.num_doorbells) { 648 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 649 } else { 650 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 651 return 0; 652 } 653 } 654 655 /** 656 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 657 * 658 * @adev: amdgpu_device pointer 659 * @index: doorbell index 660 * @v: value to write 661 * 662 * Writes @v to the doorbell aperture at the 663 * requested doorbell index (VEGA10+). 664 */ 665 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 666 { 667 if (amdgpu_device_skip_hw_access(adev)) 668 return; 669 670 if (index < adev->doorbell.num_doorbells) { 671 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 672 } else { 673 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 674 } 675 } 676 677 /** 678 * amdgpu_device_indirect_rreg - read an indirect register 679 * 680 * @adev: amdgpu_device pointer 681 * @pcie_index: mmio register offset 682 * @pcie_data: mmio register offset 683 * @reg_addr: indirect register address to read from 684 * 685 * Returns the value of indirect register @reg_addr 686 */ 687 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 688 u32 pcie_index, u32 pcie_data, 689 u32 reg_addr) 690 { 691 unsigned long flags; 692 u32 r; 693 void __iomem *pcie_index_offset; 694 void __iomem *pcie_data_offset; 695 696 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 697 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 698 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 699 700 writel(reg_addr, pcie_index_offset); 701 readl(pcie_index_offset); 702 r = readl(pcie_data_offset); 703 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 704 705 return r; 706 } 707 708 /** 709 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 710 * 711 * @adev: amdgpu_device pointer 712 * @pcie_index: mmio register offset 713 * @pcie_data: mmio register offset 714 * @reg_addr: indirect register address to read from 715 * 716 * Returns the value of indirect register @reg_addr 717 */ 718 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 719 u32 pcie_index, u32 pcie_data, 720 u32 reg_addr) 721 { 722 unsigned long flags; 723 u64 r; 724 void __iomem *pcie_index_offset; 725 void __iomem *pcie_data_offset; 726 727 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 728 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 729 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 730 731 /* read low 32 bits */ 732 writel(reg_addr, pcie_index_offset); 733 readl(pcie_index_offset); 734 r = readl(pcie_data_offset); 735 /* read high 32 bits */ 736 writel(reg_addr + 4, pcie_index_offset); 737 readl(pcie_index_offset); 738 r |= ((u64)readl(pcie_data_offset) << 32); 739 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 740 741 return r; 742 } 743 744 /** 745 * amdgpu_device_indirect_wreg - write an indirect register address 746 * 747 * @adev: amdgpu_device pointer 748 * @pcie_index: mmio register offset 749 * @pcie_data: mmio register offset 750 * @reg_addr: indirect register offset 751 * @reg_data: indirect register data 752 * 753 */ 754 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 755 u32 pcie_index, u32 pcie_data, 756 u32 reg_addr, u32 reg_data) 757 { 758 unsigned long flags; 759 void __iomem *pcie_index_offset; 760 void __iomem *pcie_data_offset; 761 762 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 763 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 764 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 765 766 writel(reg_addr, pcie_index_offset); 767 readl(pcie_index_offset); 768 writel(reg_data, pcie_data_offset); 769 readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 } 772 773 /** 774 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 775 * 776 * @adev: amdgpu_device pointer 777 * @pcie_index: mmio register offset 778 * @pcie_data: mmio register offset 779 * @reg_addr: indirect register offset 780 * @reg_data: indirect register data 781 * 782 */ 783 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 784 u32 pcie_index, u32 pcie_data, 785 u32 reg_addr, u64 reg_data) 786 { 787 unsigned long flags; 788 void __iomem *pcie_index_offset; 789 void __iomem *pcie_data_offset; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 795 /* write low 32 bits */ 796 writel(reg_addr, pcie_index_offset); 797 readl(pcie_index_offset); 798 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 799 readl(pcie_data_offset); 800 /* write high 32 bits */ 801 writel(reg_addr + 4, pcie_index_offset); 802 readl(pcie_index_offset); 803 writel((u32)(reg_data >> 32), pcie_data_offset); 804 readl(pcie_data_offset); 805 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 806 } 807 808 /** 809 * amdgpu_invalid_rreg - dummy reg read function 810 * 811 * @adev: amdgpu_device pointer 812 * @reg: offset of register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 * Returns the value in the register. 817 */ 818 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 819 { 820 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 821 BUG(); 822 return 0; 823 } 824 825 /** 826 * amdgpu_invalid_wreg - dummy reg write function 827 * 828 * @adev: amdgpu_device pointer 829 * @reg: offset of register 830 * @v: value to write to the register 831 * 832 * Dummy register read function. Used for register blocks 833 * that certain asics don't have (all asics). 834 */ 835 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 836 { 837 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 838 reg, v); 839 BUG(); 840 } 841 842 /** 843 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: offset of register 847 * 848 * Dummy register read function. Used for register blocks 849 * that certain asics don't have (all asics). 850 * Returns the value in the register. 851 */ 852 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 855 BUG(); 856 return 0; 857 } 858 859 /** 860 * amdgpu_invalid_wreg64 - dummy reg write function 861 * 862 * @adev: amdgpu_device pointer 863 * @reg: offset of register 864 * @v: value to write to the register 865 * 866 * Dummy register read function. Used for register blocks 867 * that certain asics don't have (all asics). 868 */ 869 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 870 { 871 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 872 reg, v); 873 BUG(); 874 } 875 876 /** 877 * amdgpu_block_invalid_rreg - dummy reg read function 878 * 879 * @adev: amdgpu_device pointer 880 * @block: offset of instance 881 * @reg: offset of register 882 * 883 * Dummy register read function. Used for register blocks 884 * that certain asics don't have (all asics). 885 * Returns the value in the register. 886 */ 887 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 888 uint32_t block, uint32_t reg) 889 { 890 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 891 reg, block); 892 BUG(); 893 return 0; 894 } 895 896 /** 897 * amdgpu_block_invalid_wreg - dummy reg write function 898 * 899 * @adev: amdgpu_device pointer 900 * @block: offset of instance 901 * @reg: offset of register 902 * @v: value to write to the register 903 * 904 * Dummy register read function. Used for register blocks 905 * that certain asics don't have (all asics). 906 */ 907 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 908 uint32_t block, 909 uint32_t reg, uint32_t v) 910 { 911 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 912 reg, block, v); 913 BUG(); 914 } 915 916 /** 917 * amdgpu_device_asic_init - Wrapper for atom asic_init 918 * 919 * @adev: amdgpu_device pointer 920 * 921 * Does any asic specific work and then calls atom asic init. 922 */ 923 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 924 { 925 amdgpu_asic_pre_asic_init(adev); 926 927 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 928 return amdgpu_atomfirmware_asic_init(adev, true); 929 else 930 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 931 } 932 933 /** 934 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 935 * 936 * @adev: amdgpu_device pointer 937 * 938 * Allocates a scratch page of VRAM for use by various things in the 939 * driver. 940 */ 941 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 942 { 943 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 944 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 945 &adev->vram_scratch.robj, 946 &adev->vram_scratch.gpu_addr, 947 (void **)&adev->vram_scratch.ptr); 948 } 949 950 /** 951 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 952 * 953 * @adev: amdgpu_device pointer 954 * 955 * Frees the VRAM scratch page. 956 */ 957 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 958 { 959 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 960 } 961 962 /** 963 * amdgpu_device_program_register_sequence - program an array of registers. 964 * 965 * @adev: amdgpu_device pointer 966 * @registers: pointer to the register array 967 * @array_size: size of the register array 968 * 969 * Programs an array or registers with and and or masks. 970 * This is a helper for setting golden registers. 971 */ 972 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 973 const u32 *registers, 974 const u32 array_size) 975 { 976 u32 tmp, reg, and_mask, or_mask; 977 int i; 978 979 if (array_size % 3) 980 return; 981 982 for (i = 0; i < array_size; i +=3) { 983 reg = registers[i + 0]; 984 and_mask = registers[i + 1]; 985 or_mask = registers[i + 2]; 986 987 if (and_mask == 0xffffffff) { 988 tmp = or_mask; 989 } else { 990 tmp = RREG32(reg); 991 tmp &= ~and_mask; 992 if (adev->family >= AMDGPU_FAMILY_AI) 993 tmp |= (or_mask & and_mask); 994 else 995 tmp |= or_mask; 996 } 997 WREG32(reg, tmp); 998 } 999 } 1000 1001 /** 1002 * amdgpu_device_pci_config_reset - reset the GPU 1003 * 1004 * @adev: amdgpu_device pointer 1005 * 1006 * Resets the GPU using the pci config reset sequence. 1007 * Only applicable to asics prior to vega10. 1008 */ 1009 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1010 { 1011 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1012 } 1013 1014 /** 1015 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1016 * 1017 * @adev: amdgpu_device pointer 1018 * 1019 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1020 */ 1021 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1022 { 1023 STUB(); 1024 return -ENOSYS; 1025 #ifdef notyet 1026 return pci_reset_function(adev->pdev); 1027 #endif 1028 } 1029 1030 /* 1031 * GPU doorbell aperture helpers function. 1032 */ 1033 /** 1034 * amdgpu_device_doorbell_init - Init doorbell driver information. 1035 * 1036 * @adev: amdgpu_device pointer 1037 * 1038 * Init doorbell driver information (CIK) 1039 * Returns 0 on success, error on failure. 1040 */ 1041 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1042 { 1043 1044 /* No doorbell on SI hardware generation */ 1045 if (adev->asic_type < CHIP_BONAIRE) { 1046 adev->doorbell.base = 0; 1047 adev->doorbell.size = 0; 1048 adev->doorbell.num_doorbells = 0; 1049 adev->doorbell.ptr = NULL; 1050 return 0; 1051 } 1052 1053 #ifdef __linux__ 1054 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1055 return -EINVAL; 1056 #endif 1057 1058 amdgpu_asic_init_doorbell_index(adev); 1059 1060 /* doorbell bar mapping */ 1061 #ifdef __linux__ 1062 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1063 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1064 #endif 1065 1066 if (adev->enable_mes) { 1067 adev->doorbell.num_doorbells = 1068 adev->doorbell.size / sizeof(u32); 1069 } else { 1070 adev->doorbell.num_doorbells = 1071 min_t(u32, adev->doorbell.size / sizeof(u32), 1072 adev->doorbell_index.max_assignment+1); 1073 if (adev->doorbell.num_doorbells == 0) 1074 return -EINVAL; 1075 1076 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1077 * paging queue doorbell use the second page. The 1078 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1079 * doorbells are in the first page. So with paging queue enabled, 1080 * the max num_doorbells should + 1 page (0x400 in dword) 1081 */ 1082 if (adev->asic_type >= CHIP_VEGA10) 1083 adev->doorbell.num_doorbells += 0x400; 1084 } 1085 1086 #ifdef __linux__ 1087 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1088 adev->doorbell.num_doorbells * 1089 sizeof(u32)); 1090 if (adev->doorbell.ptr == NULL) 1091 return -ENOMEM; 1092 #endif 1093 1094 return 0; 1095 } 1096 1097 /** 1098 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1099 * 1100 * @adev: amdgpu_device pointer 1101 * 1102 * Tear down doorbell driver information (CIK) 1103 */ 1104 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1105 { 1106 #ifdef __linux__ 1107 iounmap(adev->doorbell.ptr); 1108 #else 1109 if (adev->doorbell.size > 0) 1110 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1111 adev->doorbell.size); 1112 #endif 1113 adev->doorbell.ptr = NULL; 1114 } 1115 1116 1117 1118 /* 1119 * amdgpu_device_wb_*() 1120 * Writeback is the method by which the GPU updates special pages in memory 1121 * with the status of certain GPU events (fences, ring pointers,etc.). 1122 */ 1123 1124 /** 1125 * amdgpu_device_wb_fini - Disable Writeback and free memory 1126 * 1127 * @adev: amdgpu_device pointer 1128 * 1129 * Disables Writeback and frees the Writeback memory (all asics). 1130 * Used at driver shutdown. 1131 */ 1132 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1133 { 1134 if (adev->wb.wb_obj) { 1135 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1136 &adev->wb.gpu_addr, 1137 (void **)&adev->wb.wb); 1138 adev->wb.wb_obj = NULL; 1139 } 1140 } 1141 1142 /** 1143 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1144 * 1145 * @adev: amdgpu_device pointer 1146 * 1147 * Initializes writeback and allocates writeback memory (all asics). 1148 * Used at driver startup. 1149 * Returns 0 on success or an -error on failure. 1150 */ 1151 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1152 { 1153 int r; 1154 1155 if (adev->wb.wb_obj == NULL) { 1156 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1157 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1158 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1159 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1160 (void **)&adev->wb.wb); 1161 if (r) { 1162 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1163 return r; 1164 } 1165 1166 adev->wb.num_wb = AMDGPU_MAX_WB; 1167 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1168 1169 /* clear wb memory */ 1170 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1171 } 1172 1173 return 0; 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_get - Allocate a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Allocate a wb slot for use by the driver (all asics). 1183 * Returns 0 on success or -EINVAL on failure. 1184 */ 1185 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1186 { 1187 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1188 1189 if (offset < adev->wb.num_wb) { 1190 __set_bit(offset, adev->wb.used); 1191 *wb = offset << 3; /* convert to dw offset */ 1192 return 0; 1193 } else { 1194 return -EINVAL; 1195 } 1196 } 1197 1198 /** 1199 * amdgpu_device_wb_free - Free a wb entry 1200 * 1201 * @adev: amdgpu_device pointer 1202 * @wb: wb index 1203 * 1204 * Free a wb slot allocated for use by the driver (all asics) 1205 */ 1206 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1207 { 1208 wb >>= 3; 1209 if (wb < adev->wb.num_wb) 1210 __clear_bit(wb, adev->wb.used); 1211 } 1212 1213 /** 1214 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1215 * 1216 * @adev: amdgpu_device pointer 1217 * 1218 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1219 * to fail, but if any of the BARs is not accessible after the size we abort 1220 * driver loading by returning -ENODEV. 1221 */ 1222 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1223 { 1224 #ifdef __linux__ 1225 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1226 struct pci_bus *root; 1227 struct resource *res; 1228 unsigned i; 1229 u16 cmd; 1230 int r; 1231 1232 /* Bypass for VF */ 1233 if (amdgpu_sriov_vf(adev)) 1234 return 0; 1235 1236 /* skip if the bios has already enabled large BAR */ 1237 if (adev->gmc.real_vram_size && 1238 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1239 return 0; 1240 1241 /* Check if the root BUS has 64bit memory resources */ 1242 root = adev->pdev->bus; 1243 while (root->parent) 1244 root = root->parent; 1245 1246 pci_bus_for_each_resource(root, res, i) { 1247 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1248 res->start > 0x100000000ull) 1249 break; 1250 } 1251 1252 /* Trying to resize is pointless without a root hub window above 4GB */ 1253 if (!res) 1254 return 0; 1255 1256 /* Limit the BAR size to what is available */ 1257 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1258 rbar_size); 1259 1260 /* Disable memory decoding while we change the BAR addresses and size */ 1261 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1262 pci_write_config_word(adev->pdev, PCI_COMMAND, 1263 cmd & ~PCI_COMMAND_MEMORY); 1264 1265 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1266 amdgpu_device_doorbell_fini(adev); 1267 if (adev->asic_type >= CHIP_BONAIRE) 1268 pci_release_resource(adev->pdev, 2); 1269 1270 pci_release_resource(adev->pdev, 0); 1271 1272 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1273 if (r == -ENOSPC) 1274 DRM_INFO("Not enough PCI address space for a large BAR."); 1275 else if (r && r != -ENOTSUPP) 1276 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1277 1278 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1279 1280 /* When the doorbell or fb BAR isn't available we have no chance of 1281 * using the device. 1282 */ 1283 r = amdgpu_device_doorbell_init(adev); 1284 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1285 return -ENODEV; 1286 1287 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1288 #endif /* __linux__ */ 1289 1290 return 0; 1291 } 1292 1293 /* 1294 * GPU helpers function. 1295 */ 1296 /** 1297 * amdgpu_device_need_post - check if the hw need post or not 1298 * 1299 * @adev: amdgpu_device pointer 1300 * 1301 * Check if the asic has been initialized (all asics) at driver startup 1302 * or post is needed if hw reset is performed. 1303 * Returns true if need or false if not. 1304 */ 1305 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1306 { 1307 uint32_t reg; 1308 1309 if (amdgpu_sriov_vf(adev)) 1310 return false; 1311 1312 if (amdgpu_passthrough(adev)) { 1313 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1314 * some old smc fw still need driver do vPost otherwise gpu hang, while 1315 * those smc fw version above 22.15 doesn't have this flaw, so we force 1316 * vpost executed for smc version below 22.15 1317 */ 1318 if (adev->asic_type == CHIP_FIJI) { 1319 int err; 1320 uint32_t fw_ver; 1321 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1322 /* force vPost if error occured */ 1323 if (err) 1324 return true; 1325 1326 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1327 if (fw_ver < 0x00160e00) 1328 return true; 1329 } 1330 } 1331 1332 /* Don't post if we need to reset whole hive on init */ 1333 if (adev->gmc.xgmi.pending_reset) 1334 return false; 1335 1336 if (adev->has_hw_reset) { 1337 adev->has_hw_reset = false; 1338 return true; 1339 } 1340 1341 /* bios scratch used on CIK+ */ 1342 if (adev->asic_type >= CHIP_BONAIRE) 1343 return amdgpu_atombios_scratch_need_asic_init(adev); 1344 1345 /* check MEM_SIZE for older asics */ 1346 reg = amdgpu_asic_get_config_memsize(adev); 1347 1348 if ((reg != 0) && (reg != 0xffffffff)) 1349 return false; 1350 1351 return true; 1352 } 1353 1354 /** 1355 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1356 * 1357 * @adev: amdgpu_device pointer 1358 * 1359 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1360 * be set for this device. 1361 * 1362 * Returns true if it should be used or false if not. 1363 */ 1364 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1365 { 1366 switch (amdgpu_aspm) { 1367 case -1: 1368 break; 1369 case 0: 1370 return false; 1371 case 1: 1372 return true; 1373 default: 1374 return false; 1375 } 1376 return pcie_aspm_enabled(adev->pdev); 1377 } 1378 1379 bool amdgpu_device_aspm_support_quirk(void) 1380 { 1381 #if IS_ENABLED(CONFIG_X86) 1382 struct cpu_info *ci = curcpu(); 1383 1384 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1385 #else 1386 return true; 1387 #endif 1388 } 1389 1390 /* if we get transitioned to only one device, take VGA back */ 1391 /** 1392 * amdgpu_device_vga_set_decode - enable/disable vga decode 1393 * 1394 * @pdev: PCI device pointer 1395 * @state: enable/disable vga decode 1396 * 1397 * Enable/disable vga decode (all asics). 1398 * Returns VGA resource flags. 1399 */ 1400 #ifdef notyet 1401 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1402 bool state) 1403 { 1404 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1405 amdgpu_asic_set_vga_state(adev, state); 1406 if (state) 1407 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1408 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1409 else 1410 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1411 } 1412 #endif 1413 1414 /** 1415 * amdgpu_device_check_block_size - validate the vm block size 1416 * 1417 * @adev: amdgpu_device pointer 1418 * 1419 * Validates the vm block size specified via module parameter. 1420 * The vm block size defines number of bits in page table versus page directory, 1421 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1422 * page table and the remaining bits are in the page directory. 1423 */ 1424 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1425 { 1426 /* defines number of bits in page table versus page directory, 1427 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1428 * page table and the remaining bits are in the page directory */ 1429 if (amdgpu_vm_block_size == -1) 1430 return; 1431 1432 if (amdgpu_vm_block_size < 9) { 1433 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1434 amdgpu_vm_block_size); 1435 amdgpu_vm_block_size = -1; 1436 } 1437 } 1438 1439 /** 1440 * amdgpu_device_check_vm_size - validate the vm size 1441 * 1442 * @adev: amdgpu_device pointer 1443 * 1444 * Validates the vm size in GB specified via module parameter. 1445 * The VM size is the size of the GPU virtual memory space in GB. 1446 */ 1447 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1448 { 1449 /* no need to check the default value */ 1450 if (amdgpu_vm_size == -1) 1451 return; 1452 1453 if (amdgpu_vm_size < 1) { 1454 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1455 amdgpu_vm_size); 1456 amdgpu_vm_size = -1; 1457 } 1458 } 1459 1460 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1461 { 1462 #ifdef __linux__ 1463 struct sysinfo si; 1464 #endif 1465 bool is_os_64 = (sizeof(void *) == 8); 1466 uint64_t total_memory; 1467 uint64_t dram_size_seven_GB = 0x1B8000000; 1468 uint64_t dram_size_three_GB = 0xB8000000; 1469 1470 if (amdgpu_smu_memory_pool_size == 0) 1471 return; 1472 1473 if (!is_os_64) { 1474 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1475 goto def_value; 1476 } 1477 #ifdef __linux__ 1478 si_meminfo(&si); 1479 total_memory = (uint64_t)si.totalram * si.mem_unit; 1480 #else 1481 total_memory = ptoa(physmem); 1482 #endif 1483 1484 if ((amdgpu_smu_memory_pool_size == 1) || 1485 (amdgpu_smu_memory_pool_size == 2)) { 1486 if (total_memory < dram_size_three_GB) 1487 goto def_value1; 1488 } else if ((amdgpu_smu_memory_pool_size == 4) || 1489 (amdgpu_smu_memory_pool_size == 8)) { 1490 if (total_memory < dram_size_seven_GB) 1491 goto def_value1; 1492 } else { 1493 DRM_WARN("Smu memory pool size not supported\n"); 1494 goto def_value; 1495 } 1496 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1497 1498 return; 1499 1500 def_value1: 1501 DRM_WARN("No enough system memory\n"); 1502 def_value: 1503 adev->pm.smu_prv_buffer_size = 0; 1504 } 1505 1506 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1507 { 1508 if (!(adev->flags & AMD_IS_APU) || 1509 adev->asic_type < CHIP_RAVEN) 1510 return 0; 1511 1512 switch (adev->asic_type) { 1513 case CHIP_RAVEN: 1514 if (adev->pdev->device == 0x15dd) 1515 adev->apu_flags |= AMD_APU_IS_RAVEN; 1516 if (adev->pdev->device == 0x15d8) 1517 adev->apu_flags |= AMD_APU_IS_PICASSO; 1518 break; 1519 case CHIP_RENOIR: 1520 if ((adev->pdev->device == 0x1636) || 1521 (adev->pdev->device == 0x164c)) 1522 adev->apu_flags |= AMD_APU_IS_RENOIR; 1523 else 1524 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1525 break; 1526 case CHIP_VANGOGH: 1527 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1528 break; 1529 case CHIP_YELLOW_CARP: 1530 break; 1531 case CHIP_CYAN_SKILLFISH: 1532 if ((adev->pdev->device == 0x13FE) || 1533 (adev->pdev->device == 0x143F)) 1534 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1535 break; 1536 default: 1537 break; 1538 } 1539 1540 return 0; 1541 } 1542 1543 /** 1544 * amdgpu_device_check_arguments - validate module params 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Validates certain module parameters and updates 1549 * the associated values used by the driver (all asics). 1550 */ 1551 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1552 { 1553 if (amdgpu_sched_jobs < 4) { 1554 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1555 amdgpu_sched_jobs); 1556 amdgpu_sched_jobs = 4; 1557 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1558 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1559 amdgpu_sched_jobs); 1560 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1561 } 1562 1563 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1564 /* gart size must be greater or equal to 32M */ 1565 dev_warn(adev->dev, "gart size (%d) too small\n", 1566 amdgpu_gart_size); 1567 amdgpu_gart_size = -1; 1568 } 1569 1570 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1571 /* gtt size must be greater or equal to 32M */ 1572 dev_warn(adev->dev, "gtt size (%d) too small\n", 1573 amdgpu_gtt_size); 1574 amdgpu_gtt_size = -1; 1575 } 1576 1577 /* valid range is between 4 and 9 inclusive */ 1578 if (amdgpu_vm_fragment_size != -1 && 1579 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1580 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1581 amdgpu_vm_fragment_size = -1; 1582 } 1583 1584 if (amdgpu_sched_hw_submission < 2) { 1585 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1586 amdgpu_sched_hw_submission); 1587 amdgpu_sched_hw_submission = 2; 1588 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1589 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1590 amdgpu_sched_hw_submission); 1591 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1592 } 1593 1594 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1595 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1596 amdgpu_reset_method = -1; 1597 } 1598 1599 amdgpu_device_check_smu_prv_buffer_size(adev); 1600 1601 amdgpu_device_check_vm_size(adev); 1602 1603 amdgpu_device_check_block_size(adev); 1604 1605 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1606 1607 return 0; 1608 } 1609 1610 #ifdef __linux__ 1611 /** 1612 * amdgpu_switcheroo_set_state - set switcheroo state 1613 * 1614 * @pdev: pci dev pointer 1615 * @state: vga_switcheroo state 1616 * 1617 * Callback for the switcheroo driver. Suspends or resumes the 1618 * the asics before or after it is powered up using ACPI methods. 1619 */ 1620 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1621 enum vga_switcheroo_state state) 1622 { 1623 struct drm_device *dev = pci_get_drvdata(pdev); 1624 int r; 1625 1626 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1627 return; 1628 1629 if (state == VGA_SWITCHEROO_ON) { 1630 pr_info("switched on\n"); 1631 /* don't suspend or resume card normally */ 1632 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1633 1634 pci_set_power_state(pdev, PCI_D0); 1635 amdgpu_device_load_pci_state(pdev); 1636 r = pci_enable_device(pdev); 1637 if (r) 1638 DRM_WARN("pci_enable_device failed (%d)\n", r); 1639 amdgpu_device_resume(dev, true); 1640 1641 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1642 } else { 1643 pr_info("switched off\n"); 1644 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1645 amdgpu_device_suspend(dev, true); 1646 amdgpu_device_cache_pci_state(pdev); 1647 /* Shut down the device */ 1648 pci_disable_device(pdev); 1649 pci_set_power_state(pdev, PCI_D3cold); 1650 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1651 } 1652 } 1653 1654 /** 1655 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1656 * 1657 * @pdev: pci dev pointer 1658 * 1659 * Callback for the switcheroo driver. Check of the switcheroo 1660 * state can be changed. 1661 * Returns true if the state can be changed, false if not. 1662 */ 1663 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1664 { 1665 struct drm_device *dev = pci_get_drvdata(pdev); 1666 1667 /* 1668 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1669 * locking inversion with the driver load path. And the access here is 1670 * completely racy anyway. So don't bother with locking for now. 1671 */ 1672 return atomic_read(&dev->open_count) == 0; 1673 } 1674 #endif /* __linux__ */ 1675 1676 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1677 #ifdef notyet 1678 .set_gpu_state = amdgpu_switcheroo_set_state, 1679 .reprobe = NULL, 1680 .can_switch = amdgpu_switcheroo_can_switch, 1681 #endif 1682 }; 1683 1684 /** 1685 * amdgpu_device_ip_set_clockgating_state - set the CG state 1686 * 1687 * @dev: amdgpu_device pointer 1688 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1689 * @state: clockgating state (gate or ungate) 1690 * 1691 * Sets the requested clockgating state for all instances of 1692 * the hardware IP specified. 1693 * Returns the error code from the last instance. 1694 */ 1695 int amdgpu_device_ip_set_clockgating_state(void *dev, 1696 enum amd_ip_block_type block_type, 1697 enum amd_clockgating_state state) 1698 { 1699 struct amdgpu_device *adev = dev; 1700 int i, r = 0; 1701 1702 for (i = 0; i < adev->num_ip_blocks; i++) { 1703 if (!adev->ip_blocks[i].status.valid) 1704 continue; 1705 if (adev->ip_blocks[i].version->type != block_type) 1706 continue; 1707 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1708 continue; 1709 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1710 (void *)adev, state); 1711 if (r) 1712 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1713 adev->ip_blocks[i].version->funcs->name, r); 1714 } 1715 return r; 1716 } 1717 1718 /** 1719 * amdgpu_device_ip_set_powergating_state - set the PG state 1720 * 1721 * @dev: amdgpu_device pointer 1722 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1723 * @state: powergating state (gate or ungate) 1724 * 1725 * Sets the requested powergating state for all instances of 1726 * the hardware IP specified. 1727 * Returns the error code from the last instance. 1728 */ 1729 int amdgpu_device_ip_set_powergating_state(void *dev, 1730 enum amd_ip_block_type block_type, 1731 enum amd_powergating_state state) 1732 { 1733 struct amdgpu_device *adev = dev; 1734 int i, r = 0; 1735 1736 for (i = 0; i < adev->num_ip_blocks; i++) { 1737 if (!adev->ip_blocks[i].status.valid) 1738 continue; 1739 if (adev->ip_blocks[i].version->type != block_type) 1740 continue; 1741 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1742 continue; 1743 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1744 (void *)adev, state); 1745 if (r) 1746 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1747 adev->ip_blocks[i].version->funcs->name, r); 1748 } 1749 return r; 1750 } 1751 1752 /** 1753 * amdgpu_device_ip_get_clockgating_state - get the CG state 1754 * 1755 * @adev: amdgpu_device pointer 1756 * @flags: clockgating feature flags 1757 * 1758 * Walks the list of IPs on the device and updates the clockgating 1759 * flags for each IP. 1760 * Updates @flags with the feature flags for each hardware IP where 1761 * clockgating is enabled. 1762 */ 1763 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1764 u64 *flags) 1765 { 1766 int i; 1767 1768 for (i = 0; i < adev->num_ip_blocks; i++) { 1769 if (!adev->ip_blocks[i].status.valid) 1770 continue; 1771 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1772 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1773 } 1774 } 1775 1776 /** 1777 * amdgpu_device_ip_wait_for_idle - wait for idle 1778 * 1779 * @adev: amdgpu_device pointer 1780 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1781 * 1782 * Waits for the request hardware IP to be idle. 1783 * Returns 0 for success or a negative error code on failure. 1784 */ 1785 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1786 enum amd_ip_block_type block_type) 1787 { 1788 int i, r; 1789 1790 for (i = 0; i < adev->num_ip_blocks; i++) { 1791 if (!adev->ip_blocks[i].status.valid) 1792 continue; 1793 if (adev->ip_blocks[i].version->type == block_type) { 1794 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1795 if (r) 1796 return r; 1797 break; 1798 } 1799 } 1800 return 0; 1801 1802 } 1803 1804 /** 1805 * amdgpu_device_ip_is_idle - is the hardware IP idle 1806 * 1807 * @adev: amdgpu_device pointer 1808 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1809 * 1810 * Check if the hardware IP is idle or not. 1811 * Returns true if it the IP is idle, false if not. 1812 */ 1813 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1814 enum amd_ip_block_type block_type) 1815 { 1816 int i; 1817 1818 for (i = 0; i < adev->num_ip_blocks; i++) { 1819 if (!adev->ip_blocks[i].status.valid) 1820 continue; 1821 if (adev->ip_blocks[i].version->type == block_type) 1822 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1823 } 1824 return true; 1825 1826 } 1827 1828 /** 1829 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1830 * 1831 * @adev: amdgpu_device pointer 1832 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1833 * 1834 * Returns a pointer to the hardware IP block structure 1835 * if it exists for the asic, otherwise NULL. 1836 */ 1837 struct amdgpu_ip_block * 1838 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1839 enum amd_ip_block_type type) 1840 { 1841 int i; 1842 1843 for (i = 0; i < adev->num_ip_blocks; i++) 1844 if (adev->ip_blocks[i].version->type == type) 1845 return &adev->ip_blocks[i]; 1846 1847 return NULL; 1848 } 1849 1850 /** 1851 * amdgpu_device_ip_block_version_cmp 1852 * 1853 * @adev: amdgpu_device pointer 1854 * @type: enum amd_ip_block_type 1855 * @major: major version 1856 * @minor: minor version 1857 * 1858 * return 0 if equal or greater 1859 * return 1 if smaller or the ip_block doesn't exist 1860 */ 1861 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1862 enum amd_ip_block_type type, 1863 u32 major, u32 minor) 1864 { 1865 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1866 1867 if (ip_block && ((ip_block->version->major > major) || 1868 ((ip_block->version->major == major) && 1869 (ip_block->version->minor >= minor)))) 1870 return 0; 1871 1872 return 1; 1873 } 1874 1875 /** 1876 * amdgpu_device_ip_block_add 1877 * 1878 * @adev: amdgpu_device pointer 1879 * @ip_block_version: pointer to the IP to add 1880 * 1881 * Adds the IP block driver information to the collection of IPs 1882 * on the asic. 1883 */ 1884 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1885 const struct amdgpu_ip_block_version *ip_block_version) 1886 { 1887 if (!ip_block_version) 1888 return -EINVAL; 1889 1890 switch (ip_block_version->type) { 1891 case AMD_IP_BLOCK_TYPE_VCN: 1892 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1893 return 0; 1894 break; 1895 case AMD_IP_BLOCK_TYPE_JPEG: 1896 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1897 return 0; 1898 break; 1899 default: 1900 break; 1901 } 1902 1903 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1904 ip_block_version->funcs->name); 1905 1906 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1907 1908 return 0; 1909 } 1910 1911 /** 1912 * amdgpu_device_enable_virtual_display - enable virtual display feature 1913 * 1914 * @adev: amdgpu_device pointer 1915 * 1916 * Enabled the virtual display feature if the user has enabled it via 1917 * the module parameter virtual_display. This feature provides a virtual 1918 * display hardware on headless boards or in virtualized environments. 1919 * This function parses and validates the configuration string specified by 1920 * the user and configues the virtual display configuration (number of 1921 * virtual connectors, crtcs, etc.) specified. 1922 */ 1923 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1924 { 1925 adev->enable_virtual_display = false; 1926 1927 #ifdef notyet 1928 if (amdgpu_virtual_display) { 1929 const char *pci_address_name = pci_name(adev->pdev); 1930 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1931 1932 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1933 pciaddstr_tmp = pciaddstr; 1934 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1935 pciaddname = strsep(&pciaddname_tmp, ","); 1936 if (!strcmp("all", pciaddname) 1937 || !strcmp(pci_address_name, pciaddname)) { 1938 long num_crtc; 1939 int res = -1; 1940 1941 adev->enable_virtual_display = true; 1942 1943 if (pciaddname_tmp) 1944 res = kstrtol(pciaddname_tmp, 10, 1945 &num_crtc); 1946 1947 if (!res) { 1948 if (num_crtc < 1) 1949 num_crtc = 1; 1950 if (num_crtc > 6) 1951 num_crtc = 6; 1952 adev->mode_info.num_crtc = num_crtc; 1953 } else { 1954 adev->mode_info.num_crtc = 1; 1955 } 1956 break; 1957 } 1958 } 1959 1960 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1961 amdgpu_virtual_display, pci_address_name, 1962 adev->enable_virtual_display, adev->mode_info.num_crtc); 1963 1964 kfree(pciaddstr); 1965 } 1966 #endif 1967 } 1968 1969 /** 1970 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1971 * 1972 * @adev: amdgpu_device pointer 1973 * 1974 * Parses the asic configuration parameters specified in the gpu info 1975 * firmware and makes them availale to the driver for use in configuring 1976 * the asic. 1977 * Returns 0 on success, -EINVAL on failure. 1978 */ 1979 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1980 { 1981 const char *chip_name; 1982 char fw_name[40]; 1983 int err; 1984 const struct gpu_info_firmware_header_v1_0 *hdr; 1985 1986 adev->firmware.gpu_info_fw = NULL; 1987 1988 if (adev->mman.discovery_bin) { 1989 /* 1990 * FIXME: The bounding box is still needed by Navi12, so 1991 * temporarily read it from gpu_info firmware. Should be dropped 1992 * when DAL no longer needs it. 1993 */ 1994 if (adev->asic_type != CHIP_NAVI12) 1995 return 0; 1996 } 1997 1998 switch (adev->asic_type) { 1999 default: 2000 return 0; 2001 case CHIP_VEGA10: 2002 chip_name = "vega10"; 2003 break; 2004 case CHIP_VEGA12: 2005 chip_name = "vega12"; 2006 break; 2007 case CHIP_RAVEN: 2008 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2009 chip_name = "raven2"; 2010 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2011 chip_name = "picasso"; 2012 else 2013 chip_name = "raven"; 2014 break; 2015 case CHIP_ARCTURUS: 2016 chip_name = "arcturus"; 2017 break; 2018 case CHIP_NAVI12: 2019 chip_name = "navi12"; 2020 break; 2021 } 2022 2023 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2024 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2025 if (err) { 2026 dev_err(adev->dev, 2027 "Failed to load gpu_info firmware \"%s\"\n", 2028 fw_name); 2029 goto out; 2030 } 2031 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2032 if (err) { 2033 dev_err(adev->dev, 2034 "Failed to validate gpu_info firmware \"%s\"\n", 2035 fw_name); 2036 goto out; 2037 } 2038 2039 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2040 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2041 2042 switch (hdr->version_major) { 2043 case 1: 2044 { 2045 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2046 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2047 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2048 2049 /* 2050 * Should be droped when DAL no longer needs it. 2051 */ 2052 if (adev->asic_type == CHIP_NAVI12) 2053 goto parse_soc_bounding_box; 2054 2055 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2056 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2057 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2058 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2059 adev->gfx.config.max_texture_channel_caches = 2060 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2061 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2062 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2063 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2064 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2065 adev->gfx.config.double_offchip_lds_buf = 2066 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2067 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2068 adev->gfx.cu_info.max_waves_per_simd = 2069 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2070 adev->gfx.cu_info.max_scratch_slots_per_cu = 2071 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2072 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2073 if (hdr->version_minor >= 1) { 2074 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2075 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2076 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2077 adev->gfx.config.num_sc_per_sh = 2078 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2079 adev->gfx.config.num_packer_per_sc = 2080 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2081 } 2082 2083 parse_soc_bounding_box: 2084 /* 2085 * soc bounding box info is not integrated in disocovery table, 2086 * we always need to parse it from gpu info firmware if needed. 2087 */ 2088 if (hdr->version_minor == 2) { 2089 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2090 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2091 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2092 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2093 } 2094 break; 2095 } 2096 default: 2097 dev_err(adev->dev, 2098 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2099 err = -EINVAL; 2100 goto out; 2101 } 2102 out: 2103 return err; 2104 } 2105 2106 /** 2107 * amdgpu_device_ip_early_init - run early init for hardware IPs 2108 * 2109 * @adev: amdgpu_device pointer 2110 * 2111 * Early initialization pass for hardware IPs. The hardware IPs that make 2112 * up each asic are discovered each IP's early_init callback is run. This 2113 * is the first stage in initializing the asic. 2114 * Returns 0 on success, negative error code on failure. 2115 */ 2116 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2117 { 2118 struct drm_device *dev = adev_to_drm(adev); 2119 struct pci_dev *parent; 2120 int i, r; 2121 2122 amdgpu_device_enable_virtual_display(adev); 2123 2124 if (amdgpu_sriov_vf(adev)) { 2125 r = amdgpu_virt_request_full_gpu(adev, true); 2126 if (r) 2127 return r; 2128 } 2129 2130 switch (adev->asic_type) { 2131 #ifdef CONFIG_DRM_AMDGPU_SI 2132 case CHIP_VERDE: 2133 case CHIP_TAHITI: 2134 case CHIP_PITCAIRN: 2135 case CHIP_OLAND: 2136 case CHIP_HAINAN: 2137 adev->family = AMDGPU_FAMILY_SI; 2138 r = si_set_ip_blocks(adev); 2139 if (r) 2140 return r; 2141 break; 2142 #endif 2143 #ifdef CONFIG_DRM_AMDGPU_CIK 2144 case CHIP_BONAIRE: 2145 case CHIP_HAWAII: 2146 case CHIP_KAVERI: 2147 case CHIP_KABINI: 2148 case CHIP_MULLINS: 2149 if (adev->flags & AMD_IS_APU) 2150 adev->family = AMDGPU_FAMILY_KV; 2151 else 2152 adev->family = AMDGPU_FAMILY_CI; 2153 2154 r = cik_set_ip_blocks(adev); 2155 if (r) 2156 return r; 2157 break; 2158 #endif 2159 case CHIP_TOPAZ: 2160 case CHIP_TONGA: 2161 case CHIP_FIJI: 2162 case CHIP_POLARIS10: 2163 case CHIP_POLARIS11: 2164 case CHIP_POLARIS12: 2165 case CHIP_VEGAM: 2166 case CHIP_CARRIZO: 2167 case CHIP_STONEY: 2168 if (adev->flags & AMD_IS_APU) 2169 adev->family = AMDGPU_FAMILY_CZ; 2170 else 2171 adev->family = AMDGPU_FAMILY_VI; 2172 2173 r = vi_set_ip_blocks(adev); 2174 if (r) 2175 return r; 2176 break; 2177 default: 2178 r = amdgpu_discovery_set_ip_blocks(adev); 2179 if (r) 2180 return r; 2181 break; 2182 } 2183 2184 if (amdgpu_has_atpx() && 2185 (amdgpu_is_atpx_hybrid() || 2186 amdgpu_has_atpx_dgpu_power_cntl()) && 2187 ((adev->flags & AMD_IS_APU) == 0) && 2188 !pci_is_thunderbolt_attached(dev->pdev)) 2189 adev->flags |= AMD_IS_PX; 2190 2191 if (!(adev->flags & AMD_IS_APU)) { 2192 parent = pci_upstream_bridge(adev->pdev); 2193 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2194 } 2195 2196 amdgpu_amdkfd_device_probe(adev); 2197 2198 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2199 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2200 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2201 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2202 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2203 2204 for (i = 0; i < adev->num_ip_blocks; i++) { 2205 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2206 DRM_ERROR("disabled ip block: %d <%s>\n", 2207 i, adev->ip_blocks[i].version->funcs->name); 2208 adev->ip_blocks[i].status.valid = false; 2209 } else { 2210 if (adev->ip_blocks[i].version->funcs->early_init) { 2211 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2212 if (r == -ENOENT) { 2213 adev->ip_blocks[i].status.valid = false; 2214 } else if (r) { 2215 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2216 adev->ip_blocks[i].version->funcs->name, r); 2217 return r; 2218 } else { 2219 adev->ip_blocks[i].status.valid = true; 2220 } 2221 } else { 2222 adev->ip_blocks[i].status.valid = true; 2223 } 2224 } 2225 /* get the vbios after the asic_funcs are set up */ 2226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2227 r = amdgpu_device_parse_gpu_info_fw(adev); 2228 if (r) 2229 return r; 2230 2231 /* Read BIOS */ 2232 if (!amdgpu_get_bios(adev)) 2233 return -EINVAL; 2234 2235 r = amdgpu_atombios_init(adev); 2236 if (r) { 2237 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2239 return r; 2240 } 2241 2242 /*get pf2vf msg info at it's earliest time*/ 2243 if (amdgpu_sriov_vf(adev)) 2244 amdgpu_virt_init_data_exchange(adev); 2245 2246 } 2247 } 2248 2249 adev->cg_flags &= amdgpu_cg_mask; 2250 adev->pg_flags &= amdgpu_pg_mask; 2251 2252 return 0; 2253 } 2254 2255 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2256 { 2257 int i, r; 2258 2259 for (i = 0; i < adev->num_ip_blocks; i++) { 2260 if (!adev->ip_blocks[i].status.sw) 2261 continue; 2262 if (adev->ip_blocks[i].status.hw) 2263 continue; 2264 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2265 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2266 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2267 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2268 if (r) { 2269 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2270 adev->ip_blocks[i].version->funcs->name, r); 2271 return r; 2272 } 2273 adev->ip_blocks[i].status.hw = true; 2274 } 2275 } 2276 2277 return 0; 2278 } 2279 2280 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2281 { 2282 int i, r; 2283 2284 for (i = 0; i < adev->num_ip_blocks; i++) { 2285 if (!adev->ip_blocks[i].status.sw) 2286 continue; 2287 if (adev->ip_blocks[i].status.hw) 2288 continue; 2289 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2290 if (r) { 2291 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2292 adev->ip_blocks[i].version->funcs->name, r); 2293 return r; 2294 } 2295 adev->ip_blocks[i].status.hw = true; 2296 } 2297 2298 return 0; 2299 } 2300 2301 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2302 { 2303 int r = 0; 2304 int i; 2305 uint32_t smu_version; 2306 2307 if (adev->asic_type >= CHIP_VEGA10) { 2308 for (i = 0; i < adev->num_ip_blocks; i++) { 2309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2310 continue; 2311 2312 if (!adev->ip_blocks[i].status.sw) 2313 continue; 2314 2315 /* no need to do the fw loading again if already done*/ 2316 if (adev->ip_blocks[i].status.hw == true) 2317 break; 2318 2319 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2320 r = adev->ip_blocks[i].version->funcs->resume(adev); 2321 if (r) { 2322 DRM_ERROR("resume of IP block <%s> failed %d\n", 2323 adev->ip_blocks[i].version->funcs->name, r); 2324 return r; 2325 } 2326 } else { 2327 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2328 if (r) { 2329 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2330 adev->ip_blocks[i].version->funcs->name, r); 2331 return r; 2332 } 2333 } 2334 2335 adev->ip_blocks[i].status.hw = true; 2336 break; 2337 } 2338 } 2339 2340 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2341 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2342 2343 return r; 2344 } 2345 2346 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2347 { 2348 long timeout; 2349 int r, i; 2350 2351 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2352 struct amdgpu_ring *ring = adev->rings[i]; 2353 2354 /* No need to setup the GPU scheduler for rings that don't need it */ 2355 if (!ring || ring->no_scheduler) 2356 continue; 2357 2358 switch (ring->funcs->type) { 2359 case AMDGPU_RING_TYPE_GFX: 2360 timeout = adev->gfx_timeout; 2361 break; 2362 case AMDGPU_RING_TYPE_COMPUTE: 2363 timeout = adev->compute_timeout; 2364 break; 2365 case AMDGPU_RING_TYPE_SDMA: 2366 timeout = adev->sdma_timeout; 2367 break; 2368 default: 2369 timeout = adev->video_timeout; 2370 break; 2371 } 2372 2373 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2374 ring->num_hw_submission, amdgpu_job_hang_limit, 2375 timeout, adev->reset_domain->wq, 2376 ring->sched_score, ring->name, 2377 adev->dev); 2378 if (r) { 2379 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2380 ring->name); 2381 return r; 2382 } 2383 } 2384 2385 return 0; 2386 } 2387 2388 2389 /** 2390 * amdgpu_device_ip_init - run init for hardware IPs 2391 * 2392 * @adev: amdgpu_device pointer 2393 * 2394 * Main initialization pass for hardware IPs. The list of all the hardware 2395 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2396 * are run. sw_init initializes the software state associated with each IP 2397 * and hw_init initializes the hardware associated with each IP. 2398 * Returns 0 on success, negative error code on failure. 2399 */ 2400 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2401 { 2402 int i, r; 2403 2404 r = amdgpu_ras_init(adev); 2405 if (r) 2406 return r; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (!adev->ip_blocks[i].status.valid) 2410 continue; 2411 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2412 if (r) { 2413 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2414 adev->ip_blocks[i].version->funcs->name, r); 2415 goto init_failed; 2416 } 2417 adev->ip_blocks[i].status.sw = true; 2418 2419 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2420 /* need to do common hw init early so everything is set up for gmc */ 2421 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2422 if (r) { 2423 DRM_ERROR("hw_init %d failed %d\n", i, r); 2424 goto init_failed; 2425 } 2426 adev->ip_blocks[i].status.hw = true; 2427 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2428 /* need to do gmc hw init early so we can allocate gpu mem */ 2429 /* Try to reserve bad pages early */ 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_exchange_data(adev); 2432 2433 r = amdgpu_device_vram_scratch_init(adev); 2434 if (r) { 2435 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2436 goto init_failed; 2437 } 2438 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2439 if (r) { 2440 DRM_ERROR("hw_init %d failed %d\n", i, r); 2441 goto init_failed; 2442 } 2443 r = amdgpu_device_wb_init(adev); 2444 if (r) { 2445 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2446 goto init_failed; 2447 } 2448 adev->ip_blocks[i].status.hw = true; 2449 2450 /* right after GMC hw init, we create CSA */ 2451 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2452 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2453 AMDGPU_GEM_DOMAIN_VRAM, 2454 AMDGPU_CSA_SIZE); 2455 if (r) { 2456 DRM_ERROR("allocate CSA failed %d\n", r); 2457 goto init_failed; 2458 } 2459 } 2460 } 2461 } 2462 2463 if (amdgpu_sriov_vf(adev)) 2464 amdgpu_virt_init_data_exchange(adev); 2465 2466 r = amdgpu_ib_pool_init(adev); 2467 if (r) { 2468 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2469 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2470 goto init_failed; 2471 } 2472 2473 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2474 if (r) 2475 goto init_failed; 2476 2477 r = amdgpu_device_ip_hw_init_phase1(adev); 2478 if (r) 2479 goto init_failed; 2480 2481 r = amdgpu_device_fw_loading(adev); 2482 if (r) 2483 goto init_failed; 2484 2485 r = amdgpu_device_ip_hw_init_phase2(adev); 2486 if (r) 2487 goto init_failed; 2488 2489 /* 2490 * retired pages will be loaded from eeprom and reserved here, 2491 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2492 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2493 * for I2C communication which only true at this point. 2494 * 2495 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2496 * failure from bad gpu situation and stop amdgpu init process 2497 * accordingly. For other failed cases, it will still release all 2498 * the resource and print error message, rather than returning one 2499 * negative value to upper level. 2500 * 2501 * Note: theoretically, this should be called before all vram allocations 2502 * to protect retired page from abusing 2503 */ 2504 r = amdgpu_ras_recovery_init(adev); 2505 if (r) 2506 goto init_failed; 2507 2508 /** 2509 * In case of XGMI grab extra reference for reset domain for this device 2510 */ 2511 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2512 if (amdgpu_xgmi_add_device(adev) == 0) { 2513 if (!amdgpu_sriov_vf(adev)) { 2514 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2515 2516 if (WARN_ON(!hive)) { 2517 r = -ENOENT; 2518 goto init_failed; 2519 } 2520 2521 if (!hive->reset_domain || 2522 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2523 r = -ENOENT; 2524 amdgpu_put_xgmi_hive(hive); 2525 goto init_failed; 2526 } 2527 2528 /* Drop the early temporary reset domain we created for device */ 2529 amdgpu_reset_put_reset_domain(adev->reset_domain); 2530 adev->reset_domain = hive->reset_domain; 2531 amdgpu_put_xgmi_hive(hive); 2532 } 2533 } 2534 } 2535 2536 r = amdgpu_device_init_schedulers(adev); 2537 if (r) 2538 goto init_failed; 2539 2540 /* Don't init kfd if whole hive need to be reset during init */ 2541 if (!adev->gmc.xgmi.pending_reset) 2542 amdgpu_amdkfd_device_init(adev); 2543 2544 amdgpu_fru_get_product_info(adev); 2545 2546 init_failed: 2547 if (amdgpu_sriov_vf(adev)) 2548 amdgpu_virt_release_full_gpu(adev, true); 2549 2550 return r; 2551 } 2552 2553 /** 2554 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2555 * 2556 * @adev: amdgpu_device pointer 2557 * 2558 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2559 * this function before a GPU reset. If the value is retained after a 2560 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2561 */ 2562 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2563 { 2564 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2565 } 2566 2567 /** 2568 * amdgpu_device_check_vram_lost - check if vram is valid 2569 * 2570 * @adev: amdgpu_device pointer 2571 * 2572 * Checks the reset magic value written to the gart pointer in VRAM. 2573 * The driver calls this after a GPU reset to see if the contents of 2574 * VRAM is lost or now. 2575 * returns true if vram is lost, false if not. 2576 */ 2577 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2578 { 2579 if (memcmp(adev->gart.ptr, adev->reset_magic, 2580 AMDGPU_RESET_MAGIC_NUM)) 2581 return true; 2582 2583 if (!amdgpu_in_reset(adev)) 2584 return false; 2585 2586 /* 2587 * For all ASICs with baco/mode1 reset, the VRAM is 2588 * always assumed to be lost. 2589 */ 2590 switch (amdgpu_asic_reset_method(adev)) { 2591 case AMD_RESET_METHOD_BACO: 2592 case AMD_RESET_METHOD_MODE1: 2593 return true; 2594 default: 2595 return false; 2596 } 2597 } 2598 2599 /** 2600 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2601 * 2602 * @adev: amdgpu_device pointer 2603 * @state: clockgating state (gate or ungate) 2604 * 2605 * The list of all the hardware IPs that make up the asic is walked and the 2606 * set_clockgating_state callbacks are run. 2607 * Late initialization pass enabling clockgating for hardware IPs. 2608 * Fini or suspend, pass disabling clockgating for hardware IPs. 2609 * Returns 0 on success, negative error code on failure. 2610 */ 2611 2612 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2613 enum amd_clockgating_state state) 2614 { 2615 int i, j, r; 2616 2617 if (amdgpu_emu_mode == 1) 2618 return 0; 2619 2620 for (j = 0; j < adev->num_ip_blocks; j++) { 2621 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2622 if (!adev->ip_blocks[i].status.late_initialized) 2623 continue; 2624 /* skip CG for GFX on S0ix */ 2625 if (adev->in_s0ix && 2626 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2627 continue; 2628 /* skip CG for VCE/UVD, it's handled specially */ 2629 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2632 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2633 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2634 /* enable clockgating to save power */ 2635 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2636 state); 2637 if (r) { 2638 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2639 adev->ip_blocks[i].version->funcs->name, r); 2640 return r; 2641 } 2642 } 2643 } 2644 2645 return 0; 2646 } 2647 2648 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2649 enum amd_powergating_state state) 2650 { 2651 int i, j, r; 2652 2653 if (amdgpu_emu_mode == 1) 2654 return 0; 2655 2656 for (j = 0; j < adev->num_ip_blocks; j++) { 2657 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2658 if (!adev->ip_blocks[i].status.late_initialized) 2659 continue; 2660 /* skip PG for GFX on S0ix */ 2661 if (adev->in_s0ix && 2662 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2663 continue; 2664 /* skip CG for VCE/UVD, it's handled specially */ 2665 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2666 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2667 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2668 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2669 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2670 /* enable powergating to save power */ 2671 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2672 state); 2673 if (r) { 2674 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2675 adev->ip_blocks[i].version->funcs->name, r); 2676 return r; 2677 } 2678 } 2679 } 2680 return 0; 2681 } 2682 2683 static int amdgpu_device_enable_mgpu_fan_boost(void) 2684 { 2685 struct amdgpu_gpu_instance *gpu_ins; 2686 struct amdgpu_device *adev; 2687 int i, ret = 0; 2688 2689 mutex_lock(&mgpu_info.mutex); 2690 2691 /* 2692 * MGPU fan boost feature should be enabled 2693 * only when there are two or more dGPUs in 2694 * the system 2695 */ 2696 if (mgpu_info.num_dgpu < 2) 2697 goto out; 2698 2699 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2700 gpu_ins = &(mgpu_info.gpu_ins[i]); 2701 adev = gpu_ins->adev; 2702 if (!(adev->flags & AMD_IS_APU) && 2703 !gpu_ins->mgpu_fan_enabled) { 2704 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2705 if (ret) 2706 break; 2707 2708 gpu_ins->mgpu_fan_enabled = 1; 2709 } 2710 } 2711 2712 out: 2713 mutex_unlock(&mgpu_info.mutex); 2714 2715 return ret; 2716 } 2717 2718 /** 2719 * amdgpu_device_ip_late_init - run late init for hardware IPs 2720 * 2721 * @adev: amdgpu_device pointer 2722 * 2723 * Late initialization pass for hardware IPs. The list of all the hardware 2724 * IPs that make up the asic is walked and the late_init callbacks are run. 2725 * late_init covers any special initialization that an IP requires 2726 * after all of the have been initialized or something that needs to happen 2727 * late in the init process. 2728 * Returns 0 on success, negative error code on failure. 2729 */ 2730 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2731 { 2732 struct amdgpu_gpu_instance *gpu_instance; 2733 int i = 0, r; 2734 2735 for (i = 0; i < adev->num_ip_blocks; i++) { 2736 if (!adev->ip_blocks[i].status.hw) 2737 continue; 2738 if (adev->ip_blocks[i].version->funcs->late_init) { 2739 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2740 if (r) { 2741 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2742 adev->ip_blocks[i].version->funcs->name, r); 2743 return r; 2744 } 2745 } 2746 adev->ip_blocks[i].status.late_initialized = true; 2747 } 2748 2749 r = amdgpu_ras_late_init(adev); 2750 if (r) { 2751 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2752 return r; 2753 } 2754 2755 amdgpu_ras_set_error_query_ready(adev, true); 2756 2757 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2758 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2759 2760 amdgpu_device_fill_reset_magic(adev); 2761 2762 r = amdgpu_device_enable_mgpu_fan_boost(); 2763 if (r) 2764 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2765 2766 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2767 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2768 adev->asic_type == CHIP_ALDEBARAN )) 2769 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2770 2771 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2772 mutex_lock(&mgpu_info.mutex); 2773 2774 /* 2775 * Reset device p-state to low as this was booted with high. 2776 * 2777 * This should be performed only after all devices from the same 2778 * hive get initialized. 2779 * 2780 * However, it's unknown how many device in the hive in advance. 2781 * As this is counted one by one during devices initializations. 2782 * 2783 * So, we wait for all XGMI interlinked devices initialized. 2784 * This may bring some delays as those devices may come from 2785 * different hives. But that should be OK. 2786 */ 2787 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2788 for (i = 0; i < mgpu_info.num_gpu; i++) { 2789 gpu_instance = &(mgpu_info.gpu_ins[i]); 2790 if (gpu_instance->adev->flags & AMD_IS_APU) 2791 continue; 2792 2793 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2794 AMDGPU_XGMI_PSTATE_MIN); 2795 if (r) { 2796 DRM_ERROR("pstate setting failed (%d).\n", r); 2797 break; 2798 } 2799 } 2800 } 2801 2802 mutex_unlock(&mgpu_info.mutex); 2803 } 2804 2805 return 0; 2806 } 2807 2808 /** 2809 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2810 * 2811 * @adev: amdgpu_device pointer 2812 * 2813 * For ASICs need to disable SMC first 2814 */ 2815 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2816 { 2817 int i, r; 2818 2819 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2820 return; 2821 2822 for (i = 0; i < adev->num_ip_blocks; i++) { 2823 if (!adev->ip_blocks[i].status.hw) 2824 continue; 2825 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2826 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2827 /* XXX handle errors */ 2828 if (r) { 2829 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2830 adev->ip_blocks[i].version->funcs->name, r); 2831 } 2832 adev->ip_blocks[i].status.hw = false; 2833 break; 2834 } 2835 } 2836 } 2837 2838 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2839 { 2840 int i, r; 2841 2842 for (i = 0; i < adev->num_ip_blocks; i++) { 2843 if (!adev->ip_blocks[i].version->funcs->early_fini) 2844 continue; 2845 2846 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2847 if (r) { 2848 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2849 adev->ip_blocks[i].version->funcs->name, r); 2850 } 2851 } 2852 2853 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2854 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2855 2856 amdgpu_amdkfd_suspend(adev, false); 2857 2858 /* Workaroud for ASICs need to disable SMC first */ 2859 amdgpu_device_smu_fini_early(adev); 2860 2861 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2862 if (!adev->ip_blocks[i].status.hw) 2863 continue; 2864 2865 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2866 /* XXX handle errors */ 2867 if (r) { 2868 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2869 adev->ip_blocks[i].version->funcs->name, r); 2870 } 2871 2872 adev->ip_blocks[i].status.hw = false; 2873 } 2874 2875 if (amdgpu_sriov_vf(adev)) { 2876 if (amdgpu_virt_release_full_gpu(adev, false)) 2877 DRM_ERROR("failed to release exclusive mode on fini\n"); 2878 } 2879 2880 return 0; 2881 } 2882 2883 /** 2884 * amdgpu_device_ip_fini - run fini for hardware IPs 2885 * 2886 * @adev: amdgpu_device pointer 2887 * 2888 * Main teardown pass for hardware IPs. The list of all the hardware 2889 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2890 * are run. hw_fini tears down the hardware associated with each IP 2891 * and sw_fini tears down any software state associated with each IP. 2892 * Returns 0 on success, negative error code on failure. 2893 */ 2894 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2895 { 2896 int i, r; 2897 2898 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2899 amdgpu_virt_release_ras_err_handler_data(adev); 2900 2901 if (adev->gmc.xgmi.num_physical_nodes > 1) 2902 amdgpu_xgmi_remove_device(adev); 2903 2904 amdgpu_amdkfd_device_fini_sw(adev); 2905 2906 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2907 if (!adev->ip_blocks[i].status.sw) 2908 continue; 2909 2910 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2911 amdgpu_ucode_free_bo(adev); 2912 amdgpu_free_static_csa(&adev->virt.csa_obj); 2913 amdgpu_device_wb_fini(adev); 2914 amdgpu_device_vram_scratch_fini(adev); 2915 amdgpu_ib_pool_fini(adev); 2916 } 2917 2918 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2919 /* XXX handle errors */ 2920 if (r) { 2921 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2922 adev->ip_blocks[i].version->funcs->name, r); 2923 } 2924 adev->ip_blocks[i].status.sw = false; 2925 adev->ip_blocks[i].status.valid = false; 2926 } 2927 2928 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2929 if (!adev->ip_blocks[i].status.late_initialized) 2930 continue; 2931 if (adev->ip_blocks[i].version->funcs->late_fini) 2932 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2933 adev->ip_blocks[i].status.late_initialized = false; 2934 } 2935 2936 amdgpu_ras_fini(adev); 2937 2938 return 0; 2939 } 2940 2941 /** 2942 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2943 * 2944 * @work: work_struct. 2945 */ 2946 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2947 { 2948 struct amdgpu_device *adev = 2949 container_of(work, struct amdgpu_device, delayed_init_work.work); 2950 int r; 2951 2952 r = amdgpu_ib_ring_tests(adev); 2953 if (r) 2954 DRM_ERROR("ib ring test failed (%d).\n", r); 2955 } 2956 2957 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2958 { 2959 struct amdgpu_device *adev = 2960 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2961 2962 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2963 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2964 2965 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2966 adev->gfx.gfx_off_state = true; 2967 } 2968 2969 /** 2970 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2971 * 2972 * @adev: amdgpu_device pointer 2973 * 2974 * Main suspend function for hardware IPs. The list of all the hardware 2975 * IPs that make up the asic is walked, clockgating is disabled and the 2976 * suspend callbacks are run. suspend puts the hardware and software state 2977 * in each IP into a state suitable for suspend. 2978 * Returns 0 on success, negative error code on failure. 2979 */ 2980 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2981 { 2982 int i, r; 2983 2984 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2985 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2986 2987 /* 2988 * Per PMFW team's suggestion, driver needs to handle gfxoff 2989 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2990 * scenario. Add the missing df cstate disablement here. 2991 */ 2992 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2993 dev_warn(adev->dev, "Failed to disallow df cstate"); 2994 2995 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2996 if (!adev->ip_blocks[i].status.valid) 2997 continue; 2998 2999 /* displays are handled separately */ 3000 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3001 continue; 3002 3003 /* XXX handle errors */ 3004 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3005 /* XXX handle errors */ 3006 if (r) { 3007 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3008 adev->ip_blocks[i].version->funcs->name, r); 3009 return r; 3010 } 3011 3012 adev->ip_blocks[i].status.hw = false; 3013 } 3014 3015 return 0; 3016 } 3017 3018 /** 3019 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3020 * 3021 * @adev: amdgpu_device pointer 3022 * 3023 * Main suspend function for hardware IPs. The list of all the hardware 3024 * IPs that make up the asic is walked, clockgating is disabled and the 3025 * suspend callbacks are run. suspend puts the hardware and software state 3026 * in each IP into a state suitable for suspend. 3027 * Returns 0 on success, negative error code on failure. 3028 */ 3029 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3030 { 3031 int i, r; 3032 3033 if (adev->in_s0ix) 3034 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3035 3036 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3037 if (!adev->ip_blocks[i].status.valid) 3038 continue; 3039 /* displays are handled in phase1 */ 3040 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3041 continue; 3042 /* PSP lost connection when err_event_athub occurs */ 3043 if (amdgpu_ras_intr_triggered() && 3044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3045 adev->ip_blocks[i].status.hw = false; 3046 continue; 3047 } 3048 3049 /* skip unnecessary suspend if we do not initialize them yet */ 3050 if (adev->gmc.xgmi.pending_reset && 3051 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3053 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3054 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3055 adev->ip_blocks[i].status.hw = false; 3056 continue; 3057 } 3058 3059 /* skip suspend of gfx/mes and psp for S0ix 3060 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3061 * like at runtime. PSP is also part of the always on hardware 3062 * so no need to suspend it. 3063 */ 3064 if (adev->in_s0ix && 3065 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3066 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3067 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3068 continue; 3069 3070 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3071 if (adev->in_s0ix && 3072 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3073 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3074 continue; 3075 3076 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3077 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3078 * from this location and RLC Autoload automatically also gets loaded 3079 * from here based on PMFW -> PSP message during re-init sequence. 3080 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3081 * the TMR and reload FWs again for IMU enabled APU ASICs. 3082 */ 3083 if (amdgpu_in_reset(adev) && 3084 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3085 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3086 continue; 3087 3088 /* XXX handle errors */ 3089 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3090 /* XXX handle errors */ 3091 if (r) { 3092 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3093 adev->ip_blocks[i].version->funcs->name, r); 3094 } 3095 adev->ip_blocks[i].status.hw = false; 3096 /* handle putting the SMC in the appropriate state */ 3097 if(!amdgpu_sriov_vf(adev)){ 3098 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3099 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3100 if (r) { 3101 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3102 adev->mp1_state, r); 3103 return r; 3104 } 3105 } 3106 } 3107 } 3108 3109 return 0; 3110 } 3111 3112 /** 3113 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3114 * 3115 * @adev: amdgpu_device pointer 3116 * 3117 * Main suspend function for hardware IPs. The list of all the hardware 3118 * IPs that make up the asic is walked, clockgating is disabled and the 3119 * suspend callbacks are run. suspend puts the hardware and software state 3120 * in each IP into a state suitable for suspend. 3121 * Returns 0 on success, negative error code on failure. 3122 */ 3123 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3124 { 3125 int r; 3126 3127 if (amdgpu_sriov_vf(adev)) { 3128 amdgpu_virt_fini_data_exchange(adev); 3129 amdgpu_virt_request_full_gpu(adev, false); 3130 } 3131 3132 r = amdgpu_device_ip_suspend_phase1(adev); 3133 if (r) 3134 return r; 3135 r = amdgpu_device_ip_suspend_phase2(adev); 3136 3137 if (amdgpu_sriov_vf(adev)) 3138 amdgpu_virt_release_full_gpu(adev, false); 3139 3140 return r; 3141 } 3142 3143 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3144 { 3145 int i, r; 3146 3147 static enum amd_ip_block_type ip_order[] = { 3148 AMD_IP_BLOCK_TYPE_COMMON, 3149 AMD_IP_BLOCK_TYPE_GMC, 3150 AMD_IP_BLOCK_TYPE_PSP, 3151 AMD_IP_BLOCK_TYPE_IH, 3152 }; 3153 3154 for (i = 0; i < adev->num_ip_blocks; i++) { 3155 int j; 3156 struct amdgpu_ip_block *block; 3157 3158 block = &adev->ip_blocks[i]; 3159 block->status.hw = false; 3160 3161 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3162 3163 if (block->version->type != ip_order[j] || 3164 !block->status.valid) 3165 continue; 3166 3167 r = block->version->funcs->hw_init(adev); 3168 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3169 if (r) 3170 return r; 3171 block->status.hw = true; 3172 } 3173 } 3174 3175 return 0; 3176 } 3177 3178 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3179 { 3180 int i, r; 3181 3182 static enum amd_ip_block_type ip_order[] = { 3183 AMD_IP_BLOCK_TYPE_SMC, 3184 AMD_IP_BLOCK_TYPE_DCE, 3185 AMD_IP_BLOCK_TYPE_GFX, 3186 AMD_IP_BLOCK_TYPE_SDMA, 3187 AMD_IP_BLOCK_TYPE_UVD, 3188 AMD_IP_BLOCK_TYPE_VCE, 3189 AMD_IP_BLOCK_TYPE_VCN 3190 }; 3191 3192 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3193 int j; 3194 struct amdgpu_ip_block *block; 3195 3196 for (j = 0; j < adev->num_ip_blocks; j++) { 3197 block = &adev->ip_blocks[j]; 3198 3199 if (block->version->type != ip_order[i] || 3200 !block->status.valid || 3201 block->status.hw) 3202 continue; 3203 3204 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3205 r = block->version->funcs->resume(adev); 3206 else 3207 r = block->version->funcs->hw_init(adev); 3208 3209 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3210 if (r) 3211 return r; 3212 block->status.hw = true; 3213 } 3214 } 3215 3216 return 0; 3217 } 3218 3219 /** 3220 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3221 * 3222 * @adev: amdgpu_device pointer 3223 * 3224 * First resume function for hardware IPs. The list of all the hardware 3225 * IPs that make up the asic is walked and the resume callbacks are run for 3226 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3227 * after a suspend and updates the software state as necessary. This 3228 * function is also used for restoring the GPU after a GPU reset. 3229 * Returns 0 on success, negative error code on failure. 3230 */ 3231 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3232 { 3233 int i, r; 3234 3235 for (i = 0; i < adev->num_ip_blocks; i++) { 3236 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3237 continue; 3238 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3239 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3240 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3241 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3242 3243 r = adev->ip_blocks[i].version->funcs->resume(adev); 3244 if (r) { 3245 DRM_ERROR("resume of IP block <%s> failed %d\n", 3246 adev->ip_blocks[i].version->funcs->name, r); 3247 return r; 3248 } 3249 adev->ip_blocks[i].status.hw = true; 3250 } 3251 } 3252 3253 return 0; 3254 } 3255 3256 /** 3257 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3258 * 3259 * @adev: amdgpu_device pointer 3260 * 3261 * First resume function for hardware IPs. The list of all the hardware 3262 * IPs that make up the asic is walked and the resume callbacks are run for 3263 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3264 * functional state after a suspend and updates the software state as 3265 * necessary. This function is also used for restoring the GPU after a GPU 3266 * reset. 3267 * Returns 0 on success, negative error code on failure. 3268 */ 3269 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3270 { 3271 int i, r; 3272 3273 for (i = 0; i < adev->num_ip_blocks; i++) { 3274 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3275 continue; 3276 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3278 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3279 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3280 continue; 3281 r = adev->ip_blocks[i].version->funcs->resume(adev); 3282 if (r) { 3283 DRM_ERROR("resume of IP block <%s> failed %d\n", 3284 adev->ip_blocks[i].version->funcs->name, r); 3285 return r; 3286 } 3287 adev->ip_blocks[i].status.hw = true; 3288 3289 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3290 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3291 * amdgpu_device_resume() after IP resume. 3292 */ 3293 amdgpu_gfx_off_ctrl(adev, false); 3294 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3295 } 3296 3297 } 3298 3299 return 0; 3300 } 3301 3302 /** 3303 * amdgpu_device_ip_resume - run resume for hardware IPs 3304 * 3305 * @adev: amdgpu_device pointer 3306 * 3307 * Main resume function for hardware IPs. The hardware IPs 3308 * are split into two resume functions because they are 3309 * are also used in in recovering from a GPU reset and some additional 3310 * steps need to be take between them. In this case (S3/S4) they are 3311 * run sequentially. 3312 * Returns 0 on success, negative error code on failure. 3313 */ 3314 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3315 { 3316 int r; 3317 3318 r = amdgpu_amdkfd_resume_iommu(adev); 3319 if (r) 3320 return r; 3321 3322 r = amdgpu_device_ip_resume_phase1(adev); 3323 if (r) 3324 return r; 3325 3326 r = amdgpu_device_fw_loading(adev); 3327 if (r) 3328 return r; 3329 3330 r = amdgpu_device_ip_resume_phase2(adev); 3331 3332 return r; 3333 } 3334 3335 /** 3336 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3337 * 3338 * @adev: amdgpu_device pointer 3339 * 3340 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3341 */ 3342 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3343 { 3344 if (amdgpu_sriov_vf(adev)) { 3345 if (adev->is_atom_fw) { 3346 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3347 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3348 } else { 3349 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3350 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3351 } 3352 3353 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3354 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3355 } 3356 } 3357 3358 /** 3359 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3360 * 3361 * @asic_type: AMD asic type 3362 * 3363 * Check if there is DC (new modesetting infrastructre) support for an asic. 3364 * returns true if DC has support, false if not. 3365 */ 3366 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3367 { 3368 switch (asic_type) { 3369 #ifdef CONFIG_DRM_AMDGPU_SI 3370 case CHIP_HAINAN: 3371 #endif 3372 case CHIP_TOPAZ: 3373 /* chips with no display hardware */ 3374 return false; 3375 #if defined(CONFIG_DRM_AMD_DC) 3376 case CHIP_TAHITI: 3377 case CHIP_PITCAIRN: 3378 case CHIP_VERDE: 3379 case CHIP_OLAND: 3380 /* 3381 * We have systems in the wild with these ASICs that require 3382 * LVDS and VGA support which is not supported with DC. 3383 * 3384 * Fallback to the non-DC driver here by default so as not to 3385 * cause regressions. 3386 */ 3387 #if defined(CONFIG_DRM_AMD_DC_SI) 3388 return amdgpu_dc > 0; 3389 #else 3390 return false; 3391 #endif 3392 case CHIP_BONAIRE: 3393 case CHIP_KAVERI: 3394 case CHIP_KABINI: 3395 case CHIP_MULLINS: 3396 /* 3397 * We have systems in the wild with these ASICs that require 3398 * VGA support which is not supported with DC. 3399 * 3400 * Fallback to the non-DC driver here by default so as not to 3401 * cause regressions. 3402 */ 3403 return amdgpu_dc > 0; 3404 default: 3405 return amdgpu_dc != 0; 3406 #else 3407 default: 3408 if (amdgpu_dc > 0) 3409 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3410 "but isn't supported by ASIC, ignoring\n"); 3411 return false; 3412 #endif 3413 } 3414 } 3415 3416 /** 3417 * amdgpu_device_has_dc_support - check if dc is supported 3418 * 3419 * @adev: amdgpu_device pointer 3420 * 3421 * Returns true for supported, false for not supported 3422 */ 3423 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3424 { 3425 if (amdgpu_sriov_vf(adev) || 3426 adev->enable_virtual_display || 3427 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3428 return false; 3429 3430 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3431 } 3432 3433 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3434 { 3435 struct amdgpu_device *adev = 3436 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3437 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3438 3439 /* It's a bug to not have a hive within this function */ 3440 if (WARN_ON(!hive)) 3441 return; 3442 3443 /* 3444 * Use task barrier to synchronize all xgmi reset works across the 3445 * hive. task_barrier_enter and task_barrier_exit will block 3446 * until all the threads running the xgmi reset works reach 3447 * those points. task_barrier_full will do both blocks. 3448 */ 3449 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3450 3451 task_barrier_enter(&hive->tb); 3452 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3453 3454 if (adev->asic_reset_res) 3455 goto fail; 3456 3457 task_barrier_exit(&hive->tb); 3458 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3459 3460 if (adev->asic_reset_res) 3461 goto fail; 3462 3463 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3464 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3465 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3466 } else { 3467 3468 task_barrier_full(&hive->tb); 3469 adev->asic_reset_res = amdgpu_asic_reset(adev); 3470 } 3471 3472 fail: 3473 if (adev->asic_reset_res) 3474 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3475 adev->asic_reset_res, adev_to_drm(adev)->unique); 3476 amdgpu_put_xgmi_hive(hive); 3477 } 3478 3479 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3480 { 3481 char *input = amdgpu_lockup_timeout; 3482 char *timeout_setting = NULL; 3483 int index = 0; 3484 long timeout; 3485 int ret = 0; 3486 3487 /* 3488 * By default timeout for non compute jobs is 10000 3489 * and 60000 for compute jobs. 3490 * In SR-IOV or passthrough mode, timeout for compute 3491 * jobs are 60000 by default. 3492 */ 3493 adev->gfx_timeout = msecs_to_jiffies(10000); 3494 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3495 if (amdgpu_sriov_vf(adev)) 3496 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3497 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3498 else 3499 adev->compute_timeout = msecs_to_jiffies(60000); 3500 3501 #ifdef notyet 3502 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3503 while ((timeout_setting = strsep(&input, ",")) && 3504 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3505 ret = kstrtol(timeout_setting, 0, &timeout); 3506 if (ret) 3507 return ret; 3508 3509 if (timeout == 0) { 3510 index++; 3511 continue; 3512 } else if (timeout < 0) { 3513 timeout = MAX_SCHEDULE_TIMEOUT; 3514 dev_warn(adev->dev, "lockup timeout disabled"); 3515 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3516 } else { 3517 timeout = msecs_to_jiffies(timeout); 3518 } 3519 3520 switch (index++) { 3521 case 0: 3522 adev->gfx_timeout = timeout; 3523 break; 3524 case 1: 3525 adev->compute_timeout = timeout; 3526 break; 3527 case 2: 3528 adev->sdma_timeout = timeout; 3529 break; 3530 case 3: 3531 adev->video_timeout = timeout; 3532 break; 3533 default: 3534 break; 3535 } 3536 } 3537 /* 3538 * There is only one value specified and 3539 * it should apply to all non-compute jobs. 3540 */ 3541 if (index == 1) { 3542 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3543 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3544 adev->compute_timeout = adev->gfx_timeout; 3545 } 3546 } 3547 #endif 3548 3549 return ret; 3550 } 3551 3552 /** 3553 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3554 * 3555 * @adev: amdgpu_device pointer 3556 * 3557 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3558 */ 3559 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3560 { 3561 #ifdef notyet 3562 struct iommu_domain *domain; 3563 3564 domain = iommu_get_domain_for_dev(adev->dev); 3565 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3566 #endif 3567 adev->ram_is_direct_mapped = true; 3568 } 3569 3570 static const struct attribute *amdgpu_dev_attributes[] = { 3571 &dev_attr_product_name.attr, 3572 &dev_attr_product_number.attr, 3573 &dev_attr_serial_number.attr, 3574 &dev_attr_pcie_replay_count.attr, 3575 NULL 3576 }; 3577 3578 /** 3579 * amdgpu_device_init - initialize the driver 3580 * 3581 * @adev: amdgpu_device pointer 3582 * @flags: driver flags 3583 * 3584 * Initializes the driver info and hw (all asics). 3585 * Returns 0 for success or an error on failure. 3586 * Called at driver startup. 3587 */ 3588 int amdgpu_device_init(struct amdgpu_device *adev, 3589 uint32_t flags) 3590 { 3591 struct drm_device *ddev = adev_to_drm(adev); 3592 struct pci_dev *pdev = adev->pdev; 3593 int r, i; 3594 bool px = false; 3595 u32 max_MBps; 3596 3597 adev->shutdown = false; 3598 adev->flags = flags; 3599 3600 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3601 adev->asic_type = amdgpu_force_asic_type; 3602 else 3603 adev->asic_type = flags & AMD_ASIC_MASK; 3604 3605 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3606 if (amdgpu_emu_mode == 1) 3607 adev->usec_timeout *= 10; 3608 adev->gmc.gart_size = 512 * 1024 * 1024; 3609 adev->accel_working = false; 3610 adev->num_rings = 0; 3611 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3612 adev->mman.buffer_funcs = NULL; 3613 adev->mman.buffer_funcs_ring = NULL; 3614 adev->vm_manager.vm_pte_funcs = NULL; 3615 adev->vm_manager.vm_pte_num_scheds = 0; 3616 adev->gmc.gmc_funcs = NULL; 3617 adev->harvest_ip_mask = 0x0; 3618 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3619 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3620 3621 adev->smc_rreg = &amdgpu_invalid_rreg; 3622 adev->smc_wreg = &amdgpu_invalid_wreg; 3623 adev->pcie_rreg = &amdgpu_invalid_rreg; 3624 adev->pcie_wreg = &amdgpu_invalid_wreg; 3625 adev->pciep_rreg = &amdgpu_invalid_rreg; 3626 adev->pciep_wreg = &amdgpu_invalid_wreg; 3627 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3628 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3629 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3630 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3631 adev->didt_rreg = &amdgpu_invalid_rreg; 3632 adev->didt_wreg = &amdgpu_invalid_wreg; 3633 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3634 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3635 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3636 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3637 3638 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3639 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3640 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3641 3642 /* mutex initialization are all done here so we 3643 * can recall function without having locking issues */ 3644 rw_init(&adev->firmware.mutex, "agfw"); 3645 rw_init(&adev->pm.mutex, "agpm"); 3646 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3647 rw_init(&adev->srbm_mutex, "srbm"); 3648 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3649 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3650 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3651 rw_init(&adev->mn_lock, "agpumn"); 3652 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3653 hash_init(adev->mn_hash); 3654 rw_init(&adev->psp.mutex, "agpsp"); 3655 rw_init(&adev->notifier_lock, "agnf"); 3656 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3657 rw_init(&adev->benchmark_mutex, "agbm"); 3658 3659 amdgpu_device_init_apu_flags(adev); 3660 3661 r = amdgpu_device_check_arguments(adev); 3662 if (r) 3663 return r; 3664 3665 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3666 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3667 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3668 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3669 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3670 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3671 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3672 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3673 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3674 3675 INIT_LIST_HEAD(&adev->shadow_list); 3676 rw_init(&adev->shadow_list_lock, "sdwlst"); 3677 3678 INIT_LIST_HEAD(&adev->reset_list); 3679 3680 INIT_LIST_HEAD(&adev->ras_list); 3681 3682 INIT_DELAYED_WORK(&adev->delayed_init_work, 3683 amdgpu_device_delayed_init_work_handler); 3684 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3685 amdgpu_device_delay_enable_gfx_off); 3686 3687 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3688 3689 adev->gfx.gfx_off_req_count = 1; 3690 adev->gfx.gfx_off_residency = 0; 3691 adev->gfx.gfx_off_entrycount = 0; 3692 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3693 3694 atomic_set(&adev->throttling_logging_enabled, 1); 3695 /* 3696 * If throttling continues, logging will be performed every minute 3697 * to avoid log flooding. "-1" is subtracted since the thermal 3698 * throttling interrupt comes every second. Thus, the total logging 3699 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3700 * for throttling interrupt) = 60 seconds. 3701 */ 3702 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3703 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3704 3705 #ifdef __linux__ 3706 /* Registers mapping */ 3707 /* TODO: block userspace mapping of io register */ 3708 if (adev->asic_type >= CHIP_BONAIRE) { 3709 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3710 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3711 } else { 3712 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3713 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3714 } 3715 3716 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3717 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3718 3719 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3720 if (adev->rmmio == NULL) { 3721 return -ENOMEM; 3722 } 3723 #endif 3724 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3725 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3726 3727 amdgpu_device_get_pcie_info(adev); 3728 3729 if (amdgpu_mcbp) 3730 DRM_INFO("MCBP is enabled\n"); 3731 3732 /* 3733 * Reset domain needs to be present early, before XGMI hive discovered 3734 * (if any) and intitialized to use reset sem and in_gpu reset flag 3735 * early on during init and before calling to RREG32. 3736 */ 3737 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3738 if (!adev->reset_domain) 3739 return -ENOMEM; 3740 3741 /* detect hw virtualization here */ 3742 amdgpu_detect_virtualization(adev); 3743 3744 r = amdgpu_device_get_job_timeout_settings(adev); 3745 if (r) { 3746 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3747 return r; 3748 } 3749 3750 /* early init functions */ 3751 r = amdgpu_device_ip_early_init(adev); 3752 if (r) 3753 return r; 3754 3755 /* Get rid of things like offb */ 3756 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3757 if (r) 3758 return r; 3759 3760 /* Enable TMZ based on IP_VERSION */ 3761 amdgpu_gmc_tmz_set(adev); 3762 3763 amdgpu_gmc_noretry_set(adev); 3764 /* Need to get xgmi info early to decide the reset behavior*/ 3765 if (adev->gmc.xgmi.supported) { 3766 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3767 if (r) 3768 return r; 3769 } 3770 3771 /* enable PCIE atomic ops */ 3772 #ifdef notyet 3773 if (amdgpu_sriov_vf(adev)) 3774 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3775 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3776 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3777 else 3778 adev->have_atomics_support = 3779 !pci_enable_atomic_ops_to_root(adev->pdev, 3780 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3781 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3782 if (!adev->have_atomics_support) 3783 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3784 #else 3785 adev->have_atomics_support = false; 3786 #endif 3787 3788 /* doorbell bar mapping and doorbell index init*/ 3789 amdgpu_device_doorbell_init(adev); 3790 3791 if (amdgpu_emu_mode == 1) { 3792 /* post the asic on emulation mode */ 3793 emu_soc_asic_init(adev); 3794 goto fence_driver_init; 3795 } 3796 3797 amdgpu_reset_init(adev); 3798 3799 /* detect if we are with an SRIOV vbios */ 3800 amdgpu_device_detect_sriov_bios(adev); 3801 3802 /* check if we need to reset the asic 3803 * E.g., driver was not cleanly unloaded previously, etc. 3804 */ 3805 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3806 if (adev->gmc.xgmi.num_physical_nodes) { 3807 dev_info(adev->dev, "Pending hive reset.\n"); 3808 adev->gmc.xgmi.pending_reset = true; 3809 /* Only need to init necessary block for SMU to handle the reset */ 3810 for (i = 0; i < adev->num_ip_blocks; i++) { 3811 if (!adev->ip_blocks[i].status.valid) 3812 continue; 3813 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3815 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3816 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3817 DRM_DEBUG("IP %s disabled for hw_init.\n", 3818 adev->ip_blocks[i].version->funcs->name); 3819 adev->ip_blocks[i].status.hw = true; 3820 } 3821 } 3822 } else { 3823 r = amdgpu_asic_reset(adev); 3824 if (r) { 3825 dev_err(adev->dev, "asic reset on init failed\n"); 3826 goto failed; 3827 } 3828 } 3829 } 3830 3831 pci_enable_pcie_error_reporting(adev->pdev); 3832 3833 /* Post card if necessary */ 3834 if (amdgpu_device_need_post(adev)) { 3835 if (!adev->bios) { 3836 dev_err(adev->dev, "no vBIOS found\n"); 3837 r = -EINVAL; 3838 goto failed; 3839 } 3840 DRM_INFO("GPU posting now...\n"); 3841 r = amdgpu_device_asic_init(adev); 3842 if (r) { 3843 dev_err(adev->dev, "gpu post error!\n"); 3844 goto failed; 3845 } 3846 } 3847 3848 if (adev->is_atom_fw) { 3849 /* Initialize clocks */ 3850 r = amdgpu_atomfirmware_get_clock_info(adev); 3851 if (r) { 3852 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3853 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3854 goto failed; 3855 } 3856 } else { 3857 /* Initialize clocks */ 3858 r = amdgpu_atombios_get_clock_info(adev); 3859 if (r) { 3860 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3861 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3862 goto failed; 3863 } 3864 /* init i2c buses */ 3865 if (!amdgpu_device_has_dc_support(adev)) 3866 amdgpu_atombios_i2c_init(adev); 3867 } 3868 3869 fence_driver_init: 3870 /* Fence driver */ 3871 r = amdgpu_fence_driver_sw_init(adev); 3872 if (r) { 3873 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3875 goto failed; 3876 } 3877 3878 /* init the mode config */ 3879 drm_mode_config_init(adev_to_drm(adev)); 3880 3881 r = amdgpu_device_ip_init(adev); 3882 if (r) { 3883 /* failed in exclusive mode due to timeout */ 3884 if (amdgpu_sriov_vf(adev) && 3885 !amdgpu_sriov_runtime(adev) && 3886 amdgpu_virt_mmio_blocked(adev) && 3887 !amdgpu_virt_wait_reset(adev)) { 3888 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3889 /* Don't send request since VF is inactive. */ 3890 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3891 adev->virt.ops = NULL; 3892 r = -EAGAIN; 3893 goto release_ras_con; 3894 } 3895 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3896 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3897 goto release_ras_con; 3898 } 3899 3900 amdgpu_fence_driver_hw_init(adev); 3901 3902 dev_info(adev->dev, 3903 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3904 adev->gfx.config.max_shader_engines, 3905 adev->gfx.config.max_sh_per_se, 3906 adev->gfx.config.max_cu_per_sh, 3907 adev->gfx.cu_info.number); 3908 3909 #ifdef __OpenBSD__ 3910 { 3911 const char *chip_name; 3912 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3913 int maj, min, rev; 3914 3915 switch (adev->asic_type) { 3916 case CHIP_RAVEN: 3917 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3918 chip_name = "RAVEN2"; 3919 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3920 chip_name = "PICASSO"; 3921 else 3922 chip_name = "RAVEN"; 3923 break; 3924 case CHIP_RENOIR: 3925 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3926 chip_name = "RENOIR"; 3927 else 3928 chip_name = "GREEN_SARDINE"; 3929 break; 3930 default: 3931 chip_name = amdgpu_asic_name[adev->asic_type]; 3932 } 3933 3934 printf("%s: %s", adev->self.dv_xname, chip_name); 3935 /* show graphics/compute ip block version, not set on < GFX9 */ 3936 if (version) { 3937 maj = IP_VERSION_MAJ(version); 3938 min = IP_VERSION_MIN(version); 3939 rev = IP_VERSION_REV(version); 3940 printf(" GC %d.%d.%d", maj, min, rev); 3941 } 3942 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3943 } 3944 #endif 3945 3946 adev->accel_working = true; 3947 3948 amdgpu_vm_check_compute_bug(adev); 3949 3950 /* Initialize the buffer migration limit. */ 3951 if (amdgpu_moverate >= 0) 3952 max_MBps = amdgpu_moverate; 3953 else 3954 max_MBps = 8; /* Allow 8 MB/s. */ 3955 /* Get a log2 for easy divisions. */ 3956 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3957 3958 r = amdgpu_pm_sysfs_init(adev); 3959 if (r) { 3960 adev->pm_sysfs_en = false; 3961 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3962 } else 3963 adev->pm_sysfs_en = true; 3964 3965 r = amdgpu_ucode_sysfs_init(adev); 3966 if (r) { 3967 adev->ucode_sysfs_en = false; 3968 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3969 } else 3970 adev->ucode_sysfs_en = true; 3971 3972 r = amdgpu_psp_sysfs_init(adev); 3973 if (r) { 3974 adev->psp_sysfs_en = false; 3975 if (!amdgpu_sriov_vf(adev)) 3976 DRM_ERROR("Creating psp sysfs failed\n"); 3977 } else 3978 adev->psp_sysfs_en = true; 3979 3980 /* 3981 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3982 * Otherwise the mgpu fan boost feature will be skipped due to the 3983 * gpu instance is counted less. 3984 */ 3985 amdgpu_register_gpu_instance(adev); 3986 3987 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3988 * explicit gating rather than handling it automatically. 3989 */ 3990 if (!adev->gmc.xgmi.pending_reset) { 3991 r = amdgpu_device_ip_late_init(adev); 3992 if (r) { 3993 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3994 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3995 goto release_ras_con; 3996 } 3997 /* must succeed. */ 3998 amdgpu_ras_resume(adev); 3999 queue_delayed_work(system_wq, &adev->delayed_init_work, 4000 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4001 } 4002 4003 if (amdgpu_sriov_vf(adev)) 4004 flush_delayed_work(&adev->delayed_init_work); 4005 4006 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4007 if (r) 4008 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4009 4010 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4011 r = amdgpu_pmu_init(adev); 4012 if (r) 4013 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4014 4015 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4016 if (amdgpu_device_cache_pci_state(adev->pdev)) 4017 pci_restore_state(pdev); 4018 4019 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4020 /* this will fail for cards that aren't VGA class devices, just 4021 * ignore it */ 4022 #ifdef notyet 4023 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4024 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4025 #endif 4026 4027 px = amdgpu_device_supports_px(ddev); 4028 4029 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4030 apple_gmux_detect(NULL, NULL))) 4031 vga_switcheroo_register_client(adev->pdev, 4032 &amdgpu_switcheroo_ops, px); 4033 4034 if (px) 4035 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4036 4037 if (adev->gmc.xgmi.pending_reset) 4038 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4039 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4040 4041 amdgpu_device_check_iommu_direct_map(adev); 4042 4043 return 0; 4044 4045 release_ras_con: 4046 amdgpu_release_ras_context(adev); 4047 4048 failed: 4049 amdgpu_vf_error_trans_all(adev); 4050 4051 return r; 4052 } 4053 4054 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4055 { 4056 STUB(); 4057 #ifdef notyet 4058 /* Clear all CPU mappings pointing to this device */ 4059 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4060 #endif 4061 4062 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4063 amdgpu_device_doorbell_fini(adev); 4064 4065 #ifdef __linux__ 4066 iounmap(adev->rmmio); 4067 adev->rmmio = NULL; 4068 if (adev->mman.aper_base_kaddr) 4069 iounmap(adev->mman.aper_base_kaddr); 4070 adev->mman.aper_base_kaddr = NULL; 4071 #else 4072 if (adev->rmmio_size > 0) 4073 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4074 adev->rmmio_size); 4075 adev->rmmio_size = 0; 4076 adev->rmmio = NULL; 4077 if (adev->mman.aper_base_kaddr) 4078 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4079 adev->gmc.visible_vram_size); 4080 adev->mman.aper_base_kaddr = NULL; 4081 #endif 4082 4083 /* Memory manager related */ 4084 if (!adev->gmc.xgmi.connected_to_cpu) { 4085 #ifdef __linux__ 4086 arch_phys_wc_del(adev->gmc.vram_mtrr); 4087 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4088 #else 4089 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4090 #endif 4091 } 4092 } 4093 4094 /** 4095 * amdgpu_device_fini_hw - tear down the driver 4096 * 4097 * @adev: amdgpu_device pointer 4098 * 4099 * Tear down the driver info (all asics). 4100 * Called at driver shutdown. 4101 */ 4102 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4103 { 4104 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4105 flush_delayed_work(&adev->delayed_init_work); 4106 adev->shutdown = true; 4107 4108 /* make sure IB test finished before entering exclusive mode 4109 * to avoid preemption on IB test 4110 * */ 4111 if (amdgpu_sriov_vf(adev)) { 4112 amdgpu_virt_request_full_gpu(adev, false); 4113 amdgpu_virt_fini_data_exchange(adev); 4114 } 4115 4116 /* disable all interrupts */ 4117 amdgpu_irq_disable_all(adev); 4118 if (adev->mode_info.mode_config_initialized){ 4119 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4120 drm_helper_force_disable_all(adev_to_drm(adev)); 4121 else 4122 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4123 } 4124 amdgpu_fence_driver_hw_fini(adev); 4125 4126 if (adev->mman.initialized) { 4127 flush_delayed_work(&adev->mman.bdev.wq); 4128 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4129 } 4130 4131 if (adev->pm_sysfs_en) 4132 amdgpu_pm_sysfs_fini(adev); 4133 if (adev->ucode_sysfs_en) 4134 amdgpu_ucode_sysfs_fini(adev); 4135 if (adev->psp_sysfs_en) 4136 amdgpu_psp_sysfs_fini(adev); 4137 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4138 4139 /* disable ras feature must before hw fini */ 4140 amdgpu_ras_pre_fini(adev); 4141 4142 amdgpu_device_ip_fini_early(adev); 4143 4144 amdgpu_irq_fini_hw(adev); 4145 4146 if (adev->mman.initialized) 4147 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4148 4149 amdgpu_gart_dummy_page_fini(adev); 4150 4151 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4152 amdgpu_device_unmap_mmio(adev); 4153 4154 } 4155 4156 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4157 { 4158 int idx; 4159 bool px; 4160 4161 amdgpu_fence_driver_sw_fini(adev); 4162 amdgpu_device_ip_fini(adev); 4163 release_firmware(adev->firmware.gpu_info_fw); 4164 adev->firmware.gpu_info_fw = NULL; 4165 adev->accel_working = false; 4166 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4167 4168 amdgpu_reset_fini(adev); 4169 4170 /* free i2c buses */ 4171 if (!amdgpu_device_has_dc_support(adev)) 4172 amdgpu_i2c_fini(adev); 4173 4174 if (amdgpu_emu_mode != 1) 4175 amdgpu_atombios_fini(adev); 4176 4177 kfree(adev->bios); 4178 adev->bios = NULL; 4179 4180 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4181 4182 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4183 apple_gmux_detect(NULL, NULL))) 4184 vga_switcheroo_unregister_client(adev->pdev); 4185 4186 if (px) 4187 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4188 4189 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4190 vga_client_unregister(adev->pdev); 4191 4192 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4193 #ifdef __linux__ 4194 iounmap(adev->rmmio); 4195 adev->rmmio = NULL; 4196 #else 4197 if (adev->rmmio_size > 0) 4198 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4199 adev->rmmio_size); 4200 adev->rmmio_size = 0; 4201 adev->rmmio = NULL; 4202 #endif 4203 amdgpu_device_doorbell_fini(adev); 4204 drm_dev_exit(idx); 4205 } 4206 4207 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4208 amdgpu_pmu_fini(adev); 4209 if (adev->mman.discovery_bin) 4210 amdgpu_discovery_fini(adev); 4211 4212 amdgpu_reset_put_reset_domain(adev->reset_domain); 4213 adev->reset_domain = NULL; 4214 4215 kfree(adev->pci_state); 4216 4217 } 4218 4219 /** 4220 * amdgpu_device_evict_resources - evict device resources 4221 * @adev: amdgpu device object 4222 * 4223 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4224 * of the vram memory type. Mainly used for evicting device resources 4225 * at suspend time. 4226 * 4227 */ 4228 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4229 { 4230 int ret; 4231 4232 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4233 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4234 return 0; 4235 4236 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4237 if (ret) 4238 DRM_WARN("evicting device resources failed\n"); 4239 return ret; 4240 } 4241 4242 /* 4243 * Suspend & resume. 4244 */ 4245 /** 4246 * amdgpu_device_suspend - initiate device suspend 4247 * 4248 * @dev: drm dev pointer 4249 * @fbcon : notify the fbdev of suspend 4250 * 4251 * Puts the hw in the suspend state (all asics). 4252 * Returns 0 for success or an error on failure. 4253 * Called at driver suspend. 4254 */ 4255 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4256 { 4257 struct amdgpu_device *adev = drm_to_adev(dev); 4258 int r = 0; 4259 4260 if (adev->shutdown) 4261 return 0; 4262 4263 #ifdef notyet 4264 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4265 return 0; 4266 #endif 4267 4268 adev->in_suspend = true; 4269 4270 if (amdgpu_sriov_vf(adev)) { 4271 amdgpu_virt_fini_data_exchange(adev); 4272 r = amdgpu_virt_request_full_gpu(adev, false); 4273 if (r) 4274 return r; 4275 } 4276 4277 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4278 DRM_WARN("smart shift update failed\n"); 4279 4280 drm_kms_helper_poll_disable(dev); 4281 4282 if (fbcon) 4283 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4284 4285 cancel_delayed_work_sync(&adev->delayed_init_work); 4286 4287 amdgpu_ras_suspend(adev); 4288 4289 amdgpu_device_ip_suspend_phase1(adev); 4290 4291 if (!adev->in_s0ix) 4292 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4293 4294 r = amdgpu_device_evict_resources(adev); 4295 if (r) 4296 return r; 4297 4298 amdgpu_fence_driver_hw_fini(adev); 4299 4300 amdgpu_device_ip_suspend_phase2(adev); 4301 4302 if (amdgpu_sriov_vf(adev)) 4303 amdgpu_virt_release_full_gpu(adev, false); 4304 4305 return 0; 4306 } 4307 4308 /** 4309 * amdgpu_device_resume - initiate device resume 4310 * 4311 * @dev: drm dev pointer 4312 * @fbcon : notify the fbdev of resume 4313 * 4314 * Bring the hw back to operating state (all asics). 4315 * Returns 0 for success or an error on failure. 4316 * Called at driver resume. 4317 */ 4318 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4319 { 4320 struct amdgpu_device *adev = drm_to_adev(dev); 4321 int r = 0; 4322 4323 if (amdgpu_sriov_vf(adev)) { 4324 r = amdgpu_virt_request_full_gpu(adev, true); 4325 if (r) 4326 return r; 4327 } 4328 4329 #ifdef notyet 4330 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4331 return 0; 4332 #endif 4333 4334 if (adev->in_s0ix) 4335 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4336 4337 /* post card */ 4338 if (amdgpu_device_need_post(adev)) { 4339 r = amdgpu_device_asic_init(adev); 4340 if (r) 4341 dev_err(adev->dev, "amdgpu asic init failed\n"); 4342 } 4343 4344 r = amdgpu_device_ip_resume(adev); 4345 4346 /* no matter what r is, always need to properly release full GPU */ 4347 if (amdgpu_sriov_vf(adev)) { 4348 amdgpu_virt_init_data_exchange(adev); 4349 amdgpu_virt_release_full_gpu(adev, true); 4350 } 4351 4352 if (r) { 4353 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4354 return r; 4355 } 4356 amdgpu_fence_driver_hw_init(adev); 4357 4358 r = amdgpu_device_ip_late_init(adev); 4359 if (r) 4360 return r; 4361 4362 queue_delayed_work(system_wq, &adev->delayed_init_work, 4363 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4364 4365 if (!adev->in_s0ix) { 4366 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4367 if (r) 4368 return r; 4369 } 4370 4371 /* Make sure IB tests flushed */ 4372 flush_delayed_work(&adev->delayed_init_work); 4373 4374 if (adev->in_s0ix) { 4375 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4376 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4377 */ 4378 amdgpu_gfx_off_ctrl(adev, true); 4379 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4380 } 4381 if (fbcon) 4382 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4383 4384 drm_kms_helper_poll_enable(dev); 4385 4386 amdgpu_ras_resume(adev); 4387 4388 /* 4389 * Most of the connector probing functions try to acquire runtime pm 4390 * refs to ensure that the GPU is powered on when connector polling is 4391 * performed. Since we're calling this from a runtime PM callback, 4392 * trying to acquire rpm refs will cause us to deadlock. 4393 * 4394 * Since we're guaranteed to be holding the rpm lock, it's safe to 4395 * temporarily disable the rpm helpers so this doesn't deadlock us. 4396 */ 4397 #if defined(CONFIG_PM) && defined(__linux__) 4398 dev->dev->power.disable_depth++; 4399 #endif 4400 if (!amdgpu_device_has_dc_support(adev)) 4401 drm_helper_hpd_irq_event(dev); 4402 else 4403 drm_kms_helper_hotplug_event(dev); 4404 #if defined(CONFIG_PM) && defined(__linux__) 4405 dev->dev->power.disable_depth--; 4406 #endif 4407 adev->in_suspend = false; 4408 4409 if (adev->enable_mes) 4410 amdgpu_mes_self_test(adev); 4411 4412 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4413 DRM_WARN("smart shift update failed\n"); 4414 4415 return 0; 4416 } 4417 4418 /** 4419 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4420 * 4421 * @adev: amdgpu_device pointer 4422 * 4423 * The list of all the hardware IPs that make up the asic is walked and 4424 * the check_soft_reset callbacks are run. check_soft_reset determines 4425 * if the asic is still hung or not. 4426 * Returns true if any of the IPs are still in a hung state, false if not. 4427 */ 4428 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4429 { 4430 int i; 4431 bool asic_hang = false; 4432 4433 if (amdgpu_sriov_vf(adev)) 4434 return true; 4435 4436 if (amdgpu_asic_need_full_reset(adev)) 4437 return true; 4438 4439 for (i = 0; i < adev->num_ip_blocks; i++) { 4440 if (!adev->ip_blocks[i].status.valid) 4441 continue; 4442 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4443 adev->ip_blocks[i].status.hang = 4444 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4445 if (adev->ip_blocks[i].status.hang) { 4446 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4447 asic_hang = true; 4448 } 4449 } 4450 return asic_hang; 4451 } 4452 4453 /** 4454 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4455 * 4456 * @adev: amdgpu_device pointer 4457 * 4458 * The list of all the hardware IPs that make up the asic is walked and the 4459 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4460 * handles any IP specific hardware or software state changes that are 4461 * necessary for a soft reset to succeed. 4462 * Returns 0 on success, negative error code on failure. 4463 */ 4464 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4465 { 4466 int i, r = 0; 4467 4468 for (i = 0; i < adev->num_ip_blocks; i++) { 4469 if (!adev->ip_blocks[i].status.valid) 4470 continue; 4471 if (adev->ip_blocks[i].status.hang && 4472 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4473 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4474 if (r) 4475 return r; 4476 } 4477 } 4478 4479 return 0; 4480 } 4481 4482 /** 4483 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4484 * 4485 * @adev: amdgpu_device pointer 4486 * 4487 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4488 * reset is necessary to recover. 4489 * Returns true if a full asic reset is required, false if not. 4490 */ 4491 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4492 { 4493 int i; 4494 4495 if (amdgpu_asic_need_full_reset(adev)) 4496 return true; 4497 4498 for (i = 0; i < adev->num_ip_blocks; i++) { 4499 if (!adev->ip_blocks[i].status.valid) 4500 continue; 4501 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4502 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4503 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4504 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4505 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4506 if (adev->ip_blocks[i].status.hang) { 4507 dev_info(adev->dev, "Some block need full reset!\n"); 4508 return true; 4509 } 4510 } 4511 } 4512 return false; 4513 } 4514 4515 /** 4516 * amdgpu_device_ip_soft_reset - do a soft reset 4517 * 4518 * @adev: amdgpu_device pointer 4519 * 4520 * The list of all the hardware IPs that make up the asic is walked and the 4521 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4522 * IP specific hardware or software state changes that are necessary to soft 4523 * reset the IP. 4524 * Returns 0 on success, negative error code on failure. 4525 */ 4526 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4527 { 4528 int i, r = 0; 4529 4530 for (i = 0; i < adev->num_ip_blocks; i++) { 4531 if (!adev->ip_blocks[i].status.valid) 4532 continue; 4533 if (adev->ip_blocks[i].status.hang && 4534 adev->ip_blocks[i].version->funcs->soft_reset) { 4535 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4536 if (r) 4537 return r; 4538 } 4539 } 4540 4541 return 0; 4542 } 4543 4544 /** 4545 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4546 * 4547 * @adev: amdgpu_device pointer 4548 * 4549 * The list of all the hardware IPs that make up the asic is walked and the 4550 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4551 * handles any IP specific hardware or software state changes that are 4552 * necessary after the IP has been soft reset. 4553 * Returns 0 on success, negative error code on failure. 4554 */ 4555 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4556 { 4557 int i, r = 0; 4558 4559 for (i = 0; i < adev->num_ip_blocks; i++) { 4560 if (!adev->ip_blocks[i].status.valid) 4561 continue; 4562 if (adev->ip_blocks[i].status.hang && 4563 adev->ip_blocks[i].version->funcs->post_soft_reset) 4564 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4565 if (r) 4566 return r; 4567 } 4568 4569 return 0; 4570 } 4571 4572 /** 4573 * amdgpu_device_recover_vram - Recover some VRAM contents 4574 * 4575 * @adev: amdgpu_device pointer 4576 * 4577 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4578 * restore things like GPUVM page tables after a GPU reset where 4579 * the contents of VRAM might be lost. 4580 * 4581 * Returns: 4582 * 0 on success, negative error code on failure. 4583 */ 4584 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4585 { 4586 struct dma_fence *fence = NULL, *next = NULL; 4587 struct amdgpu_bo *shadow; 4588 struct amdgpu_bo_vm *vmbo; 4589 long r = 1, tmo; 4590 4591 if (amdgpu_sriov_runtime(adev)) 4592 tmo = msecs_to_jiffies(8000); 4593 else 4594 tmo = msecs_to_jiffies(100); 4595 4596 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4597 mutex_lock(&adev->shadow_list_lock); 4598 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4599 shadow = &vmbo->bo; 4600 /* No need to recover an evicted BO */ 4601 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4602 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4603 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4604 continue; 4605 4606 r = amdgpu_bo_restore_shadow(shadow, &next); 4607 if (r) 4608 break; 4609 4610 if (fence) { 4611 tmo = dma_fence_wait_timeout(fence, false, tmo); 4612 dma_fence_put(fence); 4613 fence = next; 4614 if (tmo == 0) { 4615 r = -ETIMEDOUT; 4616 break; 4617 } else if (tmo < 0) { 4618 r = tmo; 4619 break; 4620 } 4621 } else { 4622 fence = next; 4623 } 4624 } 4625 mutex_unlock(&adev->shadow_list_lock); 4626 4627 if (fence) 4628 tmo = dma_fence_wait_timeout(fence, false, tmo); 4629 dma_fence_put(fence); 4630 4631 if (r < 0 || tmo <= 0) { 4632 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4633 return -EIO; 4634 } 4635 4636 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4637 return 0; 4638 } 4639 4640 4641 /** 4642 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4643 * 4644 * @adev: amdgpu_device pointer 4645 * @from_hypervisor: request from hypervisor 4646 * 4647 * do VF FLR and reinitialize Asic 4648 * return 0 means succeeded otherwise failed 4649 */ 4650 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4651 bool from_hypervisor) 4652 { 4653 int r; 4654 struct amdgpu_hive_info *hive = NULL; 4655 int retry_limit = 0; 4656 4657 retry: 4658 amdgpu_amdkfd_pre_reset(adev); 4659 4660 if (from_hypervisor) 4661 r = amdgpu_virt_request_full_gpu(adev, true); 4662 else 4663 r = amdgpu_virt_reset_gpu(adev); 4664 if (r) 4665 return r; 4666 4667 /* Resume IP prior to SMC */ 4668 r = amdgpu_device_ip_reinit_early_sriov(adev); 4669 if (r) 4670 goto error; 4671 4672 amdgpu_virt_init_data_exchange(adev); 4673 4674 r = amdgpu_device_fw_loading(adev); 4675 if (r) 4676 return r; 4677 4678 /* now we are okay to resume SMC/CP/SDMA */ 4679 r = amdgpu_device_ip_reinit_late_sriov(adev); 4680 if (r) 4681 goto error; 4682 4683 hive = amdgpu_get_xgmi_hive(adev); 4684 /* Update PSP FW topology after reset */ 4685 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4686 r = amdgpu_xgmi_update_topology(hive, adev); 4687 4688 if (hive) 4689 amdgpu_put_xgmi_hive(hive); 4690 4691 if (!r) { 4692 amdgpu_irq_gpu_reset_resume_helper(adev); 4693 r = amdgpu_ib_ring_tests(adev); 4694 4695 amdgpu_amdkfd_post_reset(adev); 4696 } 4697 4698 error: 4699 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4700 amdgpu_inc_vram_lost(adev); 4701 r = amdgpu_device_recover_vram(adev); 4702 } 4703 amdgpu_virt_release_full_gpu(adev, true); 4704 4705 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4706 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4707 retry_limit++; 4708 goto retry; 4709 } else 4710 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4711 } 4712 4713 return r; 4714 } 4715 4716 /** 4717 * amdgpu_device_has_job_running - check if there is any job in mirror list 4718 * 4719 * @adev: amdgpu_device pointer 4720 * 4721 * check if there is any job in mirror list 4722 */ 4723 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4724 { 4725 int i; 4726 struct drm_sched_job *job; 4727 4728 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4729 struct amdgpu_ring *ring = adev->rings[i]; 4730 4731 if (!ring || !ring->sched.thread) 4732 continue; 4733 4734 spin_lock(&ring->sched.job_list_lock); 4735 job = list_first_entry_or_null(&ring->sched.pending_list, 4736 struct drm_sched_job, list); 4737 spin_unlock(&ring->sched.job_list_lock); 4738 if (job) 4739 return true; 4740 } 4741 return false; 4742 } 4743 4744 /** 4745 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4746 * 4747 * @adev: amdgpu_device pointer 4748 * 4749 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4750 * a hung GPU. 4751 */ 4752 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4753 { 4754 4755 if (amdgpu_gpu_recovery == 0) 4756 goto disabled; 4757 4758 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4759 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4760 return false; 4761 } 4762 4763 if (amdgpu_sriov_vf(adev)) 4764 return true; 4765 4766 if (amdgpu_gpu_recovery == -1) { 4767 switch (adev->asic_type) { 4768 #ifdef CONFIG_DRM_AMDGPU_SI 4769 case CHIP_VERDE: 4770 case CHIP_TAHITI: 4771 case CHIP_PITCAIRN: 4772 case CHIP_OLAND: 4773 case CHIP_HAINAN: 4774 #endif 4775 #ifdef CONFIG_DRM_AMDGPU_CIK 4776 case CHIP_KAVERI: 4777 case CHIP_KABINI: 4778 case CHIP_MULLINS: 4779 #endif 4780 case CHIP_CARRIZO: 4781 case CHIP_STONEY: 4782 case CHIP_CYAN_SKILLFISH: 4783 goto disabled; 4784 default: 4785 break; 4786 } 4787 } 4788 4789 return true; 4790 4791 disabled: 4792 dev_info(adev->dev, "GPU recovery disabled.\n"); 4793 return false; 4794 } 4795 4796 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4797 { 4798 u32 i; 4799 int ret = 0; 4800 4801 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4802 4803 dev_info(adev->dev, "GPU mode1 reset\n"); 4804 4805 /* disable BM */ 4806 pci_clear_master(adev->pdev); 4807 4808 amdgpu_device_cache_pci_state(adev->pdev); 4809 4810 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4811 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4812 ret = amdgpu_dpm_mode1_reset(adev); 4813 } else { 4814 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4815 ret = psp_gpu_reset(adev); 4816 } 4817 4818 if (ret) 4819 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4820 4821 amdgpu_device_load_pci_state(adev->pdev); 4822 4823 /* wait for asic to come out of reset */ 4824 for (i = 0; i < adev->usec_timeout; i++) { 4825 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4826 4827 if (memsize != 0xffffffff) 4828 break; 4829 udelay(1); 4830 } 4831 4832 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4833 return ret; 4834 } 4835 4836 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4837 struct amdgpu_reset_context *reset_context) 4838 { 4839 int i, r = 0; 4840 struct amdgpu_job *job = NULL; 4841 bool need_full_reset = 4842 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4843 4844 if (reset_context->reset_req_dev == adev) 4845 job = reset_context->job; 4846 4847 if (amdgpu_sriov_vf(adev)) { 4848 /* stop the data exchange thread */ 4849 amdgpu_virt_fini_data_exchange(adev); 4850 } 4851 4852 amdgpu_fence_driver_isr_toggle(adev, true); 4853 4854 /* block all schedulers and reset given job's ring */ 4855 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4856 struct amdgpu_ring *ring = adev->rings[i]; 4857 4858 if (!ring || !ring->sched.thread) 4859 continue; 4860 4861 /*clear job fence from fence drv to avoid force_completion 4862 *leave NULL and vm flush fence in fence drv */ 4863 amdgpu_fence_driver_clear_job_fences(ring); 4864 4865 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4866 amdgpu_fence_driver_force_completion(ring); 4867 } 4868 4869 amdgpu_fence_driver_isr_toggle(adev, false); 4870 4871 if (job && job->vm) 4872 drm_sched_increase_karma(&job->base); 4873 4874 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4875 /* If reset handler not implemented, continue; otherwise return */ 4876 if (r == -ENOSYS) 4877 r = 0; 4878 else 4879 return r; 4880 4881 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4882 if (!amdgpu_sriov_vf(adev)) { 4883 4884 if (!need_full_reset) 4885 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4886 4887 if (!need_full_reset && amdgpu_gpu_recovery) { 4888 amdgpu_device_ip_pre_soft_reset(adev); 4889 r = amdgpu_device_ip_soft_reset(adev); 4890 amdgpu_device_ip_post_soft_reset(adev); 4891 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4892 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4893 need_full_reset = true; 4894 } 4895 } 4896 4897 if (need_full_reset) 4898 r = amdgpu_device_ip_suspend(adev); 4899 if (need_full_reset) 4900 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4901 else 4902 clear_bit(AMDGPU_NEED_FULL_RESET, 4903 &reset_context->flags); 4904 } 4905 4906 return r; 4907 } 4908 4909 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4910 { 4911 int i; 4912 4913 lockdep_assert_held(&adev->reset_domain->sem); 4914 4915 for (i = 0; i < adev->num_regs; i++) { 4916 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4917 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4918 adev->reset_dump_reg_value[i]); 4919 } 4920 4921 return 0; 4922 } 4923 4924 #ifdef CONFIG_DEV_COREDUMP 4925 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4926 size_t count, void *data, size_t datalen) 4927 { 4928 struct drm_printer p; 4929 struct amdgpu_device *adev = data; 4930 struct drm_print_iterator iter; 4931 int i; 4932 4933 iter.data = buffer; 4934 iter.offset = 0; 4935 iter.start = offset; 4936 iter.remain = count; 4937 4938 p = drm_coredump_printer(&iter); 4939 4940 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4941 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4942 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4943 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4944 if (adev->reset_task_info.pid) 4945 drm_printf(&p, "process_name: %s PID: %d\n", 4946 adev->reset_task_info.process_name, 4947 adev->reset_task_info.pid); 4948 4949 if (adev->reset_vram_lost) 4950 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4951 if (adev->num_regs) { 4952 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4953 4954 for (i = 0; i < adev->num_regs; i++) 4955 drm_printf(&p, "0x%08x: 0x%08x\n", 4956 adev->reset_dump_reg_list[i], 4957 adev->reset_dump_reg_value[i]); 4958 } 4959 4960 return count - iter.remain; 4961 } 4962 4963 static void amdgpu_devcoredump_free(void *data) 4964 { 4965 } 4966 4967 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4968 { 4969 struct drm_device *dev = adev_to_drm(adev); 4970 4971 ktime_get_ts64(&adev->reset_time); 4972 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4973 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4974 } 4975 #endif 4976 4977 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4978 struct amdgpu_reset_context *reset_context) 4979 { 4980 struct amdgpu_device *tmp_adev = NULL; 4981 bool need_full_reset, skip_hw_reset, vram_lost = false; 4982 int r = 0; 4983 bool gpu_reset_for_dev_remove = 0; 4984 4985 /* Try reset handler method first */ 4986 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4987 reset_list); 4988 amdgpu_reset_reg_dumps(tmp_adev); 4989 4990 reset_context->reset_device_list = device_list_handle; 4991 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4992 /* If reset handler not implemented, continue; otherwise return */ 4993 if (r == -ENOSYS) 4994 r = 0; 4995 else 4996 return r; 4997 4998 /* Reset handler not implemented, use the default method */ 4999 need_full_reset = 5000 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5001 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5002 5003 gpu_reset_for_dev_remove = 5004 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5005 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5006 5007 /* 5008 * ASIC reset has to be done on all XGMI hive nodes ASAP 5009 * to allow proper links negotiation in FW (within 1 sec) 5010 */ 5011 if (!skip_hw_reset && need_full_reset) { 5012 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5013 /* For XGMI run all resets in parallel to speed up the process */ 5014 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5015 tmp_adev->gmc.xgmi.pending_reset = false; 5016 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5017 r = -EALREADY; 5018 } else 5019 r = amdgpu_asic_reset(tmp_adev); 5020 5021 if (r) { 5022 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5023 r, adev_to_drm(tmp_adev)->unique); 5024 break; 5025 } 5026 } 5027 5028 /* For XGMI wait for all resets to complete before proceed */ 5029 if (!r) { 5030 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5031 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5032 flush_work(&tmp_adev->xgmi_reset_work); 5033 r = tmp_adev->asic_reset_res; 5034 if (r) 5035 break; 5036 } 5037 } 5038 } 5039 } 5040 5041 if (!r && amdgpu_ras_intr_triggered()) { 5042 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5043 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5044 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5045 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5046 } 5047 5048 amdgpu_ras_intr_cleared(); 5049 } 5050 5051 /* Since the mode1 reset affects base ip blocks, the 5052 * phase1 ip blocks need to be resumed. Otherwise there 5053 * will be a BIOS signature error and the psp bootloader 5054 * can't load kdb on the next amdgpu install. 5055 */ 5056 if (gpu_reset_for_dev_remove) { 5057 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5058 amdgpu_device_ip_resume_phase1(tmp_adev); 5059 5060 goto end; 5061 } 5062 5063 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5064 if (need_full_reset) { 5065 /* post card */ 5066 r = amdgpu_device_asic_init(tmp_adev); 5067 if (r) { 5068 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5069 } else { 5070 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5071 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5072 if (r) 5073 goto out; 5074 5075 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5076 if (r) 5077 goto out; 5078 5079 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5080 #ifdef CONFIG_DEV_COREDUMP 5081 tmp_adev->reset_vram_lost = vram_lost; 5082 memset(&tmp_adev->reset_task_info, 0, 5083 sizeof(tmp_adev->reset_task_info)); 5084 if (reset_context->job && reset_context->job->vm) 5085 tmp_adev->reset_task_info = 5086 reset_context->job->vm->task_info; 5087 amdgpu_reset_capture_coredumpm(tmp_adev); 5088 #endif 5089 if (vram_lost) { 5090 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5091 amdgpu_inc_vram_lost(tmp_adev); 5092 } 5093 5094 r = amdgpu_device_fw_loading(tmp_adev); 5095 if (r) 5096 return r; 5097 5098 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5099 if (r) 5100 goto out; 5101 5102 if (vram_lost) 5103 amdgpu_device_fill_reset_magic(tmp_adev); 5104 5105 /* 5106 * Add this ASIC as tracked as reset was already 5107 * complete successfully. 5108 */ 5109 amdgpu_register_gpu_instance(tmp_adev); 5110 5111 if (!reset_context->hive && 5112 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5113 amdgpu_xgmi_add_device(tmp_adev); 5114 5115 r = amdgpu_device_ip_late_init(tmp_adev); 5116 if (r) 5117 goto out; 5118 5119 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5120 5121 /* 5122 * The GPU enters bad state once faulty pages 5123 * by ECC has reached the threshold, and ras 5124 * recovery is scheduled next. So add one check 5125 * here to break recovery if it indeed exceeds 5126 * bad page threshold, and remind user to 5127 * retire this GPU or setting one bigger 5128 * bad_page_threshold value to fix this once 5129 * probing driver again. 5130 */ 5131 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5132 /* must succeed. */ 5133 amdgpu_ras_resume(tmp_adev); 5134 } else { 5135 r = -EINVAL; 5136 goto out; 5137 } 5138 5139 /* Update PSP FW topology after reset */ 5140 if (reset_context->hive && 5141 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5142 r = amdgpu_xgmi_update_topology( 5143 reset_context->hive, tmp_adev); 5144 } 5145 } 5146 5147 out: 5148 if (!r) { 5149 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5150 r = amdgpu_ib_ring_tests(tmp_adev); 5151 if (r) { 5152 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5153 need_full_reset = true; 5154 r = -EAGAIN; 5155 goto end; 5156 } 5157 } 5158 5159 if (!r) 5160 r = amdgpu_device_recover_vram(tmp_adev); 5161 else 5162 tmp_adev->asic_reset_res = r; 5163 } 5164 5165 end: 5166 if (need_full_reset) 5167 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5168 else 5169 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5170 return r; 5171 } 5172 5173 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5174 { 5175 5176 switch (amdgpu_asic_reset_method(adev)) { 5177 case AMD_RESET_METHOD_MODE1: 5178 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5179 break; 5180 case AMD_RESET_METHOD_MODE2: 5181 adev->mp1_state = PP_MP1_STATE_RESET; 5182 break; 5183 default: 5184 adev->mp1_state = PP_MP1_STATE_NONE; 5185 break; 5186 } 5187 5188 pci_dev_put(p); 5189 } 5190 5191 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5192 { 5193 amdgpu_vf_error_trans_all(adev); 5194 adev->mp1_state = PP_MP1_STATE_NONE; 5195 } 5196 5197 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5198 { 5199 STUB(); 5200 #ifdef notyet 5201 struct pci_dev *p = NULL; 5202 5203 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5204 adev->pdev->bus->number, 1); 5205 if (p) { 5206 pm_runtime_enable(&(p->dev)); 5207 pm_runtime_resume(&(p->dev)); 5208 } 5209 #endif 5210 } 5211 5212 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5213 { 5214 enum amd_reset_method reset_method; 5215 struct pci_dev *p = NULL; 5216 u64 expires; 5217 5218 /* 5219 * For now, only BACO and mode1 reset are confirmed 5220 * to suffer the audio issue without proper suspended. 5221 */ 5222 reset_method = amdgpu_asic_reset_method(adev); 5223 if ((reset_method != AMD_RESET_METHOD_BACO) && 5224 (reset_method != AMD_RESET_METHOD_MODE1)) 5225 return -EINVAL; 5226 5227 STUB(); 5228 return -ENOSYS; 5229 #ifdef notyet 5230 5231 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5232 adev->pdev->bus->number, 1); 5233 if (!p) 5234 return -ENODEV; 5235 5236 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5237 if (!expires) 5238 /* 5239 * If we cannot get the audio device autosuspend delay, 5240 * a fixed 4S interval will be used. Considering 3S is 5241 * the audio controller default autosuspend delay setting. 5242 * 4S used here is guaranteed to cover that. 5243 */ 5244 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5245 5246 while (!pm_runtime_status_suspended(&(p->dev))) { 5247 if (!pm_runtime_suspend(&(p->dev))) 5248 break; 5249 5250 if (expires < ktime_get_mono_fast_ns()) { 5251 dev_warn(adev->dev, "failed to suspend display audio\n"); 5252 pci_dev_put(p); 5253 /* TODO: abort the succeeding gpu reset? */ 5254 return -ETIMEDOUT; 5255 } 5256 } 5257 5258 pm_runtime_disable(&(p->dev)); 5259 5260 pci_dev_put(p); 5261 return 0; 5262 #endif 5263 } 5264 5265 static void amdgpu_device_recheck_guilty_jobs( 5266 struct amdgpu_device *adev, struct list_head *device_list_handle, 5267 struct amdgpu_reset_context *reset_context) 5268 { 5269 int i, r = 0; 5270 5271 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5272 struct amdgpu_ring *ring = adev->rings[i]; 5273 int ret = 0; 5274 struct drm_sched_job *s_job; 5275 5276 if (!ring || !ring->sched.thread) 5277 continue; 5278 5279 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5280 struct drm_sched_job, list); 5281 if (s_job == NULL) 5282 continue; 5283 5284 /* clear job's guilty and depend the folowing step to decide the real one */ 5285 drm_sched_reset_karma(s_job); 5286 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5287 5288 if (!s_job->s_fence->parent) { 5289 DRM_WARN("Failed to get a HW fence for job!"); 5290 continue; 5291 } 5292 5293 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5294 if (ret == 0) { /* timeout */ 5295 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5296 ring->sched.name, s_job->id); 5297 5298 5299 amdgpu_fence_driver_isr_toggle(adev, true); 5300 5301 /* Clear this failed job from fence array */ 5302 amdgpu_fence_driver_clear_job_fences(ring); 5303 5304 amdgpu_fence_driver_isr_toggle(adev, false); 5305 5306 /* Since the job won't signal and we go for 5307 * another resubmit drop this parent pointer 5308 */ 5309 dma_fence_put(s_job->s_fence->parent); 5310 s_job->s_fence->parent = NULL; 5311 5312 /* set guilty */ 5313 drm_sched_increase_karma(s_job); 5314 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5315 retry: 5316 /* do hw reset */ 5317 if (amdgpu_sriov_vf(adev)) { 5318 amdgpu_virt_fini_data_exchange(adev); 5319 r = amdgpu_device_reset_sriov(adev, false); 5320 if (r) 5321 adev->asic_reset_res = r; 5322 } else { 5323 clear_bit(AMDGPU_SKIP_HW_RESET, 5324 &reset_context->flags); 5325 r = amdgpu_do_asic_reset(device_list_handle, 5326 reset_context); 5327 if (r && r == -EAGAIN) 5328 goto retry; 5329 } 5330 5331 /* 5332 * add reset counter so that the following 5333 * resubmitted job could flush vmid 5334 */ 5335 atomic_inc(&adev->gpu_reset_counter); 5336 continue; 5337 } 5338 5339 /* got the hw fence, signal finished fence */ 5340 atomic_dec(ring->sched.score); 5341 dma_fence_get(&s_job->s_fence->finished); 5342 dma_fence_signal(&s_job->s_fence->finished); 5343 dma_fence_put(&s_job->s_fence->finished); 5344 5345 /* remove node from list and free the job */ 5346 spin_lock(&ring->sched.job_list_lock); 5347 list_del_init(&s_job->list); 5348 spin_unlock(&ring->sched.job_list_lock); 5349 ring->sched.ops->free_job(s_job); 5350 } 5351 } 5352 5353 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5354 { 5355 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5356 5357 #if defined(CONFIG_DEBUG_FS) 5358 if (!amdgpu_sriov_vf(adev)) 5359 cancel_work(&adev->reset_work); 5360 #endif 5361 5362 if (adev->kfd.dev) 5363 cancel_work(&adev->kfd.reset_work); 5364 5365 if (amdgpu_sriov_vf(adev)) 5366 cancel_work(&adev->virt.flr_work); 5367 5368 if (con && adev->ras_enabled) 5369 cancel_work(&con->recovery_work); 5370 5371 } 5372 5373 5374 /** 5375 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5376 * 5377 * @adev: amdgpu_device pointer 5378 * @job: which job trigger hang 5379 * 5380 * Attempt to reset the GPU if it has hung (all asics). 5381 * Attempt to do soft-reset or full-reset and reinitialize Asic 5382 * Returns 0 for success or an error on failure. 5383 */ 5384 5385 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5386 struct amdgpu_job *job, 5387 struct amdgpu_reset_context *reset_context) 5388 { 5389 struct list_head device_list, *device_list_handle = NULL; 5390 bool job_signaled = false; 5391 struct amdgpu_hive_info *hive = NULL; 5392 struct amdgpu_device *tmp_adev = NULL; 5393 int i, r = 0; 5394 bool need_emergency_restart = false; 5395 bool audio_suspended = false; 5396 int tmp_vram_lost_counter; 5397 bool gpu_reset_for_dev_remove = false; 5398 5399 gpu_reset_for_dev_remove = 5400 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5401 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5402 5403 /* 5404 * Special case: RAS triggered and full reset isn't supported 5405 */ 5406 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5407 5408 /* 5409 * Flush RAM to disk so that after reboot 5410 * the user can read log and see why the system rebooted. 5411 */ 5412 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5413 DRM_WARN("Emergency reboot."); 5414 5415 #ifdef notyet 5416 ksys_sync_helper(); 5417 emergency_restart(); 5418 #else 5419 panic("emergency_restart"); 5420 #endif 5421 } 5422 5423 dev_info(adev->dev, "GPU %s begin!\n", 5424 need_emergency_restart ? "jobs stop":"reset"); 5425 5426 if (!amdgpu_sriov_vf(adev)) 5427 hive = amdgpu_get_xgmi_hive(adev); 5428 if (hive) 5429 mutex_lock(&hive->hive_lock); 5430 5431 reset_context->job = job; 5432 reset_context->hive = hive; 5433 /* 5434 * Build list of devices to reset. 5435 * In case we are in XGMI hive mode, resort the device list 5436 * to put adev in the 1st position. 5437 */ 5438 INIT_LIST_HEAD(&device_list); 5439 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5440 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5441 list_add_tail(&tmp_adev->reset_list, &device_list); 5442 if (gpu_reset_for_dev_remove && adev->shutdown) 5443 tmp_adev->shutdown = true; 5444 } 5445 if (!list_is_first(&adev->reset_list, &device_list)) 5446 list_rotate_to_front(&adev->reset_list, &device_list); 5447 device_list_handle = &device_list; 5448 } else { 5449 list_add_tail(&adev->reset_list, &device_list); 5450 device_list_handle = &device_list; 5451 } 5452 5453 /* We need to lock reset domain only once both for XGMI and single device */ 5454 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5455 reset_list); 5456 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5457 5458 /* block all schedulers and reset given job's ring */ 5459 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5460 5461 amdgpu_device_set_mp1_state(tmp_adev); 5462 5463 /* 5464 * Try to put the audio codec into suspend state 5465 * before gpu reset started. 5466 * 5467 * Due to the power domain of the graphics device 5468 * is shared with AZ power domain. Without this, 5469 * we may change the audio hardware from behind 5470 * the audio driver's back. That will trigger 5471 * some audio codec errors. 5472 */ 5473 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5474 audio_suspended = true; 5475 5476 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5477 5478 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5479 5480 if (!amdgpu_sriov_vf(tmp_adev)) 5481 amdgpu_amdkfd_pre_reset(tmp_adev); 5482 5483 /* 5484 * Mark these ASICs to be reseted as untracked first 5485 * And add them back after reset completed 5486 */ 5487 amdgpu_unregister_gpu_instance(tmp_adev); 5488 5489 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5490 5491 /* disable ras on ALL IPs */ 5492 if (!need_emergency_restart && 5493 amdgpu_device_ip_need_full_reset(tmp_adev)) 5494 amdgpu_ras_suspend(tmp_adev); 5495 5496 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5497 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5498 5499 if (!ring || !ring->sched.thread) 5500 continue; 5501 5502 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5503 5504 if (need_emergency_restart) 5505 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5506 } 5507 atomic_inc(&tmp_adev->gpu_reset_counter); 5508 } 5509 5510 if (need_emergency_restart) 5511 goto skip_sched_resume; 5512 5513 /* 5514 * Must check guilty signal here since after this point all old 5515 * HW fences are force signaled. 5516 * 5517 * job->base holds a reference to parent fence 5518 */ 5519 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5520 job_signaled = true; 5521 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5522 goto skip_hw_reset; 5523 } 5524 5525 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5526 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5527 if (gpu_reset_for_dev_remove) { 5528 /* Workaroud for ASICs need to disable SMC first */ 5529 amdgpu_device_smu_fini_early(tmp_adev); 5530 } 5531 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5532 /*TODO Should we stop ?*/ 5533 if (r) { 5534 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5535 r, adev_to_drm(tmp_adev)->unique); 5536 tmp_adev->asic_reset_res = r; 5537 } 5538 5539 /* 5540 * Drop all pending non scheduler resets. Scheduler resets 5541 * were already dropped during drm_sched_stop 5542 */ 5543 amdgpu_device_stop_pending_resets(tmp_adev); 5544 } 5545 5546 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5547 /* Actual ASIC resets if needed.*/ 5548 /* Host driver will handle XGMI hive reset for SRIOV */ 5549 if (amdgpu_sriov_vf(adev)) { 5550 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5551 if (r) 5552 adev->asic_reset_res = r; 5553 5554 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5555 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5556 amdgpu_ras_resume(adev); 5557 } else { 5558 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5559 if (r && r == -EAGAIN) 5560 goto retry; 5561 5562 if (!r && gpu_reset_for_dev_remove) 5563 goto recover_end; 5564 } 5565 5566 skip_hw_reset: 5567 5568 /* Post ASIC reset for all devs .*/ 5569 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5570 5571 /* 5572 * Sometimes a later bad compute job can block a good gfx job as gfx 5573 * and compute ring share internal GC HW mutually. We add an additional 5574 * guilty jobs recheck step to find the real guilty job, it synchronously 5575 * submits and pends for the first job being signaled. If it gets timeout, 5576 * we identify it as a real guilty job. 5577 */ 5578 if (amdgpu_gpu_recovery == 2 && 5579 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5580 amdgpu_device_recheck_guilty_jobs( 5581 tmp_adev, device_list_handle, reset_context); 5582 5583 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5584 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5585 5586 if (!ring || !ring->sched.thread) 5587 continue; 5588 5589 /* No point to resubmit jobs if we didn't HW reset*/ 5590 if (!tmp_adev->asic_reset_res && !job_signaled) 5591 drm_sched_resubmit_jobs(&ring->sched); 5592 5593 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5594 } 5595 5596 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5597 amdgpu_mes_self_test(tmp_adev); 5598 5599 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5600 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5601 } 5602 5603 if (tmp_adev->asic_reset_res) 5604 r = tmp_adev->asic_reset_res; 5605 5606 tmp_adev->asic_reset_res = 0; 5607 5608 if (r) { 5609 /* bad news, how to tell it to userspace ? */ 5610 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5611 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5612 } else { 5613 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5614 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5615 DRM_WARN("smart shift update failed\n"); 5616 } 5617 } 5618 5619 skip_sched_resume: 5620 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5621 /* unlock kfd: SRIOV would do it separately */ 5622 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5623 amdgpu_amdkfd_post_reset(tmp_adev); 5624 5625 /* kfd_post_reset will do nothing if kfd device is not initialized, 5626 * need to bring up kfd here if it's not be initialized before 5627 */ 5628 if (!adev->kfd.init_complete) 5629 amdgpu_amdkfd_device_init(adev); 5630 5631 if (audio_suspended) 5632 amdgpu_device_resume_display_audio(tmp_adev); 5633 5634 amdgpu_device_unset_mp1_state(tmp_adev); 5635 } 5636 5637 recover_end: 5638 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5639 reset_list); 5640 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5641 5642 if (hive) { 5643 mutex_unlock(&hive->hive_lock); 5644 amdgpu_put_xgmi_hive(hive); 5645 } 5646 5647 if (r) 5648 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5649 5650 atomic_set(&adev->reset_domain->reset_res, r); 5651 return r; 5652 } 5653 5654 /** 5655 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5656 * 5657 * @adev: amdgpu_device pointer 5658 * 5659 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5660 * and lanes) of the slot the device is in. Handles APUs and 5661 * virtualized environments where PCIE config space may not be available. 5662 */ 5663 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5664 { 5665 struct pci_dev *pdev; 5666 enum pci_bus_speed speed_cap, platform_speed_cap; 5667 enum pcie_link_width platform_link_width; 5668 5669 if (amdgpu_pcie_gen_cap) 5670 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5671 5672 if (amdgpu_pcie_lane_cap) 5673 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5674 5675 /* covers APUs as well */ 5676 if (pci_is_root_bus(adev->pdev->bus)) { 5677 if (adev->pm.pcie_gen_mask == 0) 5678 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5679 if (adev->pm.pcie_mlw_mask == 0) 5680 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5681 return; 5682 } 5683 5684 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5685 return; 5686 5687 pcie_bandwidth_available(adev->pdev, NULL, 5688 &platform_speed_cap, &platform_link_width); 5689 5690 if (adev->pm.pcie_gen_mask == 0) { 5691 /* asic caps */ 5692 pdev = adev->pdev; 5693 speed_cap = pcie_get_speed_cap(pdev); 5694 if (speed_cap == PCI_SPEED_UNKNOWN) { 5695 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5696 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5698 } else { 5699 if (speed_cap == PCIE_SPEED_32_0GT) 5700 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5701 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5702 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5703 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5704 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5705 else if (speed_cap == PCIE_SPEED_16_0GT) 5706 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5707 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5708 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5709 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5710 else if (speed_cap == PCIE_SPEED_8_0GT) 5711 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5712 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5713 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5714 else if (speed_cap == PCIE_SPEED_5_0GT) 5715 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5716 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5717 else 5718 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5719 } 5720 /* platform caps */ 5721 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5722 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5724 } else { 5725 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5726 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5727 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5728 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5729 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5730 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5731 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5732 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5733 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5734 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5735 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5736 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5737 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5738 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5739 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5740 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5741 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5742 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5743 else 5744 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5745 5746 } 5747 } 5748 if (adev->pm.pcie_mlw_mask == 0) { 5749 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5750 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5751 } else { 5752 switch (platform_link_width) { 5753 case PCIE_LNK_X32: 5754 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5761 break; 5762 case PCIE_LNK_X16: 5763 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5764 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5769 break; 5770 case PCIE_LNK_X12: 5771 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5776 break; 5777 case PCIE_LNK_X8: 5778 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5780 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5781 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5782 break; 5783 case PCIE_LNK_X4: 5784 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5785 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5786 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5787 break; 5788 case PCIE_LNK_X2: 5789 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5790 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5791 break; 5792 case PCIE_LNK_X1: 5793 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5794 break; 5795 default: 5796 break; 5797 } 5798 } 5799 } 5800 } 5801 5802 /** 5803 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5804 * 5805 * @adev: amdgpu_device pointer 5806 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5807 * 5808 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5809 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5810 * @peer_adev. 5811 */ 5812 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5813 struct amdgpu_device *peer_adev) 5814 { 5815 #ifdef CONFIG_HSA_AMD_P2P 5816 uint64_t address_mask = peer_adev->dev->dma_mask ? 5817 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5818 resource_size_t aper_limit = 5819 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5820 bool p2p_access = 5821 !adev->gmc.xgmi.connected_to_cpu && 5822 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5823 5824 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5825 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5826 !(adev->gmc.aper_base & address_mask || 5827 aper_limit & address_mask)); 5828 #else 5829 return false; 5830 #endif 5831 } 5832 5833 int amdgpu_device_baco_enter(struct drm_device *dev) 5834 { 5835 struct amdgpu_device *adev = drm_to_adev(dev); 5836 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5837 5838 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5839 return -ENOTSUPP; 5840 5841 if (ras && adev->ras_enabled && 5842 adev->nbio.funcs->enable_doorbell_interrupt) 5843 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5844 5845 return amdgpu_dpm_baco_enter(adev); 5846 } 5847 5848 int amdgpu_device_baco_exit(struct drm_device *dev) 5849 { 5850 struct amdgpu_device *adev = drm_to_adev(dev); 5851 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5852 int ret = 0; 5853 5854 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5855 return -ENOTSUPP; 5856 5857 ret = amdgpu_dpm_baco_exit(adev); 5858 if (ret) 5859 return ret; 5860 5861 if (ras && adev->ras_enabled && 5862 adev->nbio.funcs->enable_doorbell_interrupt) 5863 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5864 5865 if (amdgpu_passthrough(adev) && 5866 adev->nbio.funcs->clear_doorbell_interrupt) 5867 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5868 5869 return 0; 5870 } 5871 5872 /** 5873 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5874 * @pdev: PCI device struct 5875 * @state: PCI channel state 5876 * 5877 * Description: Called when a PCI error is detected. 5878 * 5879 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5880 */ 5881 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5882 { 5883 STUB(); 5884 return 0; 5885 #ifdef notyet 5886 struct drm_device *dev = pci_get_drvdata(pdev); 5887 struct amdgpu_device *adev = drm_to_adev(dev); 5888 int i; 5889 5890 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5891 5892 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5893 DRM_WARN("No support for XGMI hive yet..."); 5894 return PCI_ERS_RESULT_DISCONNECT; 5895 } 5896 5897 adev->pci_channel_state = state; 5898 5899 switch (state) { 5900 case pci_channel_io_normal: 5901 return PCI_ERS_RESULT_CAN_RECOVER; 5902 /* Fatal error, prepare for slot reset */ 5903 case pci_channel_io_frozen: 5904 /* 5905 * Locking adev->reset_domain->sem will prevent any external access 5906 * to GPU during PCI error recovery 5907 */ 5908 amdgpu_device_lock_reset_domain(adev->reset_domain); 5909 amdgpu_device_set_mp1_state(adev); 5910 5911 /* 5912 * Block any work scheduling as we do for regular GPU reset 5913 * for the duration of the recovery 5914 */ 5915 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5916 struct amdgpu_ring *ring = adev->rings[i]; 5917 5918 if (!ring || !ring->sched.thread) 5919 continue; 5920 5921 drm_sched_stop(&ring->sched, NULL); 5922 } 5923 atomic_inc(&adev->gpu_reset_counter); 5924 return PCI_ERS_RESULT_NEED_RESET; 5925 case pci_channel_io_perm_failure: 5926 /* Permanent error, prepare for device removal */ 5927 return PCI_ERS_RESULT_DISCONNECT; 5928 } 5929 5930 return PCI_ERS_RESULT_NEED_RESET; 5931 #endif 5932 } 5933 5934 /** 5935 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5936 * @pdev: pointer to PCI device 5937 */ 5938 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5939 { 5940 5941 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5942 5943 /* TODO - dump whatever for debugging purposes */ 5944 5945 /* This called only if amdgpu_pci_error_detected returns 5946 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5947 * works, no need to reset slot. 5948 */ 5949 5950 return PCI_ERS_RESULT_RECOVERED; 5951 } 5952 5953 /** 5954 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5955 * @pdev: PCI device struct 5956 * 5957 * Description: This routine is called by the pci error recovery 5958 * code after the PCI slot has been reset, just before we 5959 * should resume normal operations. 5960 */ 5961 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5962 { 5963 STUB(); 5964 return PCI_ERS_RESULT_RECOVERED; 5965 #ifdef notyet 5966 struct drm_device *dev = pci_get_drvdata(pdev); 5967 struct amdgpu_device *adev = drm_to_adev(dev); 5968 int r, i; 5969 struct amdgpu_reset_context reset_context; 5970 u32 memsize; 5971 struct list_head device_list; 5972 5973 DRM_INFO("PCI error: slot reset callback!!\n"); 5974 5975 memset(&reset_context, 0, sizeof(reset_context)); 5976 5977 INIT_LIST_HEAD(&device_list); 5978 list_add_tail(&adev->reset_list, &device_list); 5979 5980 /* wait for asic to come out of reset */ 5981 drm_msleep(500); 5982 5983 /* Restore PCI confspace */ 5984 amdgpu_device_load_pci_state(pdev); 5985 5986 /* confirm ASIC came out of reset */ 5987 for (i = 0; i < adev->usec_timeout; i++) { 5988 memsize = amdgpu_asic_get_config_memsize(adev); 5989 5990 if (memsize != 0xffffffff) 5991 break; 5992 udelay(1); 5993 } 5994 if (memsize == 0xffffffff) { 5995 r = -ETIME; 5996 goto out; 5997 } 5998 5999 reset_context.method = AMD_RESET_METHOD_NONE; 6000 reset_context.reset_req_dev = adev; 6001 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6002 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6003 6004 adev->no_hw_access = true; 6005 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6006 adev->no_hw_access = false; 6007 if (r) 6008 goto out; 6009 6010 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6011 6012 out: 6013 if (!r) { 6014 if (amdgpu_device_cache_pci_state(adev->pdev)) 6015 pci_restore_state(adev->pdev); 6016 6017 DRM_INFO("PCIe error recovery succeeded\n"); 6018 } else { 6019 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6020 amdgpu_device_unset_mp1_state(adev); 6021 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6022 } 6023 6024 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6025 #endif 6026 } 6027 6028 /** 6029 * amdgpu_pci_resume() - resume normal ops after PCI reset 6030 * @pdev: pointer to PCI device 6031 * 6032 * Called when the error recovery driver tells us that its 6033 * OK to resume normal operation. 6034 */ 6035 void amdgpu_pci_resume(struct pci_dev *pdev) 6036 { 6037 STUB(); 6038 #ifdef notyet 6039 struct drm_device *dev = pci_get_drvdata(pdev); 6040 struct amdgpu_device *adev = drm_to_adev(dev); 6041 int i; 6042 6043 6044 DRM_INFO("PCI error: resume callback!!\n"); 6045 6046 /* Only continue execution for the case of pci_channel_io_frozen */ 6047 if (adev->pci_channel_state != pci_channel_io_frozen) 6048 return; 6049 6050 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6051 struct amdgpu_ring *ring = adev->rings[i]; 6052 6053 if (!ring || !ring->sched.thread) 6054 continue; 6055 6056 6057 drm_sched_resubmit_jobs(&ring->sched); 6058 drm_sched_start(&ring->sched, true); 6059 } 6060 6061 amdgpu_device_unset_mp1_state(adev); 6062 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6063 #endif 6064 } 6065 6066 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6067 { 6068 return false; 6069 #ifdef notyet 6070 struct drm_device *dev = pci_get_drvdata(pdev); 6071 struct amdgpu_device *adev = drm_to_adev(dev); 6072 int r; 6073 6074 r = pci_save_state(pdev); 6075 if (!r) { 6076 kfree(adev->pci_state); 6077 6078 adev->pci_state = pci_store_saved_state(pdev); 6079 6080 if (!adev->pci_state) { 6081 DRM_ERROR("Failed to store PCI saved state"); 6082 return false; 6083 } 6084 } else { 6085 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6086 return false; 6087 } 6088 6089 return true; 6090 #endif 6091 } 6092 6093 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6094 { 6095 STUB(); 6096 return false; 6097 #ifdef notyet 6098 struct drm_device *dev = pci_get_drvdata(pdev); 6099 struct amdgpu_device *adev = drm_to_adev(dev); 6100 int r; 6101 6102 if (!adev->pci_state) 6103 return false; 6104 6105 r = pci_load_saved_state(pdev, adev->pci_state); 6106 6107 if (!r) { 6108 pci_restore_state(pdev); 6109 } else { 6110 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6111 return false; 6112 } 6113 6114 return true; 6115 #endif 6116 } 6117 6118 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6119 struct amdgpu_ring *ring) 6120 { 6121 #ifdef CONFIG_X86_64 6122 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6123 return; 6124 #endif 6125 if (adev->gmc.xgmi.connected_to_cpu) 6126 return; 6127 6128 if (ring && ring->funcs->emit_hdp_flush) 6129 amdgpu_ring_emit_hdp_flush(ring); 6130 else 6131 amdgpu_asic_flush_hdp(adev, ring); 6132 } 6133 6134 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6135 struct amdgpu_ring *ring) 6136 { 6137 #ifdef CONFIG_X86_64 6138 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6139 return; 6140 #endif 6141 if (adev->gmc.xgmi.connected_to_cpu) 6142 return; 6143 6144 amdgpu_asic_invalidate_hdp(adev, ring); 6145 } 6146 6147 int amdgpu_in_reset(struct amdgpu_device *adev) 6148 { 6149 return atomic_read(&adev->reset_domain->in_gpu_reset); 6150 } 6151 6152 /** 6153 * amdgpu_device_halt() - bring hardware to some kind of halt state 6154 * 6155 * @adev: amdgpu_device pointer 6156 * 6157 * Bring hardware to some kind of halt state so that no one can touch it 6158 * any more. It will help to maintain error context when error occurred. 6159 * Compare to a simple hang, the system will keep stable at least for SSH 6160 * access. Then it should be trivial to inspect the hardware state and 6161 * see what's going on. Implemented as following: 6162 * 6163 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6164 * clears all CPU mappings to device, disallows remappings through page faults 6165 * 2. amdgpu_irq_disable_all() disables all interrupts 6166 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6167 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6168 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6169 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6170 * flush any in flight DMA operations 6171 */ 6172 void amdgpu_device_halt(struct amdgpu_device *adev) 6173 { 6174 struct pci_dev *pdev = adev->pdev; 6175 struct drm_device *ddev = adev_to_drm(adev); 6176 6177 drm_dev_unplug(ddev); 6178 6179 amdgpu_irq_disable_all(adev); 6180 6181 amdgpu_fence_driver_hw_fini(adev); 6182 6183 adev->no_hw_access = true; 6184 6185 amdgpu_device_unmap_mmio(adev); 6186 6187 pci_disable_device(pdev); 6188 pci_wait_for_pending_transaction(pdev); 6189 } 6190 6191 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6192 u32 reg) 6193 { 6194 unsigned long flags, address, data; 6195 u32 r; 6196 6197 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6198 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6199 6200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6201 WREG32(address, reg * 4); 6202 (void)RREG32(address); 6203 r = RREG32(data); 6204 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6205 return r; 6206 } 6207 6208 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6209 u32 reg, u32 v) 6210 { 6211 unsigned long flags, address, data; 6212 6213 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6214 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6215 6216 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6217 WREG32(address, reg * 4); 6218 (void)RREG32(address); 6219 WREG32(data, v); 6220 (void)RREG32(data); 6221 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6222 } 6223 6224 /** 6225 * amdgpu_device_switch_gang - switch to a new gang 6226 * @adev: amdgpu_device pointer 6227 * @gang: the gang to switch to 6228 * 6229 * Try to switch to a new gang. 6230 * Returns: NULL if we switched to the new gang or a reference to the current 6231 * gang leader. 6232 */ 6233 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6234 struct dma_fence *gang) 6235 { 6236 struct dma_fence *old = NULL; 6237 6238 do { 6239 dma_fence_put(old); 6240 rcu_read_lock(); 6241 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6242 rcu_read_unlock(); 6243 6244 if (old == gang) 6245 break; 6246 6247 if (!dma_fence_is_signaled(old)) 6248 return old; 6249 6250 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6251 old, gang) != old); 6252 6253 dma_fence_put(old); 6254 return NULL; 6255 } 6256 6257 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6258 { 6259 switch (adev->asic_type) { 6260 #ifdef CONFIG_DRM_AMDGPU_SI 6261 case CHIP_HAINAN: 6262 #endif 6263 case CHIP_TOPAZ: 6264 /* chips with no display hardware */ 6265 return false; 6266 #ifdef CONFIG_DRM_AMDGPU_SI 6267 case CHIP_TAHITI: 6268 case CHIP_PITCAIRN: 6269 case CHIP_VERDE: 6270 case CHIP_OLAND: 6271 #endif 6272 #ifdef CONFIG_DRM_AMDGPU_CIK 6273 case CHIP_BONAIRE: 6274 case CHIP_HAWAII: 6275 case CHIP_KAVERI: 6276 case CHIP_KABINI: 6277 case CHIP_MULLINS: 6278 #endif 6279 case CHIP_TONGA: 6280 case CHIP_FIJI: 6281 case CHIP_POLARIS10: 6282 case CHIP_POLARIS11: 6283 case CHIP_POLARIS12: 6284 case CHIP_VEGAM: 6285 case CHIP_CARRIZO: 6286 case CHIP_STONEY: 6287 /* chips with display hardware */ 6288 return true; 6289 default: 6290 /* IP discovery */ 6291 if (!adev->ip_versions[DCE_HWIP][0] || 6292 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6293 return false; 6294 return true; 6295 } 6296 } 6297