1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_aperture.h> 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/firmware.h> 64 #include "amdgpu_vf_error.h" 65 66 #include "amdgpu_amdkfd.h" 67 #include "amdgpu_pm.h" 68 69 #include "amdgpu_xgmi.h" 70 #include "amdgpu_ras.h" 71 #include "amdgpu_pmu.h" 72 #include "amdgpu_fru_eeprom.h" 73 #include "amdgpu_reset.h" 74 75 #include <linux/suspend.h> 76 #include <drm/task_barrier.h> 77 #include <linux/pm_runtime.h> 78 79 #include <drm/drm_drv.h> 80 81 #if IS_ENABLED(CONFIG_X86) && defined(__linux__) 82 #include <asm/intel-family.h> 83 #endif 84 85 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 92 93 #define AMDGPU_RESUME_MS 2000 94 #define AMDGPU_MAX_RETRY_LIMIT 2 95 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 96 97 static const struct drm_driver amdgpu_kms_driver; 98 99 const char *amdgpu_asic_name[] = { 100 "TAHITI", 101 "PITCAIRN", 102 "VERDE", 103 "OLAND", 104 "HAINAN", 105 "BONAIRE", 106 "KAVERI", 107 "KABINI", 108 "HAWAII", 109 "MULLINS", 110 "TOPAZ", 111 "TONGA", 112 "FIJI", 113 "CARRIZO", 114 "STONEY", 115 "POLARIS10", 116 "POLARIS11", 117 "POLARIS12", 118 "VEGAM", 119 "VEGA10", 120 "VEGA12", 121 "VEGA20", 122 "RAVEN", 123 "ARCTURUS", 124 "RENOIR", 125 "ALDEBARAN", 126 "NAVI10", 127 "CYAN_SKILLFISH", 128 "NAVI14", 129 "NAVI12", 130 "SIENNA_CICHLID", 131 "NAVY_FLOUNDER", 132 "VANGOGH", 133 "DIMGREY_CAVEFISH", 134 "BEIGE_GOBY", 135 "YELLOW_CARP", 136 "IP DISCOVERY", 137 "LAST", 138 }; 139 140 /** 141 * DOC: pcie_replay_count 142 * 143 * The amdgpu driver provides a sysfs API for reporting the total number 144 * of PCIe replays (NAKs) 145 * The file pcie_replay_count is used for this and returns the total 146 * number of replays as a sum of the NAKs generated and NAKs received 147 */ 148 149 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 150 struct device_attribute *attr, char *buf) 151 { 152 struct drm_device *ddev = dev_get_drvdata(dev); 153 struct amdgpu_device *adev = drm_to_adev(ddev); 154 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 155 156 return sysfs_emit(buf, "%llu\n", cnt); 157 } 158 159 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 160 amdgpu_device_get_pcie_replay_count, NULL); 161 162 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 163 164 /** 165 * DOC: product_name 166 * 167 * The amdgpu driver provides a sysfs API for reporting the product name 168 * for the device 169 * The file serial_number is used for this and returns the product name 170 * as returned from the FRU. 171 * NOTE: This is only available for certain server cards 172 */ 173 174 static ssize_t amdgpu_device_get_product_name(struct device *dev, 175 struct device_attribute *attr, char *buf) 176 { 177 struct drm_device *ddev = dev_get_drvdata(dev); 178 struct amdgpu_device *adev = drm_to_adev(ddev); 179 180 return sysfs_emit(buf, "%s\n", adev->product_name); 181 } 182 183 static DEVICE_ATTR(product_name, S_IRUGO, 184 amdgpu_device_get_product_name, NULL); 185 186 /** 187 * DOC: product_number 188 * 189 * The amdgpu driver provides a sysfs API for reporting the part number 190 * for the device 191 * The file serial_number is used for this and returns the part number 192 * as returned from the FRU. 193 * NOTE: This is only available for certain server cards 194 */ 195 196 static ssize_t amdgpu_device_get_product_number(struct device *dev, 197 struct device_attribute *attr, char *buf) 198 { 199 struct drm_device *ddev = dev_get_drvdata(dev); 200 struct amdgpu_device *adev = drm_to_adev(ddev); 201 202 return sysfs_emit(buf, "%s\n", adev->product_number); 203 } 204 205 static DEVICE_ATTR(product_number, S_IRUGO, 206 amdgpu_device_get_product_number, NULL); 207 208 /** 209 * DOC: serial_number 210 * 211 * The amdgpu driver provides a sysfs API for reporting the serial number 212 * for the device 213 * The file serial_number is used for this and returns the serial number 214 * as returned from the FRU. 215 * NOTE: This is only available for certain server cards 216 */ 217 218 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 224 return sysfs_emit(buf, "%s\n", adev->serial); 225 } 226 227 static DEVICE_ATTR(serial_number, S_IRUGO, 228 amdgpu_device_get_serial_number, NULL); 229 230 /** 231 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 232 * 233 * @dev: drm_device pointer 234 * 235 * Returns true if the device is a dGPU with ATPX power control, 236 * otherwise return false. 237 */ 238 bool amdgpu_device_supports_px(struct drm_device *dev) 239 { 240 struct amdgpu_device *adev = drm_to_adev(dev); 241 242 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 243 return true; 244 return false; 245 } 246 247 /** 248 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 249 * 250 * @dev: drm_device pointer 251 * 252 * Returns true if the device is a dGPU with ACPI power control, 253 * otherwise return false. 254 */ 255 bool amdgpu_device_supports_boco(struct drm_device *dev) 256 { 257 struct amdgpu_device *adev = drm_to_adev(dev); 258 259 if (adev->has_pr3 || 260 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 261 return true; 262 return false; 263 } 264 265 /** 266 * amdgpu_device_supports_baco - Does the device support BACO 267 * 268 * @dev: drm_device pointer 269 * 270 * Returns true if the device supporte BACO, 271 * otherwise return false. 272 */ 273 bool amdgpu_device_supports_baco(struct drm_device *dev) 274 { 275 struct amdgpu_device *adev = drm_to_adev(dev); 276 277 return amdgpu_asic_supports_baco(adev); 278 } 279 280 /** 281 * amdgpu_device_supports_smart_shift - Is the device dGPU with 282 * smart shift support 283 * 284 * @dev: drm_device pointer 285 * 286 * Returns true if the device is a dGPU with Smart Shift support, 287 * otherwise returns false. 288 */ 289 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 290 { 291 return (amdgpu_device_supports_boco(dev) && 292 amdgpu_acpi_is_power_shift_control_supported()); 293 } 294 295 /* 296 * VRAM access helper functions 297 */ 298 299 /** 300 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 301 * 302 * @adev: amdgpu_device pointer 303 * @pos: offset of the buffer in vram 304 * @buf: virtual address of the buffer in system memory 305 * @size: read/write size, sizeof(@buf) must > @size 306 * @write: true - write to vram, otherwise - read from vram 307 */ 308 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 309 void *buf, size_t size, bool write) 310 { 311 unsigned long flags; 312 uint32_t hi = ~0, tmp = 0; 313 uint32_t *data = buf; 314 uint64_t last; 315 int idx; 316 317 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 318 return; 319 320 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 321 322 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 323 for (last = pos + size; pos < last; pos += 4) { 324 tmp = pos >> 31; 325 326 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 327 if (tmp != hi) { 328 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 329 hi = tmp; 330 } 331 if (write) 332 WREG32_NO_KIQ(mmMM_DATA, *data++); 333 else 334 *data++ = RREG32_NO_KIQ(mmMM_DATA); 335 } 336 337 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 338 drm_dev_exit(idx); 339 } 340 341 /** 342 * amdgpu_device_aper_access - access vram by vram aperature 343 * 344 * @adev: amdgpu_device pointer 345 * @pos: offset of the buffer in vram 346 * @buf: virtual address of the buffer in system memory 347 * @size: read/write size, sizeof(@buf) must > @size 348 * @write: true - write to vram, otherwise - read from vram 349 * 350 * The return value means how many bytes have been transferred. 351 */ 352 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 353 void *buf, size_t size, bool write) 354 { 355 #ifdef CONFIG_64BIT 356 void __iomem *addr; 357 size_t count = 0; 358 uint64_t last; 359 360 if (!adev->mman.aper_base_kaddr) 361 return 0; 362 363 last = min(pos + size, adev->gmc.visible_vram_size); 364 if (last > pos) { 365 addr = adev->mman.aper_base_kaddr + pos; 366 count = last - pos; 367 368 if (write) { 369 memcpy_toio(addr, buf, count); 370 mb(); 371 amdgpu_device_flush_hdp(adev, NULL); 372 } else { 373 amdgpu_device_invalidate_hdp(adev, NULL); 374 mb(); 375 memcpy_fromio(buf, addr, count); 376 } 377 378 } 379 380 return count; 381 #else 382 return 0; 383 #endif 384 } 385 386 /** 387 * amdgpu_device_vram_access - read/write a buffer in vram 388 * 389 * @adev: amdgpu_device pointer 390 * @pos: offset of the buffer in vram 391 * @buf: virtual address of the buffer in system memory 392 * @size: read/write size, sizeof(@buf) must > @size 393 * @write: true - write to vram, otherwise - read from vram 394 */ 395 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 396 void *buf, size_t size, bool write) 397 { 398 size_t count; 399 400 /* try to using vram apreature to access vram first */ 401 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 402 size -= count; 403 if (size) { 404 /* using MM to access rest vram */ 405 pos += count; 406 buf += count; 407 amdgpu_device_mm_access(adev, pos, buf, size, write); 408 } 409 } 410 411 /* 412 * register access helper functions. 413 */ 414 415 /* Check if hw access should be skipped because of hotplug or device error */ 416 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 417 { 418 if (adev->no_hw_access) 419 return true; 420 421 #ifdef CONFIG_LOCKDEP 422 /* 423 * This is a bit complicated to understand, so worth a comment. What we assert 424 * here is that the GPU reset is not running on another thread in parallel. 425 * 426 * For this we trylock the read side of the reset semaphore, if that succeeds 427 * we know that the reset is not running in paralell. 428 * 429 * If the trylock fails we assert that we are either already holding the read 430 * side of the lock or are the reset thread itself and hold the write side of 431 * the lock. 432 */ 433 if (in_task()) { 434 if (down_read_trylock(&adev->reset_domain->sem)) 435 up_read(&adev->reset_domain->sem); 436 else 437 lockdep_assert_held(&adev->reset_domain->sem); 438 } 439 #endif 440 return false; 441 } 442 443 /** 444 * amdgpu_device_rreg - read a memory mapped IO or indirect register 445 * 446 * @adev: amdgpu_device pointer 447 * @reg: dword aligned register offset 448 * @acc_flags: access flags which require special behavior 449 * 450 * Returns the 32 bit value from the offset specified. 451 */ 452 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 453 uint32_t reg, uint32_t acc_flags) 454 { 455 uint32_t ret; 456 457 if (amdgpu_device_skip_hw_access(adev)) 458 return 0; 459 460 if ((reg * 4) < adev->rmmio_size) { 461 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 462 amdgpu_sriov_runtime(adev) && 463 down_read_trylock(&adev->reset_domain->sem)) { 464 ret = amdgpu_kiq_rreg(adev, reg); 465 up_read(&adev->reset_domain->sem); 466 } else { 467 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 468 } 469 } else { 470 ret = adev->pcie_rreg(adev, reg * 4); 471 } 472 473 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 474 475 return ret; 476 } 477 478 /* 479 * MMIO register read with bytes helper functions 480 * @offset:bytes offset from MMIO start 481 * 482 */ 483 484 /** 485 * amdgpu_mm_rreg8 - read a memory mapped IO register 486 * 487 * @adev: amdgpu_device pointer 488 * @offset: byte aligned register offset 489 * 490 * Returns the 8 bit value from the offset specified. 491 */ 492 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 493 { 494 if (amdgpu_device_skip_hw_access(adev)) 495 return 0; 496 497 if (offset < adev->rmmio_size) 498 return (readb(adev->rmmio + offset)); 499 BUG(); 500 } 501 502 /* 503 * MMIO register write with bytes helper functions 504 * @offset:bytes offset from MMIO start 505 * @value: the value want to be written to the register 506 * 507 */ 508 /** 509 * amdgpu_mm_wreg8 - read a memory mapped IO register 510 * 511 * @adev: amdgpu_device pointer 512 * @offset: byte aligned register offset 513 * @value: 8 bit value to write 514 * 515 * Writes the value specified to the offset specified. 516 */ 517 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 518 { 519 if (amdgpu_device_skip_hw_access(adev)) 520 return; 521 522 if (offset < adev->rmmio_size) 523 writeb(value, adev->rmmio + offset); 524 else 525 BUG(); 526 } 527 528 /** 529 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 530 * 531 * @adev: amdgpu_device pointer 532 * @reg: dword aligned register offset 533 * @v: 32 bit value to write to the register 534 * @acc_flags: access flags which require special behavior 535 * 536 * Writes the value specified to the offset specified. 537 */ 538 void amdgpu_device_wreg(struct amdgpu_device *adev, 539 uint32_t reg, uint32_t v, 540 uint32_t acc_flags) 541 { 542 if (amdgpu_device_skip_hw_access(adev)) 543 return; 544 545 if ((reg * 4) < adev->rmmio_size) { 546 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 547 amdgpu_sriov_runtime(adev) && 548 down_read_trylock(&adev->reset_domain->sem)) { 549 amdgpu_kiq_wreg(adev, reg, v); 550 up_read(&adev->reset_domain->sem); 551 } else { 552 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 553 } 554 } else { 555 adev->pcie_wreg(adev, reg * 4, v); 556 } 557 558 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 559 } 560 561 /** 562 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 563 * 564 * @adev: amdgpu_device pointer 565 * @reg: mmio/rlc register 566 * @v: value to write 567 * 568 * this function is invoked only for the debugfs register access 569 */ 570 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 571 uint32_t reg, uint32_t v) 572 { 573 if (amdgpu_device_skip_hw_access(adev)) 574 return; 575 576 if (amdgpu_sriov_fullaccess(adev) && 577 adev->gfx.rlc.funcs && 578 adev->gfx.rlc.funcs->is_rlcg_access_range) { 579 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 580 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 581 } else if ((reg * 4) >= adev->rmmio_size) { 582 adev->pcie_wreg(adev, reg * 4, v); 583 } else { 584 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 585 } 586 } 587 588 /** 589 * amdgpu_mm_rdoorbell - read a doorbell dword 590 * 591 * @adev: amdgpu_device pointer 592 * @index: doorbell index 593 * 594 * Returns the value in the doorbell aperture at the 595 * requested doorbell index (CIK). 596 */ 597 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 598 { 599 if (amdgpu_device_skip_hw_access(adev)) 600 return 0; 601 602 if (index < adev->doorbell.num_doorbells) { 603 return readl(adev->doorbell.ptr + index); 604 } else { 605 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 606 return 0; 607 } 608 } 609 610 /** 611 * amdgpu_mm_wdoorbell - write a doorbell dword 612 * 613 * @adev: amdgpu_device pointer 614 * @index: doorbell index 615 * @v: value to write 616 * 617 * Writes @v to the doorbell aperture at the 618 * requested doorbell index (CIK). 619 */ 620 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 621 { 622 if (amdgpu_device_skip_hw_access(adev)) 623 return; 624 625 if (index < adev->doorbell.num_doorbells) { 626 writel(v, adev->doorbell.ptr + index); 627 } else { 628 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 629 } 630 } 631 632 /** 633 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 634 * 635 * @adev: amdgpu_device pointer 636 * @index: doorbell index 637 * 638 * Returns the value in the doorbell aperture at the 639 * requested doorbell index (VEGA10+). 640 */ 641 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 642 { 643 if (amdgpu_device_skip_hw_access(adev)) 644 return 0; 645 646 if (index < adev->doorbell.num_doorbells) { 647 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 648 } else { 649 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 650 return 0; 651 } 652 } 653 654 /** 655 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 656 * 657 * @adev: amdgpu_device pointer 658 * @index: doorbell index 659 * @v: value to write 660 * 661 * Writes @v to the doorbell aperture at the 662 * requested doorbell index (VEGA10+). 663 */ 664 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 665 { 666 if (amdgpu_device_skip_hw_access(adev)) 667 return; 668 669 if (index < adev->doorbell.num_doorbells) { 670 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 671 } else { 672 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 673 } 674 } 675 676 /** 677 * amdgpu_device_indirect_rreg - read an indirect register 678 * 679 * @adev: amdgpu_device pointer 680 * @pcie_index: mmio register offset 681 * @pcie_data: mmio register offset 682 * @reg_addr: indirect register address to read from 683 * 684 * Returns the value of indirect register @reg_addr 685 */ 686 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 687 u32 pcie_index, u32 pcie_data, 688 u32 reg_addr) 689 { 690 unsigned long flags; 691 u32 r; 692 void __iomem *pcie_index_offset; 693 void __iomem *pcie_data_offset; 694 695 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 696 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 697 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 698 699 writel(reg_addr, pcie_index_offset); 700 readl(pcie_index_offset); 701 r = readl(pcie_data_offset); 702 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 703 704 return r; 705 } 706 707 /** 708 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 709 * 710 * @adev: amdgpu_device pointer 711 * @pcie_index: mmio register offset 712 * @pcie_data: mmio register offset 713 * @reg_addr: indirect register address to read from 714 * 715 * Returns the value of indirect register @reg_addr 716 */ 717 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 718 u32 pcie_index, u32 pcie_data, 719 u32 reg_addr) 720 { 721 unsigned long flags; 722 u64 r; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 727 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 728 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 729 730 /* read low 32 bits */ 731 writel(reg_addr, pcie_index_offset); 732 readl(pcie_index_offset); 733 r = readl(pcie_data_offset); 734 /* read high 32 bits */ 735 writel(reg_addr + 4, pcie_index_offset); 736 readl(pcie_index_offset); 737 r |= ((u64)readl(pcie_data_offset) << 32); 738 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 739 740 return r; 741 } 742 743 /** 744 * amdgpu_device_indirect_wreg - write an indirect register address 745 * 746 * @adev: amdgpu_device pointer 747 * @pcie_index: mmio register offset 748 * @pcie_data: mmio register offset 749 * @reg_addr: indirect register offset 750 * @reg_data: indirect register data 751 * 752 */ 753 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 754 u32 pcie_index, u32 pcie_data, 755 u32 reg_addr, u32 reg_data) 756 { 757 unsigned long flags; 758 void __iomem *pcie_index_offset; 759 void __iomem *pcie_data_offset; 760 761 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 762 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 763 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 764 765 writel(reg_addr, pcie_index_offset); 766 readl(pcie_index_offset); 767 writel(reg_data, pcie_data_offset); 768 readl(pcie_data_offset); 769 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 770 } 771 772 /** 773 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 774 * 775 * @adev: amdgpu_device pointer 776 * @pcie_index: mmio register offset 777 * @pcie_data: mmio register offset 778 * @reg_addr: indirect register offset 779 * @reg_data: indirect register data 780 * 781 */ 782 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 783 u32 pcie_index, u32 pcie_data, 784 u32 reg_addr, u64 reg_data) 785 { 786 unsigned long flags; 787 void __iomem *pcie_index_offset; 788 void __iomem *pcie_data_offset; 789 790 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 791 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 792 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 793 794 /* write low 32 bits */ 795 writel(reg_addr, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 798 readl(pcie_data_offset); 799 /* write high 32 bits */ 800 writel(reg_addr + 4, pcie_index_offset); 801 readl(pcie_index_offset); 802 writel((u32)(reg_data >> 32), pcie_data_offset); 803 readl(pcie_data_offset); 804 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 805 } 806 807 /** 808 * amdgpu_invalid_rreg - dummy reg read function 809 * 810 * @adev: amdgpu_device pointer 811 * @reg: offset of register 812 * 813 * Dummy register read function. Used for register blocks 814 * that certain asics don't have (all asics). 815 * Returns the value in the register. 816 */ 817 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 818 { 819 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 820 BUG(); 821 return 0; 822 } 823 824 /** 825 * amdgpu_invalid_wreg - dummy reg write function 826 * 827 * @adev: amdgpu_device pointer 828 * @reg: offset of register 829 * @v: value to write to the register 830 * 831 * Dummy register read function. Used for register blocks 832 * that certain asics don't have (all asics). 833 */ 834 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 835 { 836 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 837 reg, v); 838 BUG(); 839 } 840 841 /** 842 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 852 { 853 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 854 BUG(); 855 return 0; 856 } 857 858 /** 859 * amdgpu_invalid_wreg64 - dummy reg write function 860 * 861 * @adev: amdgpu_device pointer 862 * @reg: offset of register 863 * @v: value to write to the register 864 * 865 * Dummy register read function. Used for register blocks 866 * that certain asics don't have (all asics). 867 */ 868 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 869 { 870 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 871 reg, v); 872 BUG(); 873 } 874 875 /** 876 * amdgpu_block_invalid_rreg - dummy reg read function 877 * 878 * @adev: amdgpu_device pointer 879 * @block: offset of instance 880 * @reg: offset of register 881 * 882 * Dummy register read function. Used for register blocks 883 * that certain asics don't have (all asics). 884 * Returns the value in the register. 885 */ 886 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 887 uint32_t block, uint32_t reg) 888 { 889 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 890 reg, block); 891 BUG(); 892 return 0; 893 } 894 895 /** 896 * amdgpu_block_invalid_wreg - dummy reg write function 897 * 898 * @adev: amdgpu_device pointer 899 * @block: offset of instance 900 * @reg: offset of register 901 * @v: value to write to the register 902 * 903 * Dummy register read function. Used for register blocks 904 * that certain asics don't have (all asics). 905 */ 906 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 907 uint32_t block, 908 uint32_t reg, uint32_t v) 909 { 910 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 911 reg, block, v); 912 BUG(); 913 } 914 915 /** 916 * amdgpu_device_asic_init - Wrapper for atom asic_init 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Does any asic specific work and then calls atom asic init. 921 */ 922 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 923 { 924 amdgpu_asic_pre_asic_init(adev); 925 926 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 927 return amdgpu_atomfirmware_asic_init(adev, true); 928 else 929 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Allocates a scratch page of VRAM for use by various things in the 938 * driver. 939 */ 940 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 941 { 942 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 943 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 944 &adev->vram_scratch.robj, 945 &adev->vram_scratch.gpu_addr, 946 (void **)&adev->vram_scratch.ptr); 947 } 948 949 /** 950 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Frees the VRAM scratch page. 955 */ 956 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 957 { 958 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 959 } 960 961 /** 962 * amdgpu_device_program_register_sequence - program an array of registers. 963 * 964 * @adev: amdgpu_device pointer 965 * @registers: pointer to the register array 966 * @array_size: size of the register array 967 * 968 * Programs an array or registers with and and or masks. 969 * This is a helper for setting golden registers. 970 */ 971 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 972 const u32 *registers, 973 const u32 array_size) 974 { 975 u32 tmp, reg, and_mask, or_mask; 976 int i; 977 978 if (array_size % 3) 979 return; 980 981 for (i = 0; i < array_size; i +=3) { 982 reg = registers[i + 0]; 983 and_mask = registers[i + 1]; 984 or_mask = registers[i + 2]; 985 986 if (and_mask == 0xffffffff) { 987 tmp = or_mask; 988 } else { 989 tmp = RREG32(reg); 990 tmp &= ~and_mask; 991 if (adev->family >= AMDGPU_FAMILY_AI) 992 tmp |= (or_mask & and_mask); 993 else 994 tmp |= or_mask; 995 } 996 WREG32(reg, tmp); 997 } 998 } 999 1000 /** 1001 * amdgpu_device_pci_config_reset - reset the GPU 1002 * 1003 * @adev: amdgpu_device pointer 1004 * 1005 * Resets the GPU using the pci config reset sequence. 1006 * Only applicable to asics prior to vega10. 1007 */ 1008 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1009 { 1010 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1011 } 1012 1013 /** 1014 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1015 * 1016 * @adev: amdgpu_device pointer 1017 * 1018 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1019 */ 1020 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1021 { 1022 STUB(); 1023 return -ENOSYS; 1024 #ifdef notyet 1025 return pci_reset_function(adev->pdev); 1026 #endif 1027 } 1028 1029 /* 1030 * GPU doorbell aperture helpers function. 1031 */ 1032 /** 1033 * amdgpu_device_doorbell_init - Init doorbell driver information. 1034 * 1035 * @adev: amdgpu_device pointer 1036 * 1037 * Init doorbell driver information (CIK) 1038 * Returns 0 on success, error on failure. 1039 */ 1040 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1041 { 1042 1043 /* No doorbell on SI hardware generation */ 1044 if (adev->asic_type < CHIP_BONAIRE) { 1045 adev->doorbell.base = 0; 1046 adev->doorbell.size = 0; 1047 adev->doorbell.num_doorbells = 0; 1048 adev->doorbell.ptr = NULL; 1049 return 0; 1050 } 1051 1052 #ifdef __linux__ 1053 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1054 return -EINVAL; 1055 #endif 1056 1057 amdgpu_asic_init_doorbell_index(adev); 1058 1059 /* doorbell bar mapping */ 1060 #ifdef __linux__ 1061 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1062 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1063 #endif 1064 1065 if (adev->enable_mes) { 1066 adev->doorbell.num_doorbells = 1067 adev->doorbell.size / sizeof(u32); 1068 } else { 1069 adev->doorbell.num_doorbells = 1070 min_t(u32, adev->doorbell.size / sizeof(u32), 1071 adev->doorbell_index.max_assignment+1); 1072 if (adev->doorbell.num_doorbells == 0) 1073 return -EINVAL; 1074 1075 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1076 * paging queue doorbell use the second page. The 1077 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1078 * doorbells are in the first page. So with paging queue enabled, 1079 * the max num_doorbells should + 1 page (0x400 in dword) 1080 */ 1081 if (adev->asic_type >= CHIP_VEGA10) 1082 adev->doorbell.num_doorbells += 0x400; 1083 } 1084 1085 #ifdef __linux__ 1086 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1087 adev->doorbell.num_doorbells * 1088 sizeof(u32)); 1089 if (adev->doorbell.ptr == NULL) 1090 return -ENOMEM; 1091 #endif 1092 1093 return 0; 1094 } 1095 1096 /** 1097 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1098 * 1099 * @adev: amdgpu_device pointer 1100 * 1101 * Tear down doorbell driver information (CIK) 1102 */ 1103 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1104 { 1105 #ifdef __linux__ 1106 iounmap(adev->doorbell.ptr); 1107 #else 1108 if (adev->doorbell.size > 0) 1109 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1110 adev->doorbell.size); 1111 #endif 1112 adev->doorbell.ptr = NULL; 1113 } 1114 1115 1116 1117 /* 1118 * amdgpu_device_wb_*() 1119 * Writeback is the method by which the GPU updates special pages in memory 1120 * with the status of certain GPU events (fences, ring pointers,etc.). 1121 */ 1122 1123 /** 1124 * amdgpu_device_wb_fini - Disable Writeback and free memory 1125 * 1126 * @adev: amdgpu_device pointer 1127 * 1128 * Disables Writeback and frees the Writeback memory (all asics). 1129 * Used at driver shutdown. 1130 */ 1131 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1132 { 1133 if (adev->wb.wb_obj) { 1134 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1135 &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 adev->wb.wb_obj = NULL; 1138 } 1139 } 1140 1141 /** 1142 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Initializes writeback and allocates writeback memory (all asics). 1147 * Used at driver startup. 1148 * Returns 0 on success or an -error on failure. 1149 */ 1150 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1151 { 1152 int r; 1153 1154 if (adev->wb.wb_obj == NULL) { 1155 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1156 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1157 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1158 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1159 (void **)&adev->wb.wb); 1160 if (r) { 1161 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1162 return r; 1163 } 1164 1165 adev->wb.num_wb = AMDGPU_MAX_WB; 1166 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1167 1168 /* clear wb memory */ 1169 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1170 } 1171 1172 return 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_wb_get - Allocate a wb entry 1177 * 1178 * @adev: amdgpu_device pointer 1179 * @wb: wb index 1180 * 1181 * Allocate a wb slot for use by the driver (all asics). 1182 * Returns 0 on success or -EINVAL on failure. 1183 */ 1184 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1185 { 1186 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1187 1188 if (offset < adev->wb.num_wb) { 1189 __set_bit(offset, adev->wb.used); 1190 *wb = offset << 3; /* convert to dw offset */ 1191 return 0; 1192 } else { 1193 return -EINVAL; 1194 } 1195 } 1196 1197 /** 1198 * amdgpu_device_wb_free - Free a wb entry 1199 * 1200 * @adev: amdgpu_device pointer 1201 * @wb: wb index 1202 * 1203 * Free a wb slot allocated for use by the driver (all asics) 1204 */ 1205 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1206 { 1207 wb >>= 3; 1208 if (wb < adev->wb.num_wb) 1209 __clear_bit(wb, adev->wb.used); 1210 } 1211 1212 /** 1213 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1214 * 1215 * @adev: amdgpu_device pointer 1216 * 1217 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1218 * to fail, but if any of the BARs is not accessible after the size we abort 1219 * driver loading by returning -ENODEV. 1220 */ 1221 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1222 { 1223 #ifdef __linux__ 1224 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1225 struct pci_bus *root; 1226 struct resource *res; 1227 unsigned i; 1228 u16 cmd; 1229 int r; 1230 1231 /* Bypass for VF */ 1232 if (amdgpu_sriov_vf(adev)) 1233 return 0; 1234 1235 /* skip if the bios has already enabled large BAR */ 1236 if (adev->gmc.real_vram_size && 1237 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1238 return 0; 1239 1240 /* Check if the root BUS has 64bit memory resources */ 1241 root = adev->pdev->bus; 1242 while (root->parent) 1243 root = root->parent; 1244 1245 pci_bus_for_each_resource(root, res, i) { 1246 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1247 res->start > 0x100000000ull) 1248 break; 1249 } 1250 1251 /* Trying to resize is pointless without a root hub window above 4GB */ 1252 if (!res) 1253 return 0; 1254 1255 /* Limit the BAR size to what is available */ 1256 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1257 rbar_size); 1258 1259 /* Disable memory decoding while we change the BAR addresses and size */ 1260 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1261 pci_write_config_word(adev->pdev, PCI_COMMAND, 1262 cmd & ~PCI_COMMAND_MEMORY); 1263 1264 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1265 amdgpu_device_doorbell_fini(adev); 1266 if (adev->asic_type >= CHIP_BONAIRE) 1267 pci_release_resource(adev->pdev, 2); 1268 1269 pci_release_resource(adev->pdev, 0); 1270 1271 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1272 if (r == -ENOSPC) 1273 DRM_INFO("Not enough PCI address space for a large BAR."); 1274 else if (r && r != -ENOTSUPP) 1275 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1276 1277 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1278 1279 /* When the doorbell or fb BAR isn't available we have no chance of 1280 * using the device. 1281 */ 1282 r = amdgpu_device_doorbell_init(adev); 1283 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1284 return -ENODEV; 1285 1286 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1287 #endif /* __linux__ */ 1288 1289 return 0; 1290 } 1291 1292 /* 1293 * GPU helpers function. 1294 */ 1295 /** 1296 * amdgpu_device_need_post - check if the hw need post or not 1297 * 1298 * @adev: amdgpu_device pointer 1299 * 1300 * Check if the asic has been initialized (all asics) at driver startup 1301 * or post is needed if hw reset is performed. 1302 * Returns true if need or false if not. 1303 */ 1304 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1305 { 1306 uint32_t reg; 1307 1308 if (amdgpu_sriov_vf(adev)) 1309 return false; 1310 1311 if (amdgpu_passthrough(adev)) { 1312 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1313 * some old smc fw still need driver do vPost otherwise gpu hang, while 1314 * those smc fw version above 22.15 doesn't have this flaw, so we force 1315 * vpost executed for smc version below 22.15 1316 */ 1317 if (adev->asic_type == CHIP_FIJI) { 1318 int err; 1319 uint32_t fw_ver; 1320 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1321 /* force vPost if error occured */ 1322 if (err) 1323 return true; 1324 1325 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1326 if (fw_ver < 0x00160e00) 1327 return true; 1328 } 1329 } 1330 1331 /* Don't post if we need to reset whole hive on init */ 1332 if (adev->gmc.xgmi.pending_reset) 1333 return false; 1334 1335 if (adev->has_hw_reset) { 1336 adev->has_hw_reset = false; 1337 return true; 1338 } 1339 1340 /* bios scratch used on CIK+ */ 1341 if (adev->asic_type >= CHIP_BONAIRE) 1342 return amdgpu_atombios_scratch_need_asic_init(adev); 1343 1344 /* check MEM_SIZE for older asics */ 1345 reg = amdgpu_asic_get_config_memsize(adev); 1346 1347 if ((reg != 0) && (reg != 0xffffffff)) 1348 return false; 1349 1350 return true; 1351 } 1352 1353 /** 1354 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1355 * 1356 * @adev: amdgpu_device pointer 1357 * 1358 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1359 * be set for this device. 1360 * 1361 * Returns true if it should be used or false if not. 1362 */ 1363 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1364 { 1365 switch (amdgpu_aspm) { 1366 case -1: 1367 break; 1368 case 0: 1369 return false; 1370 case 1: 1371 return true; 1372 default: 1373 return false; 1374 } 1375 return pcie_aspm_enabled(adev->pdev); 1376 } 1377 1378 bool amdgpu_device_aspm_support_quirk(void) 1379 { 1380 #if IS_ENABLED(CONFIG_X86) 1381 struct cpu_info *ci = curcpu(); 1382 1383 return !(ci->ci_family == 6 && ci->ci_model == 0x97); 1384 #else 1385 return true; 1386 #endif 1387 } 1388 1389 /* if we get transitioned to only one device, take VGA back */ 1390 /** 1391 * amdgpu_device_vga_set_decode - enable/disable vga decode 1392 * 1393 * @pdev: PCI device pointer 1394 * @state: enable/disable vga decode 1395 * 1396 * Enable/disable vga decode (all asics). 1397 * Returns VGA resource flags. 1398 */ 1399 #ifdef notyet 1400 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1401 bool state) 1402 { 1403 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1404 amdgpu_asic_set_vga_state(adev, state); 1405 if (state) 1406 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1407 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1408 else 1409 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1410 } 1411 #endif 1412 1413 /** 1414 * amdgpu_device_check_block_size - validate the vm block size 1415 * 1416 * @adev: amdgpu_device pointer 1417 * 1418 * Validates the vm block size specified via module parameter. 1419 * The vm block size defines number of bits in page table versus page directory, 1420 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1421 * page table and the remaining bits are in the page directory. 1422 */ 1423 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1424 { 1425 /* defines number of bits in page table versus page directory, 1426 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1427 * page table and the remaining bits are in the page directory */ 1428 if (amdgpu_vm_block_size == -1) 1429 return; 1430 1431 if (amdgpu_vm_block_size < 9) { 1432 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1433 amdgpu_vm_block_size); 1434 amdgpu_vm_block_size = -1; 1435 } 1436 } 1437 1438 /** 1439 * amdgpu_device_check_vm_size - validate the vm size 1440 * 1441 * @adev: amdgpu_device pointer 1442 * 1443 * Validates the vm size in GB specified via module parameter. 1444 * The VM size is the size of the GPU virtual memory space in GB. 1445 */ 1446 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1447 { 1448 /* no need to check the default value */ 1449 if (amdgpu_vm_size == -1) 1450 return; 1451 1452 if (amdgpu_vm_size < 1) { 1453 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1454 amdgpu_vm_size); 1455 amdgpu_vm_size = -1; 1456 } 1457 } 1458 1459 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1460 { 1461 #ifdef __linux__ 1462 struct sysinfo si; 1463 #endif 1464 bool is_os_64 = (sizeof(void *) == 8); 1465 uint64_t total_memory; 1466 uint64_t dram_size_seven_GB = 0x1B8000000; 1467 uint64_t dram_size_three_GB = 0xB8000000; 1468 1469 if (amdgpu_smu_memory_pool_size == 0) 1470 return; 1471 1472 if (!is_os_64) { 1473 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1474 goto def_value; 1475 } 1476 #ifdef __linux__ 1477 si_meminfo(&si); 1478 total_memory = (uint64_t)si.totalram * si.mem_unit; 1479 #else 1480 total_memory = ptoa(physmem); 1481 #endif 1482 1483 if ((amdgpu_smu_memory_pool_size == 1) || 1484 (amdgpu_smu_memory_pool_size == 2)) { 1485 if (total_memory < dram_size_three_GB) 1486 goto def_value1; 1487 } else if ((amdgpu_smu_memory_pool_size == 4) || 1488 (amdgpu_smu_memory_pool_size == 8)) { 1489 if (total_memory < dram_size_seven_GB) 1490 goto def_value1; 1491 } else { 1492 DRM_WARN("Smu memory pool size not supported\n"); 1493 goto def_value; 1494 } 1495 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1496 1497 return; 1498 1499 def_value1: 1500 DRM_WARN("No enough system memory\n"); 1501 def_value: 1502 adev->pm.smu_prv_buffer_size = 0; 1503 } 1504 1505 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1506 { 1507 if (!(adev->flags & AMD_IS_APU) || 1508 adev->asic_type < CHIP_RAVEN) 1509 return 0; 1510 1511 switch (adev->asic_type) { 1512 case CHIP_RAVEN: 1513 if (adev->pdev->device == 0x15dd) 1514 adev->apu_flags |= AMD_APU_IS_RAVEN; 1515 if (adev->pdev->device == 0x15d8) 1516 adev->apu_flags |= AMD_APU_IS_PICASSO; 1517 break; 1518 case CHIP_RENOIR: 1519 if ((adev->pdev->device == 0x1636) || 1520 (adev->pdev->device == 0x164c)) 1521 adev->apu_flags |= AMD_APU_IS_RENOIR; 1522 else 1523 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1524 break; 1525 case CHIP_VANGOGH: 1526 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1527 break; 1528 case CHIP_YELLOW_CARP: 1529 break; 1530 case CHIP_CYAN_SKILLFISH: 1531 if ((adev->pdev->device == 0x13FE) || 1532 (adev->pdev->device == 0x143F)) 1533 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1534 break; 1535 default: 1536 break; 1537 } 1538 1539 return 0; 1540 } 1541 1542 /** 1543 * amdgpu_device_check_arguments - validate module params 1544 * 1545 * @adev: amdgpu_device pointer 1546 * 1547 * Validates certain module parameters and updates 1548 * the associated values used by the driver (all asics). 1549 */ 1550 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1551 { 1552 if (amdgpu_sched_jobs < 4) { 1553 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1554 amdgpu_sched_jobs); 1555 amdgpu_sched_jobs = 4; 1556 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1557 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1558 amdgpu_sched_jobs); 1559 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1560 } 1561 1562 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1563 /* gart size must be greater or equal to 32M */ 1564 dev_warn(adev->dev, "gart size (%d) too small\n", 1565 amdgpu_gart_size); 1566 amdgpu_gart_size = -1; 1567 } 1568 1569 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1570 /* gtt size must be greater or equal to 32M */ 1571 dev_warn(adev->dev, "gtt size (%d) too small\n", 1572 amdgpu_gtt_size); 1573 amdgpu_gtt_size = -1; 1574 } 1575 1576 /* valid range is between 4 and 9 inclusive */ 1577 if (amdgpu_vm_fragment_size != -1 && 1578 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1579 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1580 amdgpu_vm_fragment_size = -1; 1581 } 1582 1583 if (amdgpu_sched_hw_submission < 2) { 1584 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1585 amdgpu_sched_hw_submission); 1586 amdgpu_sched_hw_submission = 2; 1587 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1588 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1589 amdgpu_sched_hw_submission); 1590 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1591 } 1592 1593 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1594 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1595 amdgpu_reset_method = -1; 1596 } 1597 1598 amdgpu_device_check_smu_prv_buffer_size(adev); 1599 1600 amdgpu_device_check_vm_size(adev); 1601 1602 amdgpu_device_check_block_size(adev); 1603 1604 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1605 1606 return 0; 1607 } 1608 1609 #ifdef __linux__ 1610 /** 1611 * amdgpu_switcheroo_set_state - set switcheroo state 1612 * 1613 * @pdev: pci dev pointer 1614 * @state: vga_switcheroo state 1615 * 1616 * Callback for the switcheroo driver. Suspends or resumes the 1617 * the asics before or after it is powered up using ACPI methods. 1618 */ 1619 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1620 enum vga_switcheroo_state state) 1621 { 1622 struct drm_device *dev = pci_get_drvdata(pdev); 1623 int r; 1624 1625 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1626 return; 1627 1628 if (state == VGA_SWITCHEROO_ON) { 1629 pr_info("switched on\n"); 1630 /* don't suspend or resume card normally */ 1631 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1632 1633 pci_set_power_state(pdev, PCI_D0); 1634 amdgpu_device_load_pci_state(pdev); 1635 r = pci_enable_device(pdev); 1636 if (r) 1637 DRM_WARN("pci_enable_device failed (%d)\n", r); 1638 amdgpu_device_resume(dev, true); 1639 1640 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1641 } else { 1642 pr_info("switched off\n"); 1643 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1644 amdgpu_device_suspend(dev, true); 1645 amdgpu_device_cache_pci_state(pdev); 1646 /* Shut down the device */ 1647 pci_disable_device(pdev); 1648 pci_set_power_state(pdev, PCI_D3cold); 1649 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1650 } 1651 } 1652 1653 /** 1654 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1655 * 1656 * @pdev: pci dev pointer 1657 * 1658 * Callback for the switcheroo driver. Check of the switcheroo 1659 * state can be changed. 1660 * Returns true if the state can be changed, false if not. 1661 */ 1662 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1663 { 1664 struct drm_device *dev = pci_get_drvdata(pdev); 1665 1666 /* 1667 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1668 * locking inversion with the driver load path. And the access here is 1669 * completely racy anyway. So don't bother with locking for now. 1670 */ 1671 return atomic_read(&dev->open_count) == 0; 1672 } 1673 #endif /* __linux__ */ 1674 1675 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1676 #ifdef notyet 1677 .set_gpu_state = amdgpu_switcheroo_set_state, 1678 .reprobe = NULL, 1679 .can_switch = amdgpu_switcheroo_can_switch, 1680 #endif 1681 }; 1682 1683 /** 1684 * amdgpu_device_ip_set_clockgating_state - set the CG state 1685 * 1686 * @dev: amdgpu_device pointer 1687 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1688 * @state: clockgating state (gate or ungate) 1689 * 1690 * Sets the requested clockgating state for all instances of 1691 * the hardware IP specified. 1692 * Returns the error code from the last instance. 1693 */ 1694 int amdgpu_device_ip_set_clockgating_state(void *dev, 1695 enum amd_ip_block_type block_type, 1696 enum amd_clockgating_state state) 1697 { 1698 struct amdgpu_device *adev = dev; 1699 int i, r = 0; 1700 1701 for (i = 0; i < adev->num_ip_blocks; i++) { 1702 if (!adev->ip_blocks[i].status.valid) 1703 continue; 1704 if (adev->ip_blocks[i].version->type != block_type) 1705 continue; 1706 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1707 continue; 1708 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1709 (void *)adev, state); 1710 if (r) 1711 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1712 adev->ip_blocks[i].version->funcs->name, r); 1713 } 1714 return r; 1715 } 1716 1717 /** 1718 * amdgpu_device_ip_set_powergating_state - set the PG state 1719 * 1720 * @dev: amdgpu_device pointer 1721 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1722 * @state: powergating state (gate or ungate) 1723 * 1724 * Sets the requested powergating state for all instances of 1725 * the hardware IP specified. 1726 * Returns the error code from the last instance. 1727 */ 1728 int amdgpu_device_ip_set_powergating_state(void *dev, 1729 enum amd_ip_block_type block_type, 1730 enum amd_powergating_state state) 1731 { 1732 struct amdgpu_device *adev = dev; 1733 int i, r = 0; 1734 1735 for (i = 0; i < adev->num_ip_blocks; i++) { 1736 if (!adev->ip_blocks[i].status.valid) 1737 continue; 1738 if (adev->ip_blocks[i].version->type != block_type) 1739 continue; 1740 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1741 continue; 1742 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1743 (void *)adev, state); 1744 if (r) 1745 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1746 adev->ip_blocks[i].version->funcs->name, r); 1747 } 1748 return r; 1749 } 1750 1751 /** 1752 * amdgpu_device_ip_get_clockgating_state - get the CG state 1753 * 1754 * @adev: amdgpu_device pointer 1755 * @flags: clockgating feature flags 1756 * 1757 * Walks the list of IPs on the device and updates the clockgating 1758 * flags for each IP. 1759 * Updates @flags with the feature flags for each hardware IP where 1760 * clockgating is enabled. 1761 */ 1762 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1763 u64 *flags) 1764 { 1765 int i; 1766 1767 for (i = 0; i < adev->num_ip_blocks; i++) { 1768 if (!adev->ip_blocks[i].status.valid) 1769 continue; 1770 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1771 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1772 } 1773 } 1774 1775 /** 1776 * amdgpu_device_ip_wait_for_idle - wait for idle 1777 * 1778 * @adev: amdgpu_device pointer 1779 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1780 * 1781 * Waits for the request hardware IP to be idle. 1782 * Returns 0 for success or a negative error code on failure. 1783 */ 1784 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1785 enum amd_ip_block_type block_type) 1786 { 1787 int i, r; 1788 1789 for (i = 0; i < adev->num_ip_blocks; i++) { 1790 if (!adev->ip_blocks[i].status.valid) 1791 continue; 1792 if (adev->ip_blocks[i].version->type == block_type) { 1793 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1794 if (r) 1795 return r; 1796 break; 1797 } 1798 } 1799 return 0; 1800 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_is_idle - is the hardware IP idle 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1808 * 1809 * Check if the hardware IP is idle or not. 1810 * Returns true if it the IP is idle, false if not. 1811 */ 1812 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1813 enum amd_ip_block_type block_type) 1814 { 1815 int i; 1816 1817 for (i = 0; i < adev->num_ip_blocks; i++) { 1818 if (!adev->ip_blocks[i].status.valid) 1819 continue; 1820 if (adev->ip_blocks[i].version->type == block_type) 1821 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1822 } 1823 return true; 1824 1825 } 1826 1827 /** 1828 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1829 * 1830 * @adev: amdgpu_device pointer 1831 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1832 * 1833 * Returns a pointer to the hardware IP block structure 1834 * if it exists for the asic, otherwise NULL. 1835 */ 1836 struct amdgpu_ip_block * 1837 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1838 enum amd_ip_block_type type) 1839 { 1840 int i; 1841 1842 for (i = 0; i < adev->num_ip_blocks; i++) 1843 if (adev->ip_blocks[i].version->type == type) 1844 return &adev->ip_blocks[i]; 1845 1846 return NULL; 1847 } 1848 1849 /** 1850 * amdgpu_device_ip_block_version_cmp 1851 * 1852 * @adev: amdgpu_device pointer 1853 * @type: enum amd_ip_block_type 1854 * @major: major version 1855 * @minor: minor version 1856 * 1857 * return 0 if equal or greater 1858 * return 1 if smaller or the ip_block doesn't exist 1859 */ 1860 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1861 enum amd_ip_block_type type, 1862 u32 major, u32 minor) 1863 { 1864 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1865 1866 if (ip_block && ((ip_block->version->major > major) || 1867 ((ip_block->version->major == major) && 1868 (ip_block->version->minor >= minor)))) 1869 return 0; 1870 1871 return 1; 1872 } 1873 1874 /** 1875 * amdgpu_device_ip_block_add 1876 * 1877 * @adev: amdgpu_device pointer 1878 * @ip_block_version: pointer to the IP to add 1879 * 1880 * Adds the IP block driver information to the collection of IPs 1881 * on the asic. 1882 */ 1883 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1884 const struct amdgpu_ip_block_version *ip_block_version) 1885 { 1886 if (!ip_block_version) 1887 return -EINVAL; 1888 1889 switch (ip_block_version->type) { 1890 case AMD_IP_BLOCK_TYPE_VCN: 1891 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1892 return 0; 1893 break; 1894 case AMD_IP_BLOCK_TYPE_JPEG: 1895 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1896 return 0; 1897 break; 1898 default: 1899 break; 1900 } 1901 1902 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1903 ip_block_version->funcs->name); 1904 1905 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1906 1907 return 0; 1908 } 1909 1910 /** 1911 * amdgpu_device_enable_virtual_display - enable virtual display feature 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Enabled the virtual display feature if the user has enabled it via 1916 * the module parameter virtual_display. This feature provides a virtual 1917 * display hardware on headless boards or in virtualized environments. 1918 * This function parses and validates the configuration string specified by 1919 * the user and configues the virtual display configuration (number of 1920 * virtual connectors, crtcs, etc.) specified. 1921 */ 1922 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1923 { 1924 adev->enable_virtual_display = false; 1925 1926 #ifdef notyet 1927 if (amdgpu_virtual_display) { 1928 const char *pci_address_name = pci_name(adev->pdev); 1929 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1930 1931 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1932 pciaddstr_tmp = pciaddstr; 1933 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1934 pciaddname = strsep(&pciaddname_tmp, ","); 1935 if (!strcmp("all", pciaddname) 1936 || !strcmp(pci_address_name, pciaddname)) { 1937 long num_crtc; 1938 int res = -1; 1939 1940 adev->enable_virtual_display = true; 1941 1942 if (pciaddname_tmp) 1943 res = kstrtol(pciaddname_tmp, 10, 1944 &num_crtc); 1945 1946 if (!res) { 1947 if (num_crtc < 1) 1948 num_crtc = 1; 1949 if (num_crtc > 6) 1950 num_crtc = 6; 1951 adev->mode_info.num_crtc = num_crtc; 1952 } else { 1953 adev->mode_info.num_crtc = 1; 1954 } 1955 break; 1956 } 1957 } 1958 1959 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1960 amdgpu_virtual_display, pci_address_name, 1961 adev->enable_virtual_display, adev->mode_info.num_crtc); 1962 1963 kfree(pciaddstr); 1964 } 1965 #endif 1966 } 1967 1968 /** 1969 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1970 * 1971 * @adev: amdgpu_device pointer 1972 * 1973 * Parses the asic configuration parameters specified in the gpu info 1974 * firmware and makes them availale to the driver for use in configuring 1975 * the asic. 1976 * Returns 0 on success, -EINVAL on failure. 1977 */ 1978 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1979 { 1980 const char *chip_name; 1981 char fw_name[40]; 1982 int err; 1983 const struct gpu_info_firmware_header_v1_0 *hdr; 1984 1985 adev->firmware.gpu_info_fw = NULL; 1986 1987 if (adev->mman.discovery_bin) { 1988 /* 1989 * FIXME: The bounding box is still needed by Navi12, so 1990 * temporarily read it from gpu_info firmware. Should be dropped 1991 * when DAL no longer needs it. 1992 */ 1993 if (adev->asic_type != CHIP_NAVI12) 1994 return 0; 1995 } 1996 1997 switch (adev->asic_type) { 1998 default: 1999 return 0; 2000 case CHIP_VEGA10: 2001 chip_name = "vega10"; 2002 break; 2003 case CHIP_VEGA12: 2004 chip_name = "vega12"; 2005 break; 2006 case CHIP_RAVEN: 2007 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2008 chip_name = "raven2"; 2009 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2010 chip_name = "picasso"; 2011 else 2012 chip_name = "raven"; 2013 break; 2014 case CHIP_ARCTURUS: 2015 chip_name = "arcturus"; 2016 break; 2017 case CHIP_NAVI12: 2018 chip_name = "navi12"; 2019 break; 2020 } 2021 2022 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2023 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2024 if (err) { 2025 dev_err(adev->dev, 2026 "Failed to load gpu_info firmware \"%s\"\n", 2027 fw_name); 2028 goto out; 2029 } 2030 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2031 if (err) { 2032 dev_err(adev->dev, 2033 "Failed to validate gpu_info firmware \"%s\"\n", 2034 fw_name); 2035 goto out; 2036 } 2037 2038 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2039 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2040 2041 switch (hdr->version_major) { 2042 case 1: 2043 { 2044 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2045 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2046 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2047 2048 /* 2049 * Should be droped when DAL no longer needs it. 2050 */ 2051 if (adev->asic_type == CHIP_NAVI12) 2052 goto parse_soc_bounding_box; 2053 2054 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2055 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2056 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2057 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2058 adev->gfx.config.max_texture_channel_caches = 2059 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2060 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2061 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2062 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2063 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2064 adev->gfx.config.double_offchip_lds_buf = 2065 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2066 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2067 adev->gfx.cu_info.max_waves_per_simd = 2068 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2069 adev->gfx.cu_info.max_scratch_slots_per_cu = 2070 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2071 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2072 if (hdr->version_minor >= 1) { 2073 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2074 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2075 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2076 adev->gfx.config.num_sc_per_sh = 2077 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2078 adev->gfx.config.num_packer_per_sc = 2079 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2080 } 2081 2082 parse_soc_bounding_box: 2083 /* 2084 * soc bounding box info is not integrated in disocovery table, 2085 * we always need to parse it from gpu info firmware if needed. 2086 */ 2087 if (hdr->version_minor == 2) { 2088 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2089 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2090 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2091 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2092 } 2093 break; 2094 } 2095 default: 2096 dev_err(adev->dev, 2097 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2098 err = -EINVAL; 2099 goto out; 2100 } 2101 out: 2102 return err; 2103 } 2104 2105 /** 2106 * amdgpu_device_ip_early_init - run early init for hardware IPs 2107 * 2108 * @adev: amdgpu_device pointer 2109 * 2110 * Early initialization pass for hardware IPs. The hardware IPs that make 2111 * up each asic are discovered each IP's early_init callback is run. This 2112 * is the first stage in initializing the asic. 2113 * Returns 0 on success, negative error code on failure. 2114 */ 2115 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2116 { 2117 struct drm_device *dev = adev_to_drm(adev); 2118 struct pci_dev *parent; 2119 int i, r; 2120 2121 amdgpu_device_enable_virtual_display(adev); 2122 2123 if (amdgpu_sriov_vf(adev)) { 2124 r = amdgpu_virt_request_full_gpu(adev, true); 2125 if (r) 2126 return r; 2127 } 2128 2129 switch (adev->asic_type) { 2130 #ifdef CONFIG_DRM_AMDGPU_SI 2131 case CHIP_VERDE: 2132 case CHIP_TAHITI: 2133 case CHIP_PITCAIRN: 2134 case CHIP_OLAND: 2135 case CHIP_HAINAN: 2136 adev->family = AMDGPU_FAMILY_SI; 2137 r = si_set_ip_blocks(adev); 2138 if (r) 2139 return r; 2140 break; 2141 #endif 2142 #ifdef CONFIG_DRM_AMDGPU_CIK 2143 case CHIP_BONAIRE: 2144 case CHIP_HAWAII: 2145 case CHIP_KAVERI: 2146 case CHIP_KABINI: 2147 case CHIP_MULLINS: 2148 if (adev->flags & AMD_IS_APU) 2149 adev->family = AMDGPU_FAMILY_KV; 2150 else 2151 adev->family = AMDGPU_FAMILY_CI; 2152 2153 r = cik_set_ip_blocks(adev); 2154 if (r) 2155 return r; 2156 break; 2157 #endif 2158 case CHIP_TOPAZ: 2159 case CHIP_TONGA: 2160 case CHIP_FIJI: 2161 case CHIP_POLARIS10: 2162 case CHIP_POLARIS11: 2163 case CHIP_POLARIS12: 2164 case CHIP_VEGAM: 2165 case CHIP_CARRIZO: 2166 case CHIP_STONEY: 2167 if (adev->flags & AMD_IS_APU) 2168 adev->family = AMDGPU_FAMILY_CZ; 2169 else 2170 adev->family = AMDGPU_FAMILY_VI; 2171 2172 r = vi_set_ip_blocks(adev); 2173 if (r) 2174 return r; 2175 break; 2176 default: 2177 r = amdgpu_discovery_set_ip_blocks(adev); 2178 if (r) 2179 return r; 2180 break; 2181 } 2182 2183 if (amdgpu_has_atpx() && 2184 (amdgpu_is_atpx_hybrid() || 2185 amdgpu_has_atpx_dgpu_power_cntl()) && 2186 ((adev->flags & AMD_IS_APU) == 0) && 2187 !pci_is_thunderbolt_attached(dev->pdev)) 2188 adev->flags |= AMD_IS_PX; 2189 2190 if (!(adev->flags & AMD_IS_APU)) { 2191 parent = pci_upstream_bridge(adev->pdev); 2192 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2193 } 2194 2195 amdgpu_amdkfd_device_probe(adev); 2196 2197 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2198 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2199 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2200 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2201 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2202 2203 for (i = 0; i < adev->num_ip_blocks; i++) { 2204 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2205 DRM_ERROR("disabled ip block: %d <%s>\n", 2206 i, adev->ip_blocks[i].version->funcs->name); 2207 adev->ip_blocks[i].status.valid = false; 2208 } else { 2209 if (adev->ip_blocks[i].version->funcs->early_init) { 2210 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2211 if (r == -ENOENT) { 2212 adev->ip_blocks[i].status.valid = false; 2213 } else if (r) { 2214 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2215 adev->ip_blocks[i].version->funcs->name, r); 2216 return r; 2217 } else { 2218 adev->ip_blocks[i].status.valid = true; 2219 } 2220 } else { 2221 adev->ip_blocks[i].status.valid = true; 2222 } 2223 } 2224 /* get the vbios after the asic_funcs are set up */ 2225 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2226 r = amdgpu_device_parse_gpu_info_fw(adev); 2227 if (r) 2228 return r; 2229 2230 /* Read BIOS */ 2231 if (!amdgpu_get_bios(adev)) 2232 return -EINVAL; 2233 2234 r = amdgpu_atombios_init(adev); 2235 if (r) { 2236 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2237 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2238 return r; 2239 } 2240 2241 /*get pf2vf msg info at it's earliest time*/ 2242 if (amdgpu_sriov_vf(adev)) 2243 amdgpu_virt_init_data_exchange(adev); 2244 2245 } 2246 } 2247 2248 adev->cg_flags &= amdgpu_cg_mask; 2249 adev->pg_flags &= amdgpu_pg_mask; 2250 2251 return 0; 2252 } 2253 2254 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2255 { 2256 int i, r; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.sw) 2260 continue; 2261 if (adev->ip_blocks[i].status.hw) 2262 continue; 2263 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2264 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2265 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2266 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2267 if (r) { 2268 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2269 adev->ip_blocks[i].version->funcs->name, r); 2270 return r; 2271 } 2272 adev->ip_blocks[i].status.hw = true; 2273 } 2274 } 2275 2276 return 0; 2277 } 2278 2279 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2280 { 2281 int i, r; 2282 2283 for (i = 0; i < adev->num_ip_blocks; i++) { 2284 if (!adev->ip_blocks[i].status.sw) 2285 continue; 2286 if (adev->ip_blocks[i].status.hw) 2287 continue; 2288 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2289 if (r) { 2290 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 adev->ip_blocks[i].status.hw = true; 2295 } 2296 2297 return 0; 2298 } 2299 2300 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2301 { 2302 int r = 0; 2303 int i; 2304 uint32_t smu_version; 2305 2306 if (adev->asic_type >= CHIP_VEGA10) { 2307 for (i = 0; i < adev->num_ip_blocks; i++) { 2308 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2309 continue; 2310 2311 if (!adev->ip_blocks[i].status.sw) 2312 continue; 2313 2314 /* no need to do the fw loading again if already done*/ 2315 if (adev->ip_blocks[i].status.hw == true) 2316 break; 2317 2318 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2319 r = adev->ip_blocks[i].version->funcs->resume(adev); 2320 if (r) { 2321 DRM_ERROR("resume of IP block <%s> failed %d\n", 2322 adev->ip_blocks[i].version->funcs->name, r); 2323 return r; 2324 } 2325 } else { 2326 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2327 if (r) { 2328 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2329 adev->ip_blocks[i].version->funcs->name, r); 2330 return r; 2331 } 2332 } 2333 2334 adev->ip_blocks[i].status.hw = true; 2335 break; 2336 } 2337 } 2338 2339 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2340 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2341 2342 return r; 2343 } 2344 2345 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2346 { 2347 long timeout; 2348 int r, i; 2349 2350 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2351 struct amdgpu_ring *ring = adev->rings[i]; 2352 2353 /* No need to setup the GPU scheduler for rings that don't need it */ 2354 if (!ring || ring->no_scheduler) 2355 continue; 2356 2357 switch (ring->funcs->type) { 2358 case AMDGPU_RING_TYPE_GFX: 2359 timeout = adev->gfx_timeout; 2360 break; 2361 case AMDGPU_RING_TYPE_COMPUTE: 2362 timeout = adev->compute_timeout; 2363 break; 2364 case AMDGPU_RING_TYPE_SDMA: 2365 timeout = adev->sdma_timeout; 2366 break; 2367 default: 2368 timeout = adev->video_timeout; 2369 break; 2370 } 2371 2372 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2373 ring->num_hw_submission, amdgpu_job_hang_limit, 2374 timeout, adev->reset_domain->wq, 2375 ring->sched_score, ring->name, 2376 adev->dev); 2377 if (r) { 2378 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2379 ring->name); 2380 return r; 2381 } 2382 } 2383 2384 return 0; 2385 } 2386 2387 2388 /** 2389 * amdgpu_device_ip_init - run init for hardware IPs 2390 * 2391 * @adev: amdgpu_device pointer 2392 * 2393 * Main initialization pass for hardware IPs. The list of all the hardware 2394 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2395 * are run. sw_init initializes the software state associated with each IP 2396 * and hw_init initializes the hardware associated with each IP. 2397 * Returns 0 on success, negative error code on failure. 2398 */ 2399 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2400 { 2401 int i, r; 2402 2403 r = amdgpu_ras_init(adev); 2404 if (r) 2405 return r; 2406 2407 for (i = 0; i < adev->num_ip_blocks; i++) { 2408 if (!adev->ip_blocks[i].status.valid) 2409 continue; 2410 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2411 if (r) { 2412 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2413 adev->ip_blocks[i].version->funcs->name, r); 2414 goto init_failed; 2415 } 2416 adev->ip_blocks[i].status.sw = true; 2417 2418 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2419 /* need to do common hw init early so everything is set up for gmc */ 2420 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2421 if (r) { 2422 DRM_ERROR("hw_init %d failed %d\n", i, r); 2423 goto init_failed; 2424 } 2425 adev->ip_blocks[i].status.hw = true; 2426 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2427 /* need to do gmc hw init early so we can allocate gpu mem */ 2428 /* Try to reserve bad pages early */ 2429 if (amdgpu_sriov_vf(adev)) 2430 amdgpu_virt_exchange_data(adev); 2431 2432 r = amdgpu_device_vram_scratch_init(adev); 2433 if (r) { 2434 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2435 goto init_failed; 2436 } 2437 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2438 if (r) { 2439 DRM_ERROR("hw_init %d failed %d\n", i, r); 2440 goto init_failed; 2441 } 2442 r = amdgpu_device_wb_init(adev); 2443 if (r) { 2444 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2445 goto init_failed; 2446 } 2447 adev->ip_blocks[i].status.hw = true; 2448 2449 /* right after GMC hw init, we create CSA */ 2450 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2451 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2452 AMDGPU_GEM_DOMAIN_VRAM, 2453 AMDGPU_CSA_SIZE); 2454 if (r) { 2455 DRM_ERROR("allocate CSA failed %d\n", r); 2456 goto init_failed; 2457 } 2458 } 2459 } 2460 } 2461 2462 if (amdgpu_sriov_vf(adev)) 2463 amdgpu_virt_init_data_exchange(adev); 2464 2465 r = amdgpu_ib_pool_init(adev); 2466 if (r) { 2467 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2468 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2469 goto init_failed; 2470 } 2471 2472 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2473 if (r) 2474 goto init_failed; 2475 2476 r = amdgpu_device_ip_hw_init_phase1(adev); 2477 if (r) 2478 goto init_failed; 2479 2480 r = amdgpu_device_fw_loading(adev); 2481 if (r) 2482 goto init_failed; 2483 2484 r = amdgpu_device_ip_hw_init_phase2(adev); 2485 if (r) 2486 goto init_failed; 2487 2488 /* 2489 * retired pages will be loaded from eeprom and reserved here, 2490 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2491 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2492 * for I2C communication which only true at this point. 2493 * 2494 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2495 * failure from bad gpu situation and stop amdgpu init process 2496 * accordingly. For other failed cases, it will still release all 2497 * the resource and print error message, rather than returning one 2498 * negative value to upper level. 2499 * 2500 * Note: theoretically, this should be called before all vram allocations 2501 * to protect retired page from abusing 2502 */ 2503 r = amdgpu_ras_recovery_init(adev); 2504 if (r) 2505 goto init_failed; 2506 2507 /** 2508 * In case of XGMI grab extra reference for reset domain for this device 2509 */ 2510 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2511 if (amdgpu_xgmi_add_device(adev) == 0) { 2512 if (!amdgpu_sriov_vf(adev)) { 2513 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2514 2515 if (WARN_ON(!hive)) { 2516 r = -ENOENT; 2517 goto init_failed; 2518 } 2519 2520 if (!hive->reset_domain || 2521 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2522 r = -ENOENT; 2523 amdgpu_put_xgmi_hive(hive); 2524 goto init_failed; 2525 } 2526 2527 /* Drop the early temporary reset domain we created for device */ 2528 amdgpu_reset_put_reset_domain(adev->reset_domain); 2529 adev->reset_domain = hive->reset_domain; 2530 amdgpu_put_xgmi_hive(hive); 2531 } 2532 } 2533 } 2534 2535 r = amdgpu_device_init_schedulers(adev); 2536 if (r) 2537 goto init_failed; 2538 2539 /* Don't init kfd if whole hive need to be reset during init */ 2540 if (!adev->gmc.xgmi.pending_reset) 2541 amdgpu_amdkfd_device_init(adev); 2542 2543 amdgpu_fru_get_product_info(adev); 2544 2545 init_failed: 2546 if (amdgpu_sriov_vf(adev)) 2547 amdgpu_virt_release_full_gpu(adev, true); 2548 2549 return r; 2550 } 2551 2552 /** 2553 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2554 * 2555 * @adev: amdgpu_device pointer 2556 * 2557 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2558 * this function before a GPU reset. If the value is retained after a 2559 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2560 */ 2561 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2562 { 2563 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2564 } 2565 2566 /** 2567 * amdgpu_device_check_vram_lost - check if vram is valid 2568 * 2569 * @adev: amdgpu_device pointer 2570 * 2571 * Checks the reset magic value written to the gart pointer in VRAM. 2572 * The driver calls this after a GPU reset to see if the contents of 2573 * VRAM is lost or now. 2574 * returns true if vram is lost, false if not. 2575 */ 2576 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2577 { 2578 if (memcmp(adev->gart.ptr, adev->reset_magic, 2579 AMDGPU_RESET_MAGIC_NUM)) 2580 return true; 2581 2582 if (!amdgpu_in_reset(adev)) 2583 return false; 2584 2585 /* 2586 * For all ASICs with baco/mode1 reset, the VRAM is 2587 * always assumed to be lost. 2588 */ 2589 switch (amdgpu_asic_reset_method(adev)) { 2590 case AMD_RESET_METHOD_BACO: 2591 case AMD_RESET_METHOD_MODE1: 2592 return true; 2593 default: 2594 return false; 2595 } 2596 } 2597 2598 /** 2599 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2600 * 2601 * @adev: amdgpu_device pointer 2602 * @state: clockgating state (gate or ungate) 2603 * 2604 * The list of all the hardware IPs that make up the asic is walked and the 2605 * set_clockgating_state callbacks are run. 2606 * Late initialization pass enabling clockgating for hardware IPs. 2607 * Fini or suspend, pass disabling clockgating for hardware IPs. 2608 * Returns 0 on success, negative error code on failure. 2609 */ 2610 2611 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2612 enum amd_clockgating_state state) 2613 { 2614 int i, j, r; 2615 2616 if (amdgpu_emu_mode == 1) 2617 return 0; 2618 2619 for (j = 0; j < adev->num_ip_blocks; j++) { 2620 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2621 if (!adev->ip_blocks[i].status.late_initialized) 2622 continue; 2623 /* skip CG for GFX on S0ix */ 2624 if (adev->in_s0ix && 2625 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2626 continue; 2627 /* skip CG for VCE/UVD, it's handled specially */ 2628 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2629 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2632 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2633 /* enable clockgating to save power */ 2634 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2635 state); 2636 if (r) { 2637 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2638 adev->ip_blocks[i].version->funcs->name, r); 2639 return r; 2640 } 2641 } 2642 } 2643 2644 return 0; 2645 } 2646 2647 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2648 enum amd_powergating_state state) 2649 { 2650 int i, j, r; 2651 2652 if (amdgpu_emu_mode == 1) 2653 return 0; 2654 2655 for (j = 0; j < adev->num_ip_blocks; j++) { 2656 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2657 if (!adev->ip_blocks[i].status.late_initialized) 2658 continue; 2659 /* skip PG for GFX on S0ix */ 2660 if (adev->in_s0ix && 2661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2662 continue; 2663 /* skip CG for VCE/UVD, it's handled specially */ 2664 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2665 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2666 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2667 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2668 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2669 /* enable powergating to save power */ 2670 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2671 state); 2672 if (r) { 2673 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2674 adev->ip_blocks[i].version->funcs->name, r); 2675 return r; 2676 } 2677 } 2678 } 2679 return 0; 2680 } 2681 2682 static int amdgpu_device_enable_mgpu_fan_boost(void) 2683 { 2684 struct amdgpu_gpu_instance *gpu_ins; 2685 struct amdgpu_device *adev; 2686 int i, ret = 0; 2687 2688 mutex_lock(&mgpu_info.mutex); 2689 2690 /* 2691 * MGPU fan boost feature should be enabled 2692 * only when there are two or more dGPUs in 2693 * the system 2694 */ 2695 if (mgpu_info.num_dgpu < 2) 2696 goto out; 2697 2698 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2699 gpu_ins = &(mgpu_info.gpu_ins[i]); 2700 adev = gpu_ins->adev; 2701 if (!(adev->flags & AMD_IS_APU) && 2702 !gpu_ins->mgpu_fan_enabled) { 2703 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2704 if (ret) 2705 break; 2706 2707 gpu_ins->mgpu_fan_enabled = 1; 2708 } 2709 } 2710 2711 out: 2712 mutex_unlock(&mgpu_info.mutex); 2713 2714 return ret; 2715 } 2716 2717 /** 2718 * amdgpu_device_ip_late_init - run late init for hardware IPs 2719 * 2720 * @adev: amdgpu_device pointer 2721 * 2722 * Late initialization pass for hardware IPs. The list of all the hardware 2723 * IPs that make up the asic is walked and the late_init callbacks are run. 2724 * late_init covers any special initialization that an IP requires 2725 * after all of the have been initialized or something that needs to happen 2726 * late in the init process. 2727 * Returns 0 on success, negative error code on failure. 2728 */ 2729 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2730 { 2731 struct amdgpu_gpu_instance *gpu_instance; 2732 int i = 0, r; 2733 2734 for (i = 0; i < adev->num_ip_blocks; i++) { 2735 if (!adev->ip_blocks[i].status.hw) 2736 continue; 2737 if (adev->ip_blocks[i].version->funcs->late_init) { 2738 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2739 if (r) { 2740 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 } 2745 adev->ip_blocks[i].status.late_initialized = true; 2746 } 2747 2748 r = amdgpu_ras_late_init(adev); 2749 if (r) { 2750 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2751 return r; 2752 } 2753 2754 amdgpu_ras_set_error_query_ready(adev, true); 2755 2756 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2757 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2758 2759 amdgpu_device_fill_reset_magic(adev); 2760 2761 r = amdgpu_device_enable_mgpu_fan_boost(); 2762 if (r) 2763 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2764 2765 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2766 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2767 adev->asic_type == CHIP_ALDEBARAN )) 2768 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2769 2770 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2771 mutex_lock(&mgpu_info.mutex); 2772 2773 /* 2774 * Reset device p-state to low as this was booted with high. 2775 * 2776 * This should be performed only after all devices from the same 2777 * hive get initialized. 2778 * 2779 * However, it's unknown how many device in the hive in advance. 2780 * As this is counted one by one during devices initializations. 2781 * 2782 * So, we wait for all XGMI interlinked devices initialized. 2783 * This may bring some delays as those devices may come from 2784 * different hives. But that should be OK. 2785 */ 2786 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2787 for (i = 0; i < mgpu_info.num_gpu; i++) { 2788 gpu_instance = &(mgpu_info.gpu_ins[i]); 2789 if (gpu_instance->adev->flags & AMD_IS_APU) 2790 continue; 2791 2792 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2793 AMDGPU_XGMI_PSTATE_MIN); 2794 if (r) { 2795 DRM_ERROR("pstate setting failed (%d).\n", r); 2796 break; 2797 } 2798 } 2799 } 2800 2801 mutex_unlock(&mgpu_info.mutex); 2802 } 2803 2804 return 0; 2805 } 2806 2807 /** 2808 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2809 * 2810 * @adev: amdgpu_device pointer 2811 * 2812 * For ASICs need to disable SMC first 2813 */ 2814 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2815 { 2816 int i, r; 2817 2818 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2819 return; 2820 2821 for (i = 0; i < adev->num_ip_blocks; i++) { 2822 if (!adev->ip_blocks[i].status.hw) 2823 continue; 2824 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2825 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2826 /* XXX handle errors */ 2827 if (r) { 2828 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2829 adev->ip_blocks[i].version->funcs->name, r); 2830 } 2831 adev->ip_blocks[i].status.hw = false; 2832 break; 2833 } 2834 } 2835 } 2836 2837 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2838 { 2839 int i, r; 2840 2841 for (i = 0; i < adev->num_ip_blocks; i++) { 2842 if (!adev->ip_blocks[i].version->funcs->early_fini) 2843 continue; 2844 2845 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2846 if (r) { 2847 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2848 adev->ip_blocks[i].version->funcs->name, r); 2849 } 2850 } 2851 2852 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2853 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2854 2855 amdgpu_amdkfd_suspend(adev, false); 2856 2857 /* Workaroud for ASICs need to disable SMC first */ 2858 amdgpu_device_smu_fini_early(adev); 2859 2860 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2861 if (!adev->ip_blocks[i].status.hw) 2862 continue; 2863 2864 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2865 /* XXX handle errors */ 2866 if (r) { 2867 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2868 adev->ip_blocks[i].version->funcs->name, r); 2869 } 2870 2871 adev->ip_blocks[i].status.hw = false; 2872 } 2873 2874 if (amdgpu_sriov_vf(adev)) { 2875 if (amdgpu_virt_release_full_gpu(adev, false)) 2876 DRM_ERROR("failed to release exclusive mode on fini\n"); 2877 } 2878 2879 return 0; 2880 } 2881 2882 /** 2883 * amdgpu_device_ip_fini - run fini for hardware IPs 2884 * 2885 * @adev: amdgpu_device pointer 2886 * 2887 * Main teardown pass for hardware IPs. The list of all the hardware 2888 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2889 * are run. hw_fini tears down the hardware associated with each IP 2890 * and sw_fini tears down any software state associated with each IP. 2891 * Returns 0 on success, negative error code on failure. 2892 */ 2893 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2894 { 2895 int i, r; 2896 2897 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2898 amdgpu_virt_release_ras_err_handler_data(adev); 2899 2900 if (adev->gmc.xgmi.num_physical_nodes > 1) 2901 amdgpu_xgmi_remove_device(adev); 2902 2903 amdgpu_amdkfd_device_fini_sw(adev); 2904 2905 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2906 if (!adev->ip_blocks[i].status.sw) 2907 continue; 2908 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2910 amdgpu_ucode_free_bo(adev); 2911 amdgpu_free_static_csa(&adev->virt.csa_obj); 2912 amdgpu_device_wb_fini(adev); 2913 amdgpu_device_vram_scratch_fini(adev); 2914 amdgpu_ib_pool_fini(adev); 2915 } 2916 2917 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2918 /* XXX handle errors */ 2919 if (r) { 2920 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2921 adev->ip_blocks[i].version->funcs->name, r); 2922 } 2923 adev->ip_blocks[i].status.sw = false; 2924 adev->ip_blocks[i].status.valid = false; 2925 } 2926 2927 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2928 if (!adev->ip_blocks[i].status.late_initialized) 2929 continue; 2930 if (adev->ip_blocks[i].version->funcs->late_fini) 2931 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2932 adev->ip_blocks[i].status.late_initialized = false; 2933 } 2934 2935 amdgpu_ras_fini(adev); 2936 2937 return 0; 2938 } 2939 2940 /** 2941 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2942 * 2943 * @work: work_struct. 2944 */ 2945 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2946 { 2947 struct amdgpu_device *adev = 2948 container_of(work, struct amdgpu_device, delayed_init_work.work); 2949 int r; 2950 2951 r = amdgpu_ib_ring_tests(adev); 2952 if (r) 2953 DRM_ERROR("ib ring test failed (%d).\n", r); 2954 } 2955 2956 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2957 { 2958 struct amdgpu_device *adev = 2959 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2960 2961 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2962 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2963 2964 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2965 adev->gfx.gfx_off_state = true; 2966 } 2967 2968 /** 2969 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2970 * 2971 * @adev: amdgpu_device pointer 2972 * 2973 * Main suspend function for hardware IPs. The list of all the hardware 2974 * IPs that make up the asic is walked, clockgating is disabled and the 2975 * suspend callbacks are run. suspend puts the hardware and software state 2976 * in each IP into a state suitable for suspend. 2977 * Returns 0 on success, negative error code on failure. 2978 */ 2979 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2980 { 2981 int i, r; 2982 2983 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2984 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2985 2986 /* 2987 * Per PMFW team's suggestion, driver needs to handle gfxoff 2988 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2989 * scenario. Add the missing df cstate disablement here. 2990 */ 2991 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2992 dev_warn(adev->dev, "Failed to disallow df cstate"); 2993 2994 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2995 if (!adev->ip_blocks[i].status.valid) 2996 continue; 2997 2998 /* displays are handled separately */ 2999 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3000 continue; 3001 3002 /* XXX handle errors */ 3003 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3004 /* XXX handle errors */ 3005 if (r) { 3006 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3007 adev->ip_blocks[i].version->funcs->name, r); 3008 return r; 3009 } 3010 3011 adev->ip_blocks[i].status.hw = false; 3012 } 3013 3014 return 0; 3015 } 3016 3017 /** 3018 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3019 * 3020 * @adev: amdgpu_device pointer 3021 * 3022 * Main suspend function for hardware IPs. The list of all the hardware 3023 * IPs that make up the asic is walked, clockgating is disabled and the 3024 * suspend callbacks are run. suspend puts the hardware and software state 3025 * in each IP into a state suitable for suspend. 3026 * Returns 0 on success, negative error code on failure. 3027 */ 3028 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3029 { 3030 int i, r; 3031 3032 if (adev->in_s0ix) 3033 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3034 3035 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3036 if (!adev->ip_blocks[i].status.valid) 3037 continue; 3038 /* displays are handled in phase1 */ 3039 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3040 continue; 3041 /* PSP lost connection when err_event_athub occurs */ 3042 if (amdgpu_ras_intr_triggered() && 3043 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3044 adev->ip_blocks[i].status.hw = false; 3045 continue; 3046 } 3047 3048 /* skip unnecessary suspend if we do not initialize them yet */ 3049 if (adev->gmc.xgmi.pending_reset && 3050 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3051 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3052 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3053 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3054 adev->ip_blocks[i].status.hw = false; 3055 continue; 3056 } 3057 3058 /* skip suspend of gfx/mes and psp for S0ix 3059 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3060 * like at runtime. PSP is also part of the always on hardware 3061 * so no need to suspend it. 3062 */ 3063 if (adev->in_s0ix && 3064 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3065 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3066 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3067 continue; 3068 3069 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3070 if (adev->in_s0ix && 3071 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3072 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3073 continue; 3074 3075 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3076 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3077 * from this location and RLC Autoload automatically also gets loaded 3078 * from here based on PMFW -> PSP message during re-init sequence. 3079 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3080 * the TMR and reload FWs again for IMU enabled APU ASICs. 3081 */ 3082 if (amdgpu_in_reset(adev) && 3083 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3084 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3085 continue; 3086 3087 /* XXX handle errors */ 3088 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3089 /* XXX handle errors */ 3090 if (r) { 3091 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3092 adev->ip_blocks[i].version->funcs->name, r); 3093 } 3094 adev->ip_blocks[i].status.hw = false; 3095 /* handle putting the SMC in the appropriate state */ 3096 if(!amdgpu_sriov_vf(adev)){ 3097 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3098 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3099 if (r) { 3100 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3101 adev->mp1_state, r); 3102 return r; 3103 } 3104 } 3105 } 3106 } 3107 3108 return 0; 3109 } 3110 3111 /** 3112 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3113 * 3114 * @adev: amdgpu_device pointer 3115 * 3116 * Main suspend function for hardware IPs. The list of all the hardware 3117 * IPs that make up the asic is walked, clockgating is disabled and the 3118 * suspend callbacks are run. suspend puts the hardware and software state 3119 * in each IP into a state suitable for suspend. 3120 * Returns 0 on success, negative error code on failure. 3121 */ 3122 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3123 { 3124 int r; 3125 3126 if (amdgpu_sriov_vf(adev)) { 3127 amdgpu_virt_fini_data_exchange(adev); 3128 amdgpu_virt_request_full_gpu(adev, false); 3129 } 3130 3131 r = amdgpu_device_ip_suspend_phase1(adev); 3132 if (r) 3133 return r; 3134 r = amdgpu_device_ip_suspend_phase2(adev); 3135 3136 if (amdgpu_sriov_vf(adev)) 3137 amdgpu_virt_release_full_gpu(adev, false); 3138 3139 return r; 3140 } 3141 3142 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3143 { 3144 int i, r; 3145 3146 static enum amd_ip_block_type ip_order[] = { 3147 AMD_IP_BLOCK_TYPE_COMMON, 3148 AMD_IP_BLOCK_TYPE_GMC, 3149 AMD_IP_BLOCK_TYPE_PSP, 3150 AMD_IP_BLOCK_TYPE_IH, 3151 }; 3152 3153 for (i = 0; i < adev->num_ip_blocks; i++) { 3154 int j; 3155 struct amdgpu_ip_block *block; 3156 3157 block = &adev->ip_blocks[i]; 3158 block->status.hw = false; 3159 3160 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3161 3162 if (block->version->type != ip_order[j] || 3163 !block->status.valid) 3164 continue; 3165 3166 r = block->version->funcs->hw_init(adev); 3167 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3168 if (r) 3169 return r; 3170 block->status.hw = true; 3171 } 3172 } 3173 3174 return 0; 3175 } 3176 3177 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3178 { 3179 int i, r; 3180 3181 static enum amd_ip_block_type ip_order[] = { 3182 AMD_IP_BLOCK_TYPE_SMC, 3183 AMD_IP_BLOCK_TYPE_DCE, 3184 AMD_IP_BLOCK_TYPE_GFX, 3185 AMD_IP_BLOCK_TYPE_SDMA, 3186 AMD_IP_BLOCK_TYPE_UVD, 3187 AMD_IP_BLOCK_TYPE_VCE, 3188 AMD_IP_BLOCK_TYPE_VCN 3189 }; 3190 3191 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3192 int j; 3193 struct amdgpu_ip_block *block; 3194 3195 for (j = 0; j < adev->num_ip_blocks; j++) { 3196 block = &adev->ip_blocks[j]; 3197 3198 if (block->version->type != ip_order[i] || 3199 !block->status.valid || 3200 block->status.hw) 3201 continue; 3202 3203 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3204 r = block->version->funcs->resume(adev); 3205 else 3206 r = block->version->funcs->hw_init(adev); 3207 3208 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3209 if (r) 3210 return r; 3211 block->status.hw = true; 3212 } 3213 } 3214 3215 return 0; 3216 } 3217 3218 /** 3219 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3220 * 3221 * @adev: amdgpu_device pointer 3222 * 3223 * First resume function for hardware IPs. The list of all the hardware 3224 * IPs that make up the asic is walked and the resume callbacks are run for 3225 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3226 * after a suspend and updates the software state as necessary. This 3227 * function is also used for restoring the GPU after a GPU reset. 3228 * Returns 0 on success, negative error code on failure. 3229 */ 3230 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3231 { 3232 int i, r; 3233 3234 for (i = 0; i < adev->num_ip_blocks; i++) { 3235 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3236 continue; 3237 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3238 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3239 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3240 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3241 3242 r = adev->ip_blocks[i].version->funcs->resume(adev); 3243 if (r) { 3244 DRM_ERROR("resume of IP block <%s> failed %d\n", 3245 adev->ip_blocks[i].version->funcs->name, r); 3246 return r; 3247 } 3248 adev->ip_blocks[i].status.hw = true; 3249 } 3250 } 3251 3252 return 0; 3253 } 3254 3255 /** 3256 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3257 * 3258 * @adev: amdgpu_device pointer 3259 * 3260 * First resume function for hardware IPs. The list of all the hardware 3261 * IPs that make up the asic is walked and the resume callbacks are run for 3262 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3263 * functional state after a suspend and updates the software state as 3264 * necessary. This function is also used for restoring the GPU after a GPU 3265 * reset. 3266 * Returns 0 on success, negative error code on failure. 3267 */ 3268 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3269 { 3270 int i, r; 3271 3272 for (i = 0; i < adev->num_ip_blocks; i++) { 3273 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3274 continue; 3275 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3276 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3278 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3279 continue; 3280 r = adev->ip_blocks[i].version->funcs->resume(adev); 3281 if (r) { 3282 DRM_ERROR("resume of IP block <%s> failed %d\n", 3283 adev->ip_blocks[i].version->funcs->name, r); 3284 return r; 3285 } 3286 adev->ip_blocks[i].status.hw = true; 3287 3288 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3289 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in 3290 * amdgpu_device_resume() after IP resume. 3291 */ 3292 amdgpu_gfx_off_ctrl(adev, false); 3293 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); 3294 } 3295 3296 } 3297 3298 return 0; 3299 } 3300 3301 /** 3302 * amdgpu_device_ip_resume - run resume for hardware IPs 3303 * 3304 * @adev: amdgpu_device pointer 3305 * 3306 * Main resume function for hardware IPs. The hardware IPs 3307 * are split into two resume functions because they are 3308 * are also used in in recovering from a GPU reset and some additional 3309 * steps need to be take between them. In this case (S3/S4) they are 3310 * run sequentially. 3311 * Returns 0 on success, negative error code on failure. 3312 */ 3313 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3314 { 3315 int r; 3316 3317 r = amdgpu_amdkfd_resume_iommu(adev); 3318 if (r) 3319 return r; 3320 3321 r = amdgpu_device_ip_resume_phase1(adev); 3322 if (r) 3323 return r; 3324 3325 r = amdgpu_device_fw_loading(adev); 3326 if (r) 3327 return r; 3328 3329 r = amdgpu_device_ip_resume_phase2(adev); 3330 3331 return r; 3332 } 3333 3334 /** 3335 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3336 * 3337 * @adev: amdgpu_device pointer 3338 * 3339 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3340 */ 3341 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3342 { 3343 if (amdgpu_sriov_vf(adev)) { 3344 if (adev->is_atom_fw) { 3345 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3346 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3347 } else { 3348 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3349 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3350 } 3351 3352 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3353 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3354 } 3355 } 3356 3357 /** 3358 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3359 * 3360 * @asic_type: AMD asic type 3361 * 3362 * Check if there is DC (new modesetting infrastructre) support for an asic. 3363 * returns true if DC has support, false if not. 3364 */ 3365 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3366 { 3367 switch (asic_type) { 3368 #ifdef CONFIG_DRM_AMDGPU_SI 3369 case CHIP_HAINAN: 3370 #endif 3371 case CHIP_TOPAZ: 3372 /* chips with no display hardware */ 3373 return false; 3374 #if defined(CONFIG_DRM_AMD_DC) 3375 case CHIP_TAHITI: 3376 case CHIP_PITCAIRN: 3377 case CHIP_VERDE: 3378 case CHIP_OLAND: 3379 /* 3380 * We have systems in the wild with these ASICs that require 3381 * LVDS and VGA support which is not supported with DC. 3382 * 3383 * Fallback to the non-DC driver here by default so as not to 3384 * cause regressions. 3385 */ 3386 #if defined(CONFIG_DRM_AMD_DC_SI) 3387 return amdgpu_dc > 0; 3388 #else 3389 return false; 3390 #endif 3391 case CHIP_BONAIRE: 3392 case CHIP_KAVERI: 3393 case CHIP_KABINI: 3394 case CHIP_MULLINS: 3395 /* 3396 * We have systems in the wild with these ASICs that require 3397 * VGA support which is not supported with DC. 3398 * 3399 * Fallback to the non-DC driver here by default so as not to 3400 * cause regressions. 3401 */ 3402 return amdgpu_dc > 0; 3403 default: 3404 return amdgpu_dc != 0; 3405 #else 3406 default: 3407 if (amdgpu_dc > 0) 3408 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3409 "but isn't supported by ASIC, ignoring\n"); 3410 return false; 3411 #endif 3412 } 3413 } 3414 3415 /** 3416 * amdgpu_device_has_dc_support - check if dc is supported 3417 * 3418 * @adev: amdgpu_device pointer 3419 * 3420 * Returns true for supported, false for not supported 3421 */ 3422 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3423 { 3424 if (amdgpu_sriov_vf(adev) || 3425 adev->enable_virtual_display || 3426 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3427 return false; 3428 3429 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3430 } 3431 3432 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3433 { 3434 struct amdgpu_device *adev = 3435 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3436 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3437 3438 /* It's a bug to not have a hive within this function */ 3439 if (WARN_ON(!hive)) 3440 return; 3441 3442 /* 3443 * Use task barrier to synchronize all xgmi reset works across the 3444 * hive. task_barrier_enter and task_barrier_exit will block 3445 * until all the threads running the xgmi reset works reach 3446 * those points. task_barrier_full will do both blocks. 3447 */ 3448 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3449 3450 task_barrier_enter(&hive->tb); 3451 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3452 3453 if (adev->asic_reset_res) 3454 goto fail; 3455 3456 task_barrier_exit(&hive->tb); 3457 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3458 3459 if (adev->asic_reset_res) 3460 goto fail; 3461 3462 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3463 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3464 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3465 } else { 3466 3467 task_barrier_full(&hive->tb); 3468 adev->asic_reset_res = amdgpu_asic_reset(adev); 3469 } 3470 3471 fail: 3472 if (adev->asic_reset_res) 3473 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3474 adev->asic_reset_res, adev_to_drm(adev)->unique); 3475 amdgpu_put_xgmi_hive(hive); 3476 } 3477 3478 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3479 { 3480 char *input = amdgpu_lockup_timeout; 3481 char *timeout_setting = NULL; 3482 int index = 0; 3483 long timeout; 3484 int ret = 0; 3485 3486 /* 3487 * By default timeout for non compute jobs is 10000 3488 * and 60000 for compute jobs. 3489 * In SR-IOV or passthrough mode, timeout for compute 3490 * jobs are 60000 by default. 3491 */ 3492 adev->gfx_timeout = msecs_to_jiffies(10000); 3493 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3494 if (amdgpu_sriov_vf(adev)) 3495 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3496 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3497 else 3498 adev->compute_timeout = msecs_to_jiffies(60000); 3499 3500 #ifdef notyet 3501 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3502 while ((timeout_setting = strsep(&input, ",")) && 3503 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3504 ret = kstrtol(timeout_setting, 0, &timeout); 3505 if (ret) 3506 return ret; 3507 3508 if (timeout == 0) { 3509 index++; 3510 continue; 3511 } else if (timeout < 0) { 3512 timeout = MAX_SCHEDULE_TIMEOUT; 3513 dev_warn(adev->dev, "lockup timeout disabled"); 3514 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3515 } else { 3516 timeout = msecs_to_jiffies(timeout); 3517 } 3518 3519 switch (index++) { 3520 case 0: 3521 adev->gfx_timeout = timeout; 3522 break; 3523 case 1: 3524 adev->compute_timeout = timeout; 3525 break; 3526 case 2: 3527 adev->sdma_timeout = timeout; 3528 break; 3529 case 3: 3530 adev->video_timeout = timeout; 3531 break; 3532 default: 3533 break; 3534 } 3535 } 3536 /* 3537 * There is only one value specified and 3538 * it should apply to all non-compute jobs. 3539 */ 3540 if (index == 1) { 3541 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3542 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3543 adev->compute_timeout = adev->gfx_timeout; 3544 } 3545 } 3546 #endif 3547 3548 return ret; 3549 } 3550 3551 /** 3552 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3553 * 3554 * @adev: amdgpu_device pointer 3555 * 3556 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3557 */ 3558 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3559 { 3560 #ifdef notyet 3561 struct iommu_domain *domain; 3562 3563 domain = iommu_get_domain_for_dev(adev->dev); 3564 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3565 #endif 3566 adev->ram_is_direct_mapped = true; 3567 } 3568 3569 static const struct attribute *amdgpu_dev_attributes[] = { 3570 &dev_attr_product_name.attr, 3571 &dev_attr_product_number.attr, 3572 &dev_attr_serial_number.attr, 3573 &dev_attr_pcie_replay_count.attr, 3574 NULL 3575 }; 3576 3577 /** 3578 * amdgpu_device_init - initialize the driver 3579 * 3580 * @adev: amdgpu_device pointer 3581 * @flags: driver flags 3582 * 3583 * Initializes the driver info and hw (all asics). 3584 * Returns 0 for success or an error on failure. 3585 * Called at driver startup. 3586 */ 3587 int amdgpu_device_init(struct amdgpu_device *adev, 3588 uint32_t flags) 3589 { 3590 struct drm_device *ddev = adev_to_drm(adev); 3591 struct pci_dev *pdev = adev->pdev; 3592 int r, i; 3593 bool px = false; 3594 u32 max_MBps; 3595 3596 adev->shutdown = false; 3597 adev->flags = flags; 3598 3599 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3600 adev->asic_type = amdgpu_force_asic_type; 3601 else 3602 adev->asic_type = flags & AMD_ASIC_MASK; 3603 3604 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3605 if (amdgpu_emu_mode == 1) 3606 adev->usec_timeout *= 10; 3607 adev->gmc.gart_size = 512 * 1024 * 1024; 3608 adev->accel_working = false; 3609 adev->num_rings = 0; 3610 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3611 adev->mman.buffer_funcs = NULL; 3612 adev->mman.buffer_funcs_ring = NULL; 3613 adev->vm_manager.vm_pte_funcs = NULL; 3614 adev->vm_manager.vm_pte_num_scheds = 0; 3615 adev->gmc.gmc_funcs = NULL; 3616 adev->harvest_ip_mask = 0x0; 3617 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3618 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3619 3620 adev->smc_rreg = &amdgpu_invalid_rreg; 3621 adev->smc_wreg = &amdgpu_invalid_wreg; 3622 adev->pcie_rreg = &amdgpu_invalid_rreg; 3623 adev->pcie_wreg = &amdgpu_invalid_wreg; 3624 adev->pciep_rreg = &amdgpu_invalid_rreg; 3625 adev->pciep_wreg = &amdgpu_invalid_wreg; 3626 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3627 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3628 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3629 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3630 adev->didt_rreg = &amdgpu_invalid_rreg; 3631 adev->didt_wreg = &amdgpu_invalid_wreg; 3632 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3633 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3634 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3635 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3636 3637 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3638 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3639 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3640 3641 /* mutex initialization are all done here so we 3642 * can recall function without having locking issues */ 3643 rw_init(&adev->firmware.mutex, "agfw"); 3644 rw_init(&adev->pm.mutex, "agpm"); 3645 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3646 rw_init(&adev->srbm_mutex, "srbm"); 3647 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3648 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3649 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3650 rw_init(&adev->mn_lock, "agpumn"); 3651 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3652 hash_init(adev->mn_hash); 3653 rw_init(&adev->psp.mutex, "agpsp"); 3654 rw_init(&adev->notifier_lock, "agnf"); 3655 rw_init(&adev->pm.stable_pstate_ctx_lock, "agps"); 3656 rw_init(&adev->benchmark_mutex, "agbm"); 3657 3658 amdgpu_device_init_apu_flags(adev); 3659 3660 r = amdgpu_device_check_arguments(adev); 3661 if (r) 3662 return r; 3663 3664 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3665 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3666 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3667 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3668 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3669 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3670 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3671 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3672 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3673 3674 INIT_LIST_HEAD(&adev->shadow_list); 3675 rw_init(&adev->shadow_list_lock, "sdwlst"); 3676 3677 INIT_LIST_HEAD(&adev->reset_list); 3678 3679 INIT_LIST_HEAD(&adev->ras_list); 3680 3681 INIT_DELAYED_WORK(&adev->delayed_init_work, 3682 amdgpu_device_delayed_init_work_handler); 3683 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3684 amdgpu_device_delay_enable_gfx_off); 3685 3686 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3687 3688 adev->gfx.gfx_off_req_count = 1; 3689 adev->gfx.gfx_off_residency = 0; 3690 adev->gfx.gfx_off_entrycount = 0; 3691 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3692 3693 atomic_set(&adev->throttling_logging_enabled, 1); 3694 /* 3695 * If throttling continues, logging will be performed every minute 3696 * to avoid log flooding. "-1" is subtracted since the thermal 3697 * throttling interrupt comes every second. Thus, the total logging 3698 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3699 * for throttling interrupt) = 60 seconds. 3700 */ 3701 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3702 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3703 3704 #ifdef __linux__ 3705 /* Registers mapping */ 3706 /* TODO: block userspace mapping of io register */ 3707 if (adev->asic_type >= CHIP_BONAIRE) { 3708 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3709 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3710 } else { 3711 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3712 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3713 } 3714 3715 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3716 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3717 3718 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3719 if (adev->rmmio == NULL) { 3720 return -ENOMEM; 3721 } 3722 #endif 3723 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3724 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3725 3726 amdgpu_device_get_pcie_info(adev); 3727 3728 if (amdgpu_mcbp) 3729 DRM_INFO("MCBP is enabled\n"); 3730 3731 /* 3732 * Reset domain needs to be present early, before XGMI hive discovered 3733 * (if any) and intitialized to use reset sem and in_gpu reset flag 3734 * early on during init and before calling to RREG32. 3735 */ 3736 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3737 if (!adev->reset_domain) 3738 return -ENOMEM; 3739 3740 /* detect hw virtualization here */ 3741 amdgpu_detect_virtualization(adev); 3742 3743 r = amdgpu_device_get_job_timeout_settings(adev); 3744 if (r) { 3745 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3746 return r; 3747 } 3748 3749 /* early init functions */ 3750 r = amdgpu_device_ip_early_init(adev); 3751 if (r) 3752 return r; 3753 3754 /* Get rid of things like offb */ 3755 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3756 if (r) 3757 return r; 3758 3759 /* Enable TMZ based on IP_VERSION */ 3760 amdgpu_gmc_tmz_set(adev); 3761 3762 amdgpu_gmc_noretry_set(adev); 3763 /* Need to get xgmi info early to decide the reset behavior*/ 3764 if (adev->gmc.xgmi.supported) { 3765 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3766 if (r) 3767 return r; 3768 } 3769 3770 /* enable PCIE atomic ops */ 3771 #ifdef notyet 3772 if (amdgpu_sriov_vf(adev)) 3773 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3774 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3775 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3776 else 3777 adev->have_atomics_support = 3778 !pci_enable_atomic_ops_to_root(adev->pdev, 3779 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3780 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3781 if (!adev->have_atomics_support) 3782 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3783 #else 3784 adev->have_atomics_support = false; 3785 #endif 3786 3787 /* doorbell bar mapping and doorbell index init*/ 3788 amdgpu_device_doorbell_init(adev); 3789 3790 if (amdgpu_emu_mode == 1) { 3791 /* post the asic on emulation mode */ 3792 emu_soc_asic_init(adev); 3793 goto fence_driver_init; 3794 } 3795 3796 amdgpu_reset_init(adev); 3797 3798 /* detect if we are with an SRIOV vbios */ 3799 amdgpu_device_detect_sriov_bios(adev); 3800 3801 /* check if we need to reset the asic 3802 * E.g., driver was not cleanly unloaded previously, etc. 3803 */ 3804 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3805 if (adev->gmc.xgmi.num_physical_nodes) { 3806 dev_info(adev->dev, "Pending hive reset.\n"); 3807 adev->gmc.xgmi.pending_reset = true; 3808 /* Only need to init necessary block for SMU to handle the reset */ 3809 for (i = 0; i < adev->num_ip_blocks; i++) { 3810 if (!adev->ip_blocks[i].status.valid) 3811 continue; 3812 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3813 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3815 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3816 DRM_DEBUG("IP %s disabled for hw_init.\n", 3817 adev->ip_blocks[i].version->funcs->name); 3818 adev->ip_blocks[i].status.hw = true; 3819 } 3820 } 3821 } else { 3822 r = amdgpu_asic_reset(adev); 3823 if (r) { 3824 dev_err(adev->dev, "asic reset on init failed\n"); 3825 goto failed; 3826 } 3827 } 3828 } 3829 3830 pci_enable_pcie_error_reporting(adev->pdev); 3831 3832 /* Post card if necessary */ 3833 if (amdgpu_device_need_post(adev)) { 3834 if (!adev->bios) { 3835 dev_err(adev->dev, "no vBIOS found\n"); 3836 r = -EINVAL; 3837 goto failed; 3838 } 3839 DRM_INFO("GPU posting now...\n"); 3840 r = amdgpu_device_asic_init(adev); 3841 if (r) { 3842 dev_err(adev->dev, "gpu post error!\n"); 3843 goto failed; 3844 } 3845 } 3846 3847 if (adev->is_atom_fw) { 3848 /* Initialize clocks */ 3849 r = amdgpu_atomfirmware_get_clock_info(adev); 3850 if (r) { 3851 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3852 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3853 goto failed; 3854 } 3855 } else { 3856 /* Initialize clocks */ 3857 r = amdgpu_atombios_get_clock_info(adev); 3858 if (r) { 3859 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3860 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3861 goto failed; 3862 } 3863 /* init i2c buses */ 3864 if (!amdgpu_device_has_dc_support(adev)) 3865 amdgpu_atombios_i2c_init(adev); 3866 } 3867 3868 fence_driver_init: 3869 /* Fence driver */ 3870 r = amdgpu_fence_driver_sw_init(adev); 3871 if (r) { 3872 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3873 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3874 goto failed; 3875 } 3876 3877 /* init the mode config */ 3878 drm_mode_config_init(adev_to_drm(adev)); 3879 3880 r = amdgpu_device_ip_init(adev); 3881 if (r) { 3882 /* failed in exclusive mode due to timeout */ 3883 if (amdgpu_sriov_vf(adev) && 3884 !amdgpu_sriov_runtime(adev) && 3885 amdgpu_virt_mmio_blocked(adev) && 3886 !amdgpu_virt_wait_reset(adev)) { 3887 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3888 /* Don't send request since VF is inactive. */ 3889 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3890 adev->virt.ops = NULL; 3891 r = -EAGAIN; 3892 goto release_ras_con; 3893 } 3894 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3895 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3896 goto release_ras_con; 3897 } 3898 3899 amdgpu_fence_driver_hw_init(adev); 3900 3901 dev_info(adev->dev, 3902 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3903 adev->gfx.config.max_shader_engines, 3904 adev->gfx.config.max_sh_per_se, 3905 adev->gfx.config.max_cu_per_sh, 3906 adev->gfx.cu_info.number); 3907 3908 #ifdef __OpenBSD__ 3909 { 3910 const char *chip_name; 3911 uint32_t version = adev->ip_versions[GC_HWIP][0]; 3912 int maj, min, rev; 3913 3914 switch (adev->asic_type) { 3915 case CHIP_RAVEN: 3916 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3917 chip_name = "RAVEN2"; 3918 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3919 chip_name = "PICASSO"; 3920 else 3921 chip_name = "RAVEN"; 3922 break; 3923 case CHIP_RENOIR: 3924 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3925 chip_name = "RENOIR"; 3926 else 3927 chip_name = "GREEN_SARDINE"; 3928 break; 3929 default: 3930 chip_name = amdgpu_asic_name[adev->asic_type]; 3931 } 3932 3933 printf("%s: %s", adev->self.dv_xname, chip_name); 3934 /* show graphics/compute ip block version, not set on < GFX9 */ 3935 if (version) { 3936 maj = IP_VERSION_MAJ(version); 3937 min = IP_VERSION_MIN(version); 3938 rev = IP_VERSION_REV(version); 3939 printf(" GC %d.%d.%d", maj, min, rev); 3940 } 3941 printf(" %d CU rev 0x%02x\n", adev->gfx.cu_info.number, adev->rev_id); 3942 } 3943 #endif 3944 3945 adev->accel_working = true; 3946 3947 amdgpu_vm_check_compute_bug(adev); 3948 3949 /* Initialize the buffer migration limit. */ 3950 if (amdgpu_moverate >= 0) 3951 max_MBps = amdgpu_moverate; 3952 else 3953 max_MBps = 8; /* Allow 8 MB/s. */ 3954 /* Get a log2 for easy divisions. */ 3955 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3956 3957 r = amdgpu_pm_sysfs_init(adev); 3958 if (r) { 3959 adev->pm_sysfs_en = false; 3960 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3961 } else 3962 adev->pm_sysfs_en = true; 3963 3964 r = amdgpu_ucode_sysfs_init(adev); 3965 if (r) { 3966 adev->ucode_sysfs_en = false; 3967 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3968 } else 3969 adev->ucode_sysfs_en = true; 3970 3971 r = amdgpu_psp_sysfs_init(adev); 3972 if (r) { 3973 adev->psp_sysfs_en = false; 3974 if (!amdgpu_sriov_vf(adev)) 3975 DRM_ERROR("Creating psp sysfs failed\n"); 3976 } else 3977 adev->psp_sysfs_en = true; 3978 3979 /* 3980 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3981 * Otherwise the mgpu fan boost feature will be skipped due to the 3982 * gpu instance is counted less. 3983 */ 3984 amdgpu_register_gpu_instance(adev); 3985 3986 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3987 * explicit gating rather than handling it automatically. 3988 */ 3989 if (!adev->gmc.xgmi.pending_reset) { 3990 r = amdgpu_device_ip_late_init(adev); 3991 if (r) { 3992 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3993 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3994 goto release_ras_con; 3995 } 3996 /* must succeed. */ 3997 amdgpu_ras_resume(adev); 3998 queue_delayed_work(system_wq, &adev->delayed_init_work, 3999 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4000 } 4001 4002 if (amdgpu_sriov_vf(adev)) 4003 flush_delayed_work(&adev->delayed_init_work); 4004 4005 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4006 if (r) 4007 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4008 4009 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4010 r = amdgpu_pmu_init(adev); 4011 if (r) 4012 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4013 4014 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4015 if (amdgpu_device_cache_pci_state(adev->pdev)) 4016 pci_restore_state(pdev); 4017 4018 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4019 /* this will fail for cards that aren't VGA class devices, just 4020 * ignore it */ 4021 #ifdef notyet 4022 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4023 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4024 #endif 4025 4026 if (amdgpu_device_supports_px(ddev)) { 4027 px = true; 4028 vga_switcheroo_register_client(adev->pdev, 4029 &amdgpu_switcheroo_ops, px); 4030 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4031 } 4032 4033 if (adev->gmc.xgmi.pending_reset) 4034 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4035 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4036 4037 amdgpu_device_check_iommu_direct_map(adev); 4038 4039 return 0; 4040 4041 release_ras_con: 4042 amdgpu_release_ras_context(adev); 4043 4044 failed: 4045 amdgpu_vf_error_trans_all(adev); 4046 4047 return r; 4048 } 4049 4050 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4051 { 4052 STUB(); 4053 #ifdef notyet 4054 /* Clear all CPU mappings pointing to this device */ 4055 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4056 #endif 4057 4058 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4059 amdgpu_device_doorbell_fini(adev); 4060 4061 #ifdef __linux__ 4062 iounmap(adev->rmmio); 4063 adev->rmmio = NULL; 4064 if (adev->mman.aper_base_kaddr) 4065 iounmap(adev->mman.aper_base_kaddr); 4066 adev->mman.aper_base_kaddr = NULL; 4067 #else 4068 if (adev->rmmio_size > 0) 4069 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4070 adev->rmmio_size); 4071 adev->rmmio_size = 0; 4072 adev->rmmio = NULL; 4073 if (adev->mman.aper_base_kaddr) 4074 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 4075 adev->gmc.visible_vram_size); 4076 adev->mman.aper_base_kaddr = NULL; 4077 #endif 4078 4079 /* Memory manager related */ 4080 if (!adev->gmc.xgmi.connected_to_cpu) { 4081 #ifdef __linux__ 4082 arch_phys_wc_del(adev->gmc.vram_mtrr); 4083 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4084 #else 4085 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 4086 #endif 4087 } 4088 } 4089 4090 /** 4091 * amdgpu_device_fini_hw - tear down the driver 4092 * 4093 * @adev: amdgpu_device pointer 4094 * 4095 * Tear down the driver info (all asics). 4096 * Called at driver shutdown. 4097 */ 4098 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4099 { 4100 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4101 flush_delayed_work(&adev->delayed_init_work); 4102 adev->shutdown = true; 4103 4104 /* make sure IB test finished before entering exclusive mode 4105 * to avoid preemption on IB test 4106 * */ 4107 if (amdgpu_sriov_vf(adev)) { 4108 amdgpu_virt_request_full_gpu(adev, false); 4109 amdgpu_virt_fini_data_exchange(adev); 4110 } 4111 4112 /* disable all interrupts */ 4113 amdgpu_irq_disable_all(adev); 4114 if (adev->mode_info.mode_config_initialized){ 4115 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4116 drm_helper_force_disable_all(adev_to_drm(adev)); 4117 else 4118 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4119 } 4120 amdgpu_fence_driver_hw_fini(adev); 4121 4122 if (adev->mman.initialized) { 4123 flush_delayed_work(&adev->mman.bdev.wq); 4124 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 4125 } 4126 4127 if (adev->pm_sysfs_en) 4128 amdgpu_pm_sysfs_fini(adev); 4129 if (adev->ucode_sysfs_en) 4130 amdgpu_ucode_sysfs_fini(adev); 4131 if (adev->psp_sysfs_en) 4132 amdgpu_psp_sysfs_fini(adev); 4133 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4134 4135 /* disable ras feature must before hw fini */ 4136 amdgpu_ras_pre_fini(adev); 4137 4138 amdgpu_device_ip_fini_early(adev); 4139 4140 amdgpu_irq_fini_hw(adev); 4141 4142 if (adev->mman.initialized) 4143 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4144 4145 amdgpu_gart_dummy_page_fini(adev); 4146 4147 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4148 amdgpu_device_unmap_mmio(adev); 4149 4150 } 4151 4152 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4153 { 4154 int idx; 4155 4156 amdgpu_fence_driver_sw_fini(adev); 4157 amdgpu_device_ip_fini(adev); 4158 release_firmware(adev->firmware.gpu_info_fw); 4159 adev->firmware.gpu_info_fw = NULL; 4160 adev->accel_working = false; 4161 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4162 4163 amdgpu_reset_fini(adev); 4164 4165 /* free i2c buses */ 4166 if (!amdgpu_device_has_dc_support(adev)) 4167 amdgpu_i2c_fini(adev); 4168 4169 if (amdgpu_emu_mode != 1) 4170 amdgpu_atombios_fini(adev); 4171 4172 kfree(adev->bios); 4173 adev->bios = NULL; 4174 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4175 vga_switcheroo_unregister_client(adev->pdev); 4176 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4177 } 4178 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4179 vga_client_unregister(adev->pdev); 4180 4181 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4182 #ifdef __linux__ 4183 iounmap(adev->rmmio); 4184 adev->rmmio = NULL; 4185 #else 4186 if (adev->rmmio_size > 0) 4187 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 4188 adev->rmmio_size); 4189 adev->rmmio_size = 0; 4190 adev->rmmio = NULL; 4191 #endif 4192 amdgpu_device_doorbell_fini(adev); 4193 drm_dev_exit(idx); 4194 } 4195 4196 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4197 amdgpu_pmu_fini(adev); 4198 if (adev->mman.discovery_bin) 4199 amdgpu_discovery_fini(adev); 4200 4201 amdgpu_reset_put_reset_domain(adev->reset_domain); 4202 adev->reset_domain = NULL; 4203 4204 kfree(adev->pci_state); 4205 4206 } 4207 4208 /** 4209 * amdgpu_device_evict_resources - evict device resources 4210 * @adev: amdgpu device object 4211 * 4212 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4213 * of the vram memory type. Mainly used for evicting device resources 4214 * at suspend time. 4215 * 4216 */ 4217 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4218 { 4219 int ret; 4220 4221 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4222 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4223 return 0; 4224 4225 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4226 if (ret) 4227 DRM_WARN("evicting device resources failed\n"); 4228 return ret; 4229 } 4230 4231 /* 4232 * Suspend & resume. 4233 */ 4234 /** 4235 * amdgpu_device_suspend - initiate device suspend 4236 * 4237 * @dev: drm dev pointer 4238 * @fbcon : notify the fbdev of suspend 4239 * 4240 * Puts the hw in the suspend state (all asics). 4241 * Returns 0 for success or an error on failure. 4242 * Called at driver suspend. 4243 */ 4244 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4245 { 4246 struct amdgpu_device *adev = drm_to_adev(dev); 4247 int r = 0; 4248 4249 if (adev->shutdown) 4250 return 0; 4251 4252 #ifdef notyet 4253 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4254 return 0; 4255 #endif 4256 4257 adev->in_suspend = true; 4258 4259 if (amdgpu_sriov_vf(adev)) { 4260 amdgpu_virt_fini_data_exchange(adev); 4261 r = amdgpu_virt_request_full_gpu(adev, false); 4262 if (r) 4263 return r; 4264 } 4265 4266 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4267 DRM_WARN("smart shift update failed\n"); 4268 4269 drm_kms_helper_poll_disable(dev); 4270 4271 if (fbcon) 4272 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4273 4274 cancel_delayed_work_sync(&adev->delayed_init_work); 4275 4276 amdgpu_ras_suspend(adev); 4277 4278 amdgpu_device_ip_suspend_phase1(adev); 4279 4280 if (!adev->in_s0ix) 4281 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4282 4283 r = amdgpu_device_evict_resources(adev); 4284 if (r) 4285 return r; 4286 4287 amdgpu_fence_driver_hw_fini(adev); 4288 4289 amdgpu_device_ip_suspend_phase2(adev); 4290 4291 if (amdgpu_sriov_vf(adev)) 4292 amdgpu_virt_release_full_gpu(adev, false); 4293 4294 return 0; 4295 } 4296 4297 /** 4298 * amdgpu_device_resume - initiate device resume 4299 * 4300 * @dev: drm dev pointer 4301 * @fbcon : notify the fbdev of resume 4302 * 4303 * Bring the hw back to operating state (all asics). 4304 * Returns 0 for success or an error on failure. 4305 * Called at driver resume. 4306 */ 4307 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4308 { 4309 struct amdgpu_device *adev = drm_to_adev(dev); 4310 int r = 0; 4311 4312 if (amdgpu_sriov_vf(adev)) { 4313 r = amdgpu_virt_request_full_gpu(adev, true); 4314 if (r) 4315 return r; 4316 } 4317 4318 #ifdef notyet 4319 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4320 return 0; 4321 #endif 4322 4323 if (adev->in_s0ix) 4324 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4325 4326 /* post card */ 4327 if (amdgpu_device_need_post(adev)) { 4328 r = amdgpu_device_asic_init(adev); 4329 if (r) 4330 dev_err(adev->dev, "amdgpu asic init failed\n"); 4331 } 4332 4333 r = amdgpu_device_ip_resume(adev); 4334 4335 /* no matter what r is, always need to properly release full GPU */ 4336 if (amdgpu_sriov_vf(adev)) { 4337 amdgpu_virt_init_data_exchange(adev); 4338 amdgpu_virt_release_full_gpu(adev, true); 4339 } 4340 4341 if (r) { 4342 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4343 return r; 4344 } 4345 amdgpu_fence_driver_hw_init(adev); 4346 4347 r = amdgpu_device_ip_late_init(adev); 4348 if (r) 4349 return r; 4350 4351 queue_delayed_work(system_wq, &adev->delayed_init_work, 4352 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4353 4354 if (!adev->in_s0ix) { 4355 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4356 if (r) 4357 return r; 4358 } 4359 4360 /* Make sure IB tests flushed */ 4361 flush_delayed_work(&adev->delayed_init_work); 4362 4363 if (adev->in_s0ix) { 4364 /* re-enable gfxoff after IP resume. This re-enables gfxoff after 4365 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). 4366 */ 4367 amdgpu_gfx_off_ctrl(adev, true); 4368 DRM_DEBUG("will enable gfxoff for the mission mode\n"); 4369 } 4370 if (fbcon) 4371 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4372 4373 drm_kms_helper_poll_enable(dev); 4374 4375 amdgpu_ras_resume(adev); 4376 4377 /* 4378 * Most of the connector probing functions try to acquire runtime pm 4379 * refs to ensure that the GPU is powered on when connector polling is 4380 * performed. Since we're calling this from a runtime PM callback, 4381 * trying to acquire rpm refs will cause us to deadlock. 4382 * 4383 * Since we're guaranteed to be holding the rpm lock, it's safe to 4384 * temporarily disable the rpm helpers so this doesn't deadlock us. 4385 */ 4386 #if defined(CONFIG_PM) && defined(__linux__) 4387 dev->dev->power.disable_depth++; 4388 #endif 4389 if (!amdgpu_device_has_dc_support(adev)) 4390 drm_helper_hpd_irq_event(dev); 4391 else 4392 drm_kms_helper_hotplug_event(dev); 4393 #if defined(CONFIG_PM) && defined(__linux__) 4394 dev->dev->power.disable_depth--; 4395 #endif 4396 adev->in_suspend = false; 4397 4398 if (adev->enable_mes) 4399 amdgpu_mes_self_test(adev); 4400 4401 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4402 DRM_WARN("smart shift update failed\n"); 4403 4404 return 0; 4405 } 4406 4407 /** 4408 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4409 * 4410 * @adev: amdgpu_device pointer 4411 * 4412 * The list of all the hardware IPs that make up the asic is walked and 4413 * the check_soft_reset callbacks are run. check_soft_reset determines 4414 * if the asic is still hung or not. 4415 * Returns true if any of the IPs are still in a hung state, false if not. 4416 */ 4417 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4418 { 4419 int i; 4420 bool asic_hang = false; 4421 4422 if (amdgpu_sriov_vf(adev)) 4423 return true; 4424 4425 if (amdgpu_asic_need_full_reset(adev)) 4426 return true; 4427 4428 for (i = 0; i < adev->num_ip_blocks; i++) { 4429 if (!adev->ip_blocks[i].status.valid) 4430 continue; 4431 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4432 adev->ip_blocks[i].status.hang = 4433 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4434 if (adev->ip_blocks[i].status.hang) { 4435 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4436 asic_hang = true; 4437 } 4438 } 4439 return asic_hang; 4440 } 4441 4442 /** 4443 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4444 * 4445 * @adev: amdgpu_device pointer 4446 * 4447 * The list of all the hardware IPs that make up the asic is walked and the 4448 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4449 * handles any IP specific hardware or software state changes that are 4450 * necessary for a soft reset to succeed. 4451 * Returns 0 on success, negative error code on failure. 4452 */ 4453 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4454 { 4455 int i, r = 0; 4456 4457 for (i = 0; i < adev->num_ip_blocks; i++) { 4458 if (!adev->ip_blocks[i].status.valid) 4459 continue; 4460 if (adev->ip_blocks[i].status.hang && 4461 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4462 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4463 if (r) 4464 return r; 4465 } 4466 } 4467 4468 return 0; 4469 } 4470 4471 /** 4472 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4473 * 4474 * @adev: amdgpu_device pointer 4475 * 4476 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4477 * reset is necessary to recover. 4478 * Returns true if a full asic reset is required, false if not. 4479 */ 4480 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4481 { 4482 int i; 4483 4484 if (amdgpu_asic_need_full_reset(adev)) 4485 return true; 4486 4487 for (i = 0; i < adev->num_ip_blocks; i++) { 4488 if (!adev->ip_blocks[i].status.valid) 4489 continue; 4490 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4491 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4492 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4493 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4494 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4495 if (adev->ip_blocks[i].status.hang) { 4496 dev_info(adev->dev, "Some block need full reset!\n"); 4497 return true; 4498 } 4499 } 4500 } 4501 return false; 4502 } 4503 4504 /** 4505 * amdgpu_device_ip_soft_reset - do a soft reset 4506 * 4507 * @adev: amdgpu_device pointer 4508 * 4509 * The list of all the hardware IPs that make up the asic is walked and the 4510 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4511 * IP specific hardware or software state changes that are necessary to soft 4512 * reset the IP. 4513 * Returns 0 on success, negative error code on failure. 4514 */ 4515 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4516 { 4517 int i, r = 0; 4518 4519 for (i = 0; i < adev->num_ip_blocks; i++) { 4520 if (!adev->ip_blocks[i].status.valid) 4521 continue; 4522 if (adev->ip_blocks[i].status.hang && 4523 adev->ip_blocks[i].version->funcs->soft_reset) { 4524 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4525 if (r) 4526 return r; 4527 } 4528 } 4529 4530 return 0; 4531 } 4532 4533 /** 4534 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4535 * 4536 * @adev: amdgpu_device pointer 4537 * 4538 * The list of all the hardware IPs that make up the asic is walked and the 4539 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4540 * handles any IP specific hardware or software state changes that are 4541 * necessary after the IP has been soft reset. 4542 * Returns 0 on success, negative error code on failure. 4543 */ 4544 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4545 { 4546 int i, r = 0; 4547 4548 for (i = 0; i < adev->num_ip_blocks; i++) { 4549 if (!adev->ip_blocks[i].status.valid) 4550 continue; 4551 if (adev->ip_blocks[i].status.hang && 4552 adev->ip_blocks[i].version->funcs->post_soft_reset) 4553 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4554 if (r) 4555 return r; 4556 } 4557 4558 return 0; 4559 } 4560 4561 /** 4562 * amdgpu_device_recover_vram - Recover some VRAM contents 4563 * 4564 * @adev: amdgpu_device pointer 4565 * 4566 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4567 * restore things like GPUVM page tables after a GPU reset where 4568 * the contents of VRAM might be lost. 4569 * 4570 * Returns: 4571 * 0 on success, negative error code on failure. 4572 */ 4573 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4574 { 4575 struct dma_fence *fence = NULL, *next = NULL; 4576 struct amdgpu_bo *shadow; 4577 struct amdgpu_bo_vm *vmbo; 4578 long r = 1, tmo; 4579 4580 if (amdgpu_sriov_runtime(adev)) 4581 tmo = msecs_to_jiffies(8000); 4582 else 4583 tmo = msecs_to_jiffies(100); 4584 4585 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4586 mutex_lock(&adev->shadow_list_lock); 4587 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4588 shadow = &vmbo->bo; 4589 /* No need to recover an evicted BO */ 4590 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4591 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4592 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4593 continue; 4594 4595 r = amdgpu_bo_restore_shadow(shadow, &next); 4596 if (r) 4597 break; 4598 4599 if (fence) { 4600 tmo = dma_fence_wait_timeout(fence, false, tmo); 4601 dma_fence_put(fence); 4602 fence = next; 4603 if (tmo == 0) { 4604 r = -ETIMEDOUT; 4605 break; 4606 } else if (tmo < 0) { 4607 r = tmo; 4608 break; 4609 } 4610 } else { 4611 fence = next; 4612 } 4613 } 4614 mutex_unlock(&adev->shadow_list_lock); 4615 4616 if (fence) 4617 tmo = dma_fence_wait_timeout(fence, false, tmo); 4618 dma_fence_put(fence); 4619 4620 if (r < 0 || tmo <= 0) { 4621 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4622 return -EIO; 4623 } 4624 4625 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4626 return 0; 4627 } 4628 4629 4630 /** 4631 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4632 * 4633 * @adev: amdgpu_device pointer 4634 * @from_hypervisor: request from hypervisor 4635 * 4636 * do VF FLR and reinitialize Asic 4637 * return 0 means succeeded otherwise failed 4638 */ 4639 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4640 bool from_hypervisor) 4641 { 4642 int r; 4643 struct amdgpu_hive_info *hive = NULL; 4644 int retry_limit = 0; 4645 4646 retry: 4647 amdgpu_amdkfd_pre_reset(adev); 4648 4649 if (from_hypervisor) 4650 r = amdgpu_virt_request_full_gpu(adev, true); 4651 else 4652 r = amdgpu_virt_reset_gpu(adev); 4653 if (r) 4654 return r; 4655 4656 /* Resume IP prior to SMC */ 4657 r = amdgpu_device_ip_reinit_early_sriov(adev); 4658 if (r) 4659 goto error; 4660 4661 amdgpu_virt_init_data_exchange(adev); 4662 4663 r = amdgpu_device_fw_loading(adev); 4664 if (r) 4665 return r; 4666 4667 /* now we are okay to resume SMC/CP/SDMA */ 4668 r = amdgpu_device_ip_reinit_late_sriov(adev); 4669 if (r) 4670 goto error; 4671 4672 hive = amdgpu_get_xgmi_hive(adev); 4673 /* Update PSP FW topology after reset */ 4674 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4675 r = amdgpu_xgmi_update_topology(hive, adev); 4676 4677 if (hive) 4678 amdgpu_put_xgmi_hive(hive); 4679 4680 if (!r) { 4681 amdgpu_irq_gpu_reset_resume_helper(adev); 4682 r = amdgpu_ib_ring_tests(adev); 4683 4684 amdgpu_amdkfd_post_reset(adev); 4685 } 4686 4687 error: 4688 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4689 amdgpu_inc_vram_lost(adev); 4690 r = amdgpu_device_recover_vram(adev); 4691 } 4692 amdgpu_virt_release_full_gpu(adev, true); 4693 4694 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4695 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4696 retry_limit++; 4697 goto retry; 4698 } else 4699 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4700 } 4701 4702 return r; 4703 } 4704 4705 /** 4706 * amdgpu_device_has_job_running - check if there is any job in mirror list 4707 * 4708 * @adev: amdgpu_device pointer 4709 * 4710 * check if there is any job in mirror list 4711 */ 4712 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4713 { 4714 int i; 4715 struct drm_sched_job *job; 4716 4717 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4718 struct amdgpu_ring *ring = adev->rings[i]; 4719 4720 if (!ring || !ring->sched.thread) 4721 continue; 4722 4723 spin_lock(&ring->sched.job_list_lock); 4724 job = list_first_entry_or_null(&ring->sched.pending_list, 4725 struct drm_sched_job, list); 4726 spin_unlock(&ring->sched.job_list_lock); 4727 if (job) 4728 return true; 4729 } 4730 return false; 4731 } 4732 4733 /** 4734 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4735 * 4736 * @adev: amdgpu_device pointer 4737 * 4738 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4739 * a hung GPU. 4740 */ 4741 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4742 { 4743 4744 if (amdgpu_gpu_recovery == 0) 4745 goto disabled; 4746 4747 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4748 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4749 return false; 4750 } 4751 4752 if (amdgpu_sriov_vf(adev)) 4753 return true; 4754 4755 if (amdgpu_gpu_recovery == -1) { 4756 switch (adev->asic_type) { 4757 #ifdef CONFIG_DRM_AMDGPU_SI 4758 case CHIP_VERDE: 4759 case CHIP_TAHITI: 4760 case CHIP_PITCAIRN: 4761 case CHIP_OLAND: 4762 case CHIP_HAINAN: 4763 #endif 4764 #ifdef CONFIG_DRM_AMDGPU_CIK 4765 case CHIP_KAVERI: 4766 case CHIP_KABINI: 4767 case CHIP_MULLINS: 4768 #endif 4769 case CHIP_CARRIZO: 4770 case CHIP_STONEY: 4771 case CHIP_CYAN_SKILLFISH: 4772 goto disabled; 4773 default: 4774 break; 4775 } 4776 } 4777 4778 return true; 4779 4780 disabled: 4781 dev_info(adev->dev, "GPU recovery disabled.\n"); 4782 return false; 4783 } 4784 4785 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4786 { 4787 u32 i; 4788 int ret = 0; 4789 4790 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4791 4792 dev_info(adev->dev, "GPU mode1 reset\n"); 4793 4794 /* disable BM */ 4795 pci_clear_master(adev->pdev); 4796 4797 amdgpu_device_cache_pci_state(adev->pdev); 4798 4799 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4800 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4801 ret = amdgpu_dpm_mode1_reset(adev); 4802 } else { 4803 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4804 ret = psp_gpu_reset(adev); 4805 } 4806 4807 if (ret) 4808 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4809 4810 amdgpu_device_load_pci_state(adev->pdev); 4811 4812 /* wait for asic to come out of reset */ 4813 for (i = 0; i < adev->usec_timeout; i++) { 4814 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4815 4816 if (memsize != 0xffffffff) 4817 break; 4818 udelay(1); 4819 } 4820 4821 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4822 return ret; 4823 } 4824 4825 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4826 struct amdgpu_reset_context *reset_context) 4827 { 4828 int i, r = 0; 4829 struct amdgpu_job *job = NULL; 4830 bool need_full_reset = 4831 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4832 4833 if (reset_context->reset_req_dev == adev) 4834 job = reset_context->job; 4835 4836 if (amdgpu_sriov_vf(adev)) { 4837 /* stop the data exchange thread */ 4838 amdgpu_virt_fini_data_exchange(adev); 4839 } 4840 4841 amdgpu_fence_driver_isr_toggle(adev, true); 4842 4843 /* block all schedulers and reset given job's ring */ 4844 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4845 struct amdgpu_ring *ring = adev->rings[i]; 4846 4847 if (!ring || !ring->sched.thread) 4848 continue; 4849 4850 /*clear job fence from fence drv to avoid force_completion 4851 *leave NULL and vm flush fence in fence drv */ 4852 amdgpu_fence_driver_clear_job_fences(ring); 4853 4854 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4855 amdgpu_fence_driver_force_completion(ring); 4856 } 4857 4858 amdgpu_fence_driver_isr_toggle(adev, false); 4859 4860 if (job && job->vm) 4861 drm_sched_increase_karma(&job->base); 4862 4863 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4864 /* If reset handler not implemented, continue; otherwise return */ 4865 if (r == -ENOSYS) 4866 r = 0; 4867 else 4868 return r; 4869 4870 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4871 if (!amdgpu_sriov_vf(adev)) { 4872 4873 if (!need_full_reset) 4874 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4875 4876 if (!need_full_reset && amdgpu_gpu_recovery) { 4877 amdgpu_device_ip_pre_soft_reset(adev); 4878 r = amdgpu_device_ip_soft_reset(adev); 4879 amdgpu_device_ip_post_soft_reset(adev); 4880 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4881 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4882 need_full_reset = true; 4883 } 4884 } 4885 4886 if (need_full_reset) 4887 r = amdgpu_device_ip_suspend(adev); 4888 if (need_full_reset) 4889 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4890 else 4891 clear_bit(AMDGPU_NEED_FULL_RESET, 4892 &reset_context->flags); 4893 } 4894 4895 return r; 4896 } 4897 4898 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4899 { 4900 int i; 4901 4902 lockdep_assert_held(&adev->reset_domain->sem); 4903 4904 for (i = 0; i < adev->num_regs; i++) { 4905 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4906 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4907 adev->reset_dump_reg_value[i]); 4908 } 4909 4910 return 0; 4911 } 4912 4913 #ifdef CONFIG_DEV_COREDUMP 4914 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4915 size_t count, void *data, size_t datalen) 4916 { 4917 struct drm_printer p; 4918 struct amdgpu_device *adev = data; 4919 struct drm_print_iterator iter; 4920 int i; 4921 4922 iter.data = buffer; 4923 iter.offset = 0; 4924 iter.start = offset; 4925 iter.remain = count; 4926 4927 p = drm_coredump_printer(&iter); 4928 4929 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4930 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4931 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4932 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4933 if (adev->reset_task_info.pid) 4934 drm_printf(&p, "process_name: %s PID: %d\n", 4935 adev->reset_task_info.process_name, 4936 adev->reset_task_info.pid); 4937 4938 if (adev->reset_vram_lost) 4939 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4940 if (adev->num_regs) { 4941 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4942 4943 for (i = 0; i < adev->num_regs; i++) 4944 drm_printf(&p, "0x%08x: 0x%08x\n", 4945 adev->reset_dump_reg_list[i], 4946 adev->reset_dump_reg_value[i]); 4947 } 4948 4949 return count - iter.remain; 4950 } 4951 4952 static void amdgpu_devcoredump_free(void *data) 4953 { 4954 } 4955 4956 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4957 { 4958 struct drm_device *dev = adev_to_drm(adev); 4959 4960 ktime_get_ts64(&adev->reset_time); 4961 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4962 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4963 } 4964 #endif 4965 4966 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4967 struct amdgpu_reset_context *reset_context) 4968 { 4969 struct amdgpu_device *tmp_adev = NULL; 4970 bool need_full_reset, skip_hw_reset, vram_lost = false; 4971 int r = 0; 4972 bool gpu_reset_for_dev_remove = 0; 4973 4974 /* Try reset handler method first */ 4975 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4976 reset_list); 4977 amdgpu_reset_reg_dumps(tmp_adev); 4978 4979 reset_context->reset_device_list = device_list_handle; 4980 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4981 /* If reset handler not implemented, continue; otherwise return */ 4982 if (r == -ENOSYS) 4983 r = 0; 4984 else 4985 return r; 4986 4987 /* Reset handler not implemented, use the default method */ 4988 need_full_reset = 4989 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4990 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4991 4992 gpu_reset_for_dev_remove = 4993 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4994 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4995 4996 /* 4997 * ASIC reset has to be done on all XGMI hive nodes ASAP 4998 * to allow proper links negotiation in FW (within 1 sec) 4999 */ 5000 if (!skip_hw_reset && need_full_reset) { 5001 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5002 /* For XGMI run all resets in parallel to speed up the process */ 5003 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5004 tmp_adev->gmc.xgmi.pending_reset = false; 5005 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5006 r = -EALREADY; 5007 } else 5008 r = amdgpu_asic_reset(tmp_adev); 5009 5010 if (r) { 5011 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5012 r, adev_to_drm(tmp_adev)->unique); 5013 break; 5014 } 5015 } 5016 5017 /* For XGMI wait for all resets to complete before proceed */ 5018 if (!r) { 5019 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5020 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5021 flush_work(&tmp_adev->xgmi_reset_work); 5022 r = tmp_adev->asic_reset_res; 5023 if (r) 5024 break; 5025 } 5026 } 5027 } 5028 } 5029 5030 if (!r && amdgpu_ras_intr_triggered()) { 5031 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5032 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5033 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5034 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5035 } 5036 5037 amdgpu_ras_intr_cleared(); 5038 } 5039 5040 /* Since the mode1 reset affects base ip blocks, the 5041 * phase1 ip blocks need to be resumed. Otherwise there 5042 * will be a BIOS signature error and the psp bootloader 5043 * can't load kdb on the next amdgpu install. 5044 */ 5045 if (gpu_reset_for_dev_remove) { 5046 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5047 amdgpu_device_ip_resume_phase1(tmp_adev); 5048 5049 goto end; 5050 } 5051 5052 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5053 if (need_full_reset) { 5054 /* post card */ 5055 r = amdgpu_device_asic_init(tmp_adev); 5056 if (r) { 5057 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5058 } else { 5059 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5060 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 5061 if (r) 5062 goto out; 5063 5064 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5065 if (r) 5066 goto out; 5067 5068 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5069 #ifdef CONFIG_DEV_COREDUMP 5070 tmp_adev->reset_vram_lost = vram_lost; 5071 memset(&tmp_adev->reset_task_info, 0, 5072 sizeof(tmp_adev->reset_task_info)); 5073 if (reset_context->job && reset_context->job->vm) 5074 tmp_adev->reset_task_info = 5075 reset_context->job->vm->task_info; 5076 amdgpu_reset_capture_coredumpm(tmp_adev); 5077 #endif 5078 if (vram_lost) { 5079 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5080 amdgpu_inc_vram_lost(tmp_adev); 5081 } 5082 5083 r = amdgpu_device_fw_loading(tmp_adev); 5084 if (r) 5085 return r; 5086 5087 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5088 if (r) 5089 goto out; 5090 5091 if (vram_lost) 5092 amdgpu_device_fill_reset_magic(tmp_adev); 5093 5094 /* 5095 * Add this ASIC as tracked as reset was already 5096 * complete successfully. 5097 */ 5098 amdgpu_register_gpu_instance(tmp_adev); 5099 5100 if (!reset_context->hive && 5101 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5102 amdgpu_xgmi_add_device(tmp_adev); 5103 5104 r = amdgpu_device_ip_late_init(tmp_adev); 5105 if (r) 5106 goto out; 5107 5108 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5109 5110 /* 5111 * The GPU enters bad state once faulty pages 5112 * by ECC has reached the threshold, and ras 5113 * recovery is scheduled next. So add one check 5114 * here to break recovery if it indeed exceeds 5115 * bad page threshold, and remind user to 5116 * retire this GPU or setting one bigger 5117 * bad_page_threshold value to fix this once 5118 * probing driver again. 5119 */ 5120 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5121 /* must succeed. */ 5122 amdgpu_ras_resume(tmp_adev); 5123 } else { 5124 r = -EINVAL; 5125 goto out; 5126 } 5127 5128 /* Update PSP FW topology after reset */ 5129 if (reset_context->hive && 5130 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5131 r = amdgpu_xgmi_update_topology( 5132 reset_context->hive, tmp_adev); 5133 } 5134 } 5135 5136 out: 5137 if (!r) { 5138 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5139 r = amdgpu_ib_ring_tests(tmp_adev); 5140 if (r) { 5141 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5142 need_full_reset = true; 5143 r = -EAGAIN; 5144 goto end; 5145 } 5146 } 5147 5148 if (!r) 5149 r = amdgpu_device_recover_vram(tmp_adev); 5150 else 5151 tmp_adev->asic_reset_res = r; 5152 } 5153 5154 end: 5155 if (need_full_reset) 5156 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5157 else 5158 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5159 return r; 5160 } 5161 5162 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5163 { 5164 5165 switch (amdgpu_asic_reset_method(adev)) { 5166 case AMD_RESET_METHOD_MODE1: 5167 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5168 break; 5169 case AMD_RESET_METHOD_MODE2: 5170 adev->mp1_state = PP_MP1_STATE_RESET; 5171 break; 5172 default: 5173 adev->mp1_state = PP_MP1_STATE_NONE; 5174 break; 5175 } 5176 5177 pci_dev_put(p); 5178 } 5179 5180 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5181 { 5182 amdgpu_vf_error_trans_all(adev); 5183 adev->mp1_state = PP_MP1_STATE_NONE; 5184 } 5185 5186 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5187 { 5188 STUB(); 5189 #ifdef notyet 5190 struct pci_dev *p = NULL; 5191 5192 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5193 adev->pdev->bus->number, 1); 5194 if (p) { 5195 pm_runtime_enable(&(p->dev)); 5196 pm_runtime_resume(&(p->dev)); 5197 } 5198 #endif 5199 } 5200 5201 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5202 { 5203 enum amd_reset_method reset_method; 5204 struct pci_dev *p = NULL; 5205 u64 expires; 5206 5207 /* 5208 * For now, only BACO and mode1 reset are confirmed 5209 * to suffer the audio issue without proper suspended. 5210 */ 5211 reset_method = amdgpu_asic_reset_method(adev); 5212 if ((reset_method != AMD_RESET_METHOD_BACO) && 5213 (reset_method != AMD_RESET_METHOD_MODE1)) 5214 return -EINVAL; 5215 5216 STUB(); 5217 return -ENOSYS; 5218 #ifdef notyet 5219 5220 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5221 adev->pdev->bus->number, 1); 5222 if (!p) 5223 return -ENODEV; 5224 5225 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5226 if (!expires) 5227 /* 5228 * If we cannot get the audio device autosuspend delay, 5229 * a fixed 4S interval will be used. Considering 3S is 5230 * the audio controller default autosuspend delay setting. 5231 * 4S used here is guaranteed to cover that. 5232 */ 5233 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5234 5235 while (!pm_runtime_status_suspended(&(p->dev))) { 5236 if (!pm_runtime_suspend(&(p->dev))) 5237 break; 5238 5239 if (expires < ktime_get_mono_fast_ns()) { 5240 dev_warn(adev->dev, "failed to suspend display audio\n"); 5241 pci_dev_put(p); 5242 /* TODO: abort the succeeding gpu reset? */ 5243 return -ETIMEDOUT; 5244 } 5245 } 5246 5247 pm_runtime_disable(&(p->dev)); 5248 5249 pci_dev_put(p); 5250 return 0; 5251 #endif 5252 } 5253 5254 static void amdgpu_device_recheck_guilty_jobs( 5255 struct amdgpu_device *adev, struct list_head *device_list_handle, 5256 struct amdgpu_reset_context *reset_context) 5257 { 5258 int i, r = 0; 5259 5260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5261 struct amdgpu_ring *ring = adev->rings[i]; 5262 int ret = 0; 5263 struct drm_sched_job *s_job; 5264 5265 if (!ring || !ring->sched.thread) 5266 continue; 5267 5268 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5269 struct drm_sched_job, list); 5270 if (s_job == NULL) 5271 continue; 5272 5273 /* clear job's guilty and depend the folowing step to decide the real one */ 5274 drm_sched_reset_karma(s_job); 5275 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5276 5277 if (!s_job->s_fence->parent) { 5278 DRM_WARN("Failed to get a HW fence for job!"); 5279 continue; 5280 } 5281 5282 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5283 if (ret == 0) { /* timeout */ 5284 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5285 ring->sched.name, s_job->id); 5286 5287 5288 amdgpu_fence_driver_isr_toggle(adev, true); 5289 5290 /* Clear this failed job from fence array */ 5291 amdgpu_fence_driver_clear_job_fences(ring); 5292 5293 amdgpu_fence_driver_isr_toggle(adev, false); 5294 5295 /* Since the job won't signal and we go for 5296 * another resubmit drop this parent pointer 5297 */ 5298 dma_fence_put(s_job->s_fence->parent); 5299 s_job->s_fence->parent = NULL; 5300 5301 /* set guilty */ 5302 drm_sched_increase_karma(s_job); 5303 amdgpu_reset_prepare_hwcontext(adev, reset_context); 5304 retry: 5305 /* do hw reset */ 5306 if (amdgpu_sriov_vf(adev)) { 5307 amdgpu_virt_fini_data_exchange(adev); 5308 r = amdgpu_device_reset_sriov(adev, false); 5309 if (r) 5310 adev->asic_reset_res = r; 5311 } else { 5312 clear_bit(AMDGPU_SKIP_HW_RESET, 5313 &reset_context->flags); 5314 r = amdgpu_do_asic_reset(device_list_handle, 5315 reset_context); 5316 if (r && r == -EAGAIN) 5317 goto retry; 5318 } 5319 5320 /* 5321 * add reset counter so that the following 5322 * resubmitted job could flush vmid 5323 */ 5324 atomic_inc(&adev->gpu_reset_counter); 5325 continue; 5326 } 5327 5328 /* got the hw fence, signal finished fence */ 5329 atomic_dec(ring->sched.score); 5330 dma_fence_get(&s_job->s_fence->finished); 5331 dma_fence_signal(&s_job->s_fence->finished); 5332 dma_fence_put(&s_job->s_fence->finished); 5333 5334 /* remove node from list and free the job */ 5335 spin_lock(&ring->sched.job_list_lock); 5336 list_del_init(&s_job->list); 5337 spin_unlock(&ring->sched.job_list_lock); 5338 ring->sched.ops->free_job(s_job); 5339 } 5340 } 5341 5342 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5343 { 5344 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5345 5346 #if defined(CONFIG_DEBUG_FS) 5347 if (!amdgpu_sriov_vf(adev)) 5348 cancel_work(&adev->reset_work); 5349 #endif 5350 5351 if (adev->kfd.dev) 5352 cancel_work(&adev->kfd.reset_work); 5353 5354 if (amdgpu_sriov_vf(adev)) 5355 cancel_work(&adev->virt.flr_work); 5356 5357 if (con && adev->ras_enabled) 5358 cancel_work(&con->recovery_work); 5359 5360 } 5361 5362 5363 /** 5364 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5365 * 5366 * @adev: amdgpu_device pointer 5367 * @job: which job trigger hang 5368 * 5369 * Attempt to reset the GPU if it has hung (all asics). 5370 * Attempt to do soft-reset or full-reset and reinitialize Asic 5371 * Returns 0 for success or an error on failure. 5372 */ 5373 5374 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5375 struct amdgpu_job *job, 5376 struct amdgpu_reset_context *reset_context) 5377 { 5378 struct list_head device_list, *device_list_handle = NULL; 5379 bool job_signaled = false; 5380 struct amdgpu_hive_info *hive = NULL; 5381 struct amdgpu_device *tmp_adev = NULL; 5382 int i, r = 0; 5383 bool need_emergency_restart = false; 5384 bool audio_suspended = false; 5385 int tmp_vram_lost_counter; 5386 bool gpu_reset_for_dev_remove = false; 5387 5388 gpu_reset_for_dev_remove = 5389 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5390 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5391 5392 /* 5393 * Special case: RAS triggered and full reset isn't supported 5394 */ 5395 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5396 5397 /* 5398 * Flush RAM to disk so that after reboot 5399 * the user can read log and see why the system rebooted. 5400 */ 5401 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5402 DRM_WARN("Emergency reboot."); 5403 5404 #ifdef notyet 5405 ksys_sync_helper(); 5406 emergency_restart(); 5407 #else 5408 panic("emergency_restart"); 5409 #endif 5410 } 5411 5412 dev_info(adev->dev, "GPU %s begin!\n", 5413 need_emergency_restart ? "jobs stop":"reset"); 5414 5415 if (!amdgpu_sriov_vf(adev)) 5416 hive = amdgpu_get_xgmi_hive(adev); 5417 if (hive) 5418 mutex_lock(&hive->hive_lock); 5419 5420 reset_context->job = job; 5421 reset_context->hive = hive; 5422 /* 5423 * Build list of devices to reset. 5424 * In case we are in XGMI hive mode, resort the device list 5425 * to put adev in the 1st position. 5426 */ 5427 INIT_LIST_HEAD(&device_list); 5428 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5429 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5430 list_add_tail(&tmp_adev->reset_list, &device_list); 5431 if (gpu_reset_for_dev_remove && adev->shutdown) 5432 tmp_adev->shutdown = true; 5433 } 5434 if (!list_is_first(&adev->reset_list, &device_list)) 5435 list_rotate_to_front(&adev->reset_list, &device_list); 5436 device_list_handle = &device_list; 5437 } else { 5438 list_add_tail(&adev->reset_list, &device_list); 5439 device_list_handle = &device_list; 5440 } 5441 5442 /* We need to lock reset domain only once both for XGMI and single device */ 5443 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5444 reset_list); 5445 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5446 5447 /* block all schedulers and reset given job's ring */ 5448 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5449 5450 amdgpu_device_set_mp1_state(tmp_adev); 5451 5452 /* 5453 * Try to put the audio codec into suspend state 5454 * before gpu reset started. 5455 * 5456 * Due to the power domain of the graphics device 5457 * is shared with AZ power domain. Without this, 5458 * we may change the audio hardware from behind 5459 * the audio driver's back. That will trigger 5460 * some audio codec errors. 5461 */ 5462 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5463 audio_suspended = true; 5464 5465 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5466 5467 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5468 5469 if (!amdgpu_sriov_vf(tmp_adev)) 5470 amdgpu_amdkfd_pre_reset(tmp_adev); 5471 5472 /* 5473 * Mark these ASICs to be reseted as untracked first 5474 * And add them back after reset completed 5475 */ 5476 amdgpu_unregister_gpu_instance(tmp_adev); 5477 5478 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5479 5480 /* disable ras on ALL IPs */ 5481 if (!need_emergency_restart && 5482 amdgpu_device_ip_need_full_reset(tmp_adev)) 5483 amdgpu_ras_suspend(tmp_adev); 5484 5485 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5486 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5487 5488 if (!ring || !ring->sched.thread) 5489 continue; 5490 5491 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5492 5493 if (need_emergency_restart) 5494 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5495 } 5496 atomic_inc(&tmp_adev->gpu_reset_counter); 5497 } 5498 5499 if (need_emergency_restart) 5500 goto skip_sched_resume; 5501 5502 /* 5503 * Must check guilty signal here since after this point all old 5504 * HW fences are force signaled. 5505 * 5506 * job->base holds a reference to parent fence 5507 */ 5508 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5509 job_signaled = true; 5510 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5511 goto skip_hw_reset; 5512 } 5513 5514 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5515 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5516 if (gpu_reset_for_dev_remove) { 5517 /* Workaroud for ASICs need to disable SMC first */ 5518 amdgpu_device_smu_fini_early(tmp_adev); 5519 } 5520 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5521 /*TODO Should we stop ?*/ 5522 if (r) { 5523 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5524 r, adev_to_drm(tmp_adev)->unique); 5525 tmp_adev->asic_reset_res = r; 5526 } 5527 5528 /* 5529 * Drop all pending non scheduler resets. Scheduler resets 5530 * were already dropped during drm_sched_stop 5531 */ 5532 amdgpu_device_stop_pending_resets(tmp_adev); 5533 } 5534 5535 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5536 /* Actual ASIC resets if needed.*/ 5537 /* Host driver will handle XGMI hive reset for SRIOV */ 5538 if (amdgpu_sriov_vf(adev)) { 5539 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5540 if (r) 5541 adev->asic_reset_res = r; 5542 5543 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5544 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5545 amdgpu_ras_resume(adev); 5546 } else { 5547 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5548 if (r && r == -EAGAIN) 5549 goto retry; 5550 5551 if (!r && gpu_reset_for_dev_remove) 5552 goto recover_end; 5553 } 5554 5555 skip_hw_reset: 5556 5557 /* Post ASIC reset for all devs .*/ 5558 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5559 5560 /* 5561 * Sometimes a later bad compute job can block a good gfx job as gfx 5562 * and compute ring share internal GC HW mutually. We add an additional 5563 * guilty jobs recheck step to find the real guilty job, it synchronously 5564 * submits and pends for the first job being signaled. If it gets timeout, 5565 * we identify it as a real guilty job. 5566 */ 5567 if (amdgpu_gpu_recovery == 2 && 5568 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5569 amdgpu_device_recheck_guilty_jobs( 5570 tmp_adev, device_list_handle, reset_context); 5571 5572 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5573 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5574 5575 if (!ring || !ring->sched.thread) 5576 continue; 5577 5578 /* No point to resubmit jobs if we didn't HW reset*/ 5579 if (!tmp_adev->asic_reset_res && !job_signaled) 5580 drm_sched_resubmit_jobs(&ring->sched); 5581 5582 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5583 } 5584 5585 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5586 amdgpu_mes_self_test(tmp_adev); 5587 5588 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5589 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5590 } 5591 5592 if (tmp_adev->asic_reset_res) 5593 r = tmp_adev->asic_reset_res; 5594 5595 tmp_adev->asic_reset_res = 0; 5596 5597 if (r) { 5598 /* bad news, how to tell it to userspace ? */ 5599 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5600 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5601 } else { 5602 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5603 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5604 DRM_WARN("smart shift update failed\n"); 5605 } 5606 } 5607 5608 skip_sched_resume: 5609 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5610 /* unlock kfd: SRIOV would do it separately */ 5611 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5612 amdgpu_amdkfd_post_reset(tmp_adev); 5613 5614 /* kfd_post_reset will do nothing if kfd device is not initialized, 5615 * need to bring up kfd here if it's not be initialized before 5616 */ 5617 if (!adev->kfd.init_complete) 5618 amdgpu_amdkfd_device_init(adev); 5619 5620 if (audio_suspended) 5621 amdgpu_device_resume_display_audio(tmp_adev); 5622 5623 amdgpu_device_unset_mp1_state(tmp_adev); 5624 } 5625 5626 recover_end: 5627 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5628 reset_list); 5629 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5630 5631 if (hive) { 5632 mutex_unlock(&hive->hive_lock); 5633 amdgpu_put_xgmi_hive(hive); 5634 } 5635 5636 if (r) 5637 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5638 5639 atomic_set(&adev->reset_domain->reset_res, r); 5640 return r; 5641 } 5642 5643 /** 5644 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5645 * 5646 * @adev: amdgpu_device pointer 5647 * 5648 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5649 * and lanes) of the slot the device is in. Handles APUs and 5650 * virtualized environments where PCIE config space may not be available. 5651 */ 5652 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5653 { 5654 struct pci_dev *pdev; 5655 enum pci_bus_speed speed_cap, platform_speed_cap; 5656 enum pcie_link_width platform_link_width; 5657 5658 if (amdgpu_pcie_gen_cap) 5659 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5660 5661 if (amdgpu_pcie_lane_cap) 5662 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5663 5664 /* covers APUs as well */ 5665 if (pci_is_root_bus(adev->pdev->bus)) { 5666 if (adev->pm.pcie_gen_mask == 0) 5667 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5668 if (adev->pm.pcie_mlw_mask == 0) 5669 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5670 return; 5671 } 5672 5673 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5674 return; 5675 5676 pcie_bandwidth_available(adev->pdev, NULL, 5677 &platform_speed_cap, &platform_link_width); 5678 5679 if (adev->pm.pcie_gen_mask == 0) { 5680 /* asic caps */ 5681 pdev = adev->pdev; 5682 speed_cap = pcie_get_speed_cap(pdev); 5683 if (speed_cap == PCI_SPEED_UNKNOWN) { 5684 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5685 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5686 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5687 } else { 5688 if (speed_cap == PCIE_SPEED_32_0GT) 5689 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5690 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5691 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5692 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5693 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5694 else if (speed_cap == PCIE_SPEED_16_0GT) 5695 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5696 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5698 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5699 else if (speed_cap == PCIE_SPEED_8_0GT) 5700 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5701 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5702 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5703 else if (speed_cap == PCIE_SPEED_5_0GT) 5704 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5705 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5706 else 5707 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5708 } 5709 /* platform caps */ 5710 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5711 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5712 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5713 } else { 5714 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5715 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5716 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5717 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5718 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5719 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5720 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5721 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5722 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5724 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5725 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5726 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5727 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5728 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5729 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5730 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5731 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5732 else 5733 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5734 5735 } 5736 } 5737 if (adev->pm.pcie_mlw_mask == 0) { 5738 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5739 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5740 } else { 5741 switch (platform_link_width) { 5742 case PCIE_LNK_X32: 5743 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5744 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5745 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5746 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5747 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5750 break; 5751 case PCIE_LNK_X16: 5752 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5754 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5758 break; 5759 case PCIE_LNK_X12: 5760 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5762 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5763 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5764 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5765 break; 5766 case PCIE_LNK_X8: 5767 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5769 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5770 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5771 break; 5772 case PCIE_LNK_X4: 5773 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5776 break; 5777 case PCIE_LNK_X2: 5778 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5780 break; 5781 case PCIE_LNK_X1: 5782 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5783 break; 5784 default: 5785 break; 5786 } 5787 } 5788 } 5789 } 5790 5791 /** 5792 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5793 * 5794 * @adev: amdgpu_device pointer 5795 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5796 * 5797 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5798 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5799 * @peer_adev. 5800 */ 5801 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5802 struct amdgpu_device *peer_adev) 5803 { 5804 #ifdef CONFIG_HSA_AMD_P2P 5805 uint64_t address_mask = peer_adev->dev->dma_mask ? 5806 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5807 resource_size_t aper_limit = 5808 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5809 bool p2p_access = 5810 !adev->gmc.xgmi.connected_to_cpu && 5811 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5812 5813 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5814 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5815 !(adev->gmc.aper_base & address_mask || 5816 aper_limit & address_mask)); 5817 #else 5818 return false; 5819 #endif 5820 } 5821 5822 int amdgpu_device_baco_enter(struct drm_device *dev) 5823 { 5824 struct amdgpu_device *adev = drm_to_adev(dev); 5825 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5826 5827 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5828 return -ENOTSUPP; 5829 5830 if (ras && adev->ras_enabled && 5831 adev->nbio.funcs->enable_doorbell_interrupt) 5832 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5833 5834 return amdgpu_dpm_baco_enter(adev); 5835 } 5836 5837 int amdgpu_device_baco_exit(struct drm_device *dev) 5838 { 5839 struct amdgpu_device *adev = drm_to_adev(dev); 5840 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5841 int ret = 0; 5842 5843 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5844 return -ENOTSUPP; 5845 5846 ret = amdgpu_dpm_baco_exit(adev); 5847 if (ret) 5848 return ret; 5849 5850 if (ras && adev->ras_enabled && 5851 adev->nbio.funcs->enable_doorbell_interrupt) 5852 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5853 5854 if (amdgpu_passthrough(adev) && 5855 adev->nbio.funcs->clear_doorbell_interrupt) 5856 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5857 5858 return 0; 5859 } 5860 5861 /** 5862 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5863 * @pdev: PCI device struct 5864 * @state: PCI channel state 5865 * 5866 * Description: Called when a PCI error is detected. 5867 * 5868 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5869 */ 5870 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5871 { 5872 STUB(); 5873 return 0; 5874 #ifdef notyet 5875 struct drm_device *dev = pci_get_drvdata(pdev); 5876 struct amdgpu_device *adev = drm_to_adev(dev); 5877 int i; 5878 5879 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5880 5881 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5882 DRM_WARN("No support for XGMI hive yet..."); 5883 return PCI_ERS_RESULT_DISCONNECT; 5884 } 5885 5886 adev->pci_channel_state = state; 5887 5888 switch (state) { 5889 case pci_channel_io_normal: 5890 return PCI_ERS_RESULT_CAN_RECOVER; 5891 /* Fatal error, prepare for slot reset */ 5892 case pci_channel_io_frozen: 5893 /* 5894 * Locking adev->reset_domain->sem will prevent any external access 5895 * to GPU during PCI error recovery 5896 */ 5897 amdgpu_device_lock_reset_domain(adev->reset_domain); 5898 amdgpu_device_set_mp1_state(adev); 5899 5900 /* 5901 * Block any work scheduling as we do for regular GPU reset 5902 * for the duration of the recovery 5903 */ 5904 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5905 struct amdgpu_ring *ring = adev->rings[i]; 5906 5907 if (!ring || !ring->sched.thread) 5908 continue; 5909 5910 drm_sched_stop(&ring->sched, NULL); 5911 } 5912 atomic_inc(&adev->gpu_reset_counter); 5913 return PCI_ERS_RESULT_NEED_RESET; 5914 case pci_channel_io_perm_failure: 5915 /* Permanent error, prepare for device removal */ 5916 return PCI_ERS_RESULT_DISCONNECT; 5917 } 5918 5919 return PCI_ERS_RESULT_NEED_RESET; 5920 #endif 5921 } 5922 5923 /** 5924 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5925 * @pdev: pointer to PCI device 5926 */ 5927 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5928 { 5929 5930 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5931 5932 /* TODO - dump whatever for debugging purposes */ 5933 5934 /* This called only if amdgpu_pci_error_detected returns 5935 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5936 * works, no need to reset slot. 5937 */ 5938 5939 return PCI_ERS_RESULT_RECOVERED; 5940 } 5941 5942 /** 5943 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5944 * @pdev: PCI device struct 5945 * 5946 * Description: This routine is called by the pci error recovery 5947 * code after the PCI slot has been reset, just before we 5948 * should resume normal operations. 5949 */ 5950 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5951 { 5952 STUB(); 5953 return PCI_ERS_RESULT_RECOVERED; 5954 #ifdef notyet 5955 struct drm_device *dev = pci_get_drvdata(pdev); 5956 struct amdgpu_device *adev = drm_to_adev(dev); 5957 int r, i; 5958 struct amdgpu_reset_context reset_context; 5959 u32 memsize; 5960 struct list_head device_list; 5961 5962 DRM_INFO("PCI error: slot reset callback!!\n"); 5963 5964 memset(&reset_context, 0, sizeof(reset_context)); 5965 5966 INIT_LIST_HEAD(&device_list); 5967 list_add_tail(&adev->reset_list, &device_list); 5968 5969 /* wait for asic to come out of reset */ 5970 drm_msleep(500); 5971 5972 /* Restore PCI confspace */ 5973 amdgpu_device_load_pci_state(pdev); 5974 5975 /* confirm ASIC came out of reset */ 5976 for (i = 0; i < adev->usec_timeout; i++) { 5977 memsize = amdgpu_asic_get_config_memsize(adev); 5978 5979 if (memsize != 0xffffffff) 5980 break; 5981 udelay(1); 5982 } 5983 if (memsize == 0xffffffff) { 5984 r = -ETIME; 5985 goto out; 5986 } 5987 5988 reset_context.method = AMD_RESET_METHOD_NONE; 5989 reset_context.reset_req_dev = adev; 5990 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5991 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5992 5993 adev->no_hw_access = true; 5994 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5995 adev->no_hw_access = false; 5996 if (r) 5997 goto out; 5998 5999 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6000 6001 out: 6002 if (!r) { 6003 if (amdgpu_device_cache_pci_state(adev->pdev)) 6004 pci_restore_state(adev->pdev); 6005 6006 DRM_INFO("PCIe error recovery succeeded\n"); 6007 } else { 6008 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6009 amdgpu_device_unset_mp1_state(adev); 6010 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6011 } 6012 6013 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6014 #endif 6015 } 6016 6017 /** 6018 * amdgpu_pci_resume() - resume normal ops after PCI reset 6019 * @pdev: pointer to PCI device 6020 * 6021 * Called when the error recovery driver tells us that its 6022 * OK to resume normal operation. 6023 */ 6024 void amdgpu_pci_resume(struct pci_dev *pdev) 6025 { 6026 STUB(); 6027 #ifdef notyet 6028 struct drm_device *dev = pci_get_drvdata(pdev); 6029 struct amdgpu_device *adev = drm_to_adev(dev); 6030 int i; 6031 6032 6033 DRM_INFO("PCI error: resume callback!!\n"); 6034 6035 /* Only continue execution for the case of pci_channel_io_frozen */ 6036 if (adev->pci_channel_state != pci_channel_io_frozen) 6037 return; 6038 6039 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6040 struct amdgpu_ring *ring = adev->rings[i]; 6041 6042 if (!ring || !ring->sched.thread) 6043 continue; 6044 6045 6046 drm_sched_resubmit_jobs(&ring->sched); 6047 drm_sched_start(&ring->sched, true); 6048 } 6049 6050 amdgpu_device_unset_mp1_state(adev); 6051 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6052 #endif 6053 } 6054 6055 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6056 { 6057 return false; 6058 #ifdef notyet 6059 struct drm_device *dev = pci_get_drvdata(pdev); 6060 struct amdgpu_device *adev = drm_to_adev(dev); 6061 int r; 6062 6063 r = pci_save_state(pdev); 6064 if (!r) { 6065 kfree(adev->pci_state); 6066 6067 adev->pci_state = pci_store_saved_state(pdev); 6068 6069 if (!adev->pci_state) { 6070 DRM_ERROR("Failed to store PCI saved state"); 6071 return false; 6072 } 6073 } else { 6074 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6075 return false; 6076 } 6077 6078 return true; 6079 #endif 6080 } 6081 6082 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6083 { 6084 STUB(); 6085 return false; 6086 #ifdef notyet 6087 struct drm_device *dev = pci_get_drvdata(pdev); 6088 struct amdgpu_device *adev = drm_to_adev(dev); 6089 int r; 6090 6091 if (!adev->pci_state) 6092 return false; 6093 6094 r = pci_load_saved_state(pdev, adev->pci_state); 6095 6096 if (!r) { 6097 pci_restore_state(pdev); 6098 } else { 6099 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6100 return false; 6101 } 6102 6103 return true; 6104 #endif 6105 } 6106 6107 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6108 struct amdgpu_ring *ring) 6109 { 6110 #ifdef CONFIG_X86_64 6111 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6112 return; 6113 #endif 6114 if (adev->gmc.xgmi.connected_to_cpu) 6115 return; 6116 6117 if (ring && ring->funcs->emit_hdp_flush) 6118 amdgpu_ring_emit_hdp_flush(ring); 6119 else 6120 amdgpu_asic_flush_hdp(adev, ring); 6121 } 6122 6123 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6124 struct amdgpu_ring *ring) 6125 { 6126 #ifdef CONFIG_X86_64 6127 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6128 return; 6129 #endif 6130 if (adev->gmc.xgmi.connected_to_cpu) 6131 return; 6132 6133 amdgpu_asic_invalidate_hdp(adev, ring); 6134 } 6135 6136 int amdgpu_in_reset(struct amdgpu_device *adev) 6137 { 6138 return atomic_read(&adev->reset_domain->in_gpu_reset); 6139 } 6140 6141 /** 6142 * amdgpu_device_halt() - bring hardware to some kind of halt state 6143 * 6144 * @adev: amdgpu_device pointer 6145 * 6146 * Bring hardware to some kind of halt state so that no one can touch it 6147 * any more. It will help to maintain error context when error occurred. 6148 * Compare to a simple hang, the system will keep stable at least for SSH 6149 * access. Then it should be trivial to inspect the hardware state and 6150 * see what's going on. Implemented as following: 6151 * 6152 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6153 * clears all CPU mappings to device, disallows remappings through page faults 6154 * 2. amdgpu_irq_disable_all() disables all interrupts 6155 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6156 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6157 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6158 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6159 * flush any in flight DMA operations 6160 */ 6161 void amdgpu_device_halt(struct amdgpu_device *adev) 6162 { 6163 struct pci_dev *pdev = adev->pdev; 6164 struct drm_device *ddev = adev_to_drm(adev); 6165 6166 drm_dev_unplug(ddev); 6167 6168 amdgpu_irq_disable_all(adev); 6169 6170 amdgpu_fence_driver_hw_fini(adev); 6171 6172 adev->no_hw_access = true; 6173 6174 amdgpu_device_unmap_mmio(adev); 6175 6176 pci_disable_device(pdev); 6177 pci_wait_for_pending_transaction(pdev); 6178 } 6179 6180 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6181 u32 reg) 6182 { 6183 unsigned long flags, address, data; 6184 u32 r; 6185 6186 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6187 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6188 6189 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6190 WREG32(address, reg * 4); 6191 (void)RREG32(address); 6192 r = RREG32(data); 6193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6194 return r; 6195 } 6196 6197 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6198 u32 reg, u32 v) 6199 { 6200 unsigned long flags, address, data; 6201 6202 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6203 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6204 6205 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6206 WREG32(address, reg * 4); 6207 (void)RREG32(address); 6208 WREG32(data, v); 6209 (void)RREG32(data); 6210 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6211 } 6212 6213 /** 6214 * amdgpu_device_switch_gang - switch to a new gang 6215 * @adev: amdgpu_device pointer 6216 * @gang: the gang to switch to 6217 * 6218 * Try to switch to a new gang. 6219 * Returns: NULL if we switched to the new gang or a reference to the current 6220 * gang leader. 6221 */ 6222 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6223 struct dma_fence *gang) 6224 { 6225 struct dma_fence *old = NULL; 6226 6227 do { 6228 dma_fence_put(old); 6229 rcu_read_lock(); 6230 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6231 rcu_read_unlock(); 6232 6233 if (old == gang) 6234 break; 6235 6236 if (!dma_fence_is_signaled(old)) 6237 return old; 6238 6239 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6240 old, gang) != old); 6241 6242 dma_fence_put(old); 6243 return NULL; 6244 } 6245 6246 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6247 { 6248 switch (adev->asic_type) { 6249 #ifdef CONFIG_DRM_AMDGPU_SI 6250 case CHIP_HAINAN: 6251 #endif 6252 case CHIP_TOPAZ: 6253 /* chips with no display hardware */ 6254 return false; 6255 #ifdef CONFIG_DRM_AMDGPU_SI 6256 case CHIP_TAHITI: 6257 case CHIP_PITCAIRN: 6258 case CHIP_VERDE: 6259 case CHIP_OLAND: 6260 #endif 6261 #ifdef CONFIG_DRM_AMDGPU_CIK 6262 case CHIP_BONAIRE: 6263 case CHIP_HAWAII: 6264 case CHIP_KAVERI: 6265 case CHIP_KABINI: 6266 case CHIP_MULLINS: 6267 #endif 6268 case CHIP_TONGA: 6269 case CHIP_FIJI: 6270 case CHIP_POLARIS10: 6271 case CHIP_POLARIS11: 6272 case CHIP_POLARIS12: 6273 case CHIP_VEGAM: 6274 case CHIP_CARRIZO: 6275 case CHIP_STONEY: 6276 /* chips with display hardware */ 6277 return true; 6278 default: 6279 /* IP discovery */ 6280 if (!adev->ip_versions[DCE_HWIP][0] || 6281 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6282 return false; 6283 return true; 6284 } 6285 } 6286