1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/pci.h> 34 35 #include <drm/drm_atomic_helper.h> 36 #include <drm/drm_probe_helper.h> 37 #include <drm/amdgpu_drm.h> 38 #include <linux/vgaarb.h> 39 #include <linux/vga_switcheroo.h> 40 #include <linux/efi.h> 41 #include "amdgpu.h" 42 #include "amdgpu_trace.h" 43 #include "amdgpu_i2c.h" 44 #include "atom.h" 45 #include "amdgpu_atombios.h" 46 #include "amdgpu_atomfirmware.h" 47 #include "amd_pcie.h" 48 #ifdef CONFIG_DRM_AMDGPU_SI 49 #include "si.h" 50 #endif 51 #ifdef CONFIG_DRM_AMDGPU_CIK 52 #include "cik.h" 53 #endif 54 #include "vi.h" 55 #include "soc15.h" 56 #include "nv.h" 57 #include "bif/bif_4_1_d.h" 58 #include <linux/pci.h> 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 92 const char *amdgpu_asic_name[] = { 93 "TAHITI", 94 "PITCAIRN", 95 "VERDE", 96 "OLAND", 97 "HAINAN", 98 "BONAIRE", 99 "KAVERI", 100 "KABINI", 101 "HAWAII", 102 "MULLINS", 103 "TOPAZ", 104 "TONGA", 105 "FIJI", 106 "CARRIZO", 107 "STONEY", 108 "POLARIS10", 109 "POLARIS11", 110 "POLARIS12", 111 "VEGAM", 112 "VEGA10", 113 "VEGA12", 114 "VEGA20", 115 "RAVEN", 116 "ARCTURUS", 117 "RENOIR", 118 "ALDEBARAN", 119 "NAVI10", 120 "CYAN_SKILLFISH", 121 "NAVI14", 122 "NAVI12", 123 "SIENNA_CICHLID", 124 "NAVY_FLOUNDER", 125 "VANGOGH", 126 "DIMGREY_CAVEFISH", 127 "BEIGE_GOBY", 128 "YELLOW_CARP", 129 "LAST", 130 }; 131 132 /** 133 * DOC: pcie_replay_count 134 * 135 * The amdgpu driver provides a sysfs API for reporting the total number 136 * of PCIe replays (NAKs) 137 * The file pcie_replay_count is used for this and returns the total 138 * number of replays as a sum of the NAKs generated and NAKs received 139 */ 140 141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct drm_device *ddev = dev_get_drvdata(dev); 145 struct amdgpu_device *adev = drm_to_adev(ddev); 146 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 147 148 return sysfs_emit(buf, "%llu\n", cnt); 149 } 150 151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 152 amdgpu_device_get_pcie_replay_count, NULL); 153 154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 155 156 /** 157 * DOC: product_name 158 * 159 * The amdgpu driver provides a sysfs API for reporting the product name 160 * for the device 161 * The file serial_number is used for this and returns the product name 162 * as returned from the FRU. 163 * NOTE: This is only available for certain server cards 164 */ 165 166 static ssize_t amdgpu_device_get_product_name(struct device *dev, 167 struct device_attribute *attr, char *buf) 168 { 169 struct drm_device *ddev = dev_get_drvdata(dev); 170 struct amdgpu_device *adev = drm_to_adev(ddev); 171 172 return sysfs_emit(buf, "%s\n", adev->product_name); 173 } 174 175 static DEVICE_ATTR(product_name, S_IRUGO, 176 amdgpu_device_get_product_name, NULL); 177 178 /** 179 * DOC: product_number 180 * 181 * The amdgpu driver provides a sysfs API for reporting the part number 182 * for the device 183 * The file serial_number is used for this and returns the part number 184 * as returned from the FRU. 185 * NOTE: This is only available for certain server cards 186 */ 187 188 static ssize_t amdgpu_device_get_product_number(struct device *dev, 189 struct device_attribute *attr, char *buf) 190 { 191 struct drm_device *ddev = dev_get_drvdata(dev); 192 struct amdgpu_device *adev = drm_to_adev(ddev); 193 194 return sysfs_emit(buf, "%s\n", adev->product_number); 195 } 196 197 static DEVICE_ATTR(product_number, S_IRUGO, 198 amdgpu_device_get_product_number, NULL); 199 200 /** 201 * DOC: serial_number 202 * 203 * The amdgpu driver provides a sysfs API for reporting the serial number 204 * for the device 205 * The file serial_number is used for this and returns the serial number 206 * as returned from the FRU. 207 * NOTE: This is only available for certain server cards 208 */ 209 210 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 211 struct device_attribute *attr, char *buf) 212 { 213 struct drm_device *ddev = dev_get_drvdata(dev); 214 struct amdgpu_device *adev = drm_to_adev(ddev); 215 216 return sysfs_emit(buf, "%s\n", adev->serial); 217 } 218 219 static DEVICE_ATTR(serial_number, S_IRUGO, 220 amdgpu_device_get_serial_number, NULL); 221 222 /** 223 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 224 * 225 * @dev: drm_device pointer 226 * 227 * Returns true if the device is a dGPU with ATPX power control, 228 * otherwise return false. 229 */ 230 bool amdgpu_device_supports_px(struct drm_device *dev) 231 { 232 struct amdgpu_device *adev = drm_to_adev(dev); 233 234 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 235 return true; 236 return false; 237 } 238 239 /** 240 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 241 * 242 * @dev: drm_device pointer 243 * 244 * Returns true if the device is a dGPU with ACPI power control, 245 * otherwise return false. 246 */ 247 bool amdgpu_device_supports_boco(struct drm_device *dev) 248 { 249 struct amdgpu_device *adev = drm_to_adev(dev); 250 251 if (adev->has_pr3 || 252 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 253 return true; 254 return false; 255 } 256 257 /** 258 * amdgpu_device_supports_baco - Does the device support BACO 259 * 260 * @dev: drm_device pointer 261 * 262 * Returns true if the device supporte BACO, 263 * otherwise return false. 264 */ 265 bool amdgpu_device_supports_baco(struct drm_device *dev) 266 { 267 struct amdgpu_device *adev = drm_to_adev(dev); 268 269 return amdgpu_asic_supports_baco(adev); 270 } 271 272 /** 273 * amdgpu_device_supports_smart_shift - Is the device dGPU with 274 * smart shift support 275 * 276 * @dev: drm_device pointer 277 * 278 * Returns true if the device is a dGPU with Smart Shift support, 279 * otherwise returns false. 280 */ 281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 282 { 283 return (amdgpu_device_supports_boco(dev) && 284 amdgpu_acpi_is_power_shift_control_supported()); 285 } 286 287 /* 288 * VRAM access helper functions 289 */ 290 291 /** 292 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 293 * 294 * @adev: amdgpu_device pointer 295 * @pos: offset of the buffer in vram 296 * @buf: virtual address of the buffer in system memory 297 * @size: read/write size, sizeof(@buf) must > @size 298 * @write: true - write to vram, otherwise - read from vram 299 */ 300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 301 void *buf, size_t size, bool write) 302 { 303 unsigned long flags; 304 uint32_t hi = ~0, tmp = 0; 305 uint32_t *data = buf; 306 uint64_t last; 307 int idx; 308 309 if (!drm_dev_enter(&adev->ddev, &idx)) 310 return; 311 312 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 313 314 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 315 for (last = pos + size; pos < last; pos += 4) { 316 tmp = pos >> 31; 317 318 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 319 if (tmp != hi) { 320 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 321 hi = tmp; 322 } 323 if (write) 324 WREG32_NO_KIQ(mmMM_DATA, *data++); 325 else 326 *data++ = RREG32_NO_KIQ(mmMM_DATA); 327 } 328 329 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 330 drm_dev_exit(idx); 331 } 332 333 /** 334 * amdgpu_device_vram_access - access vram by vram aperature 335 * 336 * @adev: amdgpu_device pointer 337 * @pos: offset of the buffer in vram 338 * @buf: virtual address of the buffer in system memory 339 * @size: read/write size, sizeof(@buf) must > @size 340 * @write: true - write to vram, otherwise - read from vram 341 * 342 * The return value means how many bytes have been transferred. 343 */ 344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 345 void *buf, size_t size, bool write) 346 { 347 #ifdef CONFIG_64BIT 348 void __iomem *addr; 349 size_t count = 0; 350 uint64_t last; 351 352 if (!adev->mman.aper_base_kaddr) 353 return 0; 354 355 last = min(pos + size, adev->gmc.visible_vram_size); 356 if (last > pos) { 357 addr = adev->mman.aper_base_kaddr + pos; 358 count = last - pos; 359 360 if (write) { 361 memcpy_toio(addr, buf, count); 362 mb(); 363 amdgpu_device_flush_hdp(adev, NULL); 364 } else { 365 amdgpu_device_invalidate_hdp(adev, NULL); 366 mb(); 367 memcpy_fromio(buf, addr, count); 368 } 369 370 } 371 372 return count; 373 #else 374 return 0; 375 #endif 376 } 377 378 /** 379 * amdgpu_device_vram_access - read/write a buffer in vram 380 * 381 * @adev: amdgpu_device pointer 382 * @pos: offset of the buffer in vram 383 * @buf: virtual address of the buffer in system memory 384 * @size: read/write size, sizeof(@buf) must > @size 385 * @write: true - write to vram, otherwise - read from vram 386 */ 387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 388 void *buf, size_t size, bool write) 389 { 390 size_t count; 391 392 /* try to using vram apreature to access vram first */ 393 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 394 size -= count; 395 if (size) { 396 /* using MM to access rest vram */ 397 pos += count; 398 buf += count; 399 amdgpu_device_mm_access(adev, pos, buf, size, write); 400 } 401 } 402 403 /* 404 * register access helper functions. 405 */ 406 407 /* Check if hw access should be skipped because of hotplug or device error */ 408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 409 { 410 if (adev->no_hw_access) 411 return true; 412 413 #ifdef CONFIG_LOCKDEP 414 /* 415 * This is a bit complicated to understand, so worth a comment. What we assert 416 * here is that the GPU reset is not running on another thread in parallel. 417 * 418 * For this we trylock the read side of the reset semaphore, if that succeeds 419 * we know that the reset is not running in paralell. 420 * 421 * If the trylock fails we assert that we are either already holding the read 422 * side of the lock or are the reset thread itself and hold the write side of 423 * the lock. 424 */ 425 if (in_task()) { 426 if (down_read_trylock(&adev->reset_sem)) 427 up_read(&adev->reset_sem); 428 else 429 lockdep_assert_held(&adev->reset_sem); 430 } 431 #endif 432 return false; 433 } 434 435 /** 436 * amdgpu_device_rreg - read a memory mapped IO or indirect register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * @acc_flags: access flags which require special behavior 441 * 442 * Returns the 32 bit value from the offset specified. 443 */ 444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 445 uint32_t reg, uint32_t acc_flags) 446 { 447 uint32_t ret; 448 449 if (amdgpu_device_skip_hw_access(adev)) 450 return 0; 451 452 if ((reg * 4) < adev->rmmio_size) { 453 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 454 amdgpu_sriov_runtime(adev) && 455 down_read_trylock(&adev->reset_sem)) { 456 ret = amdgpu_kiq_rreg(adev, reg); 457 up_read(&adev->reset_sem); 458 } else { 459 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 460 } 461 } else { 462 ret = adev->pcie_rreg(adev, reg * 4); 463 } 464 465 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 466 467 return ret; 468 } 469 470 /* 471 * MMIO register read with bytes helper functions 472 * @offset:bytes offset from MMIO start 473 * 474 */ 475 476 /** 477 * amdgpu_mm_rreg8 - read a memory mapped IO register 478 * 479 * @adev: amdgpu_device pointer 480 * @offset: byte aligned register offset 481 * 482 * Returns the 8 bit value from the offset specified. 483 */ 484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return 0; 488 489 if (offset < adev->rmmio_size) 490 return (readb(adev->rmmio + offset)); 491 BUG(); 492 } 493 494 /* 495 * MMIO register write with bytes helper functions 496 * @offset:bytes offset from MMIO start 497 * @value: the value want to be written to the register 498 * 499 */ 500 /** 501 * amdgpu_mm_wreg8 - read a memory mapped IO register 502 * 503 * @adev: amdgpu_device pointer 504 * @offset: byte aligned register offset 505 * @value: 8 bit value to write 506 * 507 * Writes the value specified to the offset specified. 508 */ 509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 510 { 511 if (amdgpu_device_skip_hw_access(adev)) 512 return; 513 514 if (offset < adev->rmmio_size) 515 writeb(value, adev->rmmio + offset); 516 else 517 BUG(); 518 } 519 520 /** 521 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @v: 32 bit value to write to the register 526 * @acc_flags: access flags which require special behavior 527 * 528 * Writes the value specified to the offset specified. 529 */ 530 void amdgpu_device_wreg(struct amdgpu_device *adev, 531 uint32_t reg, uint32_t v, 532 uint32_t acc_flags) 533 { 534 if (amdgpu_device_skip_hw_access(adev)) 535 return; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_sem)) { 541 amdgpu_kiq_wreg(adev, reg, v); 542 up_read(&adev->reset_sem); 543 } else { 544 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 adev->pcie_wreg(adev, reg * 4, v); 548 } 549 550 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 551 } 552 553 /* 554 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 555 * 556 * this function is invoked only the debugfs register access 557 * */ 558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 559 uint32_t reg, uint32_t v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (amdgpu_sriov_fullaccess(adev) && 565 adev->gfx.rlc.funcs && 566 adev->gfx.rlc.funcs->is_rlcg_access_range) { 567 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 568 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); 569 } else { 570 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 571 } 572 } 573 574 /** 575 * amdgpu_mm_rdoorbell - read a doorbell dword 576 * 577 * @adev: amdgpu_device pointer 578 * @index: doorbell index 579 * 580 * Returns the value in the doorbell aperture at the 581 * requested doorbell index (CIK). 582 */ 583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 584 { 585 if (amdgpu_device_skip_hw_access(adev)) 586 return 0; 587 588 if (index < adev->doorbell.num_doorbells) { 589 return readl(adev->doorbell.ptr + index); 590 } else { 591 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 592 return 0; 593 } 594 } 595 596 /** 597 * amdgpu_mm_wdoorbell - write a doorbell dword 598 * 599 * @adev: amdgpu_device pointer 600 * @index: doorbell index 601 * @v: value to write 602 * 603 * Writes @v to the doorbell aperture at the 604 * requested doorbell index (CIK). 605 */ 606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 607 { 608 if (amdgpu_device_skip_hw_access(adev)) 609 return; 610 611 if (index < adev->doorbell.num_doorbells) { 612 writel(v, adev->doorbell.ptr + index); 613 } else { 614 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 615 } 616 } 617 618 /** 619 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 620 * 621 * @adev: amdgpu_device pointer 622 * @index: doorbell index 623 * 624 * Returns the value in the doorbell aperture at the 625 * requested doorbell index (VEGA10+). 626 */ 627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 628 { 629 if (amdgpu_device_skip_hw_access(adev)) 630 return 0; 631 632 if (index < adev->doorbell.num_doorbells) { 633 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 634 } else { 635 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 636 return 0; 637 } 638 } 639 640 /** 641 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 642 * 643 * @adev: amdgpu_device pointer 644 * @index: doorbell index 645 * @v: value to write 646 * 647 * Writes @v to the doorbell aperture at the 648 * requested doorbell index (VEGA10+). 649 */ 650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 651 { 652 if (amdgpu_device_skip_hw_access(adev)) 653 return; 654 655 if (index < adev->doorbell.num_doorbells) { 656 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 657 } else { 658 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 659 } 660 } 661 662 /** 663 * amdgpu_device_indirect_rreg - read an indirect register 664 * 665 * @adev: amdgpu_device pointer 666 * @pcie_index: mmio register offset 667 * @pcie_data: mmio register offset 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 673 u32 pcie_index, u32 pcie_data, 674 u32 reg_addr) 675 { 676 unsigned long flags; 677 u32 r; 678 void __iomem *pcie_index_offset; 679 void __iomem *pcie_data_offset; 680 681 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 682 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 683 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 684 685 writel(reg_addr, pcie_index_offset); 686 readl(pcie_index_offset); 687 r = readl(pcie_data_offset); 688 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 689 690 return r; 691 } 692 693 /** 694 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 695 * 696 * @adev: amdgpu_device pointer 697 * @pcie_index: mmio register offset 698 * @pcie_data: mmio register offset 699 * @reg_addr: indirect register address to read from 700 * 701 * Returns the value of indirect register @reg_addr 702 */ 703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 704 u32 pcie_index, u32 pcie_data, 705 u32 reg_addr) 706 { 707 unsigned long flags; 708 u64 r; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* read low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 r = readl(pcie_data_offset); 720 /* read high 32 bits */ 721 writel(reg_addr + 4, pcie_index_offset); 722 readl(pcie_index_offset); 723 r |= ((u64)readl(pcie_data_offset) << 32); 724 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 725 726 return r; 727 } 728 729 /** 730 * amdgpu_device_indirect_wreg - write an indirect register address 731 * 732 * @adev: amdgpu_device pointer 733 * @pcie_index: mmio register offset 734 * @pcie_data: mmio register offset 735 * @reg_addr: indirect register offset 736 * @reg_data: indirect register data 737 * 738 */ 739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 740 u32 pcie_index, u32 pcie_data, 741 u32 reg_addr, u32 reg_data) 742 { 743 unsigned long flags; 744 void __iomem *pcie_index_offset; 745 void __iomem *pcie_data_offset; 746 747 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 748 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 749 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 writel(reg_data, pcie_data_offset); 754 readl(pcie_data_offset); 755 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 756 } 757 758 /** 759 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 760 * 761 * @adev: amdgpu_device pointer 762 * @pcie_index: mmio register offset 763 * @pcie_data: mmio register offset 764 * @reg_addr: indirect register offset 765 * @reg_data: indirect register data 766 * 767 */ 768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 769 u32 pcie_index, u32 pcie_data, 770 u32 reg_addr, u64 reg_data) 771 { 772 unsigned long flags; 773 void __iomem *pcie_index_offset; 774 void __iomem *pcie_data_offset; 775 776 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 777 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 778 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 779 780 /* write low 32 bits */ 781 writel(reg_addr, pcie_index_offset); 782 readl(pcie_index_offset); 783 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 784 readl(pcie_data_offset); 785 /* write high 32 bits */ 786 writel(reg_addr + 4, pcie_index_offset); 787 readl(pcie_index_offset); 788 writel((u32)(reg_data >> 32), pcie_data_offset); 789 readl(pcie_data_offset); 790 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 791 } 792 793 /** 794 * amdgpu_invalid_rreg - dummy reg read function 795 * 796 * @adev: amdgpu_device pointer 797 * @reg: offset of register 798 * 799 * Dummy register read function. Used for register blocks 800 * that certain asics don't have (all asics). 801 * Returns the value in the register. 802 */ 803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 804 { 805 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 806 BUG(); 807 return 0; 808 } 809 810 /** 811 * amdgpu_invalid_wreg - dummy reg write function 812 * 813 * @adev: amdgpu_device pointer 814 * @reg: offset of register 815 * @v: value to write to the register 816 * 817 * Dummy register read function. Used for register blocks 818 * that certain asics don't have (all asics). 819 */ 820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 821 { 822 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 823 reg, v); 824 BUG(); 825 } 826 827 /** 828 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 829 * 830 * @adev: amdgpu_device pointer 831 * @reg: offset of register 832 * 833 * Dummy register read function. Used for register blocks 834 * that certain asics don't have (all asics). 835 * Returns the value in the register. 836 */ 837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 838 { 839 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 840 BUG(); 841 return 0; 842 } 843 844 /** 845 * amdgpu_invalid_wreg64 - dummy reg write function 846 * 847 * @adev: amdgpu_device pointer 848 * @reg: offset of register 849 * @v: value to write to the register 850 * 851 * Dummy register read function. Used for register blocks 852 * that certain asics don't have (all asics). 853 */ 854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 855 { 856 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 857 reg, v); 858 BUG(); 859 } 860 861 /** 862 * amdgpu_block_invalid_rreg - dummy reg read function 863 * 864 * @adev: amdgpu_device pointer 865 * @block: offset of instance 866 * @reg: offset of register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 * Returns the value in the register. 871 */ 872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 873 uint32_t block, uint32_t reg) 874 { 875 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 876 reg, block); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_block_invalid_wreg - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @block: offset of instance 886 * @reg: offset of register 887 * @v: value to write to the register 888 * 889 * Dummy register read function. Used for register blocks 890 * that certain asics don't have (all asics). 891 */ 892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 893 uint32_t block, 894 uint32_t reg, uint32_t v) 895 { 896 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 897 reg, block, v); 898 BUG(); 899 } 900 901 /** 902 * amdgpu_device_asic_init - Wrapper for atom asic_init 903 * 904 * @adev: amdgpu_device pointer 905 * 906 * Does any asic specific work and then calls atom asic init. 907 */ 908 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 909 { 910 amdgpu_asic_pre_asic_init(adev); 911 912 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 913 } 914 915 /** 916 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Allocates a scratch page of VRAM for use by various things in the 921 * driver. 922 */ 923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 924 { 925 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 926 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 927 &adev->vram_scratch.robj, 928 &adev->vram_scratch.gpu_addr, 929 (void **)&adev->vram_scratch.ptr); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Frees the VRAM scratch page. 938 */ 939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 940 { 941 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 942 } 943 944 /** 945 * amdgpu_device_program_register_sequence - program an array of registers. 946 * 947 * @adev: amdgpu_device pointer 948 * @registers: pointer to the register array 949 * @array_size: size of the register array 950 * 951 * Programs an array or registers with and and or masks. 952 * This is a helper for setting golden registers. 953 */ 954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 955 const u32 *registers, 956 const u32 array_size) 957 { 958 u32 tmp, reg, and_mask, or_mask; 959 int i; 960 961 if (array_size % 3) 962 return; 963 964 for (i = 0; i < array_size; i +=3) { 965 reg = registers[i + 0]; 966 and_mask = registers[i + 1]; 967 or_mask = registers[i + 2]; 968 969 if (and_mask == 0xffffffff) { 970 tmp = or_mask; 971 } else { 972 tmp = RREG32(reg); 973 tmp &= ~and_mask; 974 if (adev->family >= AMDGPU_FAMILY_AI) 975 tmp |= (or_mask & and_mask); 976 else 977 tmp |= or_mask; 978 } 979 WREG32(reg, tmp); 980 } 981 } 982 983 /** 984 * amdgpu_device_pci_config_reset - reset the GPU 985 * 986 * @adev: amdgpu_device pointer 987 * 988 * Resets the GPU using the pci config reset sequence. 989 * Only applicable to asics prior to vega10. 990 */ 991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 992 { 993 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 994 } 995 996 /** 997 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 998 * 999 * @adev: amdgpu_device pointer 1000 * 1001 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1002 */ 1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1004 { 1005 STUB(); 1006 return -ENOSYS; 1007 #ifdef notyet 1008 return pci_reset_function(adev->pdev); 1009 #endif 1010 } 1011 1012 /* 1013 * GPU doorbell aperture helpers function. 1014 */ 1015 /** 1016 * amdgpu_device_doorbell_init - Init doorbell driver information. 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Init doorbell driver information (CIK) 1021 * Returns 0 on success, error on failure. 1022 */ 1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1024 { 1025 1026 /* No doorbell on SI hardware generation */ 1027 if (adev->asic_type < CHIP_BONAIRE) { 1028 adev->doorbell.base = 0; 1029 adev->doorbell.size = 0; 1030 adev->doorbell.num_doorbells = 0; 1031 adev->doorbell.ptr = NULL; 1032 return 0; 1033 } 1034 1035 #ifdef __linux__ 1036 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1037 return -EINVAL; 1038 #endif 1039 1040 amdgpu_asic_init_doorbell_index(adev); 1041 1042 /* doorbell bar mapping */ 1043 #ifdef __linux__ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 #endif 1047 1048 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1049 adev->doorbell_index.max_assignment+1); 1050 if (adev->doorbell.num_doorbells == 0) 1051 return -EINVAL; 1052 1053 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1054 * paging queue doorbell use the second page. The 1055 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1056 * doorbells are in the first page. So with paging queue enabled, 1057 * the max num_doorbells should + 1 page (0x400 in dword) 1058 */ 1059 if (adev->asic_type >= CHIP_VEGA10) 1060 adev->doorbell.num_doorbells += 0x400; 1061 1062 #ifdef __linux__ 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 #endif 1069 1070 return 0; 1071 } 1072 1073 /** 1074 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Tear down doorbell driver information (CIK) 1079 */ 1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1081 { 1082 #ifdef __linux__ 1083 iounmap(adev->doorbell.ptr); 1084 #else 1085 if (adev->doorbell.size > 0) 1086 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1087 adev->doorbell.size); 1088 #endif 1089 adev->doorbell.ptr = NULL; 1090 } 1091 1092 1093 1094 /* 1095 * amdgpu_device_wb_*() 1096 * Writeback is the method by which the GPU updates special pages in memory 1097 * with the status of certain GPU events (fences, ring pointers,etc.). 1098 */ 1099 1100 /** 1101 * amdgpu_device_wb_fini - Disable Writeback and free memory 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Disables Writeback and frees the Writeback memory (all asics). 1106 * Used at driver shutdown. 1107 */ 1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1109 { 1110 if (adev->wb.wb_obj) { 1111 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1112 &adev->wb.gpu_addr, 1113 (void **)&adev->wb.wb); 1114 adev->wb.wb_obj = NULL; 1115 } 1116 } 1117 1118 /** 1119 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Initializes writeback and allocates writeback memory (all asics). 1124 * Used at driver startup. 1125 * Returns 0 on success or an -error on failure. 1126 */ 1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1128 { 1129 int r; 1130 1131 if (adev->wb.wb_obj == NULL) { 1132 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1133 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1134 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1135 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 if (r) { 1138 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1139 return r; 1140 } 1141 1142 adev->wb.num_wb = AMDGPU_MAX_WB; 1143 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1144 1145 /* clear wb memory */ 1146 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1147 } 1148 1149 return 0; 1150 } 1151 1152 /** 1153 * amdgpu_device_wb_get - Allocate a wb entry 1154 * 1155 * @adev: amdgpu_device pointer 1156 * @wb: wb index 1157 * 1158 * Allocate a wb slot for use by the driver (all asics). 1159 * Returns 0 on success or -EINVAL on failure. 1160 */ 1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1162 { 1163 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1164 1165 if (offset < adev->wb.num_wb) { 1166 __set_bit(offset, adev->wb.used); 1167 *wb = offset << 3; /* convert to dw offset */ 1168 return 0; 1169 } else { 1170 return -EINVAL; 1171 } 1172 } 1173 1174 /** 1175 * amdgpu_device_wb_free - Free a wb entry 1176 * 1177 * @adev: amdgpu_device pointer 1178 * @wb: wb index 1179 * 1180 * Free a wb slot allocated for use by the driver (all asics) 1181 */ 1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1183 { 1184 wb >>= 3; 1185 if (wb < adev->wb.num_wb) 1186 __clear_bit(wb, adev->wb.used); 1187 } 1188 1189 /** 1190 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1191 * 1192 * @adev: amdgpu_device pointer 1193 * 1194 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1195 * to fail, but if any of the BARs is not accessible after the size we abort 1196 * driver loading by returning -ENODEV. 1197 */ 1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1199 { 1200 #ifdef __linux__ 1201 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1202 struct pci_bus *root; 1203 struct resource *res; 1204 unsigned i; 1205 u16 cmd; 1206 int r; 1207 1208 /* Bypass for VF */ 1209 if (amdgpu_sriov_vf(adev)) 1210 return 0; 1211 1212 /* skip if the bios has already enabled large BAR */ 1213 if (adev->gmc.real_vram_size && 1214 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1215 return 0; 1216 1217 /* Check if the root BUS has 64bit memory resources */ 1218 root = adev->pdev->bus; 1219 while (root->parent) 1220 root = root->parent; 1221 1222 pci_bus_for_each_resource(root, res, i) { 1223 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1224 res->start > 0x100000000ull) 1225 break; 1226 } 1227 1228 /* Trying to resize is pointless without a root hub window above 4GB */ 1229 if (!res) 1230 return 0; 1231 1232 /* Limit the BAR size to what is available */ 1233 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1234 rbar_size); 1235 1236 /* Disable memory decoding while we change the BAR addresses and size */ 1237 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1238 pci_write_config_word(adev->pdev, PCI_COMMAND, 1239 cmd & ~PCI_COMMAND_MEMORY); 1240 1241 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1242 amdgpu_device_doorbell_fini(adev); 1243 if (adev->asic_type >= CHIP_BONAIRE) 1244 pci_release_resource(adev->pdev, 2); 1245 1246 pci_release_resource(adev->pdev, 0); 1247 1248 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1249 if (r == -ENOSPC) 1250 DRM_INFO("Not enough PCI address space for a large BAR."); 1251 else if (r && r != -ENOTSUPP) 1252 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1253 1254 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1255 1256 /* When the doorbell or fb BAR isn't available we have no chance of 1257 * using the device. 1258 */ 1259 r = amdgpu_device_doorbell_init(adev); 1260 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1261 return -ENODEV; 1262 1263 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1264 #endif /* __linux__ */ 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /** 1331 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1332 * 1333 * @adev: amdgpu_device pointer 1334 * 1335 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1336 * be set for this device. 1337 * 1338 * Returns true if it should be used or false if not. 1339 */ 1340 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1341 { 1342 switch (amdgpu_aspm) { 1343 case -1: 1344 break; 1345 case 0: 1346 return false; 1347 case 1: 1348 return true; 1349 default: 1350 return false; 1351 } 1352 return pcie_aspm_enabled(adev->pdev); 1353 } 1354 1355 /* if we get transitioned to only one device, take VGA back */ 1356 /** 1357 * amdgpu_device_vga_set_decode - enable/disable vga decode 1358 * 1359 * @pdev: PCI device pointer 1360 * @state: enable/disable vga decode 1361 * 1362 * Enable/disable vga decode (all asics). 1363 * Returns VGA resource flags. 1364 */ 1365 #ifdef notyet 1366 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1367 bool state) 1368 { 1369 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1370 amdgpu_asic_set_vga_state(adev, state); 1371 if (state) 1372 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1373 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1374 else 1375 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1376 } 1377 #endif 1378 1379 /** 1380 * amdgpu_device_check_block_size - validate the vm block size 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Validates the vm block size specified via module parameter. 1385 * The vm block size defines number of bits in page table versus page directory, 1386 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1387 * page table and the remaining bits are in the page directory. 1388 */ 1389 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1390 { 1391 /* defines number of bits in page table versus page directory, 1392 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1393 * page table and the remaining bits are in the page directory */ 1394 if (amdgpu_vm_block_size == -1) 1395 return; 1396 1397 if (amdgpu_vm_block_size < 9) { 1398 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1399 amdgpu_vm_block_size); 1400 amdgpu_vm_block_size = -1; 1401 } 1402 } 1403 1404 /** 1405 * amdgpu_device_check_vm_size - validate the vm size 1406 * 1407 * @adev: amdgpu_device pointer 1408 * 1409 * Validates the vm size in GB specified via module parameter. 1410 * The VM size is the size of the GPU virtual memory space in GB. 1411 */ 1412 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1413 { 1414 /* no need to check the default value */ 1415 if (amdgpu_vm_size == -1) 1416 return; 1417 1418 if (amdgpu_vm_size < 1) { 1419 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1420 amdgpu_vm_size); 1421 amdgpu_vm_size = -1; 1422 } 1423 } 1424 1425 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1426 { 1427 #ifdef __linux__ 1428 struct sysinfo si; 1429 #endif 1430 bool is_os_64 = (sizeof(void *) == 8); 1431 uint64_t total_memory; 1432 uint64_t dram_size_seven_GB = 0x1B8000000; 1433 uint64_t dram_size_three_GB = 0xB8000000; 1434 1435 if (amdgpu_smu_memory_pool_size == 0) 1436 return; 1437 1438 if (!is_os_64) { 1439 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1440 goto def_value; 1441 } 1442 #ifdef __linux__ 1443 si_meminfo(&si); 1444 total_memory = (uint64_t)si.totalram * si.mem_unit; 1445 #else 1446 total_memory = ptoa(physmem); 1447 #endif 1448 1449 if ((amdgpu_smu_memory_pool_size == 1) || 1450 (amdgpu_smu_memory_pool_size == 2)) { 1451 if (total_memory < dram_size_three_GB) 1452 goto def_value1; 1453 } else if ((amdgpu_smu_memory_pool_size == 4) || 1454 (amdgpu_smu_memory_pool_size == 8)) { 1455 if (total_memory < dram_size_seven_GB) 1456 goto def_value1; 1457 } else { 1458 DRM_WARN("Smu memory pool size not supported\n"); 1459 goto def_value; 1460 } 1461 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1462 1463 return; 1464 1465 def_value1: 1466 DRM_WARN("No enough system memory\n"); 1467 def_value: 1468 adev->pm.smu_prv_buffer_size = 0; 1469 } 1470 1471 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1472 { 1473 if (!(adev->flags & AMD_IS_APU) || 1474 adev->asic_type < CHIP_RAVEN) 1475 return 0; 1476 1477 switch (adev->asic_type) { 1478 case CHIP_RAVEN: 1479 if (adev->pdev->device == 0x15dd) 1480 adev->apu_flags |= AMD_APU_IS_RAVEN; 1481 if (adev->pdev->device == 0x15d8) 1482 adev->apu_flags |= AMD_APU_IS_PICASSO; 1483 break; 1484 case CHIP_RENOIR: 1485 if ((adev->pdev->device == 0x1636) || 1486 (adev->pdev->device == 0x164c)) 1487 adev->apu_flags |= AMD_APU_IS_RENOIR; 1488 else 1489 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1490 break; 1491 case CHIP_VANGOGH: 1492 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1493 break; 1494 case CHIP_YELLOW_CARP: 1495 break; 1496 case CHIP_CYAN_SKILLFISH: 1497 if (adev->pdev->device == 0x13FE) 1498 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1499 break; 1500 default: 1501 return -EINVAL; 1502 } 1503 1504 return 0; 1505 } 1506 1507 /** 1508 * amdgpu_device_check_arguments - validate module params 1509 * 1510 * @adev: amdgpu_device pointer 1511 * 1512 * Validates certain module parameters and updates 1513 * the associated values used by the driver (all asics). 1514 */ 1515 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1516 { 1517 if (amdgpu_sched_jobs < 4) { 1518 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1519 amdgpu_sched_jobs); 1520 amdgpu_sched_jobs = 4; 1521 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1522 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1523 amdgpu_sched_jobs); 1524 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1525 } 1526 1527 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1528 /* gart size must be greater or equal to 32M */ 1529 dev_warn(adev->dev, "gart size (%d) too small\n", 1530 amdgpu_gart_size); 1531 amdgpu_gart_size = -1; 1532 } 1533 1534 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1535 /* gtt size must be greater or equal to 32M */ 1536 dev_warn(adev->dev, "gtt size (%d) too small\n", 1537 amdgpu_gtt_size); 1538 amdgpu_gtt_size = -1; 1539 } 1540 1541 /* valid range is between 4 and 9 inclusive */ 1542 if (amdgpu_vm_fragment_size != -1 && 1543 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1544 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1545 amdgpu_vm_fragment_size = -1; 1546 } 1547 1548 if (amdgpu_sched_hw_submission < 2) { 1549 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1550 amdgpu_sched_hw_submission); 1551 amdgpu_sched_hw_submission = 2; 1552 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1553 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1554 amdgpu_sched_hw_submission); 1555 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1556 } 1557 1558 amdgpu_device_check_smu_prv_buffer_size(adev); 1559 1560 amdgpu_device_check_vm_size(adev); 1561 1562 amdgpu_device_check_block_size(adev); 1563 1564 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1565 1566 amdgpu_gmc_tmz_set(adev); 1567 1568 amdgpu_gmc_noretry_set(adev); 1569 1570 return 0; 1571 } 1572 1573 #ifdef __linux__ 1574 /** 1575 * amdgpu_switcheroo_set_state - set switcheroo state 1576 * 1577 * @pdev: pci dev pointer 1578 * @state: vga_switcheroo state 1579 * 1580 * Callback for the switcheroo driver. Suspends or resumes the 1581 * the asics before or after it is powered up using ACPI methods. 1582 */ 1583 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1584 enum vga_switcheroo_state state) 1585 { 1586 struct drm_device *dev = pci_get_drvdata(pdev); 1587 int r; 1588 1589 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1590 return; 1591 1592 if (state == VGA_SWITCHEROO_ON) { 1593 pr_info("switched on\n"); 1594 /* don't suspend or resume card normally */ 1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1596 1597 pci_set_power_state(pdev, PCI_D0); 1598 amdgpu_device_load_pci_state(pdev); 1599 r = pci_enable_device(pdev); 1600 if (r) 1601 DRM_WARN("pci_enable_device failed (%d)\n", r); 1602 amdgpu_device_resume(dev, true); 1603 1604 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1605 } else { 1606 pr_info("switched off\n"); 1607 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1608 amdgpu_device_suspend(dev, true); 1609 amdgpu_device_cache_pci_state(pdev); 1610 /* Shut down the device */ 1611 pci_disable_device(pdev); 1612 pci_set_power_state(pdev, PCI_D3cold); 1613 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1614 } 1615 } 1616 1617 /** 1618 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1619 * 1620 * @pdev: pci dev pointer 1621 * 1622 * Callback for the switcheroo driver. Check of the switcheroo 1623 * state can be changed. 1624 * Returns true if the state can be changed, false if not. 1625 */ 1626 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1627 { 1628 struct drm_device *dev = pci_get_drvdata(pdev); 1629 1630 /* 1631 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1632 * locking inversion with the driver load path. And the access here is 1633 * completely racy anyway. So don't bother with locking for now. 1634 */ 1635 return atomic_read(&dev->open_count) == 0; 1636 } 1637 #endif /* __linux__ */ 1638 1639 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1640 #ifdef notyet 1641 .set_gpu_state = amdgpu_switcheroo_set_state, 1642 .reprobe = NULL, 1643 .can_switch = amdgpu_switcheroo_can_switch, 1644 #endif 1645 }; 1646 1647 /** 1648 * amdgpu_device_ip_set_clockgating_state - set the CG state 1649 * 1650 * @dev: amdgpu_device pointer 1651 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1652 * @state: clockgating state (gate or ungate) 1653 * 1654 * Sets the requested clockgating state for all instances of 1655 * the hardware IP specified. 1656 * Returns the error code from the last instance. 1657 */ 1658 int amdgpu_device_ip_set_clockgating_state(void *dev, 1659 enum amd_ip_block_type block_type, 1660 enum amd_clockgating_state state) 1661 { 1662 struct amdgpu_device *adev = dev; 1663 int i, r = 0; 1664 1665 for (i = 0; i < adev->num_ip_blocks; i++) { 1666 if (!adev->ip_blocks[i].status.valid) 1667 continue; 1668 if (adev->ip_blocks[i].version->type != block_type) 1669 continue; 1670 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1671 continue; 1672 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1673 (void *)adev, state); 1674 if (r) 1675 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1676 adev->ip_blocks[i].version->funcs->name, r); 1677 } 1678 return r; 1679 } 1680 1681 /** 1682 * amdgpu_device_ip_set_powergating_state - set the PG state 1683 * 1684 * @dev: amdgpu_device pointer 1685 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1686 * @state: powergating state (gate or ungate) 1687 * 1688 * Sets the requested powergating state for all instances of 1689 * the hardware IP specified. 1690 * Returns the error code from the last instance. 1691 */ 1692 int amdgpu_device_ip_set_powergating_state(void *dev, 1693 enum amd_ip_block_type block_type, 1694 enum amd_powergating_state state) 1695 { 1696 struct amdgpu_device *adev = dev; 1697 int i, r = 0; 1698 1699 for (i = 0; i < adev->num_ip_blocks; i++) { 1700 if (!adev->ip_blocks[i].status.valid) 1701 continue; 1702 if (adev->ip_blocks[i].version->type != block_type) 1703 continue; 1704 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1705 continue; 1706 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1707 (void *)adev, state); 1708 if (r) 1709 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1710 adev->ip_blocks[i].version->funcs->name, r); 1711 } 1712 return r; 1713 } 1714 1715 /** 1716 * amdgpu_device_ip_get_clockgating_state - get the CG state 1717 * 1718 * @adev: amdgpu_device pointer 1719 * @flags: clockgating feature flags 1720 * 1721 * Walks the list of IPs on the device and updates the clockgating 1722 * flags for each IP. 1723 * Updates @flags with the feature flags for each hardware IP where 1724 * clockgating is enabled. 1725 */ 1726 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1727 u32 *flags) 1728 { 1729 int i; 1730 1731 for (i = 0; i < adev->num_ip_blocks; i++) { 1732 if (!adev->ip_blocks[i].status.valid) 1733 continue; 1734 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1735 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1736 } 1737 } 1738 1739 /** 1740 * amdgpu_device_ip_wait_for_idle - wait for idle 1741 * 1742 * @adev: amdgpu_device pointer 1743 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1744 * 1745 * Waits for the request hardware IP to be idle. 1746 * Returns 0 for success or a negative error code on failure. 1747 */ 1748 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1749 enum amd_ip_block_type block_type) 1750 { 1751 int i, r; 1752 1753 for (i = 0; i < adev->num_ip_blocks; i++) { 1754 if (!adev->ip_blocks[i].status.valid) 1755 continue; 1756 if (adev->ip_blocks[i].version->type == block_type) { 1757 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1758 if (r) 1759 return r; 1760 break; 1761 } 1762 } 1763 return 0; 1764 1765 } 1766 1767 /** 1768 * amdgpu_device_ip_is_idle - is the hardware IP idle 1769 * 1770 * @adev: amdgpu_device pointer 1771 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1772 * 1773 * Check if the hardware IP is idle or not. 1774 * Returns true if it the IP is idle, false if not. 1775 */ 1776 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1777 enum amd_ip_block_type block_type) 1778 { 1779 int i; 1780 1781 for (i = 0; i < adev->num_ip_blocks; i++) { 1782 if (!adev->ip_blocks[i].status.valid) 1783 continue; 1784 if (adev->ip_blocks[i].version->type == block_type) 1785 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1786 } 1787 return true; 1788 1789 } 1790 1791 /** 1792 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1793 * 1794 * @adev: amdgpu_device pointer 1795 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1796 * 1797 * Returns a pointer to the hardware IP block structure 1798 * if it exists for the asic, otherwise NULL. 1799 */ 1800 struct amdgpu_ip_block * 1801 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1802 enum amd_ip_block_type type) 1803 { 1804 int i; 1805 1806 for (i = 0; i < adev->num_ip_blocks; i++) 1807 if (adev->ip_blocks[i].version->type == type) 1808 return &adev->ip_blocks[i]; 1809 1810 return NULL; 1811 } 1812 1813 /** 1814 * amdgpu_device_ip_block_version_cmp 1815 * 1816 * @adev: amdgpu_device pointer 1817 * @type: enum amd_ip_block_type 1818 * @major: major version 1819 * @minor: minor version 1820 * 1821 * return 0 if equal or greater 1822 * return 1 if smaller or the ip_block doesn't exist 1823 */ 1824 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1825 enum amd_ip_block_type type, 1826 u32 major, u32 minor) 1827 { 1828 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1829 1830 if (ip_block && ((ip_block->version->major > major) || 1831 ((ip_block->version->major == major) && 1832 (ip_block->version->minor >= minor)))) 1833 return 0; 1834 1835 return 1; 1836 } 1837 1838 /** 1839 * amdgpu_device_ip_block_add 1840 * 1841 * @adev: amdgpu_device pointer 1842 * @ip_block_version: pointer to the IP to add 1843 * 1844 * Adds the IP block driver information to the collection of IPs 1845 * on the asic. 1846 */ 1847 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1848 const struct amdgpu_ip_block_version *ip_block_version) 1849 { 1850 if (!ip_block_version) 1851 return -EINVAL; 1852 1853 switch (ip_block_version->type) { 1854 case AMD_IP_BLOCK_TYPE_VCN: 1855 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1856 return 0; 1857 break; 1858 case AMD_IP_BLOCK_TYPE_JPEG: 1859 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1860 return 0; 1861 break; 1862 default: 1863 break; 1864 } 1865 1866 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1867 ip_block_version->funcs->name); 1868 1869 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1870 1871 return 0; 1872 } 1873 1874 /** 1875 * amdgpu_device_enable_virtual_display - enable virtual display feature 1876 * 1877 * @adev: amdgpu_device pointer 1878 * 1879 * Enabled the virtual display feature if the user has enabled it via 1880 * the module parameter virtual_display. This feature provides a virtual 1881 * display hardware on headless boards or in virtualized environments. 1882 * This function parses and validates the configuration string specified by 1883 * the user and configues the virtual display configuration (number of 1884 * virtual connectors, crtcs, etc.) specified. 1885 */ 1886 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1887 { 1888 adev->enable_virtual_display = false; 1889 1890 #ifdef notyet 1891 if (amdgpu_virtual_display) { 1892 const char *pci_address_name = pci_name(adev->pdev); 1893 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1894 1895 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1896 pciaddstr_tmp = pciaddstr; 1897 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1898 pciaddname = strsep(&pciaddname_tmp, ","); 1899 if (!strcmp("all", pciaddname) 1900 || !strcmp(pci_address_name, pciaddname)) { 1901 long num_crtc; 1902 int res = -1; 1903 1904 adev->enable_virtual_display = true; 1905 1906 if (pciaddname_tmp) 1907 res = kstrtol(pciaddname_tmp, 10, 1908 &num_crtc); 1909 1910 if (!res) { 1911 if (num_crtc < 1) 1912 num_crtc = 1; 1913 if (num_crtc > 6) 1914 num_crtc = 6; 1915 adev->mode_info.num_crtc = num_crtc; 1916 } else { 1917 adev->mode_info.num_crtc = 1; 1918 } 1919 break; 1920 } 1921 } 1922 1923 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1924 amdgpu_virtual_display, pci_address_name, 1925 adev->enable_virtual_display, adev->mode_info.num_crtc); 1926 1927 kfree(pciaddstr); 1928 } 1929 #endif 1930 } 1931 1932 /** 1933 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1934 * 1935 * @adev: amdgpu_device pointer 1936 * 1937 * Parses the asic configuration parameters specified in the gpu info 1938 * firmware and makes them availale to the driver for use in configuring 1939 * the asic. 1940 * Returns 0 on success, -EINVAL on failure. 1941 */ 1942 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1943 { 1944 const char *chip_name; 1945 char fw_name[40]; 1946 int err; 1947 const struct gpu_info_firmware_header_v1_0 *hdr; 1948 1949 adev->firmware.gpu_info_fw = NULL; 1950 1951 if (adev->mman.discovery_bin) { 1952 amdgpu_discovery_get_gfx_info(adev); 1953 1954 /* 1955 * FIXME: The bounding box is still needed by Navi12, so 1956 * temporarily read it from gpu_info firmware. Should be droped 1957 * when DAL no longer needs it. 1958 */ 1959 if (adev->asic_type != CHIP_NAVI12) 1960 return 0; 1961 } 1962 1963 switch (adev->asic_type) { 1964 #ifdef CONFIG_DRM_AMDGPU_SI 1965 case CHIP_VERDE: 1966 case CHIP_TAHITI: 1967 case CHIP_PITCAIRN: 1968 case CHIP_OLAND: 1969 case CHIP_HAINAN: 1970 #endif 1971 #ifdef CONFIG_DRM_AMDGPU_CIK 1972 case CHIP_BONAIRE: 1973 case CHIP_HAWAII: 1974 case CHIP_KAVERI: 1975 case CHIP_KABINI: 1976 case CHIP_MULLINS: 1977 #endif 1978 case CHIP_TOPAZ: 1979 case CHIP_TONGA: 1980 case CHIP_FIJI: 1981 case CHIP_POLARIS10: 1982 case CHIP_POLARIS11: 1983 case CHIP_POLARIS12: 1984 case CHIP_VEGAM: 1985 case CHIP_CARRIZO: 1986 case CHIP_STONEY: 1987 case CHIP_VEGA20: 1988 case CHIP_ALDEBARAN: 1989 case CHIP_SIENNA_CICHLID: 1990 case CHIP_NAVY_FLOUNDER: 1991 case CHIP_DIMGREY_CAVEFISH: 1992 case CHIP_BEIGE_GOBY: 1993 default: 1994 return 0; 1995 case CHIP_VEGA10: 1996 chip_name = "vega10"; 1997 break; 1998 case CHIP_VEGA12: 1999 chip_name = "vega12"; 2000 break; 2001 case CHIP_RAVEN: 2002 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2003 chip_name = "raven2"; 2004 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2005 chip_name = "picasso"; 2006 else 2007 chip_name = "raven"; 2008 break; 2009 case CHIP_ARCTURUS: 2010 chip_name = "arcturus"; 2011 break; 2012 case CHIP_RENOIR: 2013 if (adev->apu_flags & AMD_APU_IS_RENOIR) 2014 chip_name = "renoir"; 2015 else 2016 chip_name = "green_sardine"; 2017 break; 2018 case CHIP_NAVI10: 2019 chip_name = "navi10"; 2020 break; 2021 case CHIP_NAVI14: 2022 chip_name = "navi14"; 2023 break; 2024 case CHIP_NAVI12: 2025 chip_name = "navi12"; 2026 break; 2027 case CHIP_VANGOGH: 2028 chip_name = "vangogh"; 2029 break; 2030 case CHIP_YELLOW_CARP: 2031 chip_name = "yellow_carp"; 2032 break; 2033 } 2034 2035 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2036 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2037 if (err) { 2038 dev_err(adev->dev, 2039 "Failed to load gpu_info firmware \"%s\"\n", 2040 fw_name); 2041 goto out; 2042 } 2043 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2044 if (err) { 2045 dev_err(adev->dev, 2046 "Failed to validate gpu_info firmware \"%s\"\n", 2047 fw_name); 2048 goto out; 2049 } 2050 2051 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2052 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2053 2054 switch (hdr->version_major) { 2055 case 1: 2056 { 2057 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2058 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2059 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2060 2061 /* 2062 * Should be droped when DAL no longer needs it. 2063 */ 2064 if (adev->asic_type == CHIP_NAVI12) 2065 goto parse_soc_bounding_box; 2066 2067 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2068 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2069 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2070 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2071 adev->gfx.config.max_texture_channel_caches = 2072 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2073 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2074 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2075 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2076 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2077 adev->gfx.config.double_offchip_lds_buf = 2078 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2079 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2080 adev->gfx.cu_info.max_waves_per_simd = 2081 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2082 adev->gfx.cu_info.max_scratch_slots_per_cu = 2083 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2084 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2085 if (hdr->version_minor >= 1) { 2086 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2087 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2088 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2089 adev->gfx.config.num_sc_per_sh = 2090 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2091 adev->gfx.config.num_packer_per_sc = 2092 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2093 } 2094 2095 parse_soc_bounding_box: 2096 /* 2097 * soc bounding box info is not integrated in disocovery table, 2098 * we always need to parse it from gpu info firmware if needed. 2099 */ 2100 if (hdr->version_minor == 2) { 2101 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2102 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2103 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2104 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2105 } 2106 break; 2107 } 2108 default: 2109 dev_err(adev->dev, 2110 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2111 err = -EINVAL; 2112 goto out; 2113 } 2114 out: 2115 return err; 2116 } 2117 2118 /** 2119 * amdgpu_device_ip_early_init - run early init for hardware IPs 2120 * 2121 * @adev: amdgpu_device pointer 2122 * 2123 * Early initialization pass for hardware IPs. The hardware IPs that make 2124 * up each asic are discovered each IP's early_init callback is run. This 2125 * is the first stage in initializing the asic. 2126 * Returns 0 on success, negative error code on failure. 2127 */ 2128 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2129 { 2130 struct drm_device *dev = adev_to_drm(adev); 2131 struct pci_dev *parent; 2132 int i, r; 2133 2134 amdgpu_device_enable_virtual_display(adev); 2135 2136 if (amdgpu_sriov_vf(adev)) { 2137 r = amdgpu_virt_request_full_gpu(adev, true); 2138 if (r) 2139 return r; 2140 } 2141 2142 switch (adev->asic_type) { 2143 #ifdef CONFIG_DRM_AMDGPU_SI 2144 case CHIP_VERDE: 2145 case CHIP_TAHITI: 2146 case CHIP_PITCAIRN: 2147 case CHIP_OLAND: 2148 case CHIP_HAINAN: 2149 adev->family = AMDGPU_FAMILY_SI; 2150 r = si_set_ip_blocks(adev); 2151 if (r) 2152 return r; 2153 break; 2154 #endif 2155 #ifdef CONFIG_DRM_AMDGPU_CIK 2156 case CHIP_BONAIRE: 2157 case CHIP_HAWAII: 2158 case CHIP_KAVERI: 2159 case CHIP_KABINI: 2160 case CHIP_MULLINS: 2161 if (adev->flags & AMD_IS_APU) 2162 adev->family = AMDGPU_FAMILY_KV; 2163 else 2164 adev->family = AMDGPU_FAMILY_CI; 2165 2166 r = cik_set_ip_blocks(adev); 2167 if (r) 2168 return r; 2169 break; 2170 #endif 2171 case CHIP_TOPAZ: 2172 case CHIP_TONGA: 2173 case CHIP_FIJI: 2174 case CHIP_POLARIS10: 2175 case CHIP_POLARIS11: 2176 case CHIP_POLARIS12: 2177 case CHIP_VEGAM: 2178 case CHIP_CARRIZO: 2179 case CHIP_STONEY: 2180 if (adev->flags & AMD_IS_APU) 2181 adev->family = AMDGPU_FAMILY_CZ; 2182 else 2183 adev->family = AMDGPU_FAMILY_VI; 2184 2185 r = vi_set_ip_blocks(adev); 2186 if (r) 2187 return r; 2188 break; 2189 case CHIP_VEGA10: 2190 case CHIP_VEGA12: 2191 case CHIP_VEGA20: 2192 case CHIP_RAVEN: 2193 case CHIP_ARCTURUS: 2194 case CHIP_RENOIR: 2195 case CHIP_ALDEBARAN: 2196 if (adev->flags & AMD_IS_APU) 2197 adev->family = AMDGPU_FAMILY_RV; 2198 else 2199 adev->family = AMDGPU_FAMILY_AI; 2200 2201 r = soc15_set_ip_blocks(adev); 2202 if (r) 2203 return r; 2204 break; 2205 case CHIP_NAVI10: 2206 case CHIP_NAVI14: 2207 case CHIP_NAVI12: 2208 case CHIP_SIENNA_CICHLID: 2209 case CHIP_NAVY_FLOUNDER: 2210 case CHIP_DIMGREY_CAVEFISH: 2211 case CHIP_BEIGE_GOBY: 2212 case CHIP_VANGOGH: 2213 case CHIP_YELLOW_CARP: 2214 case CHIP_CYAN_SKILLFISH: 2215 if (adev->asic_type == CHIP_VANGOGH) 2216 adev->family = AMDGPU_FAMILY_VGH; 2217 else if (adev->asic_type == CHIP_YELLOW_CARP) 2218 adev->family = AMDGPU_FAMILY_YC; 2219 else 2220 adev->family = AMDGPU_FAMILY_NV; 2221 2222 r = nv_set_ip_blocks(adev); 2223 if (r) 2224 return r; 2225 break; 2226 default: 2227 /* FIXME: not supported yet */ 2228 return -EINVAL; 2229 } 2230 2231 if (amdgpu_has_atpx() && 2232 (amdgpu_is_atpx_hybrid() || 2233 amdgpu_has_atpx_dgpu_power_cntl()) && 2234 ((adev->flags & AMD_IS_APU) == 0) && 2235 !pci_is_thunderbolt_attached(dev->pdev)) 2236 adev->flags |= AMD_IS_PX; 2237 2238 if (!(adev->flags & AMD_IS_APU)) { 2239 parent = pci_upstream_bridge(adev->pdev); 2240 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2241 } 2242 2243 amdgpu_amdkfd_device_probe(adev); 2244 2245 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2246 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2247 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2248 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2249 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2253 DRM_ERROR("disabled ip block: %d <%s>\n", 2254 i, adev->ip_blocks[i].version->funcs->name); 2255 adev->ip_blocks[i].status.valid = false; 2256 } else { 2257 if (adev->ip_blocks[i].version->funcs->early_init) { 2258 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2259 if (r == -ENOENT) { 2260 adev->ip_blocks[i].status.valid = false; 2261 } else if (r) { 2262 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2263 adev->ip_blocks[i].version->funcs->name, r); 2264 return r; 2265 } else { 2266 adev->ip_blocks[i].status.valid = true; 2267 } 2268 } else { 2269 adev->ip_blocks[i].status.valid = true; 2270 } 2271 } 2272 /* get the vbios after the asic_funcs are set up */ 2273 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2274 r = amdgpu_device_parse_gpu_info_fw(adev); 2275 if (r) 2276 return r; 2277 2278 /* Read BIOS */ 2279 if (!amdgpu_get_bios(adev)) 2280 return -EINVAL; 2281 2282 r = amdgpu_atombios_init(adev); 2283 if (r) { 2284 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2285 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2286 return r; 2287 } 2288 2289 /*get pf2vf msg info at it's earliest time*/ 2290 if (amdgpu_sriov_vf(adev)) 2291 amdgpu_virt_init_data_exchange(adev); 2292 2293 } 2294 } 2295 2296 adev->cg_flags &= amdgpu_cg_mask; 2297 adev->pg_flags &= amdgpu_pg_mask; 2298 2299 return 0; 2300 } 2301 2302 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2303 { 2304 int i, r; 2305 2306 for (i = 0; i < adev->num_ip_blocks; i++) { 2307 if (!adev->ip_blocks[i].status.sw) 2308 continue; 2309 if (adev->ip_blocks[i].status.hw) 2310 continue; 2311 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2312 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2313 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2314 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2315 if (r) { 2316 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2317 adev->ip_blocks[i].version->funcs->name, r); 2318 return r; 2319 } 2320 adev->ip_blocks[i].status.hw = true; 2321 } 2322 } 2323 2324 return 0; 2325 } 2326 2327 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2328 { 2329 int i, r; 2330 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (!adev->ip_blocks[i].status.sw) 2333 continue; 2334 if (adev->ip_blocks[i].status.hw) 2335 continue; 2336 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2337 if (r) { 2338 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 return r; 2341 } 2342 adev->ip_blocks[i].status.hw = true; 2343 } 2344 2345 return 0; 2346 } 2347 2348 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2349 { 2350 int r = 0; 2351 int i; 2352 uint32_t smu_version; 2353 2354 if (adev->asic_type >= CHIP_VEGA10) { 2355 for (i = 0; i < adev->num_ip_blocks; i++) { 2356 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2357 continue; 2358 2359 if (!adev->ip_blocks[i].status.sw) 2360 continue; 2361 2362 /* no need to do the fw loading again if already done*/ 2363 if (adev->ip_blocks[i].status.hw == true) 2364 break; 2365 2366 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2367 r = adev->ip_blocks[i].version->funcs->resume(adev); 2368 if (r) { 2369 DRM_ERROR("resume of IP block <%s> failed %d\n", 2370 adev->ip_blocks[i].version->funcs->name, r); 2371 return r; 2372 } 2373 } else { 2374 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2375 if (r) { 2376 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 return r; 2379 } 2380 } 2381 2382 adev->ip_blocks[i].status.hw = true; 2383 break; 2384 } 2385 } 2386 2387 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2388 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2389 2390 return r; 2391 } 2392 2393 /** 2394 * amdgpu_device_ip_init - run init for hardware IPs 2395 * 2396 * @adev: amdgpu_device pointer 2397 * 2398 * Main initialization pass for hardware IPs. The list of all the hardware 2399 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2400 * are run. sw_init initializes the software state associated with each IP 2401 * and hw_init initializes the hardware associated with each IP. 2402 * Returns 0 on success, negative error code on failure. 2403 */ 2404 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2405 { 2406 int i, r; 2407 2408 r = amdgpu_ras_init(adev); 2409 if (r) 2410 return r; 2411 2412 for (i = 0; i < adev->num_ip_blocks; i++) { 2413 if (!adev->ip_blocks[i].status.valid) 2414 continue; 2415 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2416 if (r) { 2417 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2418 adev->ip_blocks[i].version->funcs->name, r); 2419 goto init_failed; 2420 } 2421 adev->ip_blocks[i].status.sw = true; 2422 2423 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2424 /* need to do common hw init early so everything is set up for gmc */ 2425 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2426 if (r) { 2427 DRM_ERROR("hw_init %d failed %d\n", i, r); 2428 goto init_failed; 2429 } 2430 adev->ip_blocks[i].status.hw = true; 2431 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2432 /* need to do gmc hw init early so we can allocate gpu mem */ 2433 /* Try to reserve bad pages early */ 2434 if (amdgpu_sriov_vf(adev)) 2435 amdgpu_virt_exchange_data(adev); 2436 2437 r = amdgpu_device_vram_scratch_init(adev); 2438 if (r) { 2439 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2440 goto init_failed; 2441 } 2442 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2443 if (r) { 2444 DRM_ERROR("hw_init %d failed %d\n", i, r); 2445 goto init_failed; 2446 } 2447 r = amdgpu_device_wb_init(adev); 2448 if (r) { 2449 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2450 goto init_failed; 2451 } 2452 adev->ip_blocks[i].status.hw = true; 2453 2454 /* right after GMC hw init, we create CSA */ 2455 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2456 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2457 AMDGPU_GEM_DOMAIN_VRAM, 2458 AMDGPU_CSA_SIZE); 2459 if (r) { 2460 DRM_ERROR("allocate CSA failed %d\n", r); 2461 goto init_failed; 2462 } 2463 } 2464 } 2465 } 2466 2467 if (amdgpu_sriov_vf(adev)) 2468 amdgpu_virt_init_data_exchange(adev); 2469 2470 r = amdgpu_ib_pool_init(adev); 2471 if (r) { 2472 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2473 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2474 goto init_failed; 2475 } 2476 2477 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2478 if (r) 2479 goto init_failed; 2480 2481 r = amdgpu_amdkfd_resume_iommu(adev); 2482 if (r) 2483 goto init_failed; 2484 2485 r = amdgpu_device_ip_hw_init_phase1(adev); 2486 if (r) 2487 goto init_failed; 2488 2489 r = amdgpu_device_fw_loading(adev); 2490 if (r) 2491 goto init_failed; 2492 2493 r = amdgpu_device_ip_hw_init_phase2(adev); 2494 if (r) 2495 goto init_failed; 2496 2497 /* 2498 * retired pages will be loaded from eeprom and reserved here, 2499 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2500 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2501 * for I2C communication which only true at this point. 2502 * 2503 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2504 * failure from bad gpu situation and stop amdgpu init process 2505 * accordingly. For other failed cases, it will still release all 2506 * the resource and print error message, rather than returning one 2507 * negative value to upper level. 2508 * 2509 * Note: theoretically, this should be called before all vram allocations 2510 * to protect retired page from abusing 2511 */ 2512 r = amdgpu_ras_recovery_init(adev); 2513 if (r) 2514 goto init_failed; 2515 2516 if (adev->gmc.xgmi.num_physical_nodes > 1) 2517 amdgpu_xgmi_add_device(adev); 2518 2519 /* Don't init kfd if whole hive need to be reset during init */ 2520 if (!adev->gmc.xgmi.pending_reset) 2521 amdgpu_amdkfd_device_init(adev); 2522 2523 amdgpu_fru_get_product_info(adev); 2524 2525 init_failed: 2526 if (amdgpu_sriov_vf(adev)) 2527 amdgpu_virt_release_full_gpu(adev, true); 2528 2529 return r; 2530 } 2531 2532 /** 2533 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2534 * 2535 * @adev: amdgpu_device pointer 2536 * 2537 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2538 * this function before a GPU reset. If the value is retained after a 2539 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2540 */ 2541 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2542 { 2543 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2544 } 2545 2546 /** 2547 * amdgpu_device_check_vram_lost - check if vram is valid 2548 * 2549 * @adev: amdgpu_device pointer 2550 * 2551 * Checks the reset magic value written to the gart pointer in VRAM. 2552 * The driver calls this after a GPU reset to see if the contents of 2553 * VRAM is lost or now. 2554 * returns true if vram is lost, false if not. 2555 */ 2556 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2557 { 2558 if (memcmp(adev->gart.ptr, adev->reset_magic, 2559 AMDGPU_RESET_MAGIC_NUM)) 2560 return true; 2561 2562 if (!amdgpu_in_reset(adev)) 2563 return false; 2564 2565 /* 2566 * For all ASICs with baco/mode1 reset, the VRAM is 2567 * always assumed to be lost. 2568 */ 2569 switch (amdgpu_asic_reset_method(adev)) { 2570 case AMD_RESET_METHOD_BACO: 2571 case AMD_RESET_METHOD_MODE1: 2572 return true; 2573 default: 2574 return false; 2575 } 2576 } 2577 2578 /** 2579 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2580 * 2581 * @adev: amdgpu_device pointer 2582 * @state: clockgating state (gate or ungate) 2583 * 2584 * The list of all the hardware IPs that make up the asic is walked and the 2585 * set_clockgating_state callbacks are run. 2586 * Late initialization pass enabling clockgating for hardware IPs. 2587 * Fini or suspend, pass disabling clockgating for hardware IPs. 2588 * Returns 0 on success, negative error code on failure. 2589 */ 2590 2591 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2592 enum amd_clockgating_state state) 2593 { 2594 int i, j, r; 2595 2596 if (amdgpu_emu_mode == 1) 2597 return 0; 2598 2599 for (j = 0; j < adev->num_ip_blocks; j++) { 2600 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2601 if (!adev->ip_blocks[i].status.late_initialized) 2602 continue; 2603 /* skip CG for GFX on S0ix */ 2604 if (adev->in_s0ix && 2605 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2606 continue; 2607 /* skip CG for VCE/UVD, it's handled specially */ 2608 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2609 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2610 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2611 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2612 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2613 /* enable clockgating to save power */ 2614 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2615 state); 2616 if (r) { 2617 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2618 adev->ip_blocks[i].version->funcs->name, r); 2619 return r; 2620 } 2621 } 2622 } 2623 2624 return 0; 2625 } 2626 2627 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2628 enum amd_powergating_state state) 2629 { 2630 int i, j, r; 2631 2632 if (amdgpu_emu_mode == 1) 2633 return 0; 2634 2635 for (j = 0; j < adev->num_ip_blocks; j++) { 2636 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2637 if (!adev->ip_blocks[i].status.late_initialized) 2638 continue; 2639 /* skip PG for GFX on S0ix */ 2640 if (adev->in_s0ix && 2641 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2642 continue; 2643 /* skip CG for VCE/UVD, it's handled specially */ 2644 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2645 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2646 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2647 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2648 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2649 /* enable powergating to save power */ 2650 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2651 state); 2652 if (r) { 2653 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2654 adev->ip_blocks[i].version->funcs->name, r); 2655 return r; 2656 } 2657 } 2658 } 2659 return 0; 2660 } 2661 2662 static int amdgpu_device_enable_mgpu_fan_boost(void) 2663 { 2664 struct amdgpu_gpu_instance *gpu_ins; 2665 struct amdgpu_device *adev; 2666 int i, ret = 0; 2667 2668 mutex_lock(&mgpu_info.mutex); 2669 2670 /* 2671 * MGPU fan boost feature should be enabled 2672 * only when there are two or more dGPUs in 2673 * the system 2674 */ 2675 if (mgpu_info.num_dgpu < 2) 2676 goto out; 2677 2678 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2679 gpu_ins = &(mgpu_info.gpu_ins[i]); 2680 adev = gpu_ins->adev; 2681 if (!(adev->flags & AMD_IS_APU) && 2682 !gpu_ins->mgpu_fan_enabled) { 2683 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2684 if (ret) 2685 break; 2686 2687 gpu_ins->mgpu_fan_enabled = 1; 2688 } 2689 } 2690 2691 out: 2692 mutex_unlock(&mgpu_info.mutex); 2693 2694 return ret; 2695 } 2696 2697 /** 2698 * amdgpu_device_ip_late_init - run late init for hardware IPs 2699 * 2700 * @adev: amdgpu_device pointer 2701 * 2702 * Late initialization pass for hardware IPs. The list of all the hardware 2703 * IPs that make up the asic is walked and the late_init callbacks are run. 2704 * late_init covers any special initialization that an IP requires 2705 * after all of the have been initialized or something that needs to happen 2706 * late in the init process. 2707 * Returns 0 on success, negative error code on failure. 2708 */ 2709 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2710 { 2711 struct amdgpu_gpu_instance *gpu_instance; 2712 int i = 0, r; 2713 2714 for (i = 0; i < adev->num_ip_blocks; i++) { 2715 if (!adev->ip_blocks[i].status.hw) 2716 continue; 2717 if (adev->ip_blocks[i].version->funcs->late_init) { 2718 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2719 if (r) { 2720 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2721 adev->ip_blocks[i].version->funcs->name, r); 2722 return r; 2723 } 2724 } 2725 adev->ip_blocks[i].status.late_initialized = true; 2726 } 2727 2728 amdgpu_ras_set_error_query_ready(adev, true); 2729 2730 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2731 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2732 2733 amdgpu_device_fill_reset_magic(adev); 2734 2735 r = amdgpu_device_enable_mgpu_fan_boost(); 2736 if (r) 2737 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2738 2739 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2740 if (adev->asic_type == CHIP_ARCTURUS && 2741 amdgpu_passthrough(adev) && 2742 adev->gmc.xgmi.num_physical_nodes > 1) 2743 smu_set_light_sbr(&adev->smu, true); 2744 2745 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2746 mutex_lock(&mgpu_info.mutex); 2747 2748 /* 2749 * Reset device p-state to low as this was booted with high. 2750 * 2751 * This should be performed only after all devices from the same 2752 * hive get initialized. 2753 * 2754 * However, it's unknown how many device in the hive in advance. 2755 * As this is counted one by one during devices initializations. 2756 * 2757 * So, we wait for all XGMI interlinked devices initialized. 2758 * This may bring some delays as those devices may come from 2759 * different hives. But that should be OK. 2760 */ 2761 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2762 for (i = 0; i < mgpu_info.num_gpu; i++) { 2763 gpu_instance = &(mgpu_info.gpu_ins[i]); 2764 if (gpu_instance->adev->flags & AMD_IS_APU) 2765 continue; 2766 2767 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2768 AMDGPU_XGMI_PSTATE_MIN); 2769 if (r) { 2770 DRM_ERROR("pstate setting failed (%d).\n", r); 2771 break; 2772 } 2773 } 2774 } 2775 2776 mutex_unlock(&mgpu_info.mutex); 2777 } 2778 2779 return 0; 2780 } 2781 2782 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2783 { 2784 int i, r; 2785 2786 for (i = 0; i < adev->num_ip_blocks; i++) { 2787 if (!adev->ip_blocks[i].version->funcs->early_fini) 2788 continue; 2789 2790 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2791 if (r) { 2792 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2793 adev->ip_blocks[i].version->funcs->name, r); 2794 } 2795 } 2796 2797 amdgpu_amdkfd_suspend(adev, false); 2798 2799 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2800 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2801 2802 /* need to disable SMC first */ 2803 for (i = 0; i < adev->num_ip_blocks; i++) { 2804 if (!adev->ip_blocks[i].status.hw) 2805 continue; 2806 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2807 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2808 /* XXX handle errors */ 2809 if (r) { 2810 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2811 adev->ip_blocks[i].version->funcs->name, r); 2812 } 2813 adev->ip_blocks[i].status.hw = false; 2814 break; 2815 } 2816 } 2817 2818 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2819 if (!adev->ip_blocks[i].status.hw) 2820 continue; 2821 2822 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2823 /* XXX handle errors */ 2824 if (r) { 2825 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2826 adev->ip_blocks[i].version->funcs->name, r); 2827 } 2828 2829 adev->ip_blocks[i].status.hw = false; 2830 } 2831 2832 if (amdgpu_sriov_vf(adev)) { 2833 if (amdgpu_virt_release_full_gpu(adev, false)) 2834 DRM_ERROR("failed to release exclusive mode on fini\n"); 2835 } 2836 2837 return 0; 2838 } 2839 2840 /** 2841 * amdgpu_device_ip_fini - run fini for hardware IPs 2842 * 2843 * @adev: amdgpu_device pointer 2844 * 2845 * Main teardown pass for hardware IPs. The list of all the hardware 2846 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2847 * are run. hw_fini tears down the hardware associated with each IP 2848 * and sw_fini tears down any software state associated with each IP. 2849 * Returns 0 on success, negative error code on failure. 2850 */ 2851 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2852 { 2853 int i, r; 2854 2855 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2856 amdgpu_virt_release_ras_err_handler_data(adev); 2857 2858 amdgpu_ras_pre_fini(adev); 2859 2860 if (adev->gmc.xgmi.num_physical_nodes > 1) 2861 amdgpu_xgmi_remove_device(adev); 2862 2863 amdgpu_amdkfd_device_fini_sw(adev); 2864 2865 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2866 if (!adev->ip_blocks[i].status.sw) 2867 continue; 2868 2869 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2870 amdgpu_ucode_free_bo(adev); 2871 amdgpu_free_static_csa(&adev->virt.csa_obj); 2872 amdgpu_device_wb_fini(adev); 2873 amdgpu_device_vram_scratch_fini(adev); 2874 amdgpu_ib_pool_fini(adev); 2875 } 2876 2877 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2878 /* XXX handle errors */ 2879 if (r) { 2880 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2881 adev->ip_blocks[i].version->funcs->name, r); 2882 } 2883 adev->ip_blocks[i].status.sw = false; 2884 adev->ip_blocks[i].status.valid = false; 2885 } 2886 2887 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2888 if (!adev->ip_blocks[i].status.late_initialized) 2889 continue; 2890 if (adev->ip_blocks[i].version->funcs->late_fini) 2891 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2892 adev->ip_blocks[i].status.late_initialized = false; 2893 } 2894 2895 amdgpu_ras_fini(adev); 2896 2897 return 0; 2898 } 2899 2900 /** 2901 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2902 * 2903 * @work: work_struct. 2904 */ 2905 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2906 { 2907 struct amdgpu_device *adev = 2908 container_of(work, struct amdgpu_device, delayed_init_work.work); 2909 int r; 2910 2911 r = amdgpu_ib_ring_tests(adev); 2912 if (r) 2913 DRM_ERROR("ib ring test failed (%d).\n", r); 2914 } 2915 2916 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2917 { 2918 struct amdgpu_device *adev = 2919 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2920 2921 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2922 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2923 2924 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2925 adev->gfx.gfx_off_state = true; 2926 } 2927 2928 /** 2929 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2930 * 2931 * @adev: amdgpu_device pointer 2932 * 2933 * Main suspend function for hardware IPs. The list of all the hardware 2934 * IPs that make up the asic is walked, clockgating is disabled and the 2935 * suspend callbacks are run. suspend puts the hardware and software state 2936 * in each IP into a state suitable for suspend. 2937 * Returns 0 on success, negative error code on failure. 2938 */ 2939 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2940 { 2941 int i, r; 2942 2943 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2944 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2945 2946 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2947 if (!adev->ip_blocks[i].status.valid) 2948 continue; 2949 2950 /* displays are handled separately */ 2951 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2952 continue; 2953 2954 /* XXX handle errors */ 2955 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2956 /* XXX handle errors */ 2957 if (r) { 2958 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2959 adev->ip_blocks[i].version->funcs->name, r); 2960 return r; 2961 } 2962 2963 adev->ip_blocks[i].status.hw = false; 2964 } 2965 2966 return 0; 2967 } 2968 2969 /** 2970 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2971 * 2972 * @adev: amdgpu_device pointer 2973 * 2974 * Main suspend function for hardware IPs. The list of all the hardware 2975 * IPs that make up the asic is walked, clockgating is disabled and the 2976 * suspend callbacks are run. suspend puts the hardware and software state 2977 * in each IP into a state suitable for suspend. 2978 * Returns 0 on success, negative error code on failure. 2979 */ 2980 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2981 { 2982 int i, r; 2983 2984 if (adev->in_s0ix) 2985 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2986 2987 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2988 if (!adev->ip_blocks[i].status.valid) 2989 continue; 2990 /* displays are handled in phase1 */ 2991 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2992 continue; 2993 /* PSP lost connection when err_event_athub occurs */ 2994 if (amdgpu_ras_intr_triggered() && 2995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2996 adev->ip_blocks[i].status.hw = false; 2997 continue; 2998 } 2999 3000 /* skip unnecessary suspend if we do not initialize them yet */ 3001 if (adev->gmc.xgmi.pending_reset && 3002 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3003 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3006 adev->ip_blocks[i].status.hw = false; 3007 continue; 3008 } 3009 3010 /* skip suspend of gfx and psp for S0ix 3011 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3012 * like at runtime. PSP is also part of the always on hardware 3013 * so no need to suspend it. 3014 */ 3015 if (adev->in_s0ix && 3016 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3017 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3018 continue; 3019 3020 /* XXX handle errors */ 3021 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3022 /* XXX handle errors */ 3023 if (r) { 3024 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3025 adev->ip_blocks[i].version->funcs->name, r); 3026 } 3027 adev->ip_blocks[i].status.hw = false; 3028 /* handle putting the SMC in the appropriate state */ 3029 if(!amdgpu_sriov_vf(adev)){ 3030 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3031 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3032 if (r) { 3033 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3034 adev->mp1_state, r); 3035 return r; 3036 } 3037 } 3038 } 3039 } 3040 3041 return 0; 3042 } 3043 3044 /** 3045 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3046 * 3047 * @adev: amdgpu_device pointer 3048 * 3049 * Main suspend function for hardware IPs. The list of all the hardware 3050 * IPs that make up the asic is walked, clockgating is disabled and the 3051 * suspend callbacks are run. suspend puts the hardware and software state 3052 * in each IP into a state suitable for suspend. 3053 * Returns 0 on success, negative error code on failure. 3054 */ 3055 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3056 { 3057 int r; 3058 3059 if (amdgpu_sriov_vf(adev)) { 3060 amdgpu_virt_fini_data_exchange(adev); 3061 amdgpu_virt_request_full_gpu(adev, false); 3062 } 3063 3064 r = amdgpu_device_ip_suspend_phase1(adev); 3065 if (r) 3066 return r; 3067 r = amdgpu_device_ip_suspend_phase2(adev); 3068 3069 if (amdgpu_sriov_vf(adev)) 3070 amdgpu_virt_release_full_gpu(adev, false); 3071 3072 return r; 3073 } 3074 3075 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3076 { 3077 int i, r; 3078 3079 static enum amd_ip_block_type ip_order[] = { 3080 AMD_IP_BLOCK_TYPE_COMMON, 3081 AMD_IP_BLOCK_TYPE_GMC, 3082 AMD_IP_BLOCK_TYPE_PSP, 3083 AMD_IP_BLOCK_TYPE_IH, 3084 }; 3085 3086 for (i = 0; i < adev->num_ip_blocks; i++) { 3087 int j; 3088 struct amdgpu_ip_block *block; 3089 3090 block = &adev->ip_blocks[i]; 3091 block->status.hw = false; 3092 3093 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3094 3095 if (block->version->type != ip_order[j] || 3096 !block->status.valid) 3097 continue; 3098 3099 r = block->version->funcs->hw_init(adev); 3100 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3101 if (r) 3102 return r; 3103 block->status.hw = true; 3104 } 3105 } 3106 3107 return 0; 3108 } 3109 3110 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3111 { 3112 int i, r; 3113 3114 static enum amd_ip_block_type ip_order[] = { 3115 AMD_IP_BLOCK_TYPE_SMC, 3116 AMD_IP_BLOCK_TYPE_DCE, 3117 AMD_IP_BLOCK_TYPE_GFX, 3118 AMD_IP_BLOCK_TYPE_SDMA, 3119 AMD_IP_BLOCK_TYPE_UVD, 3120 AMD_IP_BLOCK_TYPE_VCE, 3121 AMD_IP_BLOCK_TYPE_VCN 3122 }; 3123 3124 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3125 int j; 3126 struct amdgpu_ip_block *block; 3127 3128 for (j = 0; j < adev->num_ip_blocks; j++) { 3129 block = &adev->ip_blocks[j]; 3130 3131 if (block->version->type != ip_order[i] || 3132 !block->status.valid || 3133 block->status.hw) 3134 continue; 3135 3136 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3137 r = block->version->funcs->resume(adev); 3138 else 3139 r = block->version->funcs->hw_init(adev); 3140 3141 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3142 if (r) 3143 return r; 3144 block->status.hw = true; 3145 } 3146 } 3147 3148 return 0; 3149 } 3150 3151 /** 3152 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3153 * 3154 * @adev: amdgpu_device pointer 3155 * 3156 * First resume function for hardware IPs. The list of all the hardware 3157 * IPs that make up the asic is walked and the resume callbacks are run for 3158 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3159 * after a suspend and updates the software state as necessary. This 3160 * function is also used for restoring the GPU after a GPU reset. 3161 * Returns 0 on success, negative error code on failure. 3162 */ 3163 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3164 { 3165 int i, r; 3166 3167 for (i = 0; i < adev->num_ip_blocks; i++) { 3168 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3169 continue; 3170 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3173 3174 r = adev->ip_blocks[i].version->funcs->resume(adev); 3175 if (r) { 3176 DRM_ERROR("resume of IP block <%s> failed %d\n", 3177 adev->ip_blocks[i].version->funcs->name, r); 3178 return r; 3179 } 3180 adev->ip_blocks[i].status.hw = true; 3181 } 3182 } 3183 3184 return 0; 3185 } 3186 3187 /** 3188 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3189 * 3190 * @adev: amdgpu_device pointer 3191 * 3192 * First resume function for hardware IPs. The list of all the hardware 3193 * IPs that make up the asic is walked and the resume callbacks are run for 3194 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3195 * functional state after a suspend and updates the software state as 3196 * necessary. This function is also used for restoring the GPU after a GPU 3197 * reset. 3198 * Returns 0 on success, negative error code on failure. 3199 */ 3200 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3201 { 3202 int i, r; 3203 3204 for (i = 0; i < adev->num_ip_blocks; i++) { 3205 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3206 continue; 3207 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3208 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3209 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3210 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3211 continue; 3212 r = adev->ip_blocks[i].version->funcs->resume(adev); 3213 if (r) { 3214 DRM_ERROR("resume of IP block <%s> failed %d\n", 3215 adev->ip_blocks[i].version->funcs->name, r); 3216 return r; 3217 } 3218 adev->ip_blocks[i].status.hw = true; 3219 } 3220 3221 return 0; 3222 } 3223 3224 /** 3225 * amdgpu_device_ip_resume - run resume for hardware IPs 3226 * 3227 * @adev: amdgpu_device pointer 3228 * 3229 * Main resume function for hardware IPs. The hardware IPs 3230 * are split into two resume functions because they are 3231 * are also used in in recovering from a GPU reset and some additional 3232 * steps need to be take between them. In this case (S3/S4) they are 3233 * run sequentially. 3234 * Returns 0 on success, negative error code on failure. 3235 */ 3236 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3237 { 3238 int r; 3239 3240 r = amdgpu_amdkfd_resume_iommu(adev); 3241 if (r) 3242 return r; 3243 3244 r = amdgpu_device_ip_resume_phase1(adev); 3245 if (r) 3246 return r; 3247 3248 r = amdgpu_device_fw_loading(adev); 3249 if (r) 3250 return r; 3251 3252 r = amdgpu_device_ip_resume_phase2(adev); 3253 3254 return r; 3255 } 3256 3257 /** 3258 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3263 */ 3264 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3265 { 3266 if (amdgpu_sriov_vf(adev)) { 3267 if (adev->is_atom_fw) { 3268 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3269 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3270 } else { 3271 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3272 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3273 } 3274 3275 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3276 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3277 } 3278 } 3279 3280 /** 3281 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3282 * 3283 * @asic_type: AMD asic type 3284 * 3285 * Check if there is DC (new modesetting infrastructre) support for an asic. 3286 * returns true if DC has support, false if not. 3287 */ 3288 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3289 { 3290 switch (asic_type) { 3291 #if defined(CONFIG_DRM_AMD_DC) 3292 #if defined(CONFIG_DRM_AMD_DC_SI) 3293 case CHIP_TAHITI: 3294 case CHIP_PITCAIRN: 3295 case CHIP_VERDE: 3296 case CHIP_OLAND: 3297 #endif 3298 case CHIP_BONAIRE: 3299 case CHIP_KAVERI: 3300 case CHIP_KABINI: 3301 case CHIP_MULLINS: 3302 /* 3303 * We have systems in the wild with these ASICs that require 3304 * LVDS and VGA support which is not supported with DC. 3305 * 3306 * Fallback to the non-DC driver here by default so as not to 3307 * cause regressions. 3308 */ 3309 return amdgpu_dc > 0; 3310 case CHIP_HAWAII: 3311 case CHIP_CARRIZO: 3312 case CHIP_STONEY: 3313 case CHIP_POLARIS10: 3314 case CHIP_POLARIS11: 3315 case CHIP_POLARIS12: 3316 case CHIP_VEGAM: 3317 case CHIP_TONGA: 3318 case CHIP_FIJI: 3319 case CHIP_VEGA10: 3320 case CHIP_VEGA12: 3321 case CHIP_VEGA20: 3322 #if defined(CONFIG_DRM_AMD_DC_DCN) 3323 case CHIP_RAVEN: 3324 case CHIP_NAVI10: 3325 case CHIP_NAVI14: 3326 case CHIP_NAVI12: 3327 case CHIP_RENOIR: 3328 case CHIP_SIENNA_CICHLID: 3329 case CHIP_NAVY_FLOUNDER: 3330 case CHIP_DIMGREY_CAVEFISH: 3331 case CHIP_BEIGE_GOBY: 3332 case CHIP_VANGOGH: 3333 case CHIP_YELLOW_CARP: 3334 #endif 3335 return amdgpu_dc != 0; 3336 #endif 3337 default: 3338 if (amdgpu_dc > 0) 3339 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3340 "but isn't supported by ASIC, ignoring\n"); 3341 return false; 3342 } 3343 } 3344 3345 /** 3346 * amdgpu_device_has_dc_support - check if dc is supported 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Returns true for supported, false for not supported 3351 */ 3352 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3353 { 3354 if (amdgpu_sriov_vf(adev) || 3355 adev->enable_virtual_display || 3356 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3357 return false; 3358 3359 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3360 } 3361 3362 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3363 { 3364 struct amdgpu_device *adev = 3365 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3366 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3367 3368 /* It's a bug to not have a hive within this function */ 3369 if (WARN_ON(!hive)) 3370 return; 3371 3372 /* 3373 * Use task barrier to synchronize all xgmi reset works across the 3374 * hive. task_barrier_enter and task_barrier_exit will block 3375 * until all the threads running the xgmi reset works reach 3376 * those points. task_barrier_full will do both blocks. 3377 */ 3378 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3379 3380 task_barrier_enter(&hive->tb); 3381 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3382 3383 if (adev->asic_reset_res) 3384 goto fail; 3385 3386 task_barrier_exit(&hive->tb); 3387 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3388 3389 if (adev->asic_reset_res) 3390 goto fail; 3391 3392 if (adev->mmhub.ras_funcs && 3393 adev->mmhub.ras_funcs->reset_ras_error_count) 3394 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3395 } else { 3396 3397 task_barrier_full(&hive->tb); 3398 adev->asic_reset_res = amdgpu_asic_reset(adev); 3399 } 3400 3401 fail: 3402 if (adev->asic_reset_res) 3403 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3404 adev->asic_reset_res, adev_to_drm(adev)->unique); 3405 amdgpu_put_xgmi_hive(hive); 3406 } 3407 3408 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3409 { 3410 char *input = amdgpu_lockup_timeout; 3411 char *timeout_setting = NULL; 3412 int index = 0; 3413 long timeout; 3414 int ret = 0; 3415 3416 /* 3417 * By default timeout for non compute jobs is 10000 3418 * and 60000 for compute jobs. 3419 * In SR-IOV or passthrough mode, timeout for compute 3420 * jobs are 60000 by default. 3421 */ 3422 adev->gfx_timeout = msecs_to_jiffies(10000); 3423 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3424 if (amdgpu_sriov_vf(adev)) 3425 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3426 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3427 else 3428 adev->compute_timeout = msecs_to_jiffies(60000); 3429 3430 #ifdef notyet 3431 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3432 while ((timeout_setting = strsep(&input, ",")) && 3433 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3434 ret = kstrtol(timeout_setting, 0, &timeout); 3435 if (ret) 3436 return ret; 3437 3438 if (timeout == 0) { 3439 index++; 3440 continue; 3441 } else if (timeout < 0) { 3442 timeout = MAX_SCHEDULE_TIMEOUT; 3443 } else { 3444 timeout = msecs_to_jiffies(timeout); 3445 } 3446 3447 switch (index++) { 3448 case 0: 3449 adev->gfx_timeout = timeout; 3450 break; 3451 case 1: 3452 adev->compute_timeout = timeout; 3453 break; 3454 case 2: 3455 adev->sdma_timeout = timeout; 3456 break; 3457 case 3: 3458 adev->video_timeout = timeout; 3459 break; 3460 default: 3461 break; 3462 } 3463 } 3464 /* 3465 * There is only one value specified and 3466 * it should apply to all non-compute jobs. 3467 */ 3468 if (index == 1) { 3469 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3470 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3471 adev->compute_timeout = adev->gfx_timeout; 3472 } 3473 } 3474 #endif 3475 3476 return ret; 3477 } 3478 3479 static const struct attribute *amdgpu_dev_attributes[] = { 3480 &dev_attr_product_name.attr, 3481 &dev_attr_product_number.attr, 3482 &dev_attr_serial_number.attr, 3483 &dev_attr_pcie_replay_count.attr, 3484 NULL 3485 }; 3486 3487 /** 3488 * amdgpu_device_init - initialize the driver 3489 * 3490 * @adev: amdgpu_device pointer 3491 * @flags: driver flags 3492 * 3493 * Initializes the driver info and hw (all asics). 3494 * Returns 0 for success or an error on failure. 3495 * Called at driver startup. 3496 */ 3497 int amdgpu_device_init(struct amdgpu_device *adev, 3498 uint32_t flags) 3499 { 3500 struct drm_device *ddev = adev_to_drm(adev); 3501 struct pci_dev *pdev = adev->pdev; 3502 int r, i; 3503 bool px = false; 3504 u32 max_MBps; 3505 3506 adev->shutdown = false; 3507 adev->flags = flags; 3508 3509 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3510 adev->asic_type = amdgpu_force_asic_type; 3511 else 3512 adev->asic_type = flags & AMD_ASIC_MASK; 3513 3514 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3515 if (amdgpu_emu_mode == 1) 3516 adev->usec_timeout *= 10; 3517 adev->gmc.gart_size = 512 * 1024 * 1024; 3518 adev->accel_working = false; 3519 adev->num_rings = 0; 3520 adev->mman.buffer_funcs = NULL; 3521 adev->mman.buffer_funcs_ring = NULL; 3522 adev->vm_manager.vm_pte_funcs = NULL; 3523 adev->vm_manager.vm_pte_num_scheds = 0; 3524 adev->gmc.gmc_funcs = NULL; 3525 adev->harvest_ip_mask = 0x0; 3526 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3527 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3528 3529 adev->smc_rreg = &amdgpu_invalid_rreg; 3530 adev->smc_wreg = &amdgpu_invalid_wreg; 3531 adev->pcie_rreg = &amdgpu_invalid_rreg; 3532 adev->pcie_wreg = &amdgpu_invalid_wreg; 3533 adev->pciep_rreg = &amdgpu_invalid_rreg; 3534 adev->pciep_wreg = &amdgpu_invalid_wreg; 3535 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3536 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3537 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3538 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3539 adev->didt_rreg = &amdgpu_invalid_rreg; 3540 adev->didt_wreg = &amdgpu_invalid_wreg; 3541 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3542 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3543 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3544 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3545 3546 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3547 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3548 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3549 3550 /* mutex initialization are all done here so we 3551 * can recall function without having locking issues */ 3552 rw_init(&adev->firmware.mutex, "agfw"); 3553 rw_init(&adev->pm.mutex, "agpm"); 3554 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3555 rw_init(&adev->srbm_mutex, "srbm"); 3556 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3557 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3558 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3559 rw_init(&adev->mn_lock, "agpumn"); 3560 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3561 hash_init(adev->mn_hash); 3562 atomic_set(&adev->in_gpu_reset, 0); 3563 rw_init(&adev->reset_sem, "amrs"); 3564 rw_init(&adev->psp.mutex, "agpsp"); 3565 rw_init(&adev->notifier_lock, "agnf"); 3566 3567 r = amdgpu_device_init_apu_flags(adev); 3568 if (r) 3569 return r; 3570 3571 r = amdgpu_device_check_arguments(adev); 3572 if (r) 3573 return r; 3574 3575 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3576 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3577 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3578 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3579 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3580 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3581 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3582 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3583 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3584 3585 INIT_LIST_HEAD(&adev->shadow_list); 3586 rw_init(&adev->shadow_list_lock, "sdwlst"); 3587 3588 INIT_LIST_HEAD(&adev->reset_list); 3589 3590 INIT_DELAYED_WORK(&adev->delayed_init_work, 3591 amdgpu_device_delayed_init_work_handler); 3592 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3593 amdgpu_device_delay_enable_gfx_off); 3594 3595 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3596 3597 adev->gfx.gfx_off_req_count = 1; 3598 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3599 3600 atomic_set(&adev->throttling_logging_enabled, 1); 3601 /* 3602 * If throttling continues, logging will be performed every minute 3603 * to avoid log flooding. "-1" is subtracted since the thermal 3604 * throttling interrupt comes every second. Thus, the total logging 3605 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3606 * for throttling interrupt) = 60 seconds. 3607 */ 3608 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3609 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3610 3611 #ifdef __linux__ 3612 /* Registers mapping */ 3613 /* TODO: block userspace mapping of io register */ 3614 if (adev->asic_type >= CHIP_BONAIRE) { 3615 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3616 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3617 } else { 3618 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3619 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3620 } 3621 3622 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3623 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3624 3625 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3626 if (adev->rmmio == NULL) { 3627 return -ENOMEM; 3628 } 3629 #endif 3630 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3631 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3632 3633 /* enable PCIE atomic ops */ 3634 #ifdef notyet 3635 r = pci_enable_atomic_ops_to_root(adev->pdev, 3636 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3637 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3638 if (r) { 3639 adev->have_atomics_support = false; 3640 DRM_INFO("PCIE atomic ops is not supported\n"); 3641 } else { 3642 adev->have_atomics_support = true; 3643 } 3644 #else 3645 adev->have_atomics_support = false; 3646 #endif 3647 3648 amdgpu_device_get_pcie_info(adev); 3649 3650 if (amdgpu_mcbp) 3651 DRM_INFO("MCBP is enabled\n"); 3652 3653 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3654 adev->enable_mes = true; 3655 3656 /* detect hw virtualization here */ 3657 amdgpu_detect_virtualization(adev); 3658 3659 r = amdgpu_device_get_job_timeout_settings(adev); 3660 if (r) { 3661 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3662 return r; 3663 } 3664 3665 /* early init functions */ 3666 r = amdgpu_device_ip_early_init(adev); 3667 if (r) 3668 return r; 3669 3670 /* doorbell bar mapping and doorbell index init*/ 3671 amdgpu_device_doorbell_init(adev); 3672 3673 if (amdgpu_emu_mode == 1) { 3674 /* post the asic on emulation mode */ 3675 emu_soc_asic_init(adev); 3676 goto fence_driver_init; 3677 } 3678 3679 amdgpu_reset_init(adev); 3680 3681 /* detect if we are with an SRIOV vbios */ 3682 amdgpu_device_detect_sriov_bios(adev); 3683 3684 /* check if we need to reset the asic 3685 * E.g., driver was not cleanly unloaded previously, etc. 3686 */ 3687 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3688 if (adev->gmc.xgmi.num_physical_nodes) { 3689 dev_info(adev->dev, "Pending hive reset.\n"); 3690 adev->gmc.xgmi.pending_reset = true; 3691 /* Only need to init necessary block for SMU to handle the reset */ 3692 for (i = 0; i < adev->num_ip_blocks; i++) { 3693 if (!adev->ip_blocks[i].status.valid) 3694 continue; 3695 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3696 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3697 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3699 DRM_DEBUG("IP %s disabled for hw_init.\n", 3700 adev->ip_blocks[i].version->funcs->name); 3701 adev->ip_blocks[i].status.hw = true; 3702 } 3703 } 3704 } else { 3705 r = amdgpu_asic_reset(adev); 3706 if (r) { 3707 dev_err(adev->dev, "asic reset on init failed\n"); 3708 goto failed; 3709 } 3710 } 3711 } 3712 3713 pci_enable_pcie_error_reporting(adev->pdev); 3714 3715 /* Post card if necessary */ 3716 if (amdgpu_device_need_post(adev)) { 3717 if (!adev->bios) { 3718 dev_err(adev->dev, "no vBIOS found\n"); 3719 r = -EINVAL; 3720 goto failed; 3721 } 3722 DRM_INFO("GPU posting now...\n"); 3723 r = amdgpu_device_asic_init(adev); 3724 if (r) { 3725 dev_err(adev->dev, "gpu post error!\n"); 3726 goto failed; 3727 } 3728 } 3729 3730 if (adev->is_atom_fw) { 3731 /* Initialize clocks */ 3732 r = amdgpu_atomfirmware_get_clock_info(adev); 3733 if (r) { 3734 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3735 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3736 goto failed; 3737 } 3738 } else { 3739 /* Initialize clocks */ 3740 r = amdgpu_atombios_get_clock_info(adev); 3741 if (r) { 3742 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3744 goto failed; 3745 } 3746 /* init i2c buses */ 3747 if (!amdgpu_device_has_dc_support(adev)) 3748 amdgpu_atombios_i2c_init(adev); 3749 } 3750 3751 fence_driver_init: 3752 /* Fence driver */ 3753 r = amdgpu_fence_driver_sw_init(adev); 3754 if (r) { 3755 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3756 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3757 goto failed; 3758 } 3759 3760 /* init the mode config */ 3761 drm_mode_config_init(adev_to_drm(adev)); 3762 3763 r = amdgpu_device_ip_init(adev); 3764 if (r) { 3765 /* failed in exclusive mode due to timeout */ 3766 if (amdgpu_sriov_vf(adev) && 3767 !amdgpu_sriov_runtime(adev) && 3768 amdgpu_virt_mmio_blocked(adev) && 3769 !amdgpu_virt_wait_reset(adev)) { 3770 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3771 /* Don't send request since VF is inactive. */ 3772 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3773 adev->virt.ops = NULL; 3774 r = -EAGAIN; 3775 goto release_ras_con; 3776 } 3777 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3778 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3779 goto release_ras_con; 3780 } 3781 3782 amdgpu_fence_driver_hw_init(adev); 3783 3784 dev_info(adev->dev, 3785 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3786 adev->gfx.config.max_shader_engines, 3787 adev->gfx.config.max_sh_per_se, 3788 adev->gfx.config.max_cu_per_sh, 3789 adev->gfx.cu_info.number); 3790 3791 #ifdef __OpenBSD__ 3792 { 3793 const char *chip_name; 3794 3795 switch (adev->asic_type) { 3796 case CHIP_RAVEN: 3797 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3798 chip_name = "RAVEN2"; 3799 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3800 chip_name = "PICASSO"; 3801 else 3802 chip_name = "RAVEN"; 3803 break; 3804 case CHIP_RENOIR: 3805 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3806 chip_name = "RENOIR"; 3807 else 3808 chip_name = "GREEN_SARDINE"; 3809 break; 3810 default: 3811 chip_name = amdgpu_asic_name[adev->asic_type]; 3812 } 3813 printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname, 3814 chip_name, adev->gfx.cu_info.number, adev->rev_id); 3815 } 3816 #endif 3817 3818 adev->accel_working = true; 3819 3820 amdgpu_vm_check_compute_bug(adev); 3821 3822 /* Initialize the buffer migration limit. */ 3823 if (amdgpu_moverate >= 0) 3824 max_MBps = amdgpu_moverate; 3825 else 3826 max_MBps = 8; /* Allow 8 MB/s. */ 3827 /* Get a log2 for easy divisions. */ 3828 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3829 3830 amdgpu_fbdev_init(adev); 3831 3832 r = amdgpu_pm_sysfs_init(adev); 3833 if (r) { 3834 adev->pm_sysfs_en = false; 3835 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3836 } else 3837 adev->pm_sysfs_en = true; 3838 3839 r = amdgpu_ucode_sysfs_init(adev); 3840 if (r) { 3841 adev->ucode_sysfs_en = false; 3842 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3843 } else 3844 adev->ucode_sysfs_en = true; 3845 3846 if ((amdgpu_testing & 1)) { 3847 if (adev->accel_working) 3848 amdgpu_test_moves(adev); 3849 else 3850 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3851 } 3852 if (amdgpu_benchmarking) { 3853 if (adev->accel_working) 3854 amdgpu_benchmark(adev, amdgpu_benchmarking); 3855 else 3856 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3857 } 3858 3859 /* 3860 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3861 * Otherwise the mgpu fan boost feature will be skipped due to the 3862 * gpu instance is counted less. 3863 */ 3864 amdgpu_register_gpu_instance(adev); 3865 3866 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3867 * explicit gating rather than handling it automatically. 3868 */ 3869 if (!adev->gmc.xgmi.pending_reset) { 3870 r = amdgpu_device_ip_late_init(adev); 3871 if (r) { 3872 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3873 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3874 goto release_ras_con; 3875 } 3876 /* must succeed. */ 3877 amdgpu_ras_resume(adev); 3878 queue_delayed_work(system_wq, &adev->delayed_init_work, 3879 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3880 } 3881 3882 if (amdgpu_sriov_vf(adev)) 3883 flush_delayed_work(&adev->delayed_init_work); 3884 3885 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3886 if (r) 3887 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3888 3889 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3890 r = amdgpu_pmu_init(adev); 3891 if (r) 3892 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3893 3894 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3895 if (amdgpu_device_cache_pci_state(adev->pdev)) 3896 pci_restore_state(pdev); 3897 3898 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3899 /* this will fail for cards that aren't VGA class devices, just 3900 * ignore it */ 3901 #ifdef notyet 3902 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3903 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3904 #endif 3905 3906 if (amdgpu_device_supports_px(ddev)) { 3907 px = true; 3908 vga_switcheroo_register_client(adev->pdev, 3909 &amdgpu_switcheroo_ops, px); 3910 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3911 } 3912 3913 if (adev->gmc.xgmi.pending_reset) 3914 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3915 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3916 3917 return 0; 3918 3919 release_ras_con: 3920 amdgpu_release_ras_context(adev); 3921 3922 failed: 3923 amdgpu_vf_error_trans_all(adev); 3924 3925 return r; 3926 } 3927 3928 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3929 { 3930 STUB(); 3931 #ifdef notyet 3932 /* Clear all CPU mappings pointing to this device */ 3933 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3934 #endif 3935 3936 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3937 amdgpu_device_doorbell_fini(adev); 3938 3939 #ifdef __linux__ 3940 iounmap(adev->rmmio); 3941 adev->rmmio = NULL; 3942 if (adev->mman.aper_base_kaddr) 3943 iounmap(adev->mman.aper_base_kaddr); 3944 adev->mman.aper_base_kaddr = NULL; 3945 #else 3946 if (adev->rmmio_size > 0) 3947 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 3948 adev->rmmio_size); 3949 adev->rmmio_size = 0; 3950 adev->rmmio = NULL; 3951 if (adev->mman.aper_base_kaddr) 3952 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 3953 adev->gmc.visible_vram_size); 3954 adev->mman.aper_base_kaddr = NULL; 3955 #endif 3956 3957 /* Memory manager related */ 3958 if (!adev->gmc.xgmi.connected_to_cpu) { 3959 #ifdef __linux__ 3960 arch_phys_wc_del(adev->gmc.vram_mtrr); 3961 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3962 #else 3963 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 3964 #endif 3965 } 3966 } 3967 3968 /** 3969 * amdgpu_device_fini - tear down the driver 3970 * 3971 * @adev: amdgpu_device pointer 3972 * 3973 * Tear down the driver info (all asics). 3974 * Called at driver shutdown. 3975 */ 3976 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3977 { 3978 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3979 flush_delayed_work(&adev->delayed_init_work); 3980 if (adev->mman.initialized) { 3981 flush_delayed_work(&adev->mman.bdev.wq); 3982 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3983 } 3984 adev->shutdown = true; 3985 3986 /* make sure IB test finished before entering exclusive mode 3987 * to avoid preemption on IB test 3988 * */ 3989 if (amdgpu_sriov_vf(adev)) { 3990 amdgpu_virt_request_full_gpu(adev, false); 3991 amdgpu_virt_fini_data_exchange(adev); 3992 } 3993 3994 /* disable all interrupts */ 3995 amdgpu_irq_disable_all(adev); 3996 if (adev->mode_info.mode_config_initialized){ 3997 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3998 drm_helper_force_disable_all(adev_to_drm(adev)); 3999 else 4000 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4001 } 4002 amdgpu_fence_driver_hw_fini(adev); 4003 4004 if (adev->pm_sysfs_en) 4005 amdgpu_pm_sysfs_fini(adev); 4006 if (adev->ucode_sysfs_en) 4007 amdgpu_ucode_sysfs_fini(adev); 4008 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4009 4010 amdgpu_fbdev_fini(adev); 4011 4012 amdgpu_irq_fini_hw(adev); 4013 4014 amdgpu_device_ip_fini_early(adev); 4015 4016 amdgpu_gart_dummy_page_fini(adev); 4017 4018 amdgpu_device_unmap_mmio(adev); 4019 } 4020 4021 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4022 { 4023 amdgpu_fence_driver_sw_fini(adev); 4024 amdgpu_device_ip_fini(adev); 4025 release_firmware(adev->firmware.gpu_info_fw); 4026 adev->firmware.gpu_info_fw = NULL; 4027 adev->accel_working = false; 4028 4029 amdgpu_reset_fini(adev); 4030 4031 /* free i2c buses */ 4032 if (!amdgpu_device_has_dc_support(adev)) 4033 amdgpu_i2c_fini(adev); 4034 4035 if (amdgpu_emu_mode != 1) 4036 amdgpu_atombios_fini(adev); 4037 4038 kfree(adev->bios); 4039 adev->bios = NULL; 4040 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4041 vga_switcheroo_unregister_client(adev->pdev); 4042 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4043 } 4044 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4045 vga_client_unregister(adev->pdev); 4046 4047 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4048 amdgpu_pmu_fini(adev); 4049 if (adev->mman.discovery_bin) 4050 amdgpu_discovery_fini(adev); 4051 4052 kfree(adev->pci_state); 4053 4054 } 4055 4056 /** 4057 * amdgpu_device_evict_resources - evict device resources 4058 * @adev: amdgpu device object 4059 * 4060 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4061 * of the vram memory type. Mainly used for evicting device resources 4062 * at suspend time. 4063 * 4064 */ 4065 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4066 { 4067 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4068 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4069 return; 4070 4071 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4072 DRM_WARN("evicting device resources failed\n"); 4073 4074 } 4075 4076 /* 4077 * Suspend & resume. 4078 */ 4079 /** 4080 * amdgpu_device_suspend - initiate device suspend 4081 * 4082 * @dev: drm dev pointer 4083 * @fbcon : notify the fbdev of suspend 4084 * 4085 * Puts the hw in the suspend state (all asics). 4086 * Returns 0 for success or an error on failure. 4087 * Called at driver suspend. 4088 */ 4089 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4090 { 4091 struct amdgpu_device *adev = drm_to_adev(dev); 4092 4093 if (adev->shutdown) 4094 return 0; 4095 4096 #ifdef notyet 4097 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4098 return 0; 4099 #endif 4100 4101 adev->in_suspend = true; 4102 4103 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4104 DRM_WARN("smart shift update failed\n"); 4105 4106 drm_kms_helper_poll_disable(dev); 4107 4108 if (fbcon) 4109 amdgpu_fbdev_set_suspend(adev, 1); 4110 4111 cancel_delayed_work_sync(&adev->delayed_init_work); 4112 4113 amdgpu_ras_suspend(adev); 4114 4115 amdgpu_device_ip_suspend_phase1(adev); 4116 4117 if (!adev->in_s0ix) 4118 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4119 4120 /* First evict vram memory */ 4121 amdgpu_device_evict_resources(adev); 4122 4123 amdgpu_fence_driver_hw_fini(adev); 4124 4125 amdgpu_device_ip_suspend_phase2(adev); 4126 /* This second call to evict device resources is to evict 4127 * the gart page table using the CPU. 4128 */ 4129 amdgpu_device_evict_resources(adev); 4130 4131 return 0; 4132 } 4133 4134 /** 4135 * amdgpu_device_resume - initiate device resume 4136 * 4137 * @dev: drm dev pointer 4138 * @fbcon : notify the fbdev of resume 4139 * 4140 * Bring the hw back to operating state (all asics). 4141 * Returns 0 for success or an error on failure. 4142 * Called at driver resume. 4143 */ 4144 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4145 { 4146 struct amdgpu_device *adev = drm_to_adev(dev); 4147 int r = 0; 4148 4149 #ifdef notyet 4150 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4151 return 0; 4152 #endif 4153 4154 if (adev->in_s0ix) 4155 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 4156 4157 /* post card */ 4158 if (amdgpu_device_need_post(adev)) { 4159 r = amdgpu_device_asic_init(adev); 4160 if (r) 4161 dev_err(adev->dev, "amdgpu asic init failed\n"); 4162 } 4163 4164 r = amdgpu_device_ip_resume(adev); 4165 if (r) { 4166 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4167 return r; 4168 } 4169 amdgpu_fence_driver_hw_init(adev); 4170 4171 r = amdgpu_device_ip_late_init(adev); 4172 if (r) 4173 return r; 4174 4175 queue_delayed_work(system_wq, &adev->delayed_init_work, 4176 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4177 4178 if (!adev->in_s0ix) { 4179 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4180 if (r) 4181 return r; 4182 } 4183 4184 /* Make sure IB tests flushed */ 4185 flush_delayed_work(&adev->delayed_init_work); 4186 4187 if (fbcon) 4188 amdgpu_fbdev_set_suspend(adev, 0); 4189 4190 drm_kms_helper_poll_enable(dev); 4191 4192 amdgpu_ras_resume(adev); 4193 4194 /* 4195 * Most of the connector probing functions try to acquire runtime pm 4196 * refs to ensure that the GPU is powered on when connector polling is 4197 * performed. Since we're calling this from a runtime PM callback, 4198 * trying to acquire rpm refs will cause us to deadlock. 4199 * 4200 * Since we're guaranteed to be holding the rpm lock, it's safe to 4201 * temporarily disable the rpm helpers so this doesn't deadlock us. 4202 */ 4203 #if defined(CONFIG_PM) && defined(__linux__) 4204 dev->dev->power.disable_depth++; 4205 #endif 4206 if (!amdgpu_device_has_dc_support(adev)) 4207 drm_helper_hpd_irq_event(dev); 4208 else 4209 drm_kms_helper_hotplug_event(dev); 4210 #if defined(CONFIG_PM) && defined(__linux__) 4211 dev->dev->power.disable_depth--; 4212 #endif 4213 adev->in_suspend = false; 4214 4215 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4216 DRM_WARN("smart shift update failed\n"); 4217 4218 return 0; 4219 } 4220 4221 /** 4222 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4223 * 4224 * @adev: amdgpu_device pointer 4225 * 4226 * The list of all the hardware IPs that make up the asic is walked and 4227 * the check_soft_reset callbacks are run. check_soft_reset determines 4228 * if the asic is still hung or not. 4229 * Returns true if any of the IPs are still in a hung state, false if not. 4230 */ 4231 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4232 { 4233 int i; 4234 bool asic_hang = false; 4235 4236 if (amdgpu_sriov_vf(adev)) 4237 return true; 4238 4239 if (amdgpu_asic_need_full_reset(adev)) 4240 return true; 4241 4242 for (i = 0; i < adev->num_ip_blocks; i++) { 4243 if (!adev->ip_blocks[i].status.valid) 4244 continue; 4245 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4246 adev->ip_blocks[i].status.hang = 4247 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4248 if (adev->ip_blocks[i].status.hang) { 4249 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4250 asic_hang = true; 4251 } 4252 } 4253 return asic_hang; 4254 } 4255 4256 /** 4257 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4258 * 4259 * @adev: amdgpu_device pointer 4260 * 4261 * The list of all the hardware IPs that make up the asic is walked and the 4262 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4263 * handles any IP specific hardware or software state changes that are 4264 * necessary for a soft reset to succeed. 4265 * Returns 0 on success, negative error code on failure. 4266 */ 4267 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4268 { 4269 int i, r = 0; 4270 4271 for (i = 0; i < adev->num_ip_blocks; i++) { 4272 if (!adev->ip_blocks[i].status.valid) 4273 continue; 4274 if (adev->ip_blocks[i].status.hang && 4275 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4276 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4277 if (r) 4278 return r; 4279 } 4280 } 4281 4282 return 0; 4283 } 4284 4285 /** 4286 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4287 * 4288 * @adev: amdgpu_device pointer 4289 * 4290 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4291 * reset is necessary to recover. 4292 * Returns true if a full asic reset is required, false if not. 4293 */ 4294 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4295 { 4296 int i; 4297 4298 if (amdgpu_asic_need_full_reset(adev)) 4299 return true; 4300 4301 for (i = 0; i < adev->num_ip_blocks; i++) { 4302 if (!adev->ip_blocks[i].status.valid) 4303 continue; 4304 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4305 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4306 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4307 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4308 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4309 if (adev->ip_blocks[i].status.hang) { 4310 dev_info(adev->dev, "Some block need full reset!\n"); 4311 return true; 4312 } 4313 } 4314 } 4315 return false; 4316 } 4317 4318 /** 4319 * amdgpu_device_ip_soft_reset - do a soft reset 4320 * 4321 * @adev: amdgpu_device pointer 4322 * 4323 * The list of all the hardware IPs that make up the asic is walked and the 4324 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4325 * IP specific hardware or software state changes that are necessary to soft 4326 * reset the IP. 4327 * Returns 0 on success, negative error code on failure. 4328 */ 4329 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4330 { 4331 int i, r = 0; 4332 4333 for (i = 0; i < adev->num_ip_blocks; i++) { 4334 if (!adev->ip_blocks[i].status.valid) 4335 continue; 4336 if (adev->ip_blocks[i].status.hang && 4337 adev->ip_blocks[i].version->funcs->soft_reset) { 4338 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4339 if (r) 4340 return r; 4341 } 4342 } 4343 4344 return 0; 4345 } 4346 4347 /** 4348 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4349 * 4350 * @adev: amdgpu_device pointer 4351 * 4352 * The list of all the hardware IPs that make up the asic is walked and the 4353 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4354 * handles any IP specific hardware or software state changes that are 4355 * necessary after the IP has been soft reset. 4356 * Returns 0 on success, negative error code on failure. 4357 */ 4358 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4359 { 4360 int i, r = 0; 4361 4362 for (i = 0; i < adev->num_ip_blocks; i++) { 4363 if (!adev->ip_blocks[i].status.valid) 4364 continue; 4365 if (adev->ip_blocks[i].status.hang && 4366 adev->ip_blocks[i].version->funcs->post_soft_reset) 4367 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4368 if (r) 4369 return r; 4370 } 4371 4372 return 0; 4373 } 4374 4375 /** 4376 * amdgpu_device_recover_vram - Recover some VRAM contents 4377 * 4378 * @adev: amdgpu_device pointer 4379 * 4380 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4381 * restore things like GPUVM page tables after a GPU reset where 4382 * the contents of VRAM might be lost. 4383 * 4384 * Returns: 4385 * 0 on success, negative error code on failure. 4386 */ 4387 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4388 { 4389 struct dma_fence *fence = NULL, *next = NULL; 4390 struct amdgpu_bo *shadow; 4391 struct amdgpu_bo_vm *vmbo; 4392 long r = 1, tmo; 4393 4394 if (amdgpu_sriov_runtime(adev)) 4395 tmo = msecs_to_jiffies(8000); 4396 else 4397 tmo = msecs_to_jiffies(100); 4398 4399 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4400 mutex_lock(&adev->shadow_list_lock); 4401 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4402 shadow = &vmbo->bo; 4403 /* No need to recover an evicted BO */ 4404 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4405 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4406 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4407 continue; 4408 4409 r = amdgpu_bo_restore_shadow(shadow, &next); 4410 if (r) 4411 break; 4412 4413 if (fence) { 4414 tmo = dma_fence_wait_timeout(fence, false, tmo); 4415 dma_fence_put(fence); 4416 fence = next; 4417 if (tmo == 0) { 4418 r = -ETIMEDOUT; 4419 break; 4420 } else if (tmo < 0) { 4421 r = tmo; 4422 break; 4423 } 4424 } else { 4425 fence = next; 4426 } 4427 } 4428 mutex_unlock(&adev->shadow_list_lock); 4429 4430 if (fence) 4431 tmo = dma_fence_wait_timeout(fence, false, tmo); 4432 dma_fence_put(fence); 4433 4434 if (r < 0 || tmo <= 0) { 4435 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4436 return -EIO; 4437 } 4438 4439 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4440 return 0; 4441 } 4442 4443 4444 /** 4445 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4446 * 4447 * @adev: amdgpu_device pointer 4448 * @from_hypervisor: request from hypervisor 4449 * 4450 * do VF FLR and reinitialize Asic 4451 * return 0 means succeeded otherwise failed 4452 */ 4453 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4454 bool from_hypervisor) 4455 { 4456 int r; 4457 4458 if (from_hypervisor) 4459 r = amdgpu_virt_request_full_gpu(adev, true); 4460 else 4461 r = amdgpu_virt_reset_gpu(adev); 4462 if (r) 4463 return r; 4464 4465 amdgpu_amdkfd_pre_reset(adev); 4466 4467 /* Resume IP prior to SMC */ 4468 r = amdgpu_device_ip_reinit_early_sriov(adev); 4469 if (r) 4470 goto error; 4471 4472 amdgpu_virt_init_data_exchange(adev); 4473 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4474 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4475 4476 r = amdgpu_device_fw_loading(adev); 4477 if (r) 4478 return r; 4479 4480 /* now we are okay to resume SMC/CP/SDMA */ 4481 r = amdgpu_device_ip_reinit_late_sriov(adev); 4482 if (r) 4483 goto error; 4484 4485 amdgpu_irq_gpu_reset_resume_helper(adev); 4486 r = amdgpu_ib_ring_tests(adev); 4487 amdgpu_amdkfd_post_reset(adev); 4488 4489 error: 4490 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4491 amdgpu_inc_vram_lost(adev); 4492 r = amdgpu_device_recover_vram(adev); 4493 } 4494 amdgpu_virt_release_full_gpu(adev, true); 4495 4496 return r; 4497 } 4498 4499 /** 4500 * amdgpu_device_has_job_running - check if there is any job in mirror list 4501 * 4502 * @adev: amdgpu_device pointer 4503 * 4504 * check if there is any job in mirror list 4505 */ 4506 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4507 { 4508 int i; 4509 struct drm_sched_job *job; 4510 4511 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4512 struct amdgpu_ring *ring = adev->rings[i]; 4513 4514 if (!ring || !ring->sched.thread) 4515 continue; 4516 4517 spin_lock(&ring->sched.job_list_lock); 4518 job = list_first_entry_or_null(&ring->sched.pending_list, 4519 struct drm_sched_job, list); 4520 spin_unlock(&ring->sched.job_list_lock); 4521 if (job) 4522 return true; 4523 } 4524 return false; 4525 } 4526 4527 /** 4528 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4529 * 4530 * @adev: amdgpu_device pointer 4531 * 4532 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4533 * a hung GPU. 4534 */ 4535 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4536 { 4537 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4538 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4539 return false; 4540 } 4541 4542 if (amdgpu_gpu_recovery == 0) 4543 goto disabled; 4544 4545 if (amdgpu_sriov_vf(adev)) 4546 return true; 4547 4548 if (amdgpu_gpu_recovery == -1) { 4549 switch (adev->asic_type) { 4550 case CHIP_BONAIRE: 4551 case CHIP_HAWAII: 4552 case CHIP_TOPAZ: 4553 case CHIP_TONGA: 4554 case CHIP_FIJI: 4555 case CHIP_POLARIS10: 4556 case CHIP_POLARIS11: 4557 case CHIP_POLARIS12: 4558 case CHIP_VEGAM: 4559 case CHIP_VEGA20: 4560 case CHIP_VEGA10: 4561 case CHIP_VEGA12: 4562 case CHIP_RAVEN: 4563 case CHIP_ARCTURUS: 4564 case CHIP_RENOIR: 4565 case CHIP_NAVI10: 4566 case CHIP_NAVI14: 4567 case CHIP_NAVI12: 4568 case CHIP_SIENNA_CICHLID: 4569 case CHIP_NAVY_FLOUNDER: 4570 case CHIP_DIMGREY_CAVEFISH: 4571 case CHIP_BEIGE_GOBY: 4572 case CHIP_VANGOGH: 4573 case CHIP_ALDEBARAN: 4574 break; 4575 default: 4576 goto disabled; 4577 } 4578 } 4579 4580 return true; 4581 4582 disabled: 4583 dev_info(adev->dev, "GPU recovery disabled.\n"); 4584 return false; 4585 } 4586 4587 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4588 { 4589 u32 i; 4590 int ret = 0; 4591 4592 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4593 4594 dev_info(adev->dev, "GPU mode1 reset\n"); 4595 4596 /* disable BM */ 4597 pci_clear_master(adev->pdev); 4598 4599 amdgpu_device_cache_pci_state(adev->pdev); 4600 4601 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4602 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4603 ret = amdgpu_dpm_mode1_reset(adev); 4604 } else { 4605 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4606 ret = psp_gpu_reset(adev); 4607 } 4608 4609 if (ret) 4610 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4611 4612 amdgpu_device_load_pci_state(adev->pdev); 4613 4614 /* wait for asic to come out of reset */ 4615 for (i = 0; i < adev->usec_timeout; i++) { 4616 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4617 4618 if (memsize != 0xffffffff) 4619 break; 4620 udelay(1); 4621 } 4622 4623 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4624 return ret; 4625 } 4626 4627 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4628 struct amdgpu_reset_context *reset_context) 4629 { 4630 int i, j, r = 0; 4631 struct amdgpu_job *job = NULL; 4632 bool need_full_reset = 4633 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4634 4635 if (reset_context->reset_req_dev == adev) 4636 job = reset_context->job; 4637 4638 if (amdgpu_sriov_vf(adev)) { 4639 /* stop the data exchange thread */ 4640 amdgpu_virt_fini_data_exchange(adev); 4641 } 4642 4643 /* block all schedulers and reset given job's ring */ 4644 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4645 struct amdgpu_ring *ring = adev->rings[i]; 4646 4647 if (!ring || !ring->sched.thread) 4648 continue; 4649 4650 /*clear job fence from fence drv to avoid force_completion 4651 *leave NULL and vm flush fence in fence drv */ 4652 for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { 4653 struct dma_fence *old, **ptr; 4654 4655 ptr = &ring->fence_drv.fences[j]; 4656 old = rcu_dereference_protected(*ptr, 1); 4657 if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { 4658 RCU_INIT_POINTER(*ptr, NULL); 4659 } 4660 } 4661 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4662 amdgpu_fence_driver_force_completion(ring); 4663 } 4664 4665 if (job && job->vm) 4666 drm_sched_increase_karma(&job->base); 4667 4668 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4669 /* If reset handler not implemented, continue; otherwise return */ 4670 if (r == -ENOSYS) 4671 r = 0; 4672 else 4673 return r; 4674 4675 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4676 if (!amdgpu_sriov_vf(adev)) { 4677 4678 if (!need_full_reset) 4679 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4680 4681 if (!need_full_reset) { 4682 amdgpu_device_ip_pre_soft_reset(adev); 4683 r = amdgpu_device_ip_soft_reset(adev); 4684 amdgpu_device_ip_post_soft_reset(adev); 4685 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4686 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4687 need_full_reset = true; 4688 } 4689 } 4690 4691 if (need_full_reset) 4692 r = amdgpu_device_ip_suspend(adev); 4693 if (need_full_reset) 4694 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4695 else 4696 clear_bit(AMDGPU_NEED_FULL_RESET, 4697 &reset_context->flags); 4698 } 4699 4700 return r; 4701 } 4702 4703 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4704 struct amdgpu_reset_context *reset_context) 4705 { 4706 struct amdgpu_device *tmp_adev = NULL; 4707 bool need_full_reset, skip_hw_reset, vram_lost = false; 4708 int r = 0; 4709 4710 /* Try reset handler method first */ 4711 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4712 reset_list); 4713 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4714 /* If reset handler not implemented, continue; otherwise return */ 4715 if (r == -ENOSYS) 4716 r = 0; 4717 else 4718 return r; 4719 4720 /* Reset handler not implemented, use the default method */ 4721 need_full_reset = 4722 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4723 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4724 4725 /* 4726 * ASIC reset has to be done on all XGMI hive nodes ASAP 4727 * to allow proper links negotiation in FW (within 1 sec) 4728 */ 4729 if (!skip_hw_reset && need_full_reset) { 4730 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4731 /* For XGMI run all resets in parallel to speed up the process */ 4732 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4733 tmp_adev->gmc.xgmi.pending_reset = false; 4734 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4735 r = -EALREADY; 4736 } else 4737 r = amdgpu_asic_reset(tmp_adev); 4738 4739 if (r) { 4740 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4741 r, adev_to_drm(tmp_adev)->unique); 4742 break; 4743 } 4744 } 4745 4746 /* For XGMI wait for all resets to complete before proceed */ 4747 if (!r) { 4748 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4749 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4750 flush_work(&tmp_adev->xgmi_reset_work); 4751 r = tmp_adev->asic_reset_res; 4752 if (r) 4753 break; 4754 } 4755 } 4756 } 4757 } 4758 4759 if (!r && amdgpu_ras_intr_triggered()) { 4760 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4761 if (tmp_adev->mmhub.ras_funcs && 4762 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4763 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4764 } 4765 4766 amdgpu_ras_intr_cleared(); 4767 } 4768 4769 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4770 if (need_full_reset) { 4771 /* post card */ 4772 r = amdgpu_device_asic_init(tmp_adev); 4773 if (r) { 4774 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4775 } else { 4776 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4777 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4778 if (r) 4779 goto out; 4780 4781 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4782 if (r) 4783 goto out; 4784 4785 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4786 if (vram_lost) { 4787 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4788 amdgpu_inc_vram_lost(tmp_adev); 4789 } 4790 4791 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4792 if (r) 4793 goto out; 4794 4795 r = amdgpu_device_fw_loading(tmp_adev); 4796 if (r) 4797 return r; 4798 4799 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4800 if (r) 4801 goto out; 4802 4803 if (vram_lost) 4804 amdgpu_device_fill_reset_magic(tmp_adev); 4805 4806 /* 4807 * Add this ASIC as tracked as reset was already 4808 * complete successfully. 4809 */ 4810 amdgpu_register_gpu_instance(tmp_adev); 4811 4812 if (!reset_context->hive && 4813 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4814 amdgpu_xgmi_add_device(tmp_adev); 4815 4816 r = amdgpu_device_ip_late_init(tmp_adev); 4817 if (r) 4818 goto out; 4819 4820 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4821 4822 /* 4823 * The GPU enters bad state once faulty pages 4824 * by ECC has reached the threshold, and ras 4825 * recovery is scheduled next. So add one check 4826 * here to break recovery if it indeed exceeds 4827 * bad page threshold, and remind user to 4828 * retire this GPU or setting one bigger 4829 * bad_page_threshold value to fix this once 4830 * probing driver again. 4831 */ 4832 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4833 /* must succeed. */ 4834 amdgpu_ras_resume(tmp_adev); 4835 } else { 4836 r = -EINVAL; 4837 goto out; 4838 } 4839 4840 /* Update PSP FW topology after reset */ 4841 if (reset_context->hive && 4842 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4843 r = amdgpu_xgmi_update_topology( 4844 reset_context->hive, tmp_adev); 4845 } 4846 } 4847 4848 out: 4849 if (!r) { 4850 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4851 r = amdgpu_ib_ring_tests(tmp_adev); 4852 if (r) { 4853 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4854 need_full_reset = true; 4855 r = -EAGAIN; 4856 goto end; 4857 } 4858 } 4859 4860 if (!r) 4861 r = amdgpu_device_recover_vram(tmp_adev); 4862 else 4863 tmp_adev->asic_reset_res = r; 4864 } 4865 4866 end: 4867 if (need_full_reset) 4868 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4869 else 4870 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4871 return r; 4872 } 4873 4874 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4875 struct amdgpu_hive_info *hive) 4876 { 4877 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4878 return false; 4879 4880 if (hive) { 4881 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4882 } else { 4883 down_write(&adev->reset_sem); 4884 } 4885 4886 switch (amdgpu_asic_reset_method(adev)) { 4887 case AMD_RESET_METHOD_MODE1: 4888 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4889 break; 4890 case AMD_RESET_METHOD_MODE2: 4891 adev->mp1_state = PP_MP1_STATE_RESET; 4892 break; 4893 default: 4894 adev->mp1_state = PP_MP1_STATE_NONE; 4895 break; 4896 } 4897 4898 return true; 4899 } 4900 4901 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4902 { 4903 amdgpu_vf_error_trans_all(adev); 4904 adev->mp1_state = PP_MP1_STATE_NONE; 4905 atomic_set(&adev->in_gpu_reset, 0); 4906 up_write(&adev->reset_sem); 4907 } 4908 4909 /* 4910 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4911 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4912 * 4913 * unlock won't require roll back. 4914 */ 4915 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4916 { 4917 struct amdgpu_device *tmp_adev = NULL; 4918 4919 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4920 if (!hive) { 4921 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4922 return -ENODEV; 4923 } 4924 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4925 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4926 goto roll_back; 4927 } 4928 } else if (!amdgpu_device_lock_adev(adev, hive)) 4929 return -EAGAIN; 4930 4931 return 0; 4932 roll_back: 4933 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4934 /* 4935 * if the lockup iteration break in the middle of a hive, 4936 * it may means there may has a race issue, 4937 * or a hive device locked up independently. 4938 * we may be in trouble and may not, so will try to roll back 4939 * the lock and give out a warnning. 4940 */ 4941 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4942 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4943 amdgpu_device_unlock_adev(tmp_adev); 4944 } 4945 } 4946 return -EAGAIN; 4947 } 4948 4949 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4950 { 4951 STUB(); 4952 #ifdef notyet 4953 struct pci_dev *p = NULL; 4954 4955 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4956 adev->pdev->bus->number, 1); 4957 if (p) { 4958 pm_runtime_enable(&(p->dev)); 4959 pm_runtime_resume(&(p->dev)); 4960 } 4961 #endif 4962 } 4963 4964 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4965 { 4966 enum amd_reset_method reset_method; 4967 struct pci_dev *p = NULL; 4968 u64 expires; 4969 4970 /* 4971 * For now, only BACO and mode1 reset are confirmed 4972 * to suffer the audio issue without proper suspended. 4973 */ 4974 reset_method = amdgpu_asic_reset_method(adev); 4975 if ((reset_method != AMD_RESET_METHOD_BACO) && 4976 (reset_method != AMD_RESET_METHOD_MODE1)) 4977 return -EINVAL; 4978 4979 STUB(); 4980 return -ENOSYS; 4981 #ifdef notyet 4982 4983 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4984 adev->pdev->bus->number, 1); 4985 if (!p) 4986 return -ENODEV; 4987 4988 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4989 if (!expires) 4990 /* 4991 * If we cannot get the audio device autosuspend delay, 4992 * a fixed 4S interval will be used. Considering 3S is 4993 * the audio controller default autosuspend delay setting. 4994 * 4S used here is guaranteed to cover that. 4995 */ 4996 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4997 4998 while (!pm_runtime_status_suspended(&(p->dev))) { 4999 if (!pm_runtime_suspend(&(p->dev))) 5000 break; 5001 5002 if (expires < ktime_get_mono_fast_ns()) { 5003 dev_warn(adev->dev, "failed to suspend display audio\n"); 5004 /* TODO: abort the succeeding gpu reset? */ 5005 return -ETIMEDOUT; 5006 } 5007 } 5008 5009 pm_runtime_disable(&(p->dev)); 5010 5011 return 0; 5012 #endif 5013 } 5014 5015 static void amdgpu_device_recheck_guilty_jobs( 5016 struct amdgpu_device *adev, struct list_head *device_list_handle, 5017 struct amdgpu_reset_context *reset_context) 5018 { 5019 int i, r = 0; 5020 5021 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5022 struct amdgpu_ring *ring = adev->rings[i]; 5023 int ret = 0; 5024 struct drm_sched_job *s_job; 5025 5026 if (!ring || !ring->sched.thread) 5027 continue; 5028 5029 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5030 struct drm_sched_job, list); 5031 if (s_job == NULL) 5032 continue; 5033 5034 /* clear job's guilty and depend the folowing step to decide the real one */ 5035 drm_sched_reset_karma(s_job); 5036 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5037 5038 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5039 if (ret == 0) { /* timeout */ 5040 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5041 ring->sched.name, s_job->id); 5042 5043 /* set guilty */ 5044 drm_sched_increase_karma(s_job); 5045 retry: 5046 /* do hw reset */ 5047 if (amdgpu_sriov_vf(adev)) { 5048 amdgpu_virt_fini_data_exchange(adev); 5049 r = amdgpu_device_reset_sriov(adev, false); 5050 if (r) 5051 adev->asic_reset_res = r; 5052 } else { 5053 clear_bit(AMDGPU_SKIP_HW_RESET, 5054 &reset_context->flags); 5055 r = amdgpu_do_asic_reset(device_list_handle, 5056 reset_context); 5057 if (r && r == -EAGAIN) 5058 goto retry; 5059 } 5060 5061 /* 5062 * add reset counter so that the following 5063 * resubmitted job could flush vmid 5064 */ 5065 atomic_inc(&adev->gpu_reset_counter); 5066 continue; 5067 } 5068 5069 /* got the hw fence, signal finished fence */ 5070 atomic_dec(ring->sched.score); 5071 dma_fence_get(&s_job->s_fence->finished); 5072 dma_fence_signal(&s_job->s_fence->finished); 5073 dma_fence_put(&s_job->s_fence->finished); 5074 5075 /* remove node from list and free the job */ 5076 spin_lock(&ring->sched.job_list_lock); 5077 list_del_init(&s_job->list); 5078 spin_unlock(&ring->sched.job_list_lock); 5079 ring->sched.ops->free_job(s_job); 5080 } 5081 } 5082 5083 /** 5084 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5085 * 5086 * @adev: amdgpu_device pointer 5087 * @job: which job trigger hang 5088 * 5089 * Attempt to reset the GPU if it has hung (all asics). 5090 * Attempt to do soft-reset or full-reset and reinitialize Asic 5091 * Returns 0 for success or an error on failure. 5092 */ 5093 5094 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5095 struct amdgpu_job *job) 5096 { 5097 struct list_head device_list, *device_list_handle = NULL; 5098 bool job_signaled = false; 5099 struct amdgpu_hive_info *hive = NULL; 5100 struct amdgpu_device *tmp_adev = NULL; 5101 int i, r = 0; 5102 bool need_emergency_restart = false; 5103 bool audio_suspended = false; 5104 int tmp_vram_lost_counter; 5105 struct amdgpu_reset_context reset_context; 5106 5107 memset(&reset_context, 0, sizeof(reset_context)); 5108 5109 /* 5110 * Special case: RAS triggered and full reset isn't supported 5111 */ 5112 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5113 5114 /* 5115 * Flush RAM to disk so that after reboot 5116 * the user can read log and see why the system rebooted. 5117 */ 5118 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5119 DRM_WARN("Emergency reboot."); 5120 5121 #ifdef notyet 5122 ksys_sync_helper(); 5123 emergency_restart(); 5124 #else 5125 panic("emergency_restart"); 5126 #endif 5127 } 5128 5129 dev_info(adev->dev, "GPU %s begin!\n", 5130 need_emergency_restart ? "jobs stop":"reset"); 5131 5132 /* 5133 * Here we trylock to avoid chain of resets executing from 5134 * either trigger by jobs on different adevs in XGMI hive or jobs on 5135 * different schedulers for same device while this TO handler is running. 5136 * We always reset all schedulers for device and all devices for XGMI 5137 * hive so that should take care of them too. 5138 */ 5139 hive = amdgpu_get_xgmi_hive(adev); 5140 if (hive) { 5141 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5142 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5143 job ? job->base.id : -1, hive->hive_id); 5144 amdgpu_put_xgmi_hive(hive); 5145 if (job && job->vm) 5146 drm_sched_increase_karma(&job->base); 5147 return 0; 5148 } 5149 mutex_lock(&hive->hive_lock); 5150 } 5151 5152 reset_context.method = AMD_RESET_METHOD_NONE; 5153 reset_context.reset_req_dev = adev; 5154 reset_context.job = job; 5155 reset_context.hive = hive; 5156 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5157 5158 /* 5159 * lock the device before we try to operate the linked list 5160 * if didn't get the device lock, don't touch the linked list since 5161 * others may iterating it. 5162 */ 5163 r = amdgpu_device_lock_hive_adev(adev, hive); 5164 if (r) { 5165 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5166 job ? job->base.id : -1); 5167 5168 /* even we skipped this reset, still need to set the job to guilty */ 5169 if (job && job->vm) 5170 drm_sched_increase_karma(&job->base); 5171 goto skip_recovery; 5172 } 5173 5174 /* 5175 * Build list of devices to reset. 5176 * In case we are in XGMI hive mode, resort the device list 5177 * to put adev in the 1st position. 5178 */ 5179 INIT_LIST_HEAD(&device_list); 5180 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5181 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5182 list_add_tail(&tmp_adev->reset_list, &device_list); 5183 if (!list_is_first(&adev->reset_list, &device_list)) 5184 list_rotate_to_front(&adev->reset_list, &device_list); 5185 device_list_handle = &device_list; 5186 } else { 5187 list_add_tail(&adev->reset_list, &device_list); 5188 device_list_handle = &device_list; 5189 } 5190 5191 /* block all schedulers and reset given job's ring */ 5192 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5193 /* 5194 * Try to put the audio codec into suspend state 5195 * before gpu reset started. 5196 * 5197 * Due to the power domain of the graphics device 5198 * is shared with AZ power domain. Without this, 5199 * we may change the audio hardware from behind 5200 * the audio driver's back. That will trigger 5201 * some audio codec errors. 5202 */ 5203 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5204 audio_suspended = true; 5205 5206 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5207 5208 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5209 5210 if (!amdgpu_sriov_vf(tmp_adev)) 5211 amdgpu_amdkfd_pre_reset(tmp_adev); 5212 5213 /* 5214 * Mark these ASICs to be reseted as untracked first 5215 * And add them back after reset completed 5216 */ 5217 amdgpu_unregister_gpu_instance(tmp_adev); 5218 5219 amdgpu_fbdev_set_suspend(tmp_adev, 1); 5220 5221 /* disable ras on ALL IPs */ 5222 if (!need_emergency_restart && 5223 amdgpu_device_ip_need_full_reset(tmp_adev)) 5224 amdgpu_ras_suspend(tmp_adev); 5225 5226 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5227 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5228 5229 if (!ring || !ring->sched.thread) 5230 continue; 5231 5232 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5233 5234 if (need_emergency_restart) 5235 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5236 } 5237 atomic_inc(&tmp_adev->gpu_reset_counter); 5238 } 5239 5240 if (need_emergency_restart) 5241 goto skip_sched_resume; 5242 5243 /* 5244 * Must check guilty signal here since after this point all old 5245 * HW fences are force signaled. 5246 * 5247 * job->base holds a reference to parent fence 5248 */ 5249 if (job && job->base.s_fence->parent && 5250 dma_fence_is_signaled(job->base.s_fence->parent)) { 5251 job_signaled = true; 5252 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5253 goto skip_hw_reset; 5254 } 5255 5256 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5257 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5258 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5259 /*TODO Should we stop ?*/ 5260 if (r) { 5261 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5262 r, adev_to_drm(tmp_adev)->unique); 5263 tmp_adev->asic_reset_res = r; 5264 } 5265 } 5266 5267 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5268 /* Actual ASIC resets if needed.*/ 5269 /* TODO Implement XGMI hive reset logic for SRIOV */ 5270 if (amdgpu_sriov_vf(adev)) { 5271 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5272 if (r) 5273 adev->asic_reset_res = r; 5274 } else { 5275 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5276 if (r && r == -EAGAIN) 5277 goto retry; 5278 } 5279 5280 skip_hw_reset: 5281 5282 /* Post ASIC reset for all devs .*/ 5283 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5284 5285 /* 5286 * Sometimes a later bad compute job can block a good gfx job as gfx 5287 * and compute ring share internal GC HW mutually. We add an additional 5288 * guilty jobs recheck step to find the real guilty job, it synchronously 5289 * submits and pends for the first job being signaled. If it gets timeout, 5290 * we identify it as a real guilty job. 5291 */ 5292 if (amdgpu_gpu_recovery == 2 && 5293 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5294 amdgpu_device_recheck_guilty_jobs( 5295 tmp_adev, device_list_handle, &reset_context); 5296 5297 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5298 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5299 5300 if (!ring || !ring->sched.thread) 5301 continue; 5302 5303 /* No point to resubmit jobs if we didn't HW reset*/ 5304 if (!tmp_adev->asic_reset_res && !job_signaled) 5305 drm_sched_resubmit_jobs(&ring->sched); 5306 5307 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5308 } 5309 5310 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5311 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5312 } 5313 5314 tmp_adev->asic_reset_res = 0; 5315 5316 if (r) { 5317 /* bad news, how to tell it to userspace ? */ 5318 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5319 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5320 } else { 5321 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5322 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5323 DRM_WARN("smart shift update failed\n"); 5324 } 5325 } 5326 5327 skip_sched_resume: 5328 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5329 /* unlock kfd: SRIOV would do it separately */ 5330 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5331 amdgpu_amdkfd_post_reset(tmp_adev); 5332 5333 /* kfd_post_reset will do nothing if kfd device is not initialized, 5334 * need to bring up kfd here if it's not be initialized before 5335 */ 5336 if (!adev->kfd.init_complete) 5337 amdgpu_amdkfd_device_init(adev); 5338 5339 if (audio_suspended) 5340 amdgpu_device_resume_display_audio(tmp_adev); 5341 amdgpu_device_unlock_adev(tmp_adev); 5342 } 5343 5344 skip_recovery: 5345 if (hive) { 5346 atomic_set(&hive->in_reset, 0); 5347 mutex_unlock(&hive->hive_lock); 5348 amdgpu_put_xgmi_hive(hive); 5349 } 5350 5351 if (r && r != -EAGAIN) 5352 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5353 return r; 5354 } 5355 5356 /** 5357 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5358 * 5359 * @adev: amdgpu_device pointer 5360 * 5361 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5362 * and lanes) of the slot the device is in. Handles APUs and 5363 * virtualized environments where PCIE config space may not be available. 5364 */ 5365 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5366 { 5367 struct pci_dev *pdev; 5368 enum pci_bus_speed speed_cap, platform_speed_cap; 5369 enum pcie_link_width platform_link_width; 5370 5371 if (amdgpu_pcie_gen_cap) 5372 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5373 5374 if (amdgpu_pcie_lane_cap) 5375 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5376 5377 /* covers APUs as well */ 5378 if (pci_is_root_bus(adev->pdev->bus)) { 5379 if (adev->pm.pcie_gen_mask == 0) 5380 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5381 if (adev->pm.pcie_mlw_mask == 0) 5382 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5383 return; 5384 } 5385 5386 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5387 return; 5388 5389 pcie_bandwidth_available(adev->pdev, NULL, 5390 &platform_speed_cap, &platform_link_width); 5391 5392 if (adev->pm.pcie_gen_mask == 0) { 5393 /* asic caps */ 5394 pdev = adev->pdev; 5395 speed_cap = pcie_get_speed_cap(pdev); 5396 if (speed_cap == PCI_SPEED_UNKNOWN) { 5397 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5398 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5399 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5400 } else { 5401 if (speed_cap == PCIE_SPEED_32_0GT) 5402 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5403 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5404 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5405 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5406 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5407 else if (speed_cap == PCIE_SPEED_16_0GT) 5408 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5409 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5410 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5411 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5412 else if (speed_cap == PCIE_SPEED_8_0GT) 5413 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5414 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5415 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5416 else if (speed_cap == PCIE_SPEED_5_0GT) 5417 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5418 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5419 else 5420 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5421 } 5422 /* platform caps */ 5423 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5424 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5425 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5426 } else { 5427 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5428 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5429 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5430 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5431 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5432 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5433 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5434 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5435 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5436 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5437 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5438 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5439 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5440 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5441 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5442 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5443 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5444 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5445 else 5446 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5447 5448 } 5449 } 5450 if (adev->pm.pcie_mlw_mask == 0) { 5451 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5452 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5453 } else { 5454 switch (platform_link_width) { 5455 case PCIE_LNK_X32: 5456 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5459 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5460 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5461 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5463 break; 5464 case PCIE_LNK_X16: 5465 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5467 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5468 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5469 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5470 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5471 break; 5472 case PCIE_LNK_X12: 5473 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5474 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5475 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5476 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5477 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5478 break; 5479 case PCIE_LNK_X8: 5480 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5481 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5482 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5483 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5484 break; 5485 case PCIE_LNK_X4: 5486 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5487 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5488 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5489 break; 5490 case PCIE_LNK_X2: 5491 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5492 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5493 break; 5494 case PCIE_LNK_X1: 5495 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5496 break; 5497 default: 5498 break; 5499 } 5500 } 5501 } 5502 } 5503 5504 int amdgpu_device_baco_enter(struct drm_device *dev) 5505 { 5506 struct amdgpu_device *adev = drm_to_adev(dev); 5507 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5508 5509 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5510 return -ENOTSUPP; 5511 5512 if (ras && adev->ras_enabled && 5513 adev->nbio.funcs->enable_doorbell_interrupt) 5514 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5515 5516 return amdgpu_dpm_baco_enter(adev); 5517 } 5518 5519 int amdgpu_device_baco_exit(struct drm_device *dev) 5520 { 5521 struct amdgpu_device *adev = drm_to_adev(dev); 5522 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5523 int ret = 0; 5524 5525 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5526 return -ENOTSUPP; 5527 5528 ret = amdgpu_dpm_baco_exit(adev); 5529 if (ret) 5530 return ret; 5531 5532 if (ras && adev->ras_enabled && 5533 adev->nbio.funcs->enable_doorbell_interrupt) 5534 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5535 5536 if (amdgpu_passthrough(adev) && 5537 adev->nbio.funcs->clear_doorbell_interrupt) 5538 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5539 5540 return 0; 5541 } 5542 5543 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5544 { 5545 int i; 5546 5547 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5548 struct amdgpu_ring *ring = adev->rings[i]; 5549 5550 if (!ring || !ring->sched.thread) 5551 continue; 5552 5553 cancel_delayed_work_sync(&ring->sched.work_tdr); 5554 } 5555 } 5556 5557 /** 5558 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5559 * @pdev: PCI device struct 5560 * @state: PCI channel state 5561 * 5562 * Description: Called when a PCI error is detected. 5563 * 5564 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5565 */ 5566 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5567 { 5568 STUB(); 5569 return 0; 5570 #ifdef notyet 5571 struct drm_device *dev = pci_get_drvdata(pdev); 5572 struct amdgpu_device *adev = drm_to_adev(dev); 5573 int i; 5574 5575 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5576 5577 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5578 DRM_WARN("No support for XGMI hive yet..."); 5579 return PCI_ERS_RESULT_DISCONNECT; 5580 } 5581 5582 adev->pci_channel_state = state; 5583 5584 switch (state) { 5585 case pci_channel_io_normal: 5586 return PCI_ERS_RESULT_CAN_RECOVER; 5587 /* Fatal error, prepare for slot reset */ 5588 case pci_channel_io_frozen: 5589 /* 5590 * Cancel and wait for all TDRs in progress if failing to 5591 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5592 * 5593 * Locking adev->reset_sem will prevent any external access 5594 * to GPU during PCI error recovery 5595 */ 5596 while (!amdgpu_device_lock_adev(adev, NULL)) 5597 amdgpu_cancel_all_tdr(adev); 5598 5599 /* 5600 * Block any work scheduling as we do for regular GPU reset 5601 * for the duration of the recovery 5602 */ 5603 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5604 struct amdgpu_ring *ring = adev->rings[i]; 5605 5606 if (!ring || !ring->sched.thread) 5607 continue; 5608 5609 drm_sched_stop(&ring->sched, NULL); 5610 } 5611 atomic_inc(&adev->gpu_reset_counter); 5612 return PCI_ERS_RESULT_NEED_RESET; 5613 case pci_channel_io_perm_failure: 5614 /* Permanent error, prepare for device removal */ 5615 return PCI_ERS_RESULT_DISCONNECT; 5616 } 5617 5618 return PCI_ERS_RESULT_NEED_RESET; 5619 #endif 5620 } 5621 5622 /** 5623 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5624 * @pdev: pointer to PCI device 5625 */ 5626 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5627 { 5628 5629 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5630 5631 /* TODO - dump whatever for debugging purposes */ 5632 5633 /* This called only if amdgpu_pci_error_detected returns 5634 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5635 * works, no need to reset slot. 5636 */ 5637 5638 return PCI_ERS_RESULT_RECOVERED; 5639 } 5640 5641 /** 5642 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5643 * @pdev: PCI device struct 5644 * 5645 * Description: This routine is called by the pci error recovery 5646 * code after the PCI slot has been reset, just before we 5647 * should resume normal operations. 5648 */ 5649 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5650 { 5651 STUB(); 5652 return PCI_ERS_RESULT_RECOVERED; 5653 #ifdef notyet 5654 struct drm_device *dev = pci_get_drvdata(pdev); 5655 struct amdgpu_device *adev = drm_to_adev(dev); 5656 int r, i; 5657 struct amdgpu_reset_context reset_context; 5658 u32 memsize; 5659 struct list_head device_list; 5660 5661 DRM_INFO("PCI error: slot reset callback!!\n"); 5662 5663 memset(&reset_context, 0, sizeof(reset_context)); 5664 5665 INIT_LIST_HEAD(&device_list); 5666 list_add_tail(&adev->reset_list, &device_list); 5667 5668 /* wait for asic to come out of reset */ 5669 drm_msleep(500); 5670 5671 /* Restore PCI confspace */ 5672 amdgpu_device_load_pci_state(pdev); 5673 5674 /* confirm ASIC came out of reset */ 5675 for (i = 0; i < adev->usec_timeout; i++) { 5676 memsize = amdgpu_asic_get_config_memsize(adev); 5677 5678 if (memsize != 0xffffffff) 5679 break; 5680 udelay(1); 5681 } 5682 if (memsize == 0xffffffff) { 5683 r = -ETIME; 5684 goto out; 5685 } 5686 5687 reset_context.method = AMD_RESET_METHOD_NONE; 5688 reset_context.reset_req_dev = adev; 5689 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5690 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5691 5692 adev->no_hw_access = true; 5693 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5694 adev->no_hw_access = false; 5695 if (r) 5696 goto out; 5697 5698 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5699 5700 out: 5701 if (!r) { 5702 if (amdgpu_device_cache_pci_state(adev->pdev)) 5703 pci_restore_state(adev->pdev); 5704 5705 DRM_INFO("PCIe error recovery succeeded\n"); 5706 } else { 5707 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5708 amdgpu_device_unlock_adev(adev); 5709 } 5710 5711 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5712 #endif 5713 } 5714 5715 /** 5716 * amdgpu_pci_resume() - resume normal ops after PCI reset 5717 * @pdev: pointer to PCI device 5718 * 5719 * Called when the error recovery driver tells us that its 5720 * OK to resume normal operation. 5721 */ 5722 void amdgpu_pci_resume(struct pci_dev *pdev) 5723 { 5724 STUB(); 5725 #ifdef notyet 5726 struct drm_device *dev = pci_get_drvdata(pdev); 5727 struct amdgpu_device *adev = drm_to_adev(dev); 5728 int i; 5729 5730 5731 DRM_INFO("PCI error: resume callback!!\n"); 5732 5733 /* Only continue execution for the case of pci_channel_io_frozen */ 5734 if (adev->pci_channel_state != pci_channel_io_frozen) 5735 return; 5736 5737 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5738 struct amdgpu_ring *ring = adev->rings[i]; 5739 5740 if (!ring || !ring->sched.thread) 5741 continue; 5742 5743 5744 drm_sched_resubmit_jobs(&ring->sched); 5745 drm_sched_start(&ring->sched, true); 5746 } 5747 5748 amdgpu_device_unlock_adev(adev); 5749 #endif 5750 } 5751 5752 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5753 { 5754 return false; 5755 #ifdef notyet 5756 struct drm_device *dev = pci_get_drvdata(pdev); 5757 struct amdgpu_device *adev = drm_to_adev(dev); 5758 int r; 5759 5760 r = pci_save_state(pdev); 5761 if (!r) { 5762 kfree(adev->pci_state); 5763 5764 adev->pci_state = pci_store_saved_state(pdev); 5765 5766 if (!adev->pci_state) { 5767 DRM_ERROR("Failed to store PCI saved state"); 5768 return false; 5769 } 5770 } else { 5771 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5772 return false; 5773 } 5774 5775 return true; 5776 #endif 5777 } 5778 5779 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5780 { 5781 STUB(); 5782 return false; 5783 #ifdef notyet 5784 struct drm_device *dev = pci_get_drvdata(pdev); 5785 struct amdgpu_device *adev = drm_to_adev(dev); 5786 int r; 5787 5788 if (!adev->pci_state) 5789 return false; 5790 5791 r = pci_load_saved_state(pdev, adev->pci_state); 5792 5793 if (!r) { 5794 pci_restore_state(pdev); 5795 } else { 5796 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5797 return false; 5798 } 5799 5800 return true; 5801 #endif 5802 } 5803 5804 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5805 struct amdgpu_ring *ring) 5806 { 5807 #ifdef CONFIG_X86_64 5808 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5809 return; 5810 #endif 5811 if (adev->gmc.xgmi.connected_to_cpu) 5812 return; 5813 5814 if (ring && ring->funcs->emit_hdp_flush) 5815 amdgpu_ring_emit_hdp_flush(ring); 5816 else 5817 amdgpu_asic_flush_hdp(adev, ring); 5818 } 5819 5820 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5821 struct amdgpu_ring *ring) 5822 { 5823 #ifdef CONFIG_X86_64 5824 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5825 return; 5826 #endif 5827 if (adev->gmc.xgmi.connected_to_cpu) 5828 return; 5829 5830 amdgpu_asic_invalidate_hdp(adev, ring); 5831 } 5832