1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/pci.h> 34 35 #include <drm/drm_atomic_helper.h> 36 #include <drm/drm_probe_helper.h> 37 #include <drm/amdgpu_drm.h> 38 #include <linux/vgaarb.h> 39 #include <linux/vga_switcheroo.h> 40 #include <linux/efi.h> 41 #include "amdgpu.h" 42 #include "amdgpu_trace.h" 43 #include "amdgpu_i2c.h" 44 #include "atom.h" 45 #include "amdgpu_atombios.h" 46 #include "amdgpu_atomfirmware.h" 47 #include "amd_pcie.h" 48 #ifdef CONFIG_DRM_AMDGPU_SI 49 #include "si.h" 50 #endif 51 #ifdef CONFIG_DRM_AMDGPU_CIK 52 #include "cik.h" 53 #endif 54 #include "vi.h" 55 #include "soc15.h" 56 #include "nv.h" 57 #include "bif/bif_4_1_d.h" 58 #include <linux/pci.h> 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 92 const char *amdgpu_asic_name[] = { 93 "TAHITI", 94 "PITCAIRN", 95 "VERDE", 96 "OLAND", 97 "HAINAN", 98 "BONAIRE", 99 "KAVERI", 100 "KABINI", 101 "HAWAII", 102 "MULLINS", 103 "TOPAZ", 104 "TONGA", 105 "FIJI", 106 "CARRIZO", 107 "STONEY", 108 "POLARIS10", 109 "POLARIS11", 110 "POLARIS12", 111 "VEGAM", 112 "VEGA10", 113 "VEGA12", 114 "VEGA20", 115 "RAVEN", 116 "ARCTURUS", 117 "RENOIR", 118 "ALDEBARAN", 119 "NAVI10", 120 "CYAN_SKILLFISH", 121 "NAVI14", 122 "NAVI12", 123 "SIENNA_CICHLID", 124 "NAVY_FLOUNDER", 125 "VANGOGH", 126 "DIMGREY_CAVEFISH", 127 "BEIGE_GOBY", 128 "YELLOW_CARP", 129 "LAST", 130 }; 131 132 /** 133 * DOC: pcie_replay_count 134 * 135 * The amdgpu driver provides a sysfs API for reporting the total number 136 * of PCIe replays (NAKs) 137 * The file pcie_replay_count is used for this and returns the total 138 * number of replays as a sum of the NAKs generated and NAKs received 139 */ 140 141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct drm_device *ddev = dev_get_drvdata(dev); 145 struct amdgpu_device *adev = drm_to_adev(ddev); 146 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 147 148 return sysfs_emit(buf, "%llu\n", cnt); 149 } 150 151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 152 amdgpu_device_get_pcie_replay_count, NULL); 153 154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 155 156 /** 157 * DOC: product_name 158 * 159 * The amdgpu driver provides a sysfs API for reporting the product name 160 * for the device 161 * The file serial_number is used for this and returns the product name 162 * as returned from the FRU. 163 * NOTE: This is only available for certain server cards 164 */ 165 166 static ssize_t amdgpu_device_get_product_name(struct device *dev, 167 struct device_attribute *attr, char *buf) 168 { 169 struct drm_device *ddev = dev_get_drvdata(dev); 170 struct amdgpu_device *adev = drm_to_adev(ddev); 171 172 return sysfs_emit(buf, "%s\n", adev->product_name); 173 } 174 175 static DEVICE_ATTR(product_name, S_IRUGO, 176 amdgpu_device_get_product_name, NULL); 177 178 /** 179 * DOC: product_number 180 * 181 * The amdgpu driver provides a sysfs API for reporting the part number 182 * for the device 183 * The file serial_number is used for this and returns the part number 184 * as returned from the FRU. 185 * NOTE: This is only available for certain server cards 186 */ 187 188 static ssize_t amdgpu_device_get_product_number(struct device *dev, 189 struct device_attribute *attr, char *buf) 190 { 191 struct drm_device *ddev = dev_get_drvdata(dev); 192 struct amdgpu_device *adev = drm_to_adev(ddev); 193 194 return sysfs_emit(buf, "%s\n", adev->product_number); 195 } 196 197 static DEVICE_ATTR(product_number, S_IRUGO, 198 amdgpu_device_get_product_number, NULL); 199 200 /** 201 * DOC: serial_number 202 * 203 * The amdgpu driver provides a sysfs API for reporting the serial number 204 * for the device 205 * The file serial_number is used for this and returns the serial number 206 * as returned from the FRU. 207 * NOTE: This is only available for certain server cards 208 */ 209 210 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 211 struct device_attribute *attr, char *buf) 212 { 213 struct drm_device *ddev = dev_get_drvdata(dev); 214 struct amdgpu_device *adev = drm_to_adev(ddev); 215 216 return sysfs_emit(buf, "%s\n", adev->serial); 217 } 218 219 static DEVICE_ATTR(serial_number, S_IRUGO, 220 amdgpu_device_get_serial_number, NULL); 221 222 /** 223 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 224 * 225 * @dev: drm_device pointer 226 * 227 * Returns true if the device is a dGPU with ATPX power control, 228 * otherwise return false. 229 */ 230 bool amdgpu_device_supports_px(struct drm_device *dev) 231 { 232 struct amdgpu_device *adev = drm_to_adev(dev); 233 234 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 235 return true; 236 return false; 237 } 238 239 /** 240 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 241 * 242 * @dev: drm_device pointer 243 * 244 * Returns true if the device is a dGPU with ACPI power control, 245 * otherwise return false. 246 */ 247 bool amdgpu_device_supports_boco(struct drm_device *dev) 248 { 249 struct amdgpu_device *adev = drm_to_adev(dev); 250 251 if (adev->has_pr3 || 252 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 253 return true; 254 return false; 255 } 256 257 /** 258 * amdgpu_device_supports_baco - Does the device support BACO 259 * 260 * @dev: drm_device pointer 261 * 262 * Returns true if the device supporte BACO, 263 * otherwise return false. 264 */ 265 bool amdgpu_device_supports_baco(struct drm_device *dev) 266 { 267 struct amdgpu_device *adev = drm_to_adev(dev); 268 269 return amdgpu_asic_supports_baco(adev); 270 } 271 272 /** 273 * amdgpu_device_supports_smart_shift - Is the device dGPU with 274 * smart shift support 275 * 276 * @dev: drm_device pointer 277 * 278 * Returns true if the device is a dGPU with Smart Shift support, 279 * otherwise returns false. 280 */ 281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 282 { 283 return (amdgpu_device_supports_boco(dev) && 284 amdgpu_acpi_is_power_shift_control_supported()); 285 } 286 287 /* 288 * VRAM access helper functions 289 */ 290 291 /** 292 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 293 * 294 * @adev: amdgpu_device pointer 295 * @pos: offset of the buffer in vram 296 * @buf: virtual address of the buffer in system memory 297 * @size: read/write size, sizeof(@buf) must > @size 298 * @write: true - write to vram, otherwise - read from vram 299 */ 300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 301 void *buf, size_t size, bool write) 302 { 303 unsigned long flags; 304 uint32_t hi = ~0, tmp = 0; 305 uint32_t *data = buf; 306 uint64_t last; 307 int idx; 308 309 if (!drm_dev_enter(&adev->ddev, &idx)) 310 return; 311 312 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 313 314 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 315 for (last = pos + size; pos < last; pos += 4) { 316 tmp = pos >> 31; 317 318 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 319 if (tmp != hi) { 320 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 321 hi = tmp; 322 } 323 if (write) 324 WREG32_NO_KIQ(mmMM_DATA, *data++); 325 else 326 *data++ = RREG32_NO_KIQ(mmMM_DATA); 327 } 328 329 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 330 drm_dev_exit(idx); 331 } 332 333 /** 334 * amdgpu_device_vram_access - access vram by vram aperature 335 * 336 * @adev: amdgpu_device pointer 337 * @pos: offset of the buffer in vram 338 * @buf: virtual address of the buffer in system memory 339 * @size: read/write size, sizeof(@buf) must > @size 340 * @write: true - write to vram, otherwise - read from vram 341 * 342 * The return value means how many bytes have been transferred. 343 */ 344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 345 void *buf, size_t size, bool write) 346 { 347 #ifdef CONFIG_64BIT 348 void __iomem *addr; 349 size_t count = 0; 350 uint64_t last; 351 352 if (!adev->mman.aper_base_kaddr) 353 return 0; 354 355 last = min(pos + size, adev->gmc.visible_vram_size); 356 if (last > pos) { 357 addr = adev->mman.aper_base_kaddr + pos; 358 count = last - pos; 359 360 if (write) { 361 memcpy_toio(addr, buf, count); 362 mb(); 363 amdgpu_device_flush_hdp(adev, NULL); 364 } else { 365 amdgpu_device_invalidate_hdp(adev, NULL); 366 mb(); 367 memcpy_fromio(buf, addr, count); 368 } 369 370 } 371 372 return count; 373 #else 374 return 0; 375 #endif 376 } 377 378 /** 379 * amdgpu_device_vram_access - read/write a buffer in vram 380 * 381 * @adev: amdgpu_device pointer 382 * @pos: offset of the buffer in vram 383 * @buf: virtual address of the buffer in system memory 384 * @size: read/write size, sizeof(@buf) must > @size 385 * @write: true - write to vram, otherwise - read from vram 386 */ 387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 388 void *buf, size_t size, bool write) 389 { 390 size_t count; 391 392 /* try to using vram apreature to access vram first */ 393 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 394 size -= count; 395 if (size) { 396 /* using MM to access rest vram */ 397 pos += count; 398 buf += count; 399 amdgpu_device_mm_access(adev, pos, buf, size, write); 400 } 401 } 402 403 /* 404 * register access helper functions. 405 */ 406 407 /* Check if hw access should be skipped because of hotplug or device error */ 408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 409 { 410 if (adev->no_hw_access) 411 return true; 412 413 #ifdef CONFIG_LOCKDEP 414 /* 415 * This is a bit complicated to understand, so worth a comment. What we assert 416 * here is that the GPU reset is not running on another thread in parallel. 417 * 418 * For this we trylock the read side of the reset semaphore, if that succeeds 419 * we know that the reset is not running in paralell. 420 * 421 * If the trylock fails we assert that we are either already holding the read 422 * side of the lock or are the reset thread itself and hold the write side of 423 * the lock. 424 */ 425 if (in_task()) { 426 if (down_read_trylock(&adev->reset_sem)) 427 up_read(&adev->reset_sem); 428 else 429 lockdep_assert_held(&adev->reset_sem); 430 } 431 #endif 432 return false; 433 } 434 435 /** 436 * amdgpu_device_rreg - read a memory mapped IO or indirect register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * @acc_flags: access flags which require special behavior 441 * 442 * Returns the 32 bit value from the offset specified. 443 */ 444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 445 uint32_t reg, uint32_t acc_flags) 446 { 447 uint32_t ret; 448 449 if (amdgpu_device_skip_hw_access(adev)) 450 return 0; 451 452 if ((reg * 4) < adev->rmmio_size) { 453 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 454 amdgpu_sriov_runtime(adev) && 455 down_read_trylock(&adev->reset_sem)) { 456 ret = amdgpu_kiq_rreg(adev, reg); 457 up_read(&adev->reset_sem); 458 } else { 459 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 460 } 461 } else { 462 ret = adev->pcie_rreg(adev, reg * 4); 463 } 464 465 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 466 467 return ret; 468 } 469 470 /* 471 * MMIO register read with bytes helper functions 472 * @offset:bytes offset from MMIO start 473 * 474 */ 475 476 /** 477 * amdgpu_mm_rreg8 - read a memory mapped IO register 478 * 479 * @adev: amdgpu_device pointer 480 * @offset: byte aligned register offset 481 * 482 * Returns the 8 bit value from the offset specified. 483 */ 484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return 0; 488 489 if (offset < adev->rmmio_size) 490 return (readb(adev->rmmio + offset)); 491 BUG(); 492 } 493 494 /* 495 * MMIO register write with bytes helper functions 496 * @offset:bytes offset from MMIO start 497 * @value: the value want to be written to the register 498 * 499 */ 500 /** 501 * amdgpu_mm_wreg8 - read a memory mapped IO register 502 * 503 * @adev: amdgpu_device pointer 504 * @offset: byte aligned register offset 505 * @value: 8 bit value to write 506 * 507 * Writes the value specified to the offset specified. 508 */ 509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 510 { 511 if (amdgpu_device_skip_hw_access(adev)) 512 return; 513 514 if (offset < adev->rmmio_size) 515 writeb(value, adev->rmmio + offset); 516 else 517 BUG(); 518 } 519 520 /** 521 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @v: 32 bit value to write to the register 526 * @acc_flags: access flags which require special behavior 527 * 528 * Writes the value specified to the offset specified. 529 */ 530 void amdgpu_device_wreg(struct amdgpu_device *adev, 531 uint32_t reg, uint32_t v, 532 uint32_t acc_flags) 533 { 534 if (amdgpu_device_skip_hw_access(adev)) 535 return; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_sem)) { 541 amdgpu_kiq_wreg(adev, reg, v); 542 up_read(&adev->reset_sem); 543 } else { 544 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 adev->pcie_wreg(adev, reg * 4, v); 548 } 549 550 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 551 } 552 553 /* 554 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 555 * 556 * this function is invoked only the debugfs register access 557 * */ 558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 559 uint32_t reg, uint32_t v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (amdgpu_sriov_fullaccess(adev) && 565 adev->gfx.rlc.funcs && 566 adev->gfx.rlc.funcs->is_rlcg_access_range) { 567 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 568 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); 569 } else { 570 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 571 } 572 } 573 574 /** 575 * amdgpu_mm_rdoorbell - read a doorbell dword 576 * 577 * @adev: amdgpu_device pointer 578 * @index: doorbell index 579 * 580 * Returns the value in the doorbell aperture at the 581 * requested doorbell index (CIK). 582 */ 583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 584 { 585 if (amdgpu_device_skip_hw_access(adev)) 586 return 0; 587 588 if (index < adev->doorbell.num_doorbells) { 589 return readl(adev->doorbell.ptr + index); 590 } else { 591 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 592 return 0; 593 } 594 } 595 596 /** 597 * amdgpu_mm_wdoorbell - write a doorbell dword 598 * 599 * @adev: amdgpu_device pointer 600 * @index: doorbell index 601 * @v: value to write 602 * 603 * Writes @v to the doorbell aperture at the 604 * requested doorbell index (CIK). 605 */ 606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 607 { 608 if (amdgpu_device_skip_hw_access(adev)) 609 return; 610 611 if (index < adev->doorbell.num_doorbells) { 612 writel(v, adev->doorbell.ptr + index); 613 } else { 614 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 615 } 616 } 617 618 /** 619 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 620 * 621 * @adev: amdgpu_device pointer 622 * @index: doorbell index 623 * 624 * Returns the value in the doorbell aperture at the 625 * requested doorbell index (VEGA10+). 626 */ 627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 628 { 629 if (amdgpu_device_skip_hw_access(adev)) 630 return 0; 631 632 if (index < adev->doorbell.num_doorbells) { 633 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 634 } else { 635 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 636 return 0; 637 } 638 } 639 640 /** 641 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 642 * 643 * @adev: amdgpu_device pointer 644 * @index: doorbell index 645 * @v: value to write 646 * 647 * Writes @v to the doorbell aperture at the 648 * requested doorbell index (VEGA10+). 649 */ 650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 651 { 652 if (amdgpu_device_skip_hw_access(adev)) 653 return; 654 655 if (index < adev->doorbell.num_doorbells) { 656 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 657 } else { 658 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 659 } 660 } 661 662 /** 663 * amdgpu_device_indirect_rreg - read an indirect register 664 * 665 * @adev: amdgpu_device pointer 666 * @pcie_index: mmio register offset 667 * @pcie_data: mmio register offset 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 673 u32 pcie_index, u32 pcie_data, 674 u32 reg_addr) 675 { 676 unsigned long flags; 677 u32 r; 678 void __iomem *pcie_index_offset; 679 void __iomem *pcie_data_offset; 680 681 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 682 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 683 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 684 685 writel(reg_addr, pcie_index_offset); 686 readl(pcie_index_offset); 687 r = readl(pcie_data_offset); 688 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 689 690 return r; 691 } 692 693 /** 694 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 695 * 696 * @adev: amdgpu_device pointer 697 * @pcie_index: mmio register offset 698 * @pcie_data: mmio register offset 699 * @reg_addr: indirect register address to read from 700 * 701 * Returns the value of indirect register @reg_addr 702 */ 703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 704 u32 pcie_index, u32 pcie_data, 705 u32 reg_addr) 706 { 707 unsigned long flags; 708 u64 r; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* read low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 r = readl(pcie_data_offset); 720 /* read high 32 bits */ 721 writel(reg_addr + 4, pcie_index_offset); 722 readl(pcie_index_offset); 723 r |= ((u64)readl(pcie_data_offset) << 32); 724 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 725 726 return r; 727 } 728 729 /** 730 * amdgpu_device_indirect_wreg - write an indirect register address 731 * 732 * @adev: amdgpu_device pointer 733 * @pcie_index: mmio register offset 734 * @pcie_data: mmio register offset 735 * @reg_addr: indirect register offset 736 * @reg_data: indirect register data 737 * 738 */ 739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 740 u32 pcie_index, u32 pcie_data, 741 u32 reg_addr, u32 reg_data) 742 { 743 unsigned long flags; 744 void __iomem *pcie_index_offset; 745 void __iomem *pcie_data_offset; 746 747 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 748 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 749 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 writel(reg_data, pcie_data_offset); 754 readl(pcie_data_offset); 755 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 756 } 757 758 /** 759 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 760 * 761 * @adev: amdgpu_device pointer 762 * @pcie_index: mmio register offset 763 * @pcie_data: mmio register offset 764 * @reg_addr: indirect register offset 765 * @reg_data: indirect register data 766 * 767 */ 768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 769 u32 pcie_index, u32 pcie_data, 770 u32 reg_addr, u64 reg_data) 771 { 772 unsigned long flags; 773 void __iomem *pcie_index_offset; 774 void __iomem *pcie_data_offset; 775 776 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 777 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 778 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 779 780 /* write low 32 bits */ 781 writel(reg_addr, pcie_index_offset); 782 readl(pcie_index_offset); 783 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 784 readl(pcie_data_offset); 785 /* write high 32 bits */ 786 writel(reg_addr + 4, pcie_index_offset); 787 readl(pcie_index_offset); 788 writel((u32)(reg_data >> 32), pcie_data_offset); 789 readl(pcie_data_offset); 790 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 791 } 792 793 /** 794 * amdgpu_invalid_rreg - dummy reg read function 795 * 796 * @adev: amdgpu_device pointer 797 * @reg: offset of register 798 * 799 * Dummy register read function. Used for register blocks 800 * that certain asics don't have (all asics). 801 * Returns the value in the register. 802 */ 803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 804 { 805 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 806 BUG(); 807 return 0; 808 } 809 810 /** 811 * amdgpu_invalid_wreg - dummy reg write function 812 * 813 * @adev: amdgpu_device pointer 814 * @reg: offset of register 815 * @v: value to write to the register 816 * 817 * Dummy register read function. Used for register blocks 818 * that certain asics don't have (all asics). 819 */ 820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 821 { 822 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 823 reg, v); 824 BUG(); 825 } 826 827 /** 828 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 829 * 830 * @adev: amdgpu_device pointer 831 * @reg: offset of register 832 * 833 * Dummy register read function. Used for register blocks 834 * that certain asics don't have (all asics). 835 * Returns the value in the register. 836 */ 837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 838 { 839 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 840 BUG(); 841 return 0; 842 } 843 844 /** 845 * amdgpu_invalid_wreg64 - dummy reg write function 846 * 847 * @adev: amdgpu_device pointer 848 * @reg: offset of register 849 * @v: value to write to the register 850 * 851 * Dummy register read function. Used for register blocks 852 * that certain asics don't have (all asics). 853 */ 854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 855 { 856 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 857 reg, v); 858 BUG(); 859 } 860 861 /** 862 * amdgpu_block_invalid_rreg - dummy reg read function 863 * 864 * @adev: amdgpu_device pointer 865 * @block: offset of instance 866 * @reg: offset of register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 * Returns the value in the register. 871 */ 872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 873 uint32_t block, uint32_t reg) 874 { 875 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 876 reg, block); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_block_invalid_wreg - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @block: offset of instance 886 * @reg: offset of register 887 * @v: value to write to the register 888 * 889 * Dummy register read function. Used for register blocks 890 * that certain asics don't have (all asics). 891 */ 892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 893 uint32_t block, 894 uint32_t reg, uint32_t v) 895 { 896 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 897 reg, block, v); 898 BUG(); 899 } 900 901 /** 902 * amdgpu_device_asic_init - Wrapper for atom asic_init 903 * 904 * @adev: amdgpu_device pointer 905 * 906 * Does any asic specific work and then calls atom asic init. 907 */ 908 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 909 { 910 amdgpu_asic_pre_asic_init(adev); 911 912 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 913 } 914 915 /** 916 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Allocates a scratch page of VRAM for use by various things in the 921 * driver. 922 */ 923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 924 { 925 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 926 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 927 &adev->vram_scratch.robj, 928 &adev->vram_scratch.gpu_addr, 929 (void **)&adev->vram_scratch.ptr); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Frees the VRAM scratch page. 938 */ 939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 940 { 941 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 942 } 943 944 /** 945 * amdgpu_device_program_register_sequence - program an array of registers. 946 * 947 * @adev: amdgpu_device pointer 948 * @registers: pointer to the register array 949 * @array_size: size of the register array 950 * 951 * Programs an array or registers with and and or masks. 952 * This is a helper for setting golden registers. 953 */ 954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 955 const u32 *registers, 956 const u32 array_size) 957 { 958 u32 tmp, reg, and_mask, or_mask; 959 int i; 960 961 if (array_size % 3) 962 return; 963 964 for (i = 0; i < array_size; i +=3) { 965 reg = registers[i + 0]; 966 and_mask = registers[i + 1]; 967 or_mask = registers[i + 2]; 968 969 if (and_mask == 0xffffffff) { 970 tmp = or_mask; 971 } else { 972 tmp = RREG32(reg); 973 tmp &= ~and_mask; 974 if (adev->family >= AMDGPU_FAMILY_AI) 975 tmp |= (or_mask & and_mask); 976 else 977 tmp |= or_mask; 978 } 979 WREG32(reg, tmp); 980 } 981 } 982 983 /** 984 * amdgpu_device_pci_config_reset - reset the GPU 985 * 986 * @adev: amdgpu_device pointer 987 * 988 * Resets the GPU using the pci config reset sequence. 989 * Only applicable to asics prior to vega10. 990 */ 991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 992 { 993 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 994 } 995 996 /** 997 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 998 * 999 * @adev: amdgpu_device pointer 1000 * 1001 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1002 */ 1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1004 { 1005 STUB(); 1006 return -ENOSYS; 1007 #ifdef notyet 1008 return pci_reset_function(adev->pdev); 1009 #endif 1010 } 1011 1012 /* 1013 * GPU doorbell aperture helpers function. 1014 */ 1015 /** 1016 * amdgpu_device_doorbell_init - Init doorbell driver information. 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Init doorbell driver information (CIK) 1021 * Returns 0 on success, error on failure. 1022 */ 1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1024 { 1025 1026 /* No doorbell on SI hardware generation */ 1027 if (adev->asic_type < CHIP_BONAIRE) { 1028 adev->doorbell.base = 0; 1029 adev->doorbell.size = 0; 1030 adev->doorbell.num_doorbells = 0; 1031 adev->doorbell.ptr = NULL; 1032 return 0; 1033 } 1034 1035 #ifdef __linux__ 1036 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1037 return -EINVAL; 1038 #endif 1039 1040 amdgpu_asic_init_doorbell_index(adev); 1041 1042 /* doorbell bar mapping */ 1043 #ifdef __linux__ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 #endif 1047 1048 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1049 adev->doorbell_index.max_assignment+1); 1050 if (adev->doorbell.num_doorbells == 0) 1051 return -EINVAL; 1052 1053 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1054 * paging queue doorbell use the second page. The 1055 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1056 * doorbells are in the first page. So with paging queue enabled, 1057 * the max num_doorbells should + 1 page (0x400 in dword) 1058 */ 1059 if (adev->asic_type >= CHIP_VEGA10) 1060 adev->doorbell.num_doorbells += 0x400; 1061 1062 #ifdef __linux__ 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 #endif 1069 1070 return 0; 1071 } 1072 1073 /** 1074 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Tear down doorbell driver information (CIK) 1079 */ 1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1081 { 1082 #ifdef __linux__ 1083 iounmap(adev->doorbell.ptr); 1084 #else 1085 if (adev->doorbell.size > 0) 1086 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1087 adev->doorbell.size); 1088 #endif 1089 adev->doorbell.ptr = NULL; 1090 } 1091 1092 1093 1094 /* 1095 * amdgpu_device_wb_*() 1096 * Writeback is the method by which the GPU updates special pages in memory 1097 * with the status of certain GPU events (fences, ring pointers,etc.). 1098 */ 1099 1100 /** 1101 * amdgpu_device_wb_fini - Disable Writeback and free memory 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Disables Writeback and frees the Writeback memory (all asics). 1106 * Used at driver shutdown. 1107 */ 1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1109 { 1110 if (adev->wb.wb_obj) { 1111 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1112 &adev->wb.gpu_addr, 1113 (void **)&adev->wb.wb); 1114 adev->wb.wb_obj = NULL; 1115 } 1116 } 1117 1118 /** 1119 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Initializes writeback and allocates writeback memory (all asics). 1124 * Used at driver startup. 1125 * Returns 0 on success or an -error on failure. 1126 */ 1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1128 { 1129 int r; 1130 1131 if (adev->wb.wb_obj == NULL) { 1132 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1133 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1134 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1135 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 if (r) { 1138 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1139 return r; 1140 } 1141 1142 adev->wb.num_wb = AMDGPU_MAX_WB; 1143 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1144 1145 /* clear wb memory */ 1146 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1147 } 1148 1149 return 0; 1150 } 1151 1152 /** 1153 * amdgpu_device_wb_get - Allocate a wb entry 1154 * 1155 * @adev: amdgpu_device pointer 1156 * @wb: wb index 1157 * 1158 * Allocate a wb slot for use by the driver (all asics). 1159 * Returns 0 on success or -EINVAL on failure. 1160 */ 1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1162 { 1163 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1164 1165 if (offset < adev->wb.num_wb) { 1166 __set_bit(offset, adev->wb.used); 1167 *wb = offset << 3; /* convert to dw offset */ 1168 return 0; 1169 } else { 1170 return -EINVAL; 1171 } 1172 } 1173 1174 /** 1175 * amdgpu_device_wb_free - Free a wb entry 1176 * 1177 * @adev: amdgpu_device pointer 1178 * @wb: wb index 1179 * 1180 * Free a wb slot allocated for use by the driver (all asics) 1181 */ 1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1183 { 1184 wb >>= 3; 1185 if (wb < adev->wb.num_wb) 1186 __clear_bit(wb, adev->wb.used); 1187 } 1188 1189 /** 1190 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1191 * 1192 * @adev: amdgpu_device pointer 1193 * 1194 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1195 * to fail, but if any of the BARs is not accessible after the size we abort 1196 * driver loading by returning -ENODEV. 1197 */ 1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1199 { 1200 #ifdef __linux__ 1201 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1202 struct pci_bus *root; 1203 struct resource *res; 1204 unsigned i; 1205 u16 cmd; 1206 int r; 1207 1208 /* Bypass for VF */ 1209 if (amdgpu_sriov_vf(adev)) 1210 return 0; 1211 1212 /* skip if the bios has already enabled large BAR */ 1213 if (adev->gmc.real_vram_size && 1214 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1215 return 0; 1216 1217 /* Check if the root BUS has 64bit memory resources */ 1218 root = adev->pdev->bus; 1219 while (root->parent) 1220 root = root->parent; 1221 1222 pci_bus_for_each_resource(root, res, i) { 1223 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1224 res->start > 0x100000000ull) 1225 break; 1226 } 1227 1228 /* Trying to resize is pointless without a root hub window above 4GB */ 1229 if (!res) 1230 return 0; 1231 1232 /* Limit the BAR size to what is available */ 1233 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1234 rbar_size); 1235 1236 /* Disable memory decoding while we change the BAR addresses and size */ 1237 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1238 pci_write_config_word(adev->pdev, PCI_COMMAND, 1239 cmd & ~PCI_COMMAND_MEMORY); 1240 1241 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1242 amdgpu_device_doorbell_fini(adev); 1243 if (adev->asic_type >= CHIP_BONAIRE) 1244 pci_release_resource(adev->pdev, 2); 1245 1246 pci_release_resource(adev->pdev, 0); 1247 1248 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1249 if (r == -ENOSPC) 1250 DRM_INFO("Not enough PCI address space for a large BAR."); 1251 else if (r && r != -ENOTSUPP) 1252 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1253 1254 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1255 1256 /* When the doorbell or fb BAR isn't available we have no chance of 1257 * using the device. 1258 */ 1259 r = amdgpu_device_doorbell_init(adev); 1260 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1261 return -ENODEV; 1262 1263 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1264 #endif /* __linux__ */ 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /** 1331 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1332 * 1333 * @adev: amdgpu_device pointer 1334 * 1335 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1336 * be set for this device. 1337 * 1338 * Returns true if it should be used or false if not. 1339 */ 1340 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1341 { 1342 switch (amdgpu_aspm) { 1343 case -1: 1344 break; 1345 case 0: 1346 return false; 1347 case 1: 1348 return true; 1349 default: 1350 return false; 1351 } 1352 return pcie_aspm_enabled(adev->pdev); 1353 } 1354 1355 /* if we get transitioned to only one device, take VGA back */ 1356 /** 1357 * amdgpu_device_vga_set_decode - enable/disable vga decode 1358 * 1359 * @pdev: PCI device pointer 1360 * @state: enable/disable vga decode 1361 * 1362 * Enable/disable vga decode (all asics). 1363 * Returns VGA resource flags. 1364 */ 1365 #ifdef notyet 1366 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1367 bool state) 1368 { 1369 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1370 amdgpu_asic_set_vga_state(adev, state); 1371 if (state) 1372 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1373 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1374 else 1375 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1376 } 1377 #endif 1378 1379 /** 1380 * amdgpu_device_check_block_size - validate the vm block size 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Validates the vm block size specified via module parameter. 1385 * The vm block size defines number of bits in page table versus page directory, 1386 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1387 * page table and the remaining bits are in the page directory. 1388 */ 1389 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1390 { 1391 /* defines number of bits in page table versus page directory, 1392 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1393 * page table and the remaining bits are in the page directory */ 1394 if (amdgpu_vm_block_size == -1) 1395 return; 1396 1397 if (amdgpu_vm_block_size < 9) { 1398 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1399 amdgpu_vm_block_size); 1400 amdgpu_vm_block_size = -1; 1401 } 1402 } 1403 1404 /** 1405 * amdgpu_device_check_vm_size - validate the vm size 1406 * 1407 * @adev: amdgpu_device pointer 1408 * 1409 * Validates the vm size in GB specified via module parameter. 1410 * The VM size is the size of the GPU virtual memory space in GB. 1411 */ 1412 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1413 { 1414 /* no need to check the default value */ 1415 if (amdgpu_vm_size == -1) 1416 return; 1417 1418 if (amdgpu_vm_size < 1) { 1419 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1420 amdgpu_vm_size); 1421 amdgpu_vm_size = -1; 1422 } 1423 } 1424 1425 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1426 { 1427 #ifdef __linux__ 1428 struct sysinfo si; 1429 #endif 1430 bool is_os_64 = (sizeof(void *) == 8); 1431 uint64_t total_memory; 1432 uint64_t dram_size_seven_GB = 0x1B8000000; 1433 uint64_t dram_size_three_GB = 0xB8000000; 1434 1435 if (amdgpu_smu_memory_pool_size == 0) 1436 return; 1437 1438 if (!is_os_64) { 1439 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1440 goto def_value; 1441 } 1442 #ifdef __linux__ 1443 si_meminfo(&si); 1444 total_memory = (uint64_t)si.totalram * si.mem_unit; 1445 #else 1446 total_memory = ptoa(physmem); 1447 #endif 1448 1449 if ((amdgpu_smu_memory_pool_size == 1) || 1450 (amdgpu_smu_memory_pool_size == 2)) { 1451 if (total_memory < dram_size_three_GB) 1452 goto def_value1; 1453 } else if ((amdgpu_smu_memory_pool_size == 4) || 1454 (amdgpu_smu_memory_pool_size == 8)) { 1455 if (total_memory < dram_size_seven_GB) 1456 goto def_value1; 1457 } else { 1458 DRM_WARN("Smu memory pool size not supported\n"); 1459 goto def_value; 1460 } 1461 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1462 1463 return; 1464 1465 def_value1: 1466 DRM_WARN("No enough system memory\n"); 1467 def_value: 1468 adev->pm.smu_prv_buffer_size = 0; 1469 } 1470 1471 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1472 { 1473 if (!(adev->flags & AMD_IS_APU) || 1474 adev->asic_type < CHIP_RAVEN) 1475 return 0; 1476 1477 switch (adev->asic_type) { 1478 case CHIP_RAVEN: 1479 if (adev->pdev->device == 0x15dd) 1480 adev->apu_flags |= AMD_APU_IS_RAVEN; 1481 if (adev->pdev->device == 0x15d8) 1482 adev->apu_flags |= AMD_APU_IS_PICASSO; 1483 break; 1484 case CHIP_RENOIR: 1485 if ((adev->pdev->device == 0x1636) || 1486 (adev->pdev->device == 0x164c)) 1487 adev->apu_flags |= AMD_APU_IS_RENOIR; 1488 else 1489 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1490 break; 1491 case CHIP_VANGOGH: 1492 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1493 break; 1494 case CHIP_YELLOW_CARP: 1495 break; 1496 case CHIP_CYAN_SKILLFISH: 1497 if (adev->pdev->device == 0x13FE) 1498 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1499 break; 1500 default: 1501 return -EINVAL; 1502 } 1503 1504 return 0; 1505 } 1506 1507 /** 1508 * amdgpu_device_check_arguments - validate module params 1509 * 1510 * @adev: amdgpu_device pointer 1511 * 1512 * Validates certain module parameters and updates 1513 * the associated values used by the driver (all asics). 1514 */ 1515 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1516 { 1517 if (amdgpu_sched_jobs < 4) { 1518 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1519 amdgpu_sched_jobs); 1520 amdgpu_sched_jobs = 4; 1521 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1522 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1523 amdgpu_sched_jobs); 1524 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1525 } 1526 1527 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1528 /* gart size must be greater or equal to 32M */ 1529 dev_warn(adev->dev, "gart size (%d) too small\n", 1530 amdgpu_gart_size); 1531 amdgpu_gart_size = -1; 1532 } 1533 1534 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1535 /* gtt size must be greater or equal to 32M */ 1536 dev_warn(adev->dev, "gtt size (%d) too small\n", 1537 amdgpu_gtt_size); 1538 amdgpu_gtt_size = -1; 1539 } 1540 1541 /* valid range is between 4 and 9 inclusive */ 1542 if (amdgpu_vm_fragment_size != -1 && 1543 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1544 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1545 amdgpu_vm_fragment_size = -1; 1546 } 1547 1548 if (amdgpu_sched_hw_submission < 2) { 1549 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1550 amdgpu_sched_hw_submission); 1551 amdgpu_sched_hw_submission = 2; 1552 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1553 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1554 amdgpu_sched_hw_submission); 1555 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1556 } 1557 1558 amdgpu_device_check_smu_prv_buffer_size(adev); 1559 1560 amdgpu_device_check_vm_size(adev); 1561 1562 amdgpu_device_check_block_size(adev); 1563 1564 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1565 1566 amdgpu_gmc_tmz_set(adev); 1567 1568 amdgpu_gmc_noretry_set(adev); 1569 1570 return 0; 1571 } 1572 1573 #ifdef __linux__ 1574 /** 1575 * amdgpu_switcheroo_set_state - set switcheroo state 1576 * 1577 * @pdev: pci dev pointer 1578 * @state: vga_switcheroo state 1579 * 1580 * Callback for the switcheroo driver. Suspends or resumes the 1581 * the asics before or after it is powered up using ACPI methods. 1582 */ 1583 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1584 enum vga_switcheroo_state state) 1585 { 1586 struct drm_device *dev = pci_get_drvdata(pdev); 1587 int r; 1588 1589 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1590 return; 1591 1592 if (state == VGA_SWITCHEROO_ON) { 1593 pr_info("switched on\n"); 1594 /* don't suspend or resume card normally */ 1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1596 1597 pci_set_power_state(pdev, PCI_D0); 1598 amdgpu_device_load_pci_state(pdev); 1599 r = pci_enable_device(pdev); 1600 if (r) 1601 DRM_WARN("pci_enable_device failed (%d)\n", r); 1602 amdgpu_device_resume(dev, true); 1603 1604 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1605 } else { 1606 pr_info("switched off\n"); 1607 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1608 amdgpu_device_suspend(dev, true); 1609 amdgpu_device_cache_pci_state(pdev); 1610 /* Shut down the device */ 1611 pci_disable_device(pdev); 1612 pci_set_power_state(pdev, PCI_D3cold); 1613 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1614 } 1615 } 1616 1617 /** 1618 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1619 * 1620 * @pdev: pci dev pointer 1621 * 1622 * Callback for the switcheroo driver. Check of the switcheroo 1623 * state can be changed. 1624 * Returns true if the state can be changed, false if not. 1625 */ 1626 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1627 { 1628 struct drm_device *dev = pci_get_drvdata(pdev); 1629 1630 /* 1631 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1632 * locking inversion with the driver load path. And the access here is 1633 * completely racy anyway. So don't bother with locking for now. 1634 */ 1635 return atomic_read(&dev->open_count) == 0; 1636 } 1637 #endif /* __linux__ */ 1638 1639 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1640 #ifdef notyet 1641 .set_gpu_state = amdgpu_switcheroo_set_state, 1642 .reprobe = NULL, 1643 .can_switch = amdgpu_switcheroo_can_switch, 1644 #endif 1645 }; 1646 1647 /** 1648 * amdgpu_device_ip_set_clockgating_state - set the CG state 1649 * 1650 * @dev: amdgpu_device pointer 1651 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1652 * @state: clockgating state (gate or ungate) 1653 * 1654 * Sets the requested clockgating state for all instances of 1655 * the hardware IP specified. 1656 * Returns the error code from the last instance. 1657 */ 1658 int amdgpu_device_ip_set_clockgating_state(void *dev, 1659 enum amd_ip_block_type block_type, 1660 enum amd_clockgating_state state) 1661 { 1662 struct amdgpu_device *adev = dev; 1663 int i, r = 0; 1664 1665 for (i = 0; i < adev->num_ip_blocks; i++) { 1666 if (!adev->ip_blocks[i].status.valid) 1667 continue; 1668 if (adev->ip_blocks[i].version->type != block_type) 1669 continue; 1670 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1671 continue; 1672 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1673 (void *)adev, state); 1674 if (r) 1675 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1676 adev->ip_blocks[i].version->funcs->name, r); 1677 } 1678 return r; 1679 } 1680 1681 /** 1682 * amdgpu_device_ip_set_powergating_state - set the PG state 1683 * 1684 * @dev: amdgpu_device pointer 1685 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1686 * @state: powergating state (gate or ungate) 1687 * 1688 * Sets the requested powergating state for all instances of 1689 * the hardware IP specified. 1690 * Returns the error code from the last instance. 1691 */ 1692 int amdgpu_device_ip_set_powergating_state(void *dev, 1693 enum amd_ip_block_type block_type, 1694 enum amd_powergating_state state) 1695 { 1696 struct amdgpu_device *adev = dev; 1697 int i, r = 0; 1698 1699 for (i = 0; i < adev->num_ip_blocks; i++) { 1700 if (!adev->ip_blocks[i].status.valid) 1701 continue; 1702 if (adev->ip_blocks[i].version->type != block_type) 1703 continue; 1704 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1705 continue; 1706 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1707 (void *)adev, state); 1708 if (r) 1709 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1710 adev->ip_blocks[i].version->funcs->name, r); 1711 } 1712 return r; 1713 } 1714 1715 /** 1716 * amdgpu_device_ip_get_clockgating_state - get the CG state 1717 * 1718 * @adev: amdgpu_device pointer 1719 * @flags: clockgating feature flags 1720 * 1721 * Walks the list of IPs on the device and updates the clockgating 1722 * flags for each IP. 1723 * Updates @flags with the feature flags for each hardware IP where 1724 * clockgating is enabled. 1725 */ 1726 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1727 u32 *flags) 1728 { 1729 int i; 1730 1731 for (i = 0; i < adev->num_ip_blocks; i++) { 1732 if (!adev->ip_blocks[i].status.valid) 1733 continue; 1734 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1735 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1736 } 1737 } 1738 1739 /** 1740 * amdgpu_device_ip_wait_for_idle - wait for idle 1741 * 1742 * @adev: amdgpu_device pointer 1743 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1744 * 1745 * Waits for the request hardware IP to be idle. 1746 * Returns 0 for success or a negative error code on failure. 1747 */ 1748 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1749 enum amd_ip_block_type block_type) 1750 { 1751 int i, r; 1752 1753 for (i = 0; i < adev->num_ip_blocks; i++) { 1754 if (!adev->ip_blocks[i].status.valid) 1755 continue; 1756 if (adev->ip_blocks[i].version->type == block_type) { 1757 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1758 if (r) 1759 return r; 1760 break; 1761 } 1762 } 1763 return 0; 1764 1765 } 1766 1767 /** 1768 * amdgpu_device_ip_is_idle - is the hardware IP idle 1769 * 1770 * @adev: amdgpu_device pointer 1771 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1772 * 1773 * Check if the hardware IP is idle or not. 1774 * Returns true if it the IP is idle, false if not. 1775 */ 1776 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1777 enum amd_ip_block_type block_type) 1778 { 1779 int i; 1780 1781 for (i = 0; i < adev->num_ip_blocks; i++) { 1782 if (!adev->ip_blocks[i].status.valid) 1783 continue; 1784 if (adev->ip_blocks[i].version->type == block_type) 1785 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1786 } 1787 return true; 1788 1789 } 1790 1791 /** 1792 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1793 * 1794 * @adev: amdgpu_device pointer 1795 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1796 * 1797 * Returns a pointer to the hardware IP block structure 1798 * if it exists for the asic, otherwise NULL. 1799 */ 1800 struct amdgpu_ip_block * 1801 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1802 enum amd_ip_block_type type) 1803 { 1804 int i; 1805 1806 for (i = 0; i < adev->num_ip_blocks; i++) 1807 if (adev->ip_blocks[i].version->type == type) 1808 return &adev->ip_blocks[i]; 1809 1810 return NULL; 1811 } 1812 1813 /** 1814 * amdgpu_device_ip_block_version_cmp 1815 * 1816 * @adev: amdgpu_device pointer 1817 * @type: enum amd_ip_block_type 1818 * @major: major version 1819 * @minor: minor version 1820 * 1821 * return 0 if equal or greater 1822 * return 1 if smaller or the ip_block doesn't exist 1823 */ 1824 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1825 enum amd_ip_block_type type, 1826 u32 major, u32 minor) 1827 { 1828 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1829 1830 if (ip_block && ((ip_block->version->major > major) || 1831 ((ip_block->version->major == major) && 1832 (ip_block->version->minor >= minor)))) 1833 return 0; 1834 1835 return 1; 1836 } 1837 1838 /** 1839 * amdgpu_device_ip_block_add 1840 * 1841 * @adev: amdgpu_device pointer 1842 * @ip_block_version: pointer to the IP to add 1843 * 1844 * Adds the IP block driver information to the collection of IPs 1845 * on the asic. 1846 */ 1847 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1848 const struct amdgpu_ip_block_version *ip_block_version) 1849 { 1850 if (!ip_block_version) 1851 return -EINVAL; 1852 1853 switch (ip_block_version->type) { 1854 case AMD_IP_BLOCK_TYPE_VCN: 1855 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1856 return 0; 1857 break; 1858 case AMD_IP_BLOCK_TYPE_JPEG: 1859 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1860 return 0; 1861 break; 1862 default: 1863 break; 1864 } 1865 1866 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1867 ip_block_version->funcs->name); 1868 1869 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1870 1871 return 0; 1872 } 1873 1874 /** 1875 * amdgpu_device_enable_virtual_display - enable virtual display feature 1876 * 1877 * @adev: amdgpu_device pointer 1878 * 1879 * Enabled the virtual display feature if the user has enabled it via 1880 * the module parameter virtual_display. This feature provides a virtual 1881 * display hardware on headless boards or in virtualized environments. 1882 * This function parses and validates the configuration string specified by 1883 * the user and configues the virtual display configuration (number of 1884 * virtual connectors, crtcs, etc.) specified. 1885 */ 1886 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1887 { 1888 adev->enable_virtual_display = false; 1889 1890 #ifdef notyet 1891 if (amdgpu_virtual_display) { 1892 const char *pci_address_name = pci_name(adev->pdev); 1893 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1894 1895 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1896 pciaddstr_tmp = pciaddstr; 1897 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1898 pciaddname = strsep(&pciaddname_tmp, ","); 1899 if (!strcmp("all", pciaddname) 1900 || !strcmp(pci_address_name, pciaddname)) { 1901 long num_crtc; 1902 int res = -1; 1903 1904 adev->enable_virtual_display = true; 1905 1906 if (pciaddname_tmp) 1907 res = kstrtol(pciaddname_tmp, 10, 1908 &num_crtc); 1909 1910 if (!res) { 1911 if (num_crtc < 1) 1912 num_crtc = 1; 1913 if (num_crtc > 6) 1914 num_crtc = 6; 1915 adev->mode_info.num_crtc = num_crtc; 1916 } else { 1917 adev->mode_info.num_crtc = 1; 1918 } 1919 break; 1920 } 1921 } 1922 1923 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1924 amdgpu_virtual_display, pci_address_name, 1925 adev->enable_virtual_display, adev->mode_info.num_crtc); 1926 1927 kfree(pciaddstr); 1928 } 1929 #endif 1930 } 1931 1932 /** 1933 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1934 * 1935 * @adev: amdgpu_device pointer 1936 * 1937 * Parses the asic configuration parameters specified in the gpu info 1938 * firmware and makes them availale to the driver for use in configuring 1939 * the asic. 1940 * Returns 0 on success, -EINVAL on failure. 1941 */ 1942 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1943 { 1944 const char *chip_name; 1945 char fw_name[40]; 1946 int err; 1947 const struct gpu_info_firmware_header_v1_0 *hdr; 1948 1949 adev->firmware.gpu_info_fw = NULL; 1950 1951 if (adev->mman.discovery_bin) { 1952 amdgpu_discovery_get_gfx_info(adev); 1953 1954 /* 1955 * FIXME: The bounding box is still needed by Navi12, so 1956 * temporarily read it from gpu_info firmware. Should be droped 1957 * when DAL no longer needs it. 1958 */ 1959 if (adev->asic_type != CHIP_NAVI12) 1960 return 0; 1961 } 1962 1963 switch (adev->asic_type) { 1964 #ifdef CONFIG_DRM_AMDGPU_SI 1965 case CHIP_VERDE: 1966 case CHIP_TAHITI: 1967 case CHIP_PITCAIRN: 1968 case CHIP_OLAND: 1969 case CHIP_HAINAN: 1970 #endif 1971 #ifdef CONFIG_DRM_AMDGPU_CIK 1972 case CHIP_BONAIRE: 1973 case CHIP_HAWAII: 1974 case CHIP_KAVERI: 1975 case CHIP_KABINI: 1976 case CHIP_MULLINS: 1977 #endif 1978 case CHIP_TOPAZ: 1979 case CHIP_TONGA: 1980 case CHIP_FIJI: 1981 case CHIP_POLARIS10: 1982 case CHIP_POLARIS11: 1983 case CHIP_POLARIS12: 1984 case CHIP_VEGAM: 1985 case CHIP_CARRIZO: 1986 case CHIP_STONEY: 1987 case CHIP_VEGA20: 1988 case CHIP_ALDEBARAN: 1989 case CHIP_SIENNA_CICHLID: 1990 case CHIP_NAVY_FLOUNDER: 1991 case CHIP_DIMGREY_CAVEFISH: 1992 case CHIP_BEIGE_GOBY: 1993 default: 1994 return 0; 1995 case CHIP_VEGA10: 1996 chip_name = "vega10"; 1997 break; 1998 case CHIP_VEGA12: 1999 chip_name = "vega12"; 2000 break; 2001 case CHIP_RAVEN: 2002 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2003 chip_name = "raven2"; 2004 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2005 chip_name = "picasso"; 2006 else 2007 chip_name = "raven"; 2008 break; 2009 case CHIP_ARCTURUS: 2010 chip_name = "arcturus"; 2011 break; 2012 case CHIP_RENOIR: 2013 if (adev->apu_flags & AMD_APU_IS_RENOIR) 2014 chip_name = "renoir"; 2015 else 2016 chip_name = "green_sardine"; 2017 break; 2018 case CHIP_NAVI10: 2019 chip_name = "navi10"; 2020 break; 2021 case CHIP_NAVI14: 2022 chip_name = "navi14"; 2023 break; 2024 case CHIP_NAVI12: 2025 chip_name = "navi12"; 2026 break; 2027 case CHIP_VANGOGH: 2028 chip_name = "vangogh"; 2029 break; 2030 case CHIP_YELLOW_CARP: 2031 chip_name = "yellow_carp"; 2032 break; 2033 } 2034 2035 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2036 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2037 if (err) { 2038 dev_err(adev->dev, 2039 "Failed to load gpu_info firmware \"%s\"\n", 2040 fw_name); 2041 goto out; 2042 } 2043 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2044 if (err) { 2045 dev_err(adev->dev, 2046 "Failed to validate gpu_info firmware \"%s\"\n", 2047 fw_name); 2048 goto out; 2049 } 2050 2051 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2052 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2053 2054 switch (hdr->version_major) { 2055 case 1: 2056 { 2057 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2058 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2059 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2060 2061 /* 2062 * Should be droped when DAL no longer needs it. 2063 */ 2064 if (adev->asic_type == CHIP_NAVI12) 2065 goto parse_soc_bounding_box; 2066 2067 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2068 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2069 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2070 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2071 adev->gfx.config.max_texture_channel_caches = 2072 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2073 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2074 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2075 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2076 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2077 adev->gfx.config.double_offchip_lds_buf = 2078 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2079 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2080 adev->gfx.cu_info.max_waves_per_simd = 2081 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2082 adev->gfx.cu_info.max_scratch_slots_per_cu = 2083 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2084 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2085 if (hdr->version_minor >= 1) { 2086 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2087 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2088 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2089 adev->gfx.config.num_sc_per_sh = 2090 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2091 adev->gfx.config.num_packer_per_sc = 2092 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2093 } 2094 2095 parse_soc_bounding_box: 2096 /* 2097 * soc bounding box info is not integrated in disocovery table, 2098 * we always need to parse it from gpu info firmware if needed. 2099 */ 2100 if (hdr->version_minor == 2) { 2101 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2102 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2103 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2104 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2105 } 2106 break; 2107 } 2108 default: 2109 dev_err(adev->dev, 2110 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2111 err = -EINVAL; 2112 goto out; 2113 } 2114 out: 2115 return err; 2116 } 2117 2118 /** 2119 * amdgpu_device_ip_early_init - run early init for hardware IPs 2120 * 2121 * @adev: amdgpu_device pointer 2122 * 2123 * Early initialization pass for hardware IPs. The hardware IPs that make 2124 * up each asic are discovered each IP's early_init callback is run. This 2125 * is the first stage in initializing the asic. 2126 * Returns 0 on success, negative error code on failure. 2127 */ 2128 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2129 { 2130 struct drm_device *dev = adev_to_drm(adev); 2131 struct pci_dev *parent; 2132 int i, r; 2133 2134 amdgpu_device_enable_virtual_display(adev); 2135 2136 if (amdgpu_sriov_vf(adev)) { 2137 r = amdgpu_virt_request_full_gpu(adev, true); 2138 if (r) 2139 return r; 2140 } 2141 2142 switch (adev->asic_type) { 2143 #ifdef CONFIG_DRM_AMDGPU_SI 2144 case CHIP_VERDE: 2145 case CHIP_TAHITI: 2146 case CHIP_PITCAIRN: 2147 case CHIP_OLAND: 2148 case CHIP_HAINAN: 2149 adev->family = AMDGPU_FAMILY_SI; 2150 r = si_set_ip_blocks(adev); 2151 if (r) 2152 return r; 2153 break; 2154 #endif 2155 #ifdef CONFIG_DRM_AMDGPU_CIK 2156 case CHIP_BONAIRE: 2157 case CHIP_HAWAII: 2158 case CHIP_KAVERI: 2159 case CHIP_KABINI: 2160 case CHIP_MULLINS: 2161 if (adev->flags & AMD_IS_APU) 2162 adev->family = AMDGPU_FAMILY_KV; 2163 else 2164 adev->family = AMDGPU_FAMILY_CI; 2165 2166 r = cik_set_ip_blocks(adev); 2167 if (r) 2168 return r; 2169 break; 2170 #endif 2171 case CHIP_TOPAZ: 2172 case CHIP_TONGA: 2173 case CHIP_FIJI: 2174 case CHIP_POLARIS10: 2175 case CHIP_POLARIS11: 2176 case CHIP_POLARIS12: 2177 case CHIP_VEGAM: 2178 case CHIP_CARRIZO: 2179 case CHIP_STONEY: 2180 if (adev->flags & AMD_IS_APU) 2181 adev->family = AMDGPU_FAMILY_CZ; 2182 else 2183 adev->family = AMDGPU_FAMILY_VI; 2184 2185 r = vi_set_ip_blocks(adev); 2186 if (r) 2187 return r; 2188 break; 2189 case CHIP_VEGA10: 2190 case CHIP_VEGA12: 2191 case CHIP_VEGA20: 2192 case CHIP_RAVEN: 2193 case CHIP_ARCTURUS: 2194 case CHIP_RENOIR: 2195 case CHIP_ALDEBARAN: 2196 if (adev->flags & AMD_IS_APU) 2197 adev->family = AMDGPU_FAMILY_RV; 2198 else 2199 adev->family = AMDGPU_FAMILY_AI; 2200 2201 r = soc15_set_ip_blocks(adev); 2202 if (r) 2203 return r; 2204 break; 2205 case CHIP_NAVI10: 2206 case CHIP_NAVI14: 2207 case CHIP_NAVI12: 2208 case CHIP_SIENNA_CICHLID: 2209 case CHIP_NAVY_FLOUNDER: 2210 case CHIP_DIMGREY_CAVEFISH: 2211 case CHIP_BEIGE_GOBY: 2212 case CHIP_VANGOGH: 2213 case CHIP_YELLOW_CARP: 2214 case CHIP_CYAN_SKILLFISH: 2215 if (adev->asic_type == CHIP_VANGOGH) 2216 adev->family = AMDGPU_FAMILY_VGH; 2217 else if (adev->asic_type == CHIP_YELLOW_CARP) 2218 adev->family = AMDGPU_FAMILY_YC; 2219 else 2220 adev->family = AMDGPU_FAMILY_NV; 2221 2222 r = nv_set_ip_blocks(adev); 2223 if (r) 2224 return r; 2225 break; 2226 default: 2227 /* FIXME: not supported yet */ 2228 return -EINVAL; 2229 } 2230 2231 if (amdgpu_has_atpx() && 2232 (amdgpu_is_atpx_hybrid() || 2233 amdgpu_has_atpx_dgpu_power_cntl()) && 2234 ((adev->flags & AMD_IS_APU) == 0) && 2235 !pci_is_thunderbolt_attached(dev->pdev)) 2236 adev->flags |= AMD_IS_PX; 2237 2238 if (!(adev->flags & AMD_IS_APU)) { 2239 parent = pci_upstream_bridge(adev->pdev); 2240 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2241 } 2242 2243 amdgpu_amdkfd_device_probe(adev); 2244 2245 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2246 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2247 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2248 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2249 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2253 DRM_ERROR("disabled ip block: %d <%s>\n", 2254 i, adev->ip_blocks[i].version->funcs->name); 2255 adev->ip_blocks[i].status.valid = false; 2256 } else { 2257 if (adev->ip_blocks[i].version->funcs->early_init) { 2258 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2259 if (r == -ENOENT) { 2260 adev->ip_blocks[i].status.valid = false; 2261 } else if (r) { 2262 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2263 adev->ip_blocks[i].version->funcs->name, r); 2264 return r; 2265 } else { 2266 adev->ip_blocks[i].status.valid = true; 2267 } 2268 } else { 2269 adev->ip_blocks[i].status.valid = true; 2270 } 2271 } 2272 /* get the vbios after the asic_funcs are set up */ 2273 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2274 r = amdgpu_device_parse_gpu_info_fw(adev); 2275 if (r) 2276 return r; 2277 2278 /* Read BIOS */ 2279 if (!amdgpu_get_bios(adev)) 2280 return -EINVAL; 2281 2282 r = amdgpu_atombios_init(adev); 2283 if (r) { 2284 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2285 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2286 return r; 2287 } 2288 2289 /*get pf2vf msg info at it's earliest time*/ 2290 if (amdgpu_sriov_vf(adev)) 2291 amdgpu_virt_init_data_exchange(adev); 2292 2293 } 2294 } 2295 2296 adev->cg_flags &= amdgpu_cg_mask; 2297 adev->pg_flags &= amdgpu_pg_mask; 2298 2299 return 0; 2300 } 2301 2302 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2303 { 2304 int i, r; 2305 2306 for (i = 0; i < adev->num_ip_blocks; i++) { 2307 if (!adev->ip_blocks[i].status.sw) 2308 continue; 2309 if (adev->ip_blocks[i].status.hw) 2310 continue; 2311 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2312 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2313 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2314 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2315 if (r) { 2316 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2317 adev->ip_blocks[i].version->funcs->name, r); 2318 return r; 2319 } 2320 adev->ip_blocks[i].status.hw = true; 2321 } 2322 } 2323 2324 return 0; 2325 } 2326 2327 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2328 { 2329 int i, r; 2330 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (!adev->ip_blocks[i].status.sw) 2333 continue; 2334 if (adev->ip_blocks[i].status.hw) 2335 continue; 2336 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2337 if (r) { 2338 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 return r; 2341 } 2342 adev->ip_blocks[i].status.hw = true; 2343 } 2344 2345 return 0; 2346 } 2347 2348 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2349 { 2350 int r = 0; 2351 int i; 2352 uint32_t smu_version; 2353 2354 if (adev->asic_type >= CHIP_VEGA10) { 2355 for (i = 0; i < adev->num_ip_blocks; i++) { 2356 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2357 continue; 2358 2359 if (!adev->ip_blocks[i].status.sw) 2360 continue; 2361 2362 /* no need to do the fw loading again if already done*/ 2363 if (adev->ip_blocks[i].status.hw == true) 2364 break; 2365 2366 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2367 r = adev->ip_blocks[i].version->funcs->resume(adev); 2368 if (r) { 2369 DRM_ERROR("resume of IP block <%s> failed %d\n", 2370 adev->ip_blocks[i].version->funcs->name, r); 2371 return r; 2372 } 2373 } else { 2374 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2375 if (r) { 2376 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 return r; 2379 } 2380 } 2381 2382 adev->ip_blocks[i].status.hw = true; 2383 break; 2384 } 2385 } 2386 2387 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2388 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2389 2390 return r; 2391 } 2392 2393 /** 2394 * amdgpu_device_ip_init - run init for hardware IPs 2395 * 2396 * @adev: amdgpu_device pointer 2397 * 2398 * Main initialization pass for hardware IPs. The list of all the hardware 2399 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2400 * are run. sw_init initializes the software state associated with each IP 2401 * and hw_init initializes the hardware associated with each IP. 2402 * Returns 0 on success, negative error code on failure. 2403 */ 2404 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2405 { 2406 int i, r; 2407 2408 r = amdgpu_ras_init(adev); 2409 if (r) 2410 return r; 2411 2412 for (i = 0; i < adev->num_ip_blocks; i++) { 2413 if (!adev->ip_blocks[i].status.valid) 2414 continue; 2415 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2416 if (r) { 2417 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2418 adev->ip_blocks[i].version->funcs->name, r); 2419 goto init_failed; 2420 } 2421 adev->ip_blocks[i].status.sw = true; 2422 2423 /* need to do gmc hw init early so we can allocate gpu mem */ 2424 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2425 r = amdgpu_device_vram_scratch_init(adev); 2426 if (r) { 2427 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2428 goto init_failed; 2429 } 2430 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2431 if (r) { 2432 DRM_ERROR("hw_init %d failed %d\n", i, r); 2433 goto init_failed; 2434 } 2435 r = amdgpu_device_wb_init(adev); 2436 if (r) { 2437 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2438 goto init_failed; 2439 } 2440 adev->ip_blocks[i].status.hw = true; 2441 2442 /* right after GMC hw init, we create CSA */ 2443 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2444 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2445 AMDGPU_GEM_DOMAIN_VRAM, 2446 AMDGPU_CSA_SIZE); 2447 if (r) { 2448 DRM_ERROR("allocate CSA failed %d\n", r); 2449 goto init_failed; 2450 } 2451 } 2452 } 2453 } 2454 2455 if (amdgpu_sriov_vf(adev)) 2456 amdgpu_virt_init_data_exchange(adev); 2457 2458 r = amdgpu_ib_pool_init(adev); 2459 if (r) { 2460 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2461 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2462 goto init_failed; 2463 } 2464 2465 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2466 if (r) 2467 goto init_failed; 2468 2469 r = amdgpu_amdkfd_resume_iommu(adev); 2470 if (r) 2471 goto init_failed; 2472 2473 r = amdgpu_device_ip_hw_init_phase1(adev); 2474 if (r) 2475 goto init_failed; 2476 2477 r = amdgpu_device_fw_loading(adev); 2478 if (r) 2479 goto init_failed; 2480 2481 r = amdgpu_device_ip_hw_init_phase2(adev); 2482 if (r) 2483 goto init_failed; 2484 2485 /* 2486 * retired pages will be loaded from eeprom and reserved here, 2487 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2488 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2489 * for I2C communication which only true at this point. 2490 * 2491 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2492 * failure from bad gpu situation and stop amdgpu init process 2493 * accordingly. For other failed cases, it will still release all 2494 * the resource and print error message, rather than returning one 2495 * negative value to upper level. 2496 * 2497 * Note: theoretically, this should be called before all vram allocations 2498 * to protect retired page from abusing 2499 */ 2500 r = amdgpu_ras_recovery_init(adev); 2501 if (r) 2502 goto init_failed; 2503 2504 if (adev->gmc.xgmi.num_physical_nodes > 1) 2505 amdgpu_xgmi_add_device(adev); 2506 2507 /* Don't init kfd if whole hive need to be reset during init */ 2508 if (!adev->gmc.xgmi.pending_reset) 2509 amdgpu_amdkfd_device_init(adev); 2510 2511 amdgpu_fru_get_product_info(adev); 2512 2513 init_failed: 2514 if (amdgpu_sriov_vf(adev)) 2515 amdgpu_virt_release_full_gpu(adev, true); 2516 2517 return r; 2518 } 2519 2520 /** 2521 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2522 * 2523 * @adev: amdgpu_device pointer 2524 * 2525 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2526 * this function before a GPU reset. If the value is retained after a 2527 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2528 */ 2529 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2530 { 2531 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2532 } 2533 2534 /** 2535 * amdgpu_device_check_vram_lost - check if vram is valid 2536 * 2537 * @adev: amdgpu_device pointer 2538 * 2539 * Checks the reset magic value written to the gart pointer in VRAM. 2540 * The driver calls this after a GPU reset to see if the contents of 2541 * VRAM is lost or now. 2542 * returns true if vram is lost, false if not. 2543 */ 2544 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2545 { 2546 if (memcmp(adev->gart.ptr, adev->reset_magic, 2547 AMDGPU_RESET_MAGIC_NUM)) 2548 return true; 2549 2550 if (!amdgpu_in_reset(adev)) 2551 return false; 2552 2553 /* 2554 * For all ASICs with baco/mode1 reset, the VRAM is 2555 * always assumed to be lost. 2556 */ 2557 switch (amdgpu_asic_reset_method(adev)) { 2558 case AMD_RESET_METHOD_BACO: 2559 case AMD_RESET_METHOD_MODE1: 2560 return true; 2561 default: 2562 return false; 2563 } 2564 } 2565 2566 /** 2567 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2568 * 2569 * @adev: amdgpu_device pointer 2570 * @state: clockgating state (gate or ungate) 2571 * 2572 * The list of all the hardware IPs that make up the asic is walked and the 2573 * set_clockgating_state callbacks are run. 2574 * Late initialization pass enabling clockgating for hardware IPs. 2575 * Fini or suspend, pass disabling clockgating for hardware IPs. 2576 * Returns 0 on success, negative error code on failure. 2577 */ 2578 2579 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2580 enum amd_clockgating_state state) 2581 { 2582 int i, j, r; 2583 2584 if (amdgpu_emu_mode == 1) 2585 return 0; 2586 2587 for (j = 0; j < adev->num_ip_blocks; j++) { 2588 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2589 if (!adev->ip_blocks[i].status.late_initialized) 2590 continue; 2591 /* skip CG for GFX on S0ix */ 2592 if (adev->in_s0ix && 2593 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2594 continue; 2595 /* skip CG for VCE/UVD, it's handled specially */ 2596 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2597 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2598 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2599 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2600 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2601 /* enable clockgating to save power */ 2602 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2603 state); 2604 if (r) { 2605 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2606 adev->ip_blocks[i].version->funcs->name, r); 2607 return r; 2608 } 2609 } 2610 } 2611 2612 return 0; 2613 } 2614 2615 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2616 enum amd_powergating_state state) 2617 { 2618 int i, j, r; 2619 2620 if (amdgpu_emu_mode == 1) 2621 return 0; 2622 2623 for (j = 0; j < adev->num_ip_blocks; j++) { 2624 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2625 if (!adev->ip_blocks[i].status.late_initialized) 2626 continue; 2627 /* skip PG for GFX on S0ix */ 2628 if (adev->in_s0ix && 2629 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2630 continue; 2631 /* skip CG for VCE/UVD, it's handled specially */ 2632 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2633 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2634 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2635 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2636 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2637 /* enable powergating to save power */ 2638 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2639 state); 2640 if (r) { 2641 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2642 adev->ip_blocks[i].version->funcs->name, r); 2643 return r; 2644 } 2645 } 2646 } 2647 return 0; 2648 } 2649 2650 static int amdgpu_device_enable_mgpu_fan_boost(void) 2651 { 2652 struct amdgpu_gpu_instance *gpu_ins; 2653 struct amdgpu_device *adev; 2654 int i, ret = 0; 2655 2656 mutex_lock(&mgpu_info.mutex); 2657 2658 /* 2659 * MGPU fan boost feature should be enabled 2660 * only when there are two or more dGPUs in 2661 * the system 2662 */ 2663 if (mgpu_info.num_dgpu < 2) 2664 goto out; 2665 2666 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2667 gpu_ins = &(mgpu_info.gpu_ins[i]); 2668 adev = gpu_ins->adev; 2669 if (!(adev->flags & AMD_IS_APU) && 2670 !gpu_ins->mgpu_fan_enabled) { 2671 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2672 if (ret) 2673 break; 2674 2675 gpu_ins->mgpu_fan_enabled = 1; 2676 } 2677 } 2678 2679 out: 2680 mutex_unlock(&mgpu_info.mutex); 2681 2682 return ret; 2683 } 2684 2685 /** 2686 * amdgpu_device_ip_late_init - run late init for hardware IPs 2687 * 2688 * @adev: amdgpu_device pointer 2689 * 2690 * Late initialization pass for hardware IPs. The list of all the hardware 2691 * IPs that make up the asic is walked and the late_init callbacks are run. 2692 * late_init covers any special initialization that an IP requires 2693 * after all of the have been initialized or something that needs to happen 2694 * late in the init process. 2695 * Returns 0 on success, negative error code on failure. 2696 */ 2697 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2698 { 2699 struct amdgpu_gpu_instance *gpu_instance; 2700 int i = 0, r; 2701 2702 for (i = 0; i < adev->num_ip_blocks; i++) { 2703 if (!adev->ip_blocks[i].status.hw) 2704 continue; 2705 if (adev->ip_blocks[i].version->funcs->late_init) { 2706 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2707 if (r) { 2708 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 return r; 2711 } 2712 } 2713 adev->ip_blocks[i].status.late_initialized = true; 2714 } 2715 2716 amdgpu_ras_set_error_query_ready(adev, true); 2717 2718 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2719 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2720 2721 amdgpu_device_fill_reset_magic(adev); 2722 2723 r = amdgpu_device_enable_mgpu_fan_boost(); 2724 if (r) 2725 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2726 2727 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2728 if (adev->asic_type == CHIP_ARCTURUS && 2729 amdgpu_passthrough(adev) && 2730 adev->gmc.xgmi.num_physical_nodes > 1) 2731 smu_set_light_sbr(&adev->smu, true); 2732 2733 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2734 mutex_lock(&mgpu_info.mutex); 2735 2736 /* 2737 * Reset device p-state to low as this was booted with high. 2738 * 2739 * This should be performed only after all devices from the same 2740 * hive get initialized. 2741 * 2742 * However, it's unknown how many device in the hive in advance. 2743 * As this is counted one by one during devices initializations. 2744 * 2745 * So, we wait for all XGMI interlinked devices initialized. 2746 * This may bring some delays as those devices may come from 2747 * different hives. But that should be OK. 2748 */ 2749 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2750 for (i = 0; i < mgpu_info.num_gpu; i++) { 2751 gpu_instance = &(mgpu_info.gpu_ins[i]); 2752 if (gpu_instance->adev->flags & AMD_IS_APU) 2753 continue; 2754 2755 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2756 AMDGPU_XGMI_PSTATE_MIN); 2757 if (r) { 2758 DRM_ERROR("pstate setting failed (%d).\n", r); 2759 break; 2760 } 2761 } 2762 } 2763 2764 mutex_unlock(&mgpu_info.mutex); 2765 } 2766 2767 return 0; 2768 } 2769 2770 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2771 { 2772 int i, r; 2773 2774 for (i = 0; i < adev->num_ip_blocks; i++) { 2775 if (!adev->ip_blocks[i].version->funcs->early_fini) 2776 continue; 2777 2778 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2779 if (r) { 2780 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2781 adev->ip_blocks[i].version->funcs->name, r); 2782 } 2783 } 2784 2785 amdgpu_amdkfd_suspend(adev, false); 2786 2787 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2788 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2789 2790 /* need to disable SMC first */ 2791 for (i = 0; i < adev->num_ip_blocks; i++) { 2792 if (!adev->ip_blocks[i].status.hw) 2793 continue; 2794 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2795 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2796 /* XXX handle errors */ 2797 if (r) { 2798 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2799 adev->ip_blocks[i].version->funcs->name, r); 2800 } 2801 adev->ip_blocks[i].status.hw = false; 2802 break; 2803 } 2804 } 2805 2806 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2807 if (!adev->ip_blocks[i].status.hw) 2808 continue; 2809 2810 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2811 /* XXX handle errors */ 2812 if (r) { 2813 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2814 adev->ip_blocks[i].version->funcs->name, r); 2815 } 2816 2817 adev->ip_blocks[i].status.hw = false; 2818 } 2819 2820 if (amdgpu_sriov_vf(adev)) { 2821 if (amdgpu_virt_release_full_gpu(adev, false)) 2822 DRM_ERROR("failed to release exclusive mode on fini\n"); 2823 } 2824 2825 return 0; 2826 } 2827 2828 /** 2829 * amdgpu_device_ip_fini - run fini for hardware IPs 2830 * 2831 * @adev: amdgpu_device pointer 2832 * 2833 * Main teardown pass for hardware IPs. The list of all the hardware 2834 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2835 * are run. hw_fini tears down the hardware associated with each IP 2836 * and sw_fini tears down any software state associated with each IP. 2837 * Returns 0 on success, negative error code on failure. 2838 */ 2839 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2840 { 2841 int i, r; 2842 2843 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2844 amdgpu_virt_release_ras_err_handler_data(adev); 2845 2846 amdgpu_ras_pre_fini(adev); 2847 2848 if (adev->gmc.xgmi.num_physical_nodes > 1) 2849 amdgpu_xgmi_remove_device(adev); 2850 2851 amdgpu_amdkfd_device_fini_sw(adev); 2852 2853 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2854 if (!adev->ip_blocks[i].status.sw) 2855 continue; 2856 2857 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2858 amdgpu_ucode_free_bo(adev); 2859 amdgpu_free_static_csa(&adev->virt.csa_obj); 2860 amdgpu_device_wb_fini(adev); 2861 amdgpu_device_vram_scratch_fini(adev); 2862 amdgpu_ib_pool_fini(adev); 2863 } 2864 2865 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2866 /* XXX handle errors */ 2867 if (r) { 2868 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2869 adev->ip_blocks[i].version->funcs->name, r); 2870 } 2871 adev->ip_blocks[i].status.sw = false; 2872 adev->ip_blocks[i].status.valid = false; 2873 } 2874 2875 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2876 if (!adev->ip_blocks[i].status.late_initialized) 2877 continue; 2878 if (adev->ip_blocks[i].version->funcs->late_fini) 2879 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2880 adev->ip_blocks[i].status.late_initialized = false; 2881 } 2882 2883 amdgpu_ras_fini(adev); 2884 2885 return 0; 2886 } 2887 2888 /** 2889 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2890 * 2891 * @work: work_struct. 2892 */ 2893 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2894 { 2895 struct amdgpu_device *adev = 2896 container_of(work, struct amdgpu_device, delayed_init_work.work); 2897 int r; 2898 2899 r = amdgpu_ib_ring_tests(adev); 2900 if (r) 2901 DRM_ERROR("ib ring test failed (%d).\n", r); 2902 } 2903 2904 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2905 { 2906 struct amdgpu_device *adev = 2907 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2908 2909 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2910 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2911 2912 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2913 adev->gfx.gfx_off_state = true; 2914 } 2915 2916 /** 2917 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2918 * 2919 * @adev: amdgpu_device pointer 2920 * 2921 * Main suspend function for hardware IPs. The list of all the hardware 2922 * IPs that make up the asic is walked, clockgating is disabled and the 2923 * suspend callbacks are run. suspend puts the hardware and software state 2924 * in each IP into a state suitable for suspend. 2925 * Returns 0 on success, negative error code on failure. 2926 */ 2927 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2928 { 2929 int i, r; 2930 2931 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2932 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2933 2934 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2935 if (!adev->ip_blocks[i].status.valid) 2936 continue; 2937 2938 /* displays are handled separately */ 2939 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2940 continue; 2941 2942 /* XXX handle errors */ 2943 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2944 /* XXX handle errors */ 2945 if (r) { 2946 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2947 adev->ip_blocks[i].version->funcs->name, r); 2948 return r; 2949 } 2950 2951 adev->ip_blocks[i].status.hw = false; 2952 } 2953 2954 return 0; 2955 } 2956 2957 /** 2958 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2959 * 2960 * @adev: amdgpu_device pointer 2961 * 2962 * Main suspend function for hardware IPs. The list of all the hardware 2963 * IPs that make up the asic is walked, clockgating is disabled and the 2964 * suspend callbacks are run. suspend puts the hardware and software state 2965 * in each IP into a state suitable for suspend. 2966 * Returns 0 on success, negative error code on failure. 2967 */ 2968 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 if (adev->in_s0ix) 2973 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2974 2975 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2976 if (!adev->ip_blocks[i].status.valid) 2977 continue; 2978 /* displays are handled in phase1 */ 2979 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2980 continue; 2981 /* PSP lost connection when err_event_athub occurs */ 2982 if (amdgpu_ras_intr_triggered() && 2983 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2984 adev->ip_blocks[i].status.hw = false; 2985 continue; 2986 } 2987 2988 /* skip unnecessary suspend if we do not initialize them yet */ 2989 if (adev->gmc.xgmi.pending_reset && 2990 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2991 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2992 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2994 adev->ip_blocks[i].status.hw = false; 2995 continue; 2996 } 2997 2998 /* skip suspend of gfx and psp for S0ix 2999 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3000 * like at runtime. PSP is also part of the always on hardware 3001 * so no need to suspend it. 3002 */ 3003 if (adev->in_s0ix && 3004 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 3006 continue; 3007 3008 /* XXX handle errors */ 3009 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3010 /* XXX handle errors */ 3011 if (r) { 3012 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3013 adev->ip_blocks[i].version->funcs->name, r); 3014 } 3015 adev->ip_blocks[i].status.hw = false; 3016 /* handle putting the SMC in the appropriate state */ 3017 if(!amdgpu_sriov_vf(adev)){ 3018 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3019 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3020 if (r) { 3021 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3022 adev->mp1_state, r); 3023 return r; 3024 } 3025 } 3026 } 3027 } 3028 3029 return 0; 3030 } 3031 3032 /** 3033 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3034 * 3035 * @adev: amdgpu_device pointer 3036 * 3037 * Main suspend function for hardware IPs. The list of all the hardware 3038 * IPs that make up the asic is walked, clockgating is disabled and the 3039 * suspend callbacks are run. suspend puts the hardware and software state 3040 * in each IP into a state suitable for suspend. 3041 * Returns 0 on success, negative error code on failure. 3042 */ 3043 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3044 { 3045 int r; 3046 3047 if (amdgpu_sriov_vf(adev)) { 3048 amdgpu_virt_fini_data_exchange(adev); 3049 amdgpu_virt_request_full_gpu(adev, false); 3050 } 3051 3052 r = amdgpu_device_ip_suspend_phase1(adev); 3053 if (r) 3054 return r; 3055 r = amdgpu_device_ip_suspend_phase2(adev); 3056 3057 if (amdgpu_sriov_vf(adev)) 3058 amdgpu_virt_release_full_gpu(adev, false); 3059 3060 return r; 3061 } 3062 3063 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3064 { 3065 int i, r; 3066 3067 static enum amd_ip_block_type ip_order[] = { 3068 AMD_IP_BLOCK_TYPE_GMC, 3069 AMD_IP_BLOCK_TYPE_COMMON, 3070 AMD_IP_BLOCK_TYPE_PSP, 3071 AMD_IP_BLOCK_TYPE_IH, 3072 }; 3073 3074 for (i = 0; i < adev->num_ip_blocks; i++) { 3075 int j; 3076 struct amdgpu_ip_block *block; 3077 3078 block = &adev->ip_blocks[i]; 3079 block->status.hw = false; 3080 3081 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3082 3083 if (block->version->type != ip_order[j] || 3084 !block->status.valid) 3085 continue; 3086 3087 r = block->version->funcs->hw_init(adev); 3088 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3089 if (r) 3090 return r; 3091 block->status.hw = true; 3092 } 3093 } 3094 3095 return 0; 3096 } 3097 3098 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3099 { 3100 int i, r; 3101 3102 static enum amd_ip_block_type ip_order[] = { 3103 AMD_IP_BLOCK_TYPE_SMC, 3104 AMD_IP_BLOCK_TYPE_DCE, 3105 AMD_IP_BLOCK_TYPE_GFX, 3106 AMD_IP_BLOCK_TYPE_SDMA, 3107 AMD_IP_BLOCK_TYPE_UVD, 3108 AMD_IP_BLOCK_TYPE_VCE, 3109 AMD_IP_BLOCK_TYPE_VCN 3110 }; 3111 3112 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3113 int j; 3114 struct amdgpu_ip_block *block; 3115 3116 for (j = 0; j < adev->num_ip_blocks; j++) { 3117 block = &adev->ip_blocks[j]; 3118 3119 if (block->version->type != ip_order[i] || 3120 !block->status.valid || 3121 block->status.hw) 3122 continue; 3123 3124 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3125 r = block->version->funcs->resume(adev); 3126 else 3127 r = block->version->funcs->hw_init(adev); 3128 3129 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3130 if (r) 3131 return r; 3132 block->status.hw = true; 3133 } 3134 } 3135 3136 return 0; 3137 } 3138 3139 /** 3140 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3141 * 3142 * @adev: amdgpu_device pointer 3143 * 3144 * First resume function for hardware IPs. The list of all the hardware 3145 * IPs that make up the asic is walked and the resume callbacks are run for 3146 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3147 * after a suspend and updates the software state as necessary. This 3148 * function is also used for restoring the GPU after a GPU reset. 3149 * Returns 0 on success, negative error code on failure. 3150 */ 3151 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3152 { 3153 int i, r; 3154 3155 for (i = 0; i < adev->num_ip_blocks; i++) { 3156 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3157 continue; 3158 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3160 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3161 3162 r = adev->ip_blocks[i].version->funcs->resume(adev); 3163 if (r) { 3164 DRM_ERROR("resume of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, r); 3166 return r; 3167 } 3168 adev->ip_blocks[i].status.hw = true; 3169 } 3170 } 3171 3172 return 0; 3173 } 3174 3175 /** 3176 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3177 * 3178 * @adev: amdgpu_device pointer 3179 * 3180 * First resume function for hardware IPs. The list of all the hardware 3181 * IPs that make up the asic is walked and the resume callbacks are run for 3182 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3183 * functional state after a suspend and updates the software state as 3184 * necessary. This function is also used for restoring the GPU after a GPU 3185 * reset. 3186 * Returns 0 on success, negative error code on failure. 3187 */ 3188 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3189 { 3190 int i, r; 3191 3192 for (i = 0; i < adev->num_ip_blocks; i++) { 3193 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3194 continue; 3195 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3197 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3198 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3199 continue; 3200 r = adev->ip_blocks[i].version->funcs->resume(adev); 3201 if (r) { 3202 DRM_ERROR("resume of IP block <%s> failed %d\n", 3203 adev->ip_blocks[i].version->funcs->name, r); 3204 return r; 3205 } 3206 adev->ip_blocks[i].status.hw = true; 3207 } 3208 3209 return 0; 3210 } 3211 3212 /** 3213 * amdgpu_device_ip_resume - run resume for hardware IPs 3214 * 3215 * @adev: amdgpu_device pointer 3216 * 3217 * Main resume function for hardware IPs. The hardware IPs 3218 * are split into two resume functions because they are 3219 * are also used in in recovering from a GPU reset and some additional 3220 * steps need to be take between them. In this case (S3/S4) they are 3221 * run sequentially. 3222 * Returns 0 on success, negative error code on failure. 3223 */ 3224 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3225 { 3226 int r; 3227 3228 r = amdgpu_amdkfd_resume_iommu(adev); 3229 if (r) 3230 return r; 3231 3232 r = amdgpu_device_ip_resume_phase1(adev); 3233 if (r) 3234 return r; 3235 3236 r = amdgpu_device_fw_loading(adev); 3237 if (r) 3238 return r; 3239 3240 r = amdgpu_device_ip_resume_phase2(adev); 3241 3242 return r; 3243 } 3244 3245 /** 3246 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3247 * 3248 * @adev: amdgpu_device pointer 3249 * 3250 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3251 */ 3252 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3253 { 3254 if (amdgpu_sriov_vf(adev)) { 3255 if (adev->is_atom_fw) { 3256 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3257 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3258 } else { 3259 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3260 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3261 } 3262 3263 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3264 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3265 } 3266 } 3267 3268 /** 3269 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3270 * 3271 * @asic_type: AMD asic type 3272 * 3273 * Check if there is DC (new modesetting infrastructre) support for an asic. 3274 * returns true if DC has support, false if not. 3275 */ 3276 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3277 { 3278 switch (asic_type) { 3279 #if defined(CONFIG_DRM_AMD_DC) 3280 #if defined(CONFIG_DRM_AMD_DC_SI) 3281 case CHIP_TAHITI: 3282 case CHIP_PITCAIRN: 3283 case CHIP_VERDE: 3284 case CHIP_OLAND: 3285 #endif 3286 case CHIP_BONAIRE: 3287 case CHIP_KAVERI: 3288 case CHIP_KABINI: 3289 case CHIP_MULLINS: 3290 /* 3291 * We have systems in the wild with these ASICs that require 3292 * LVDS and VGA support which is not supported with DC. 3293 * 3294 * Fallback to the non-DC driver here by default so as not to 3295 * cause regressions. 3296 */ 3297 return amdgpu_dc > 0; 3298 case CHIP_HAWAII: 3299 case CHIP_CARRIZO: 3300 case CHIP_STONEY: 3301 case CHIP_POLARIS10: 3302 case CHIP_POLARIS11: 3303 case CHIP_POLARIS12: 3304 case CHIP_VEGAM: 3305 case CHIP_TONGA: 3306 case CHIP_FIJI: 3307 case CHIP_VEGA10: 3308 case CHIP_VEGA12: 3309 case CHIP_VEGA20: 3310 #if defined(CONFIG_DRM_AMD_DC_DCN) 3311 case CHIP_RAVEN: 3312 case CHIP_NAVI10: 3313 case CHIP_NAVI14: 3314 case CHIP_NAVI12: 3315 case CHIP_RENOIR: 3316 case CHIP_SIENNA_CICHLID: 3317 case CHIP_NAVY_FLOUNDER: 3318 case CHIP_DIMGREY_CAVEFISH: 3319 case CHIP_BEIGE_GOBY: 3320 case CHIP_VANGOGH: 3321 case CHIP_YELLOW_CARP: 3322 #endif 3323 return amdgpu_dc != 0; 3324 #endif 3325 default: 3326 if (amdgpu_dc > 0) 3327 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3328 "but isn't supported by ASIC, ignoring\n"); 3329 return false; 3330 } 3331 } 3332 3333 /** 3334 * amdgpu_device_has_dc_support - check if dc is supported 3335 * 3336 * @adev: amdgpu_device pointer 3337 * 3338 * Returns true for supported, false for not supported 3339 */ 3340 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3341 { 3342 if (amdgpu_sriov_vf(adev) || 3343 adev->enable_virtual_display || 3344 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3345 return false; 3346 3347 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3348 } 3349 3350 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3351 { 3352 struct amdgpu_device *adev = 3353 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3354 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3355 3356 /* It's a bug to not have a hive within this function */ 3357 if (WARN_ON(!hive)) 3358 return; 3359 3360 /* 3361 * Use task barrier to synchronize all xgmi reset works across the 3362 * hive. task_barrier_enter and task_barrier_exit will block 3363 * until all the threads running the xgmi reset works reach 3364 * those points. task_barrier_full will do both blocks. 3365 */ 3366 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3367 3368 task_barrier_enter(&hive->tb); 3369 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3370 3371 if (adev->asic_reset_res) 3372 goto fail; 3373 3374 task_barrier_exit(&hive->tb); 3375 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3376 3377 if (adev->asic_reset_res) 3378 goto fail; 3379 3380 if (adev->mmhub.ras_funcs && 3381 adev->mmhub.ras_funcs->reset_ras_error_count) 3382 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3383 } else { 3384 3385 task_barrier_full(&hive->tb); 3386 adev->asic_reset_res = amdgpu_asic_reset(adev); 3387 } 3388 3389 fail: 3390 if (adev->asic_reset_res) 3391 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3392 adev->asic_reset_res, adev_to_drm(adev)->unique); 3393 amdgpu_put_xgmi_hive(hive); 3394 } 3395 3396 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3397 { 3398 char *input = amdgpu_lockup_timeout; 3399 char *timeout_setting = NULL; 3400 int index = 0; 3401 long timeout; 3402 int ret = 0; 3403 3404 /* 3405 * By default timeout for non compute jobs is 10000 3406 * and 60000 for compute jobs. 3407 * In SR-IOV or passthrough mode, timeout for compute 3408 * jobs are 60000 by default. 3409 */ 3410 adev->gfx_timeout = msecs_to_jiffies(10000); 3411 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3412 if (amdgpu_sriov_vf(adev)) 3413 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3414 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3415 else 3416 adev->compute_timeout = msecs_to_jiffies(60000); 3417 3418 #ifdef notyet 3419 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3420 while ((timeout_setting = strsep(&input, ",")) && 3421 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3422 ret = kstrtol(timeout_setting, 0, &timeout); 3423 if (ret) 3424 return ret; 3425 3426 if (timeout == 0) { 3427 index++; 3428 continue; 3429 } else if (timeout < 0) { 3430 timeout = MAX_SCHEDULE_TIMEOUT; 3431 } else { 3432 timeout = msecs_to_jiffies(timeout); 3433 } 3434 3435 switch (index++) { 3436 case 0: 3437 adev->gfx_timeout = timeout; 3438 break; 3439 case 1: 3440 adev->compute_timeout = timeout; 3441 break; 3442 case 2: 3443 adev->sdma_timeout = timeout; 3444 break; 3445 case 3: 3446 adev->video_timeout = timeout; 3447 break; 3448 default: 3449 break; 3450 } 3451 } 3452 /* 3453 * There is only one value specified and 3454 * it should apply to all non-compute jobs. 3455 */ 3456 if (index == 1) { 3457 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3458 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3459 adev->compute_timeout = adev->gfx_timeout; 3460 } 3461 } 3462 #endif 3463 3464 return ret; 3465 } 3466 3467 static const struct attribute *amdgpu_dev_attributes[] = { 3468 &dev_attr_product_name.attr, 3469 &dev_attr_product_number.attr, 3470 &dev_attr_serial_number.attr, 3471 &dev_attr_pcie_replay_count.attr, 3472 NULL 3473 }; 3474 3475 /** 3476 * amdgpu_device_init - initialize the driver 3477 * 3478 * @adev: amdgpu_device pointer 3479 * @flags: driver flags 3480 * 3481 * Initializes the driver info and hw (all asics). 3482 * Returns 0 for success or an error on failure. 3483 * Called at driver startup. 3484 */ 3485 int amdgpu_device_init(struct amdgpu_device *adev, 3486 uint32_t flags) 3487 { 3488 struct drm_device *ddev = adev_to_drm(adev); 3489 struct pci_dev *pdev = adev->pdev; 3490 int r, i; 3491 bool px = false; 3492 u32 max_MBps; 3493 3494 adev->shutdown = false; 3495 adev->flags = flags; 3496 3497 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3498 adev->asic_type = amdgpu_force_asic_type; 3499 else 3500 adev->asic_type = flags & AMD_ASIC_MASK; 3501 3502 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3503 if (amdgpu_emu_mode == 1) 3504 adev->usec_timeout *= 10; 3505 adev->gmc.gart_size = 512 * 1024 * 1024; 3506 adev->accel_working = false; 3507 adev->num_rings = 0; 3508 adev->mman.buffer_funcs = NULL; 3509 adev->mman.buffer_funcs_ring = NULL; 3510 adev->vm_manager.vm_pte_funcs = NULL; 3511 adev->vm_manager.vm_pte_num_scheds = 0; 3512 adev->gmc.gmc_funcs = NULL; 3513 adev->harvest_ip_mask = 0x0; 3514 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3515 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3516 3517 adev->smc_rreg = &amdgpu_invalid_rreg; 3518 adev->smc_wreg = &amdgpu_invalid_wreg; 3519 adev->pcie_rreg = &amdgpu_invalid_rreg; 3520 adev->pcie_wreg = &amdgpu_invalid_wreg; 3521 adev->pciep_rreg = &amdgpu_invalid_rreg; 3522 adev->pciep_wreg = &amdgpu_invalid_wreg; 3523 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3524 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3525 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3526 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3527 adev->didt_rreg = &amdgpu_invalid_rreg; 3528 adev->didt_wreg = &amdgpu_invalid_wreg; 3529 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3530 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3531 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3532 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3533 3534 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3535 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3536 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3537 3538 /* mutex initialization are all done here so we 3539 * can recall function without having locking issues */ 3540 rw_init(&adev->firmware.mutex, "agfw"); 3541 rw_init(&adev->pm.mutex, "agpm"); 3542 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3543 rw_init(&adev->srbm_mutex, "srbm"); 3544 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3545 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3546 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3547 rw_init(&adev->mn_lock, "agpumn"); 3548 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3549 hash_init(adev->mn_hash); 3550 atomic_set(&adev->in_gpu_reset, 0); 3551 rw_init(&adev->reset_sem, "amrs"); 3552 rw_init(&adev->psp.mutex, "agpsp"); 3553 rw_init(&adev->notifier_lock, "agnf"); 3554 3555 r = amdgpu_device_init_apu_flags(adev); 3556 if (r) 3557 return r; 3558 3559 r = amdgpu_device_check_arguments(adev); 3560 if (r) 3561 return r; 3562 3563 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3564 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3565 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3566 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3567 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3568 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3569 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3570 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3571 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3572 3573 INIT_LIST_HEAD(&adev->shadow_list); 3574 rw_init(&adev->shadow_list_lock, "sdwlst"); 3575 3576 INIT_LIST_HEAD(&adev->reset_list); 3577 3578 INIT_DELAYED_WORK(&adev->delayed_init_work, 3579 amdgpu_device_delayed_init_work_handler); 3580 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3581 amdgpu_device_delay_enable_gfx_off); 3582 3583 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3584 3585 adev->gfx.gfx_off_req_count = 1; 3586 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3587 3588 atomic_set(&adev->throttling_logging_enabled, 1); 3589 /* 3590 * If throttling continues, logging will be performed every minute 3591 * to avoid log flooding. "-1" is subtracted since the thermal 3592 * throttling interrupt comes every second. Thus, the total logging 3593 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3594 * for throttling interrupt) = 60 seconds. 3595 */ 3596 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3597 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3598 3599 #ifdef __linux__ 3600 /* Registers mapping */ 3601 /* TODO: block userspace mapping of io register */ 3602 if (adev->asic_type >= CHIP_BONAIRE) { 3603 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3604 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3605 } else { 3606 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3607 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3608 } 3609 3610 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3611 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3612 3613 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3614 if (adev->rmmio == NULL) { 3615 return -ENOMEM; 3616 } 3617 #endif 3618 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3619 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3620 3621 /* enable PCIE atomic ops */ 3622 #ifdef notyet 3623 r = pci_enable_atomic_ops_to_root(adev->pdev, 3624 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3625 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3626 if (r) { 3627 adev->have_atomics_support = false; 3628 DRM_INFO("PCIE atomic ops is not supported\n"); 3629 } else { 3630 adev->have_atomics_support = true; 3631 } 3632 #else 3633 adev->have_atomics_support = false; 3634 #endif 3635 3636 amdgpu_device_get_pcie_info(adev); 3637 3638 if (amdgpu_mcbp) 3639 DRM_INFO("MCBP is enabled\n"); 3640 3641 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3642 adev->enable_mes = true; 3643 3644 /* detect hw virtualization here */ 3645 amdgpu_detect_virtualization(adev); 3646 3647 r = amdgpu_device_get_job_timeout_settings(adev); 3648 if (r) { 3649 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3650 return r; 3651 } 3652 3653 /* early init functions */ 3654 r = amdgpu_device_ip_early_init(adev); 3655 if (r) 3656 return r; 3657 3658 /* doorbell bar mapping and doorbell index init*/ 3659 amdgpu_device_doorbell_init(adev); 3660 3661 if (amdgpu_emu_mode == 1) { 3662 /* post the asic on emulation mode */ 3663 emu_soc_asic_init(adev); 3664 goto fence_driver_init; 3665 } 3666 3667 amdgpu_reset_init(adev); 3668 3669 /* detect if we are with an SRIOV vbios */ 3670 amdgpu_device_detect_sriov_bios(adev); 3671 3672 /* check if we need to reset the asic 3673 * E.g., driver was not cleanly unloaded previously, etc. 3674 */ 3675 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3676 if (adev->gmc.xgmi.num_physical_nodes) { 3677 dev_info(adev->dev, "Pending hive reset.\n"); 3678 adev->gmc.xgmi.pending_reset = true; 3679 /* Only need to init necessary block for SMU to handle the reset */ 3680 for (i = 0; i < adev->num_ip_blocks; i++) { 3681 if (!adev->ip_blocks[i].status.valid) 3682 continue; 3683 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3684 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3685 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3687 DRM_DEBUG("IP %s disabled for hw_init.\n", 3688 adev->ip_blocks[i].version->funcs->name); 3689 adev->ip_blocks[i].status.hw = true; 3690 } 3691 } 3692 } else { 3693 r = amdgpu_asic_reset(adev); 3694 if (r) { 3695 dev_err(adev->dev, "asic reset on init failed\n"); 3696 goto failed; 3697 } 3698 } 3699 } 3700 3701 pci_enable_pcie_error_reporting(adev->pdev); 3702 3703 /* Post card if necessary */ 3704 if (amdgpu_device_need_post(adev)) { 3705 if (!adev->bios) { 3706 dev_err(adev->dev, "no vBIOS found\n"); 3707 r = -EINVAL; 3708 goto failed; 3709 } 3710 DRM_INFO("GPU posting now...\n"); 3711 r = amdgpu_device_asic_init(adev); 3712 if (r) { 3713 dev_err(adev->dev, "gpu post error!\n"); 3714 goto failed; 3715 } 3716 } 3717 3718 if (adev->is_atom_fw) { 3719 /* Initialize clocks */ 3720 r = amdgpu_atomfirmware_get_clock_info(adev); 3721 if (r) { 3722 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3723 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3724 goto failed; 3725 } 3726 } else { 3727 /* Initialize clocks */ 3728 r = amdgpu_atombios_get_clock_info(adev); 3729 if (r) { 3730 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3731 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3732 goto failed; 3733 } 3734 /* init i2c buses */ 3735 if (!amdgpu_device_has_dc_support(adev)) 3736 amdgpu_atombios_i2c_init(adev); 3737 } 3738 3739 fence_driver_init: 3740 /* Fence driver */ 3741 r = amdgpu_fence_driver_sw_init(adev); 3742 if (r) { 3743 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3744 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3745 goto failed; 3746 } 3747 3748 /* init the mode config */ 3749 drm_mode_config_init(adev_to_drm(adev)); 3750 3751 r = amdgpu_device_ip_init(adev); 3752 if (r) { 3753 /* failed in exclusive mode due to timeout */ 3754 if (amdgpu_sriov_vf(adev) && 3755 !amdgpu_sriov_runtime(adev) && 3756 amdgpu_virt_mmio_blocked(adev) && 3757 !amdgpu_virt_wait_reset(adev)) { 3758 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3759 /* Don't send request since VF is inactive. */ 3760 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3761 adev->virt.ops = NULL; 3762 r = -EAGAIN; 3763 goto release_ras_con; 3764 } 3765 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3766 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3767 goto release_ras_con; 3768 } 3769 3770 amdgpu_fence_driver_hw_init(adev); 3771 3772 dev_info(adev->dev, 3773 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3774 adev->gfx.config.max_shader_engines, 3775 adev->gfx.config.max_sh_per_se, 3776 adev->gfx.config.max_cu_per_sh, 3777 adev->gfx.cu_info.number); 3778 3779 #ifdef __OpenBSD__ 3780 { 3781 const char *chip_name; 3782 3783 switch (adev->asic_type) { 3784 case CHIP_RAVEN: 3785 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3786 chip_name = "RAVEN2"; 3787 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3788 chip_name = "PICASSO"; 3789 else 3790 chip_name = "RAVEN"; 3791 break; 3792 case CHIP_RENOIR: 3793 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3794 chip_name = "RENOIR"; 3795 else 3796 chip_name = "GREEN_SARDINE"; 3797 break; 3798 default: 3799 chip_name = amdgpu_asic_name[adev->asic_type]; 3800 } 3801 printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname, 3802 chip_name, adev->gfx.cu_info.number, adev->rev_id); 3803 } 3804 #endif 3805 3806 adev->accel_working = true; 3807 3808 amdgpu_vm_check_compute_bug(adev); 3809 3810 /* Initialize the buffer migration limit. */ 3811 if (amdgpu_moverate >= 0) 3812 max_MBps = amdgpu_moverate; 3813 else 3814 max_MBps = 8; /* Allow 8 MB/s. */ 3815 /* Get a log2 for easy divisions. */ 3816 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3817 3818 amdgpu_fbdev_init(adev); 3819 3820 r = amdgpu_pm_sysfs_init(adev); 3821 if (r) { 3822 adev->pm_sysfs_en = false; 3823 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3824 } else 3825 adev->pm_sysfs_en = true; 3826 3827 r = amdgpu_ucode_sysfs_init(adev); 3828 if (r) { 3829 adev->ucode_sysfs_en = false; 3830 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3831 } else 3832 adev->ucode_sysfs_en = true; 3833 3834 if ((amdgpu_testing & 1)) { 3835 if (adev->accel_working) 3836 amdgpu_test_moves(adev); 3837 else 3838 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3839 } 3840 if (amdgpu_benchmarking) { 3841 if (adev->accel_working) 3842 amdgpu_benchmark(adev, amdgpu_benchmarking); 3843 else 3844 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3845 } 3846 3847 /* 3848 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3849 * Otherwise the mgpu fan boost feature will be skipped due to the 3850 * gpu instance is counted less. 3851 */ 3852 amdgpu_register_gpu_instance(adev); 3853 3854 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3855 * explicit gating rather than handling it automatically. 3856 */ 3857 if (!adev->gmc.xgmi.pending_reset) { 3858 r = amdgpu_device_ip_late_init(adev); 3859 if (r) { 3860 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3861 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3862 goto release_ras_con; 3863 } 3864 /* must succeed. */ 3865 amdgpu_ras_resume(adev); 3866 queue_delayed_work(system_wq, &adev->delayed_init_work, 3867 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3868 } 3869 3870 if (amdgpu_sriov_vf(adev)) 3871 flush_delayed_work(&adev->delayed_init_work); 3872 3873 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3874 if (r) 3875 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3876 3877 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3878 r = amdgpu_pmu_init(adev); 3879 if (r) 3880 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3881 3882 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3883 if (amdgpu_device_cache_pci_state(adev->pdev)) 3884 pci_restore_state(pdev); 3885 3886 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3887 /* this will fail for cards that aren't VGA class devices, just 3888 * ignore it */ 3889 #ifdef notyet 3890 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3891 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3892 #endif 3893 3894 if (amdgpu_device_supports_px(ddev)) { 3895 px = true; 3896 vga_switcheroo_register_client(adev->pdev, 3897 &amdgpu_switcheroo_ops, px); 3898 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3899 } 3900 3901 if (adev->gmc.xgmi.pending_reset) 3902 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3903 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3904 3905 return 0; 3906 3907 release_ras_con: 3908 amdgpu_release_ras_context(adev); 3909 3910 failed: 3911 amdgpu_vf_error_trans_all(adev); 3912 3913 return r; 3914 } 3915 3916 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3917 { 3918 STUB(); 3919 #ifdef notyet 3920 /* Clear all CPU mappings pointing to this device */ 3921 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3922 #endif 3923 3924 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3925 amdgpu_device_doorbell_fini(adev); 3926 3927 #ifdef __linux__ 3928 iounmap(adev->rmmio); 3929 adev->rmmio = NULL; 3930 if (adev->mman.aper_base_kaddr) 3931 iounmap(adev->mman.aper_base_kaddr); 3932 adev->mman.aper_base_kaddr = NULL; 3933 #else 3934 if (adev->rmmio_size > 0) 3935 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 3936 adev->rmmio_size); 3937 adev->rmmio_size = 0; 3938 adev->rmmio = NULL; 3939 if (adev->mman.aper_base_kaddr) 3940 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 3941 adev->gmc.visible_vram_size); 3942 adev->mman.aper_base_kaddr = NULL; 3943 #endif 3944 3945 /* Memory manager related */ 3946 if (!adev->gmc.xgmi.connected_to_cpu) { 3947 #ifdef __linux__ 3948 arch_phys_wc_del(adev->gmc.vram_mtrr); 3949 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3950 #else 3951 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 3952 #endif 3953 } 3954 } 3955 3956 /** 3957 * amdgpu_device_fini - tear down the driver 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * Tear down the driver info (all asics). 3962 * Called at driver shutdown. 3963 */ 3964 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3965 { 3966 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3967 flush_delayed_work(&adev->delayed_init_work); 3968 if (adev->mman.initialized) { 3969 flush_delayed_work(&adev->mman.bdev.wq); 3970 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3971 } 3972 adev->shutdown = true; 3973 3974 /* make sure IB test finished before entering exclusive mode 3975 * to avoid preemption on IB test 3976 * */ 3977 if (amdgpu_sriov_vf(adev)) { 3978 amdgpu_virt_request_full_gpu(adev, false); 3979 amdgpu_virt_fini_data_exchange(adev); 3980 } 3981 3982 /* disable all interrupts */ 3983 amdgpu_irq_disable_all(adev); 3984 if (adev->mode_info.mode_config_initialized){ 3985 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3986 drm_helper_force_disable_all(adev_to_drm(adev)); 3987 else 3988 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3989 } 3990 amdgpu_fence_driver_hw_fini(adev); 3991 3992 if (adev->pm_sysfs_en) 3993 amdgpu_pm_sysfs_fini(adev); 3994 if (adev->ucode_sysfs_en) 3995 amdgpu_ucode_sysfs_fini(adev); 3996 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3997 3998 amdgpu_fbdev_fini(adev); 3999 4000 amdgpu_irq_fini_hw(adev); 4001 4002 amdgpu_device_ip_fini_early(adev); 4003 4004 amdgpu_gart_dummy_page_fini(adev); 4005 4006 amdgpu_device_unmap_mmio(adev); 4007 } 4008 4009 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4010 { 4011 amdgpu_fence_driver_sw_fini(adev); 4012 amdgpu_device_ip_fini(adev); 4013 release_firmware(adev->firmware.gpu_info_fw); 4014 adev->firmware.gpu_info_fw = NULL; 4015 adev->accel_working = false; 4016 4017 amdgpu_reset_fini(adev); 4018 4019 /* free i2c buses */ 4020 if (!amdgpu_device_has_dc_support(adev)) 4021 amdgpu_i2c_fini(adev); 4022 4023 if (amdgpu_emu_mode != 1) 4024 amdgpu_atombios_fini(adev); 4025 4026 kfree(adev->bios); 4027 adev->bios = NULL; 4028 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4029 vga_switcheroo_unregister_client(adev->pdev); 4030 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4031 } 4032 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4033 vga_client_unregister(adev->pdev); 4034 4035 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4036 amdgpu_pmu_fini(adev); 4037 if (adev->mman.discovery_bin) 4038 amdgpu_discovery_fini(adev); 4039 4040 kfree(adev->pci_state); 4041 4042 } 4043 4044 /** 4045 * amdgpu_device_evict_resources - evict device resources 4046 * @adev: amdgpu device object 4047 * 4048 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4049 * of the vram memory type. Mainly used for evicting device resources 4050 * at suspend time. 4051 * 4052 */ 4053 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4054 { 4055 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4056 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4057 return; 4058 4059 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4060 DRM_WARN("evicting device resources failed\n"); 4061 4062 } 4063 4064 /* 4065 * Suspend & resume. 4066 */ 4067 /** 4068 * amdgpu_device_suspend - initiate device suspend 4069 * 4070 * @dev: drm dev pointer 4071 * @fbcon : notify the fbdev of suspend 4072 * 4073 * Puts the hw in the suspend state (all asics). 4074 * Returns 0 for success or an error on failure. 4075 * Called at driver suspend. 4076 */ 4077 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4078 { 4079 struct amdgpu_device *adev = drm_to_adev(dev); 4080 4081 if (adev->shutdown) 4082 return 0; 4083 4084 #ifdef notyet 4085 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4086 return 0; 4087 #endif 4088 4089 adev->in_suspend = true; 4090 4091 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4092 DRM_WARN("smart shift update failed\n"); 4093 4094 drm_kms_helper_poll_disable(dev); 4095 4096 if (fbcon) 4097 amdgpu_fbdev_set_suspend(adev, 1); 4098 4099 cancel_delayed_work_sync(&adev->delayed_init_work); 4100 4101 amdgpu_ras_suspend(adev); 4102 4103 amdgpu_device_ip_suspend_phase1(adev); 4104 4105 if (!adev->in_s0ix) 4106 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4107 4108 /* First evict vram memory */ 4109 amdgpu_device_evict_resources(adev); 4110 4111 amdgpu_fence_driver_hw_fini(adev); 4112 4113 amdgpu_device_ip_suspend_phase2(adev); 4114 /* This second call to evict device resources is to evict 4115 * the gart page table using the CPU. 4116 */ 4117 amdgpu_device_evict_resources(adev); 4118 4119 return 0; 4120 } 4121 4122 /** 4123 * amdgpu_device_resume - initiate device resume 4124 * 4125 * @dev: drm dev pointer 4126 * @fbcon : notify the fbdev of resume 4127 * 4128 * Bring the hw back to operating state (all asics). 4129 * Returns 0 for success or an error on failure. 4130 * Called at driver resume. 4131 */ 4132 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4133 { 4134 struct amdgpu_device *adev = drm_to_adev(dev); 4135 int r = 0; 4136 4137 #ifdef notyet 4138 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4139 return 0; 4140 #endif 4141 4142 if (adev->in_s0ix) 4143 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 4144 4145 /* post card */ 4146 if (amdgpu_device_need_post(adev)) { 4147 r = amdgpu_device_asic_init(adev); 4148 if (r) 4149 dev_err(adev->dev, "amdgpu asic init failed\n"); 4150 } 4151 4152 r = amdgpu_device_ip_resume(adev); 4153 if (r) { 4154 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4155 return r; 4156 } 4157 amdgpu_fence_driver_hw_init(adev); 4158 4159 r = amdgpu_device_ip_late_init(adev); 4160 if (r) 4161 return r; 4162 4163 queue_delayed_work(system_wq, &adev->delayed_init_work, 4164 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4165 4166 if (!adev->in_s0ix) { 4167 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4168 if (r) 4169 return r; 4170 } 4171 4172 /* Make sure IB tests flushed */ 4173 flush_delayed_work(&adev->delayed_init_work); 4174 4175 if (fbcon) 4176 amdgpu_fbdev_set_suspend(adev, 0); 4177 4178 drm_kms_helper_poll_enable(dev); 4179 4180 amdgpu_ras_resume(adev); 4181 4182 /* 4183 * Most of the connector probing functions try to acquire runtime pm 4184 * refs to ensure that the GPU is powered on when connector polling is 4185 * performed. Since we're calling this from a runtime PM callback, 4186 * trying to acquire rpm refs will cause us to deadlock. 4187 * 4188 * Since we're guaranteed to be holding the rpm lock, it's safe to 4189 * temporarily disable the rpm helpers so this doesn't deadlock us. 4190 */ 4191 #if defined(CONFIG_PM) && defined(__linux__) 4192 dev->dev->power.disable_depth++; 4193 #endif 4194 if (!amdgpu_device_has_dc_support(adev)) 4195 drm_helper_hpd_irq_event(dev); 4196 else 4197 drm_kms_helper_hotplug_event(dev); 4198 #if defined(CONFIG_PM) && defined(__linux__) 4199 dev->dev->power.disable_depth--; 4200 #endif 4201 adev->in_suspend = false; 4202 4203 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4204 DRM_WARN("smart shift update failed\n"); 4205 4206 return 0; 4207 } 4208 4209 /** 4210 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4211 * 4212 * @adev: amdgpu_device pointer 4213 * 4214 * The list of all the hardware IPs that make up the asic is walked and 4215 * the check_soft_reset callbacks are run. check_soft_reset determines 4216 * if the asic is still hung or not. 4217 * Returns true if any of the IPs are still in a hung state, false if not. 4218 */ 4219 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4220 { 4221 int i; 4222 bool asic_hang = false; 4223 4224 if (amdgpu_sriov_vf(adev)) 4225 return true; 4226 4227 if (amdgpu_asic_need_full_reset(adev)) 4228 return true; 4229 4230 for (i = 0; i < adev->num_ip_blocks; i++) { 4231 if (!adev->ip_blocks[i].status.valid) 4232 continue; 4233 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4234 adev->ip_blocks[i].status.hang = 4235 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4236 if (adev->ip_blocks[i].status.hang) { 4237 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4238 asic_hang = true; 4239 } 4240 } 4241 return asic_hang; 4242 } 4243 4244 /** 4245 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4246 * 4247 * @adev: amdgpu_device pointer 4248 * 4249 * The list of all the hardware IPs that make up the asic is walked and the 4250 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4251 * handles any IP specific hardware or software state changes that are 4252 * necessary for a soft reset to succeed. 4253 * Returns 0 on success, negative error code on failure. 4254 */ 4255 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4256 { 4257 int i, r = 0; 4258 4259 for (i = 0; i < adev->num_ip_blocks; i++) { 4260 if (!adev->ip_blocks[i].status.valid) 4261 continue; 4262 if (adev->ip_blocks[i].status.hang && 4263 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4264 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4265 if (r) 4266 return r; 4267 } 4268 } 4269 4270 return 0; 4271 } 4272 4273 /** 4274 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4275 * 4276 * @adev: amdgpu_device pointer 4277 * 4278 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4279 * reset is necessary to recover. 4280 * Returns true if a full asic reset is required, false if not. 4281 */ 4282 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4283 { 4284 int i; 4285 4286 if (amdgpu_asic_need_full_reset(adev)) 4287 return true; 4288 4289 for (i = 0; i < adev->num_ip_blocks; i++) { 4290 if (!adev->ip_blocks[i].status.valid) 4291 continue; 4292 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4294 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4295 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4296 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4297 if (adev->ip_blocks[i].status.hang) { 4298 dev_info(adev->dev, "Some block need full reset!\n"); 4299 return true; 4300 } 4301 } 4302 } 4303 return false; 4304 } 4305 4306 /** 4307 * amdgpu_device_ip_soft_reset - do a soft reset 4308 * 4309 * @adev: amdgpu_device pointer 4310 * 4311 * The list of all the hardware IPs that make up the asic is walked and the 4312 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4313 * IP specific hardware or software state changes that are necessary to soft 4314 * reset the IP. 4315 * Returns 0 on success, negative error code on failure. 4316 */ 4317 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4318 { 4319 int i, r = 0; 4320 4321 for (i = 0; i < adev->num_ip_blocks; i++) { 4322 if (!adev->ip_blocks[i].status.valid) 4323 continue; 4324 if (adev->ip_blocks[i].status.hang && 4325 adev->ip_blocks[i].version->funcs->soft_reset) { 4326 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4327 if (r) 4328 return r; 4329 } 4330 } 4331 4332 return 0; 4333 } 4334 4335 /** 4336 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4337 * 4338 * @adev: amdgpu_device pointer 4339 * 4340 * The list of all the hardware IPs that make up the asic is walked and the 4341 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4342 * handles any IP specific hardware or software state changes that are 4343 * necessary after the IP has been soft reset. 4344 * Returns 0 on success, negative error code on failure. 4345 */ 4346 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4347 { 4348 int i, r = 0; 4349 4350 for (i = 0; i < adev->num_ip_blocks; i++) { 4351 if (!adev->ip_blocks[i].status.valid) 4352 continue; 4353 if (adev->ip_blocks[i].status.hang && 4354 adev->ip_blocks[i].version->funcs->post_soft_reset) 4355 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4356 if (r) 4357 return r; 4358 } 4359 4360 return 0; 4361 } 4362 4363 /** 4364 * amdgpu_device_recover_vram - Recover some VRAM contents 4365 * 4366 * @adev: amdgpu_device pointer 4367 * 4368 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4369 * restore things like GPUVM page tables after a GPU reset where 4370 * the contents of VRAM might be lost. 4371 * 4372 * Returns: 4373 * 0 on success, negative error code on failure. 4374 */ 4375 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4376 { 4377 struct dma_fence *fence = NULL, *next = NULL; 4378 struct amdgpu_bo *shadow; 4379 struct amdgpu_bo_vm *vmbo; 4380 long r = 1, tmo; 4381 4382 if (amdgpu_sriov_runtime(adev)) 4383 tmo = msecs_to_jiffies(8000); 4384 else 4385 tmo = msecs_to_jiffies(100); 4386 4387 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4388 mutex_lock(&adev->shadow_list_lock); 4389 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4390 shadow = &vmbo->bo; 4391 /* No need to recover an evicted BO */ 4392 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4393 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4394 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4395 continue; 4396 4397 r = amdgpu_bo_restore_shadow(shadow, &next); 4398 if (r) 4399 break; 4400 4401 if (fence) { 4402 tmo = dma_fence_wait_timeout(fence, false, tmo); 4403 dma_fence_put(fence); 4404 fence = next; 4405 if (tmo == 0) { 4406 r = -ETIMEDOUT; 4407 break; 4408 } else if (tmo < 0) { 4409 r = tmo; 4410 break; 4411 } 4412 } else { 4413 fence = next; 4414 } 4415 } 4416 mutex_unlock(&adev->shadow_list_lock); 4417 4418 if (fence) 4419 tmo = dma_fence_wait_timeout(fence, false, tmo); 4420 dma_fence_put(fence); 4421 4422 if (r < 0 || tmo <= 0) { 4423 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4424 return -EIO; 4425 } 4426 4427 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4428 return 0; 4429 } 4430 4431 4432 /** 4433 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4434 * 4435 * @adev: amdgpu_device pointer 4436 * @from_hypervisor: request from hypervisor 4437 * 4438 * do VF FLR and reinitialize Asic 4439 * return 0 means succeeded otherwise failed 4440 */ 4441 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4442 bool from_hypervisor) 4443 { 4444 int r; 4445 4446 if (from_hypervisor) 4447 r = amdgpu_virt_request_full_gpu(adev, true); 4448 else 4449 r = amdgpu_virt_reset_gpu(adev); 4450 if (r) 4451 return r; 4452 4453 amdgpu_amdkfd_pre_reset(adev); 4454 4455 /* Resume IP prior to SMC */ 4456 r = amdgpu_device_ip_reinit_early_sriov(adev); 4457 if (r) 4458 goto error; 4459 4460 amdgpu_virt_init_data_exchange(adev); 4461 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4462 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4463 4464 r = amdgpu_device_fw_loading(adev); 4465 if (r) 4466 return r; 4467 4468 /* now we are okay to resume SMC/CP/SDMA */ 4469 r = amdgpu_device_ip_reinit_late_sriov(adev); 4470 if (r) 4471 goto error; 4472 4473 amdgpu_irq_gpu_reset_resume_helper(adev); 4474 r = amdgpu_ib_ring_tests(adev); 4475 amdgpu_amdkfd_post_reset(adev); 4476 4477 error: 4478 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4479 amdgpu_inc_vram_lost(adev); 4480 r = amdgpu_device_recover_vram(adev); 4481 } 4482 amdgpu_virt_release_full_gpu(adev, true); 4483 4484 return r; 4485 } 4486 4487 /** 4488 * amdgpu_device_has_job_running - check if there is any job in mirror list 4489 * 4490 * @adev: amdgpu_device pointer 4491 * 4492 * check if there is any job in mirror list 4493 */ 4494 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4495 { 4496 int i; 4497 struct drm_sched_job *job; 4498 4499 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4500 struct amdgpu_ring *ring = adev->rings[i]; 4501 4502 if (!ring || !ring->sched.thread) 4503 continue; 4504 4505 spin_lock(&ring->sched.job_list_lock); 4506 job = list_first_entry_or_null(&ring->sched.pending_list, 4507 struct drm_sched_job, list); 4508 spin_unlock(&ring->sched.job_list_lock); 4509 if (job) 4510 return true; 4511 } 4512 return false; 4513 } 4514 4515 /** 4516 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4517 * 4518 * @adev: amdgpu_device pointer 4519 * 4520 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4521 * a hung GPU. 4522 */ 4523 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4524 { 4525 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4526 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4527 return false; 4528 } 4529 4530 if (amdgpu_gpu_recovery == 0) 4531 goto disabled; 4532 4533 if (amdgpu_sriov_vf(adev)) 4534 return true; 4535 4536 if (amdgpu_gpu_recovery == -1) { 4537 switch (adev->asic_type) { 4538 case CHIP_BONAIRE: 4539 case CHIP_HAWAII: 4540 case CHIP_TOPAZ: 4541 case CHIP_TONGA: 4542 case CHIP_FIJI: 4543 case CHIP_POLARIS10: 4544 case CHIP_POLARIS11: 4545 case CHIP_POLARIS12: 4546 case CHIP_VEGAM: 4547 case CHIP_VEGA20: 4548 case CHIP_VEGA10: 4549 case CHIP_VEGA12: 4550 case CHIP_RAVEN: 4551 case CHIP_ARCTURUS: 4552 case CHIP_RENOIR: 4553 case CHIP_NAVI10: 4554 case CHIP_NAVI14: 4555 case CHIP_NAVI12: 4556 case CHIP_SIENNA_CICHLID: 4557 case CHIP_NAVY_FLOUNDER: 4558 case CHIP_DIMGREY_CAVEFISH: 4559 case CHIP_BEIGE_GOBY: 4560 case CHIP_VANGOGH: 4561 case CHIP_ALDEBARAN: 4562 break; 4563 default: 4564 goto disabled; 4565 } 4566 } 4567 4568 return true; 4569 4570 disabled: 4571 dev_info(adev->dev, "GPU recovery disabled.\n"); 4572 return false; 4573 } 4574 4575 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4576 { 4577 u32 i; 4578 int ret = 0; 4579 4580 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4581 4582 dev_info(adev->dev, "GPU mode1 reset\n"); 4583 4584 /* disable BM */ 4585 pci_clear_master(adev->pdev); 4586 4587 amdgpu_device_cache_pci_state(adev->pdev); 4588 4589 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4590 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4591 ret = amdgpu_dpm_mode1_reset(adev); 4592 } else { 4593 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4594 ret = psp_gpu_reset(adev); 4595 } 4596 4597 if (ret) 4598 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4599 4600 amdgpu_device_load_pci_state(adev->pdev); 4601 4602 /* wait for asic to come out of reset */ 4603 for (i = 0; i < adev->usec_timeout; i++) { 4604 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4605 4606 if (memsize != 0xffffffff) 4607 break; 4608 udelay(1); 4609 } 4610 4611 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4612 return ret; 4613 } 4614 4615 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4616 struct amdgpu_reset_context *reset_context) 4617 { 4618 int i, j, r = 0; 4619 struct amdgpu_job *job = NULL; 4620 bool need_full_reset = 4621 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4622 4623 if (reset_context->reset_req_dev == adev) 4624 job = reset_context->job; 4625 4626 if (amdgpu_sriov_vf(adev)) { 4627 /* stop the data exchange thread */ 4628 amdgpu_virt_fini_data_exchange(adev); 4629 } 4630 4631 /* block all schedulers and reset given job's ring */ 4632 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4633 struct amdgpu_ring *ring = adev->rings[i]; 4634 4635 if (!ring || !ring->sched.thread) 4636 continue; 4637 4638 /*clear job fence from fence drv to avoid force_completion 4639 *leave NULL and vm flush fence in fence drv */ 4640 for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { 4641 struct dma_fence *old, **ptr; 4642 4643 ptr = &ring->fence_drv.fences[j]; 4644 old = rcu_dereference_protected(*ptr, 1); 4645 if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { 4646 RCU_INIT_POINTER(*ptr, NULL); 4647 } 4648 } 4649 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4650 amdgpu_fence_driver_force_completion(ring); 4651 } 4652 4653 if (job && job->vm) 4654 drm_sched_increase_karma(&job->base); 4655 4656 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4657 /* If reset handler not implemented, continue; otherwise return */ 4658 if (r == -ENOSYS) 4659 r = 0; 4660 else 4661 return r; 4662 4663 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4664 if (!amdgpu_sriov_vf(adev)) { 4665 4666 if (!need_full_reset) 4667 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4668 4669 if (!need_full_reset) { 4670 amdgpu_device_ip_pre_soft_reset(adev); 4671 r = amdgpu_device_ip_soft_reset(adev); 4672 amdgpu_device_ip_post_soft_reset(adev); 4673 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4674 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4675 need_full_reset = true; 4676 } 4677 } 4678 4679 if (need_full_reset) 4680 r = amdgpu_device_ip_suspend(adev); 4681 if (need_full_reset) 4682 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4683 else 4684 clear_bit(AMDGPU_NEED_FULL_RESET, 4685 &reset_context->flags); 4686 } 4687 4688 return r; 4689 } 4690 4691 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4692 struct amdgpu_reset_context *reset_context) 4693 { 4694 struct amdgpu_device *tmp_adev = NULL; 4695 bool need_full_reset, skip_hw_reset, vram_lost = false; 4696 int r = 0; 4697 4698 /* Try reset handler method first */ 4699 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4700 reset_list); 4701 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4702 /* If reset handler not implemented, continue; otherwise return */ 4703 if (r == -ENOSYS) 4704 r = 0; 4705 else 4706 return r; 4707 4708 /* Reset handler not implemented, use the default method */ 4709 need_full_reset = 4710 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4711 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4712 4713 /* 4714 * ASIC reset has to be done on all XGMI hive nodes ASAP 4715 * to allow proper links negotiation in FW (within 1 sec) 4716 */ 4717 if (!skip_hw_reset && need_full_reset) { 4718 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4719 /* For XGMI run all resets in parallel to speed up the process */ 4720 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4721 tmp_adev->gmc.xgmi.pending_reset = false; 4722 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4723 r = -EALREADY; 4724 } else 4725 r = amdgpu_asic_reset(tmp_adev); 4726 4727 if (r) { 4728 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4729 r, adev_to_drm(tmp_adev)->unique); 4730 break; 4731 } 4732 } 4733 4734 /* For XGMI wait for all resets to complete before proceed */ 4735 if (!r) { 4736 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4737 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4738 flush_work(&tmp_adev->xgmi_reset_work); 4739 r = tmp_adev->asic_reset_res; 4740 if (r) 4741 break; 4742 } 4743 } 4744 } 4745 } 4746 4747 if (!r && amdgpu_ras_intr_triggered()) { 4748 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4749 if (tmp_adev->mmhub.ras_funcs && 4750 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4751 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4752 } 4753 4754 amdgpu_ras_intr_cleared(); 4755 } 4756 4757 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4758 if (need_full_reset) { 4759 /* post card */ 4760 r = amdgpu_device_asic_init(tmp_adev); 4761 if (r) { 4762 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4763 } else { 4764 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4765 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4766 if (r) 4767 goto out; 4768 4769 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4770 if (r) 4771 goto out; 4772 4773 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4774 if (vram_lost) { 4775 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4776 amdgpu_inc_vram_lost(tmp_adev); 4777 } 4778 4779 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4780 if (r) 4781 goto out; 4782 4783 r = amdgpu_device_fw_loading(tmp_adev); 4784 if (r) 4785 return r; 4786 4787 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4788 if (r) 4789 goto out; 4790 4791 if (vram_lost) 4792 amdgpu_device_fill_reset_magic(tmp_adev); 4793 4794 /* 4795 * Add this ASIC as tracked as reset was already 4796 * complete successfully. 4797 */ 4798 amdgpu_register_gpu_instance(tmp_adev); 4799 4800 if (!reset_context->hive && 4801 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4802 amdgpu_xgmi_add_device(tmp_adev); 4803 4804 r = amdgpu_device_ip_late_init(tmp_adev); 4805 if (r) 4806 goto out; 4807 4808 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4809 4810 /* 4811 * The GPU enters bad state once faulty pages 4812 * by ECC has reached the threshold, and ras 4813 * recovery is scheduled next. So add one check 4814 * here to break recovery if it indeed exceeds 4815 * bad page threshold, and remind user to 4816 * retire this GPU or setting one bigger 4817 * bad_page_threshold value to fix this once 4818 * probing driver again. 4819 */ 4820 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4821 /* must succeed. */ 4822 amdgpu_ras_resume(tmp_adev); 4823 } else { 4824 r = -EINVAL; 4825 goto out; 4826 } 4827 4828 /* Update PSP FW topology after reset */ 4829 if (reset_context->hive && 4830 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4831 r = amdgpu_xgmi_update_topology( 4832 reset_context->hive, tmp_adev); 4833 } 4834 } 4835 4836 out: 4837 if (!r) { 4838 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4839 r = amdgpu_ib_ring_tests(tmp_adev); 4840 if (r) { 4841 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4842 need_full_reset = true; 4843 r = -EAGAIN; 4844 goto end; 4845 } 4846 } 4847 4848 if (!r) 4849 r = amdgpu_device_recover_vram(tmp_adev); 4850 else 4851 tmp_adev->asic_reset_res = r; 4852 } 4853 4854 end: 4855 if (need_full_reset) 4856 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4857 else 4858 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4859 return r; 4860 } 4861 4862 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4863 struct amdgpu_hive_info *hive) 4864 { 4865 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4866 return false; 4867 4868 if (hive) { 4869 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4870 } else { 4871 down_write(&adev->reset_sem); 4872 } 4873 4874 switch (amdgpu_asic_reset_method(adev)) { 4875 case AMD_RESET_METHOD_MODE1: 4876 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4877 break; 4878 case AMD_RESET_METHOD_MODE2: 4879 adev->mp1_state = PP_MP1_STATE_RESET; 4880 break; 4881 default: 4882 adev->mp1_state = PP_MP1_STATE_NONE; 4883 break; 4884 } 4885 4886 return true; 4887 } 4888 4889 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4890 { 4891 amdgpu_vf_error_trans_all(adev); 4892 adev->mp1_state = PP_MP1_STATE_NONE; 4893 atomic_set(&adev->in_gpu_reset, 0); 4894 up_write(&adev->reset_sem); 4895 } 4896 4897 /* 4898 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4899 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4900 * 4901 * unlock won't require roll back. 4902 */ 4903 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4904 { 4905 struct amdgpu_device *tmp_adev = NULL; 4906 4907 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4908 if (!hive) { 4909 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4910 return -ENODEV; 4911 } 4912 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4913 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4914 goto roll_back; 4915 } 4916 } else if (!amdgpu_device_lock_adev(adev, hive)) 4917 return -EAGAIN; 4918 4919 return 0; 4920 roll_back: 4921 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4922 /* 4923 * if the lockup iteration break in the middle of a hive, 4924 * it may means there may has a race issue, 4925 * or a hive device locked up independently. 4926 * we may be in trouble and may not, so will try to roll back 4927 * the lock and give out a warnning. 4928 */ 4929 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4930 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4931 amdgpu_device_unlock_adev(tmp_adev); 4932 } 4933 } 4934 return -EAGAIN; 4935 } 4936 4937 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4938 { 4939 STUB(); 4940 #ifdef notyet 4941 struct pci_dev *p = NULL; 4942 4943 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4944 adev->pdev->bus->number, 1); 4945 if (p) { 4946 pm_runtime_enable(&(p->dev)); 4947 pm_runtime_resume(&(p->dev)); 4948 } 4949 #endif 4950 } 4951 4952 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4953 { 4954 enum amd_reset_method reset_method; 4955 struct pci_dev *p = NULL; 4956 u64 expires; 4957 4958 /* 4959 * For now, only BACO and mode1 reset are confirmed 4960 * to suffer the audio issue without proper suspended. 4961 */ 4962 reset_method = amdgpu_asic_reset_method(adev); 4963 if ((reset_method != AMD_RESET_METHOD_BACO) && 4964 (reset_method != AMD_RESET_METHOD_MODE1)) 4965 return -EINVAL; 4966 4967 STUB(); 4968 return -ENOSYS; 4969 #ifdef notyet 4970 4971 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4972 adev->pdev->bus->number, 1); 4973 if (!p) 4974 return -ENODEV; 4975 4976 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4977 if (!expires) 4978 /* 4979 * If we cannot get the audio device autosuspend delay, 4980 * a fixed 4S interval will be used. Considering 3S is 4981 * the audio controller default autosuspend delay setting. 4982 * 4S used here is guaranteed to cover that. 4983 */ 4984 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4985 4986 while (!pm_runtime_status_suspended(&(p->dev))) { 4987 if (!pm_runtime_suspend(&(p->dev))) 4988 break; 4989 4990 if (expires < ktime_get_mono_fast_ns()) { 4991 dev_warn(adev->dev, "failed to suspend display audio\n"); 4992 /* TODO: abort the succeeding gpu reset? */ 4993 return -ETIMEDOUT; 4994 } 4995 } 4996 4997 pm_runtime_disable(&(p->dev)); 4998 4999 return 0; 5000 #endif 5001 } 5002 5003 static void amdgpu_device_recheck_guilty_jobs( 5004 struct amdgpu_device *adev, struct list_head *device_list_handle, 5005 struct amdgpu_reset_context *reset_context) 5006 { 5007 int i, r = 0; 5008 5009 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5010 struct amdgpu_ring *ring = adev->rings[i]; 5011 int ret = 0; 5012 struct drm_sched_job *s_job; 5013 5014 if (!ring || !ring->sched.thread) 5015 continue; 5016 5017 s_job = list_first_entry_or_null(&ring->sched.pending_list, 5018 struct drm_sched_job, list); 5019 if (s_job == NULL) 5020 continue; 5021 5022 /* clear job's guilty and depend the folowing step to decide the real one */ 5023 drm_sched_reset_karma(s_job); 5024 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5025 5026 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5027 if (ret == 0) { /* timeout */ 5028 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5029 ring->sched.name, s_job->id); 5030 5031 /* set guilty */ 5032 drm_sched_increase_karma(s_job); 5033 retry: 5034 /* do hw reset */ 5035 if (amdgpu_sriov_vf(adev)) { 5036 amdgpu_virt_fini_data_exchange(adev); 5037 r = amdgpu_device_reset_sriov(adev, false); 5038 if (r) 5039 adev->asic_reset_res = r; 5040 } else { 5041 clear_bit(AMDGPU_SKIP_HW_RESET, 5042 &reset_context->flags); 5043 r = amdgpu_do_asic_reset(device_list_handle, 5044 reset_context); 5045 if (r && r == -EAGAIN) 5046 goto retry; 5047 } 5048 5049 /* 5050 * add reset counter so that the following 5051 * resubmitted job could flush vmid 5052 */ 5053 atomic_inc(&adev->gpu_reset_counter); 5054 continue; 5055 } 5056 5057 /* got the hw fence, signal finished fence */ 5058 atomic_dec(ring->sched.score); 5059 dma_fence_get(&s_job->s_fence->finished); 5060 dma_fence_signal(&s_job->s_fence->finished); 5061 dma_fence_put(&s_job->s_fence->finished); 5062 5063 /* remove node from list and free the job */ 5064 spin_lock(&ring->sched.job_list_lock); 5065 list_del_init(&s_job->list); 5066 spin_unlock(&ring->sched.job_list_lock); 5067 ring->sched.ops->free_job(s_job); 5068 } 5069 } 5070 5071 /** 5072 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5073 * 5074 * @adev: amdgpu_device pointer 5075 * @job: which job trigger hang 5076 * 5077 * Attempt to reset the GPU if it has hung (all asics). 5078 * Attempt to do soft-reset or full-reset and reinitialize Asic 5079 * Returns 0 for success or an error on failure. 5080 */ 5081 5082 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5083 struct amdgpu_job *job) 5084 { 5085 struct list_head device_list, *device_list_handle = NULL; 5086 bool job_signaled = false; 5087 struct amdgpu_hive_info *hive = NULL; 5088 struct amdgpu_device *tmp_adev = NULL; 5089 int i, r = 0; 5090 bool need_emergency_restart = false; 5091 bool audio_suspended = false; 5092 int tmp_vram_lost_counter; 5093 struct amdgpu_reset_context reset_context; 5094 5095 memset(&reset_context, 0, sizeof(reset_context)); 5096 5097 /* 5098 * Special case: RAS triggered and full reset isn't supported 5099 */ 5100 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5101 5102 /* 5103 * Flush RAM to disk so that after reboot 5104 * the user can read log and see why the system rebooted. 5105 */ 5106 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5107 DRM_WARN("Emergency reboot."); 5108 5109 #ifdef notyet 5110 ksys_sync_helper(); 5111 emergency_restart(); 5112 #else 5113 panic("emergency_restart"); 5114 #endif 5115 } 5116 5117 dev_info(adev->dev, "GPU %s begin!\n", 5118 need_emergency_restart ? "jobs stop":"reset"); 5119 5120 /* 5121 * Here we trylock to avoid chain of resets executing from 5122 * either trigger by jobs on different adevs in XGMI hive or jobs on 5123 * different schedulers for same device while this TO handler is running. 5124 * We always reset all schedulers for device and all devices for XGMI 5125 * hive so that should take care of them too. 5126 */ 5127 hive = amdgpu_get_xgmi_hive(adev); 5128 if (hive) { 5129 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5130 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5131 job ? job->base.id : -1, hive->hive_id); 5132 amdgpu_put_xgmi_hive(hive); 5133 if (job && job->vm) 5134 drm_sched_increase_karma(&job->base); 5135 return 0; 5136 } 5137 mutex_lock(&hive->hive_lock); 5138 } 5139 5140 reset_context.method = AMD_RESET_METHOD_NONE; 5141 reset_context.reset_req_dev = adev; 5142 reset_context.job = job; 5143 reset_context.hive = hive; 5144 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5145 5146 /* 5147 * lock the device before we try to operate the linked list 5148 * if didn't get the device lock, don't touch the linked list since 5149 * others may iterating it. 5150 */ 5151 r = amdgpu_device_lock_hive_adev(adev, hive); 5152 if (r) { 5153 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5154 job ? job->base.id : -1); 5155 5156 /* even we skipped this reset, still need to set the job to guilty */ 5157 if (job && job->vm) 5158 drm_sched_increase_karma(&job->base); 5159 goto skip_recovery; 5160 } 5161 5162 /* 5163 * Build list of devices to reset. 5164 * In case we are in XGMI hive mode, resort the device list 5165 * to put adev in the 1st position. 5166 */ 5167 INIT_LIST_HEAD(&device_list); 5168 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5169 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5170 list_add_tail(&tmp_adev->reset_list, &device_list); 5171 if (!list_is_first(&adev->reset_list, &device_list)) 5172 list_rotate_to_front(&adev->reset_list, &device_list); 5173 device_list_handle = &device_list; 5174 } else { 5175 list_add_tail(&adev->reset_list, &device_list); 5176 device_list_handle = &device_list; 5177 } 5178 5179 /* block all schedulers and reset given job's ring */ 5180 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5181 /* 5182 * Try to put the audio codec into suspend state 5183 * before gpu reset started. 5184 * 5185 * Due to the power domain of the graphics device 5186 * is shared with AZ power domain. Without this, 5187 * we may change the audio hardware from behind 5188 * the audio driver's back. That will trigger 5189 * some audio codec errors. 5190 */ 5191 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5192 audio_suspended = true; 5193 5194 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5195 5196 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5197 5198 if (!amdgpu_sriov_vf(tmp_adev)) 5199 amdgpu_amdkfd_pre_reset(tmp_adev); 5200 5201 /* 5202 * Mark these ASICs to be reseted as untracked first 5203 * And add them back after reset completed 5204 */ 5205 amdgpu_unregister_gpu_instance(tmp_adev); 5206 5207 amdgpu_fbdev_set_suspend(tmp_adev, 1); 5208 5209 /* disable ras on ALL IPs */ 5210 if (!need_emergency_restart && 5211 amdgpu_device_ip_need_full_reset(tmp_adev)) 5212 amdgpu_ras_suspend(tmp_adev); 5213 5214 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5215 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5216 5217 if (!ring || !ring->sched.thread) 5218 continue; 5219 5220 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5221 5222 if (need_emergency_restart) 5223 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5224 } 5225 atomic_inc(&tmp_adev->gpu_reset_counter); 5226 } 5227 5228 if (need_emergency_restart) 5229 goto skip_sched_resume; 5230 5231 /* 5232 * Must check guilty signal here since after this point all old 5233 * HW fences are force signaled. 5234 * 5235 * job->base holds a reference to parent fence 5236 */ 5237 if (job && job->base.s_fence->parent && 5238 dma_fence_is_signaled(job->base.s_fence->parent)) { 5239 job_signaled = true; 5240 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5241 goto skip_hw_reset; 5242 } 5243 5244 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5245 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5246 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5247 /*TODO Should we stop ?*/ 5248 if (r) { 5249 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5250 r, adev_to_drm(tmp_adev)->unique); 5251 tmp_adev->asic_reset_res = r; 5252 } 5253 } 5254 5255 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5256 /* Actual ASIC resets if needed.*/ 5257 /* TODO Implement XGMI hive reset logic for SRIOV */ 5258 if (amdgpu_sriov_vf(adev)) { 5259 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5260 if (r) 5261 adev->asic_reset_res = r; 5262 } else { 5263 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5264 if (r && r == -EAGAIN) 5265 goto retry; 5266 } 5267 5268 skip_hw_reset: 5269 5270 /* Post ASIC reset for all devs .*/ 5271 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5272 5273 /* 5274 * Sometimes a later bad compute job can block a good gfx job as gfx 5275 * and compute ring share internal GC HW mutually. We add an additional 5276 * guilty jobs recheck step to find the real guilty job, it synchronously 5277 * submits and pends for the first job being signaled. If it gets timeout, 5278 * we identify it as a real guilty job. 5279 */ 5280 if (amdgpu_gpu_recovery == 2 && 5281 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5282 amdgpu_device_recheck_guilty_jobs( 5283 tmp_adev, device_list_handle, &reset_context); 5284 5285 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5286 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5287 5288 if (!ring || !ring->sched.thread) 5289 continue; 5290 5291 /* No point to resubmit jobs if we didn't HW reset*/ 5292 if (!tmp_adev->asic_reset_res && !job_signaled) 5293 drm_sched_resubmit_jobs(&ring->sched); 5294 5295 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5296 } 5297 5298 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5299 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5300 } 5301 5302 tmp_adev->asic_reset_res = 0; 5303 5304 if (r) { 5305 /* bad news, how to tell it to userspace ? */ 5306 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5307 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5308 } else { 5309 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5310 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5311 DRM_WARN("smart shift update failed\n"); 5312 } 5313 } 5314 5315 skip_sched_resume: 5316 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5317 /* unlock kfd: SRIOV would do it separately */ 5318 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5319 amdgpu_amdkfd_post_reset(tmp_adev); 5320 5321 /* kfd_post_reset will do nothing if kfd device is not initialized, 5322 * need to bring up kfd here if it's not be initialized before 5323 */ 5324 if (!adev->kfd.init_complete) 5325 amdgpu_amdkfd_device_init(adev); 5326 5327 if (audio_suspended) 5328 amdgpu_device_resume_display_audio(tmp_adev); 5329 amdgpu_device_unlock_adev(tmp_adev); 5330 } 5331 5332 skip_recovery: 5333 if (hive) { 5334 atomic_set(&hive->in_reset, 0); 5335 mutex_unlock(&hive->hive_lock); 5336 amdgpu_put_xgmi_hive(hive); 5337 } 5338 5339 if (r && r != -EAGAIN) 5340 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5341 return r; 5342 } 5343 5344 /** 5345 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5346 * 5347 * @adev: amdgpu_device pointer 5348 * 5349 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5350 * and lanes) of the slot the device is in. Handles APUs and 5351 * virtualized environments where PCIE config space may not be available. 5352 */ 5353 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5354 { 5355 struct pci_dev *pdev; 5356 enum pci_bus_speed speed_cap, platform_speed_cap; 5357 enum pcie_link_width platform_link_width; 5358 5359 if (amdgpu_pcie_gen_cap) 5360 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5361 5362 if (amdgpu_pcie_lane_cap) 5363 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5364 5365 /* covers APUs as well */ 5366 if (pci_is_root_bus(adev->pdev->bus)) { 5367 if (adev->pm.pcie_gen_mask == 0) 5368 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5369 if (adev->pm.pcie_mlw_mask == 0) 5370 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5371 return; 5372 } 5373 5374 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5375 return; 5376 5377 pcie_bandwidth_available(adev->pdev, NULL, 5378 &platform_speed_cap, &platform_link_width); 5379 5380 if (adev->pm.pcie_gen_mask == 0) { 5381 /* asic caps */ 5382 pdev = adev->pdev; 5383 speed_cap = pcie_get_speed_cap(pdev); 5384 if (speed_cap == PCI_SPEED_UNKNOWN) { 5385 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5386 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5387 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5388 } else { 5389 if (speed_cap == PCIE_SPEED_32_0GT) 5390 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5391 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5392 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5393 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5394 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5395 else if (speed_cap == PCIE_SPEED_16_0GT) 5396 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5397 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5398 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5399 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5400 else if (speed_cap == PCIE_SPEED_8_0GT) 5401 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5402 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5403 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5404 else if (speed_cap == PCIE_SPEED_5_0GT) 5405 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5406 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5407 else 5408 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5409 } 5410 /* platform caps */ 5411 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5412 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5413 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5414 } else { 5415 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5416 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5417 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5418 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5419 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5420 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5421 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5422 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5423 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5424 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5425 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5426 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5427 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5428 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5429 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5430 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5431 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5432 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5433 else 5434 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5435 5436 } 5437 } 5438 if (adev->pm.pcie_mlw_mask == 0) { 5439 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5440 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5441 } else { 5442 switch (platform_link_width) { 5443 case PCIE_LNK_X32: 5444 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5451 break; 5452 case PCIE_LNK_X16: 5453 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5454 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5456 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5459 break; 5460 case PCIE_LNK_X12: 5461 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5463 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5464 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5465 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5466 break; 5467 case PCIE_LNK_X8: 5468 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5469 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5470 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5471 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5472 break; 5473 case PCIE_LNK_X4: 5474 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5475 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5476 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5477 break; 5478 case PCIE_LNK_X2: 5479 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5480 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5481 break; 5482 case PCIE_LNK_X1: 5483 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5484 break; 5485 default: 5486 break; 5487 } 5488 } 5489 } 5490 } 5491 5492 int amdgpu_device_baco_enter(struct drm_device *dev) 5493 { 5494 struct amdgpu_device *adev = drm_to_adev(dev); 5495 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5496 5497 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5498 return -ENOTSUPP; 5499 5500 if (ras && adev->ras_enabled && 5501 adev->nbio.funcs->enable_doorbell_interrupt) 5502 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5503 5504 return amdgpu_dpm_baco_enter(adev); 5505 } 5506 5507 int amdgpu_device_baco_exit(struct drm_device *dev) 5508 { 5509 struct amdgpu_device *adev = drm_to_adev(dev); 5510 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5511 int ret = 0; 5512 5513 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5514 return -ENOTSUPP; 5515 5516 ret = amdgpu_dpm_baco_exit(adev); 5517 if (ret) 5518 return ret; 5519 5520 if (ras && adev->ras_enabled && 5521 adev->nbio.funcs->enable_doorbell_interrupt) 5522 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5523 5524 if (amdgpu_passthrough(adev) && 5525 adev->nbio.funcs->clear_doorbell_interrupt) 5526 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5527 5528 return 0; 5529 } 5530 5531 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5532 { 5533 int i; 5534 5535 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5536 struct amdgpu_ring *ring = adev->rings[i]; 5537 5538 if (!ring || !ring->sched.thread) 5539 continue; 5540 5541 cancel_delayed_work_sync(&ring->sched.work_tdr); 5542 } 5543 } 5544 5545 /** 5546 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5547 * @pdev: PCI device struct 5548 * @state: PCI channel state 5549 * 5550 * Description: Called when a PCI error is detected. 5551 * 5552 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5553 */ 5554 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5555 { 5556 STUB(); 5557 return 0; 5558 #ifdef notyet 5559 struct drm_device *dev = pci_get_drvdata(pdev); 5560 struct amdgpu_device *adev = drm_to_adev(dev); 5561 int i; 5562 5563 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5564 5565 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5566 DRM_WARN("No support for XGMI hive yet..."); 5567 return PCI_ERS_RESULT_DISCONNECT; 5568 } 5569 5570 adev->pci_channel_state = state; 5571 5572 switch (state) { 5573 case pci_channel_io_normal: 5574 return PCI_ERS_RESULT_CAN_RECOVER; 5575 /* Fatal error, prepare for slot reset */ 5576 case pci_channel_io_frozen: 5577 /* 5578 * Cancel and wait for all TDRs in progress if failing to 5579 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5580 * 5581 * Locking adev->reset_sem will prevent any external access 5582 * to GPU during PCI error recovery 5583 */ 5584 while (!amdgpu_device_lock_adev(adev, NULL)) 5585 amdgpu_cancel_all_tdr(adev); 5586 5587 /* 5588 * Block any work scheduling as we do for regular GPU reset 5589 * for the duration of the recovery 5590 */ 5591 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5592 struct amdgpu_ring *ring = adev->rings[i]; 5593 5594 if (!ring || !ring->sched.thread) 5595 continue; 5596 5597 drm_sched_stop(&ring->sched, NULL); 5598 } 5599 atomic_inc(&adev->gpu_reset_counter); 5600 return PCI_ERS_RESULT_NEED_RESET; 5601 case pci_channel_io_perm_failure: 5602 /* Permanent error, prepare for device removal */ 5603 return PCI_ERS_RESULT_DISCONNECT; 5604 } 5605 5606 return PCI_ERS_RESULT_NEED_RESET; 5607 #endif 5608 } 5609 5610 /** 5611 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5612 * @pdev: pointer to PCI device 5613 */ 5614 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5615 { 5616 5617 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5618 5619 /* TODO - dump whatever for debugging purposes */ 5620 5621 /* This called only if amdgpu_pci_error_detected returns 5622 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5623 * works, no need to reset slot. 5624 */ 5625 5626 return PCI_ERS_RESULT_RECOVERED; 5627 } 5628 5629 /** 5630 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5631 * @pdev: PCI device struct 5632 * 5633 * Description: This routine is called by the pci error recovery 5634 * code after the PCI slot has been reset, just before we 5635 * should resume normal operations. 5636 */ 5637 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5638 { 5639 STUB(); 5640 return PCI_ERS_RESULT_RECOVERED; 5641 #ifdef notyet 5642 struct drm_device *dev = pci_get_drvdata(pdev); 5643 struct amdgpu_device *adev = drm_to_adev(dev); 5644 int r, i; 5645 struct amdgpu_reset_context reset_context; 5646 u32 memsize; 5647 struct list_head device_list; 5648 5649 DRM_INFO("PCI error: slot reset callback!!\n"); 5650 5651 memset(&reset_context, 0, sizeof(reset_context)); 5652 5653 INIT_LIST_HEAD(&device_list); 5654 list_add_tail(&adev->reset_list, &device_list); 5655 5656 /* wait for asic to come out of reset */ 5657 drm_msleep(500); 5658 5659 /* Restore PCI confspace */ 5660 amdgpu_device_load_pci_state(pdev); 5661 5662 /* confirm ASIC came out of reset */ 5663 for (i = 0; i < adev->usec_timeout; i++) { 5664 memsize = amdgpu_asic_get_config_memsize(adev); 5665 5666 if (memsize != 0xffffffff) 5667 break; 5668 udelay(1); 5669 } 5670 if (memsize == 0xffffffff) { 5671 r = -ETIME; 5672 goto out; 5673 } 5674 5675 reset_context.method = AMD_RESET_METHOD_NONE; 5676 reset_context.reset_req_dev = adev; 5677 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5678 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5679 5680 adev->no_hw_access = true; 5681 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5682 adev->no_hw_access = false; 5683 if (r) 5684 goto out; 5685 5686 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5687 5688 out: 5689 if (!r) { 5690 if (amdgpu_device_cache_pci_state(adev->pdev)) 5691 pci_restore_state(adev->pdev); 5692 5693 DRM_INFO("PCIe error recovery succeeded\n"); 5694 } else { 5695 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5696 amdgpu_device_unlock_adev(adev); 5697 } 5698 5699 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5700 #endif 5701 } 5702 5703 /** 5704 * amdgpu_pci_resume() - resume normal ops after PCI reset 5705 * @pdev: pointer to PCI device 5706 * 5707 * Called when the error recovery driver tells us that its 5708 * OK to resume normal operation. 5709 */ 5710 void amdgpu_pci_resume(struct pci_dev *pdev) 5711 { 5712 STUB(); 5713 #ifdef notyet 5714 struct drm_device *dev = pci_get_drvdata(pdev); 5715 struct amdgpu_device *adev = drm_to_adev(dev); 5716 int i; 5717 5718 5719 DRM_INFO("PCI error: resume callback!!\n"); 5720 5721 /* Only continue execution for the case of pci_channel_io_frozen */ 5722 if (adev->pci_channel_state != pci_channel_io_frozen) 5723 return; 5724 5725 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5726 struct amdgpu_ring *ring = adev->rings[i]; 5727 5728 if (!ring || !ring->sched.thread) 5729 continue; 5730 5731 5732 drm_sched_resubmit_jobs(&ring->sched); 5733 drm_sched_start(&ring->sched, true); 5734 } 5735 5736 amdgpu_device_unlock_adev(adev); 5737 #endif 5738 } 5739 5740 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5741 { 5742 return false; 5743 #ifdef notyet 5744 struct drm_device *dev = pci_get_drvdata(pdev); 5745 struct amdgpu_device *adev = drm_to_adev(dev); 5746 int r; 5747 5748 r = pci_save_state(pdev); 5749 if (!r) { 5750 kfree(adev->pci_state); 5751 5752 adev->pci_state = pci_store_saved_state(pdev); 5753 5754 if (!adev->pci_state) { 5755 DRM_ERROR("Failed to store PCI saved state"); 5756 return false; 5757 } 5758 } else { 5759 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5760 return false; 5761 } 5762 5763 return true; 5764 #endif 5765 } 5766 5767 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5768 { 5769 STUB(); 5770 return false; 5771 #ifdef notyet 5772 struct drm_device *dev = pci_get_drvdata(pdev); 5773 struct amdgpu_device *adev = drm_to_adev(dev); 5774 int r; 5775 5776 if (!adev->pci_state) 5777 return false; 5778 5779 r = pci_load_saved_state(pdev, adev->pci_state); 5780 5781 if (!r) { 5782 pci_restore_state(pdev); 5783 } else { 5784 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5785 return false; 5786 } 5787 5788 return true; 5789 #endif 5790 } 5791 5792 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5793 struct amdgpu_ring *ring) 5794 { 5795 #ifdef CONFIG_X86_64 5796 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5797 return; 5798 #endif 5799 if (adev->gmc.xgmi.connected_to_cpu) 5800 return; 5801 5802 if (ring && ring->funcs->emit_hdp_flush) 5803 amdgpu_ring_emit_hdp_flush(ring); 5804 else 5805 amdgpu_asic_flush_hdp(adev, ring); 5806 } 5807 5808 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5809 struct amdgpu_ring *ring) 5810 { 5811 #ifdef CONFIG_X86_64 5812 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5813 return; 5814 #endif 5815 if (adev->gmc.xgmi.connected_to_cpu) 5816 return; 5817 5818 amdgpu_asic_invalidate_hdp(adev, ring); 5819 } 5820