1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/pci.h> 34 35 #include <drm/drm_atomic_helper.h> 36 #include <drm/drm_probe_helper.h> 37 #include <drm/amdgpu_drm.h> 38 #include <linux/vgaarb.h> 39 #include <linux/vga_switcheroo.h> 40 #include <linux/efi.h> 41 #include "amdgpu.h" 42 #include "amdgpu_trace.h" 43 #include "amdgpu_i2c.h" 44 #include "atom.h" 45 #include "amdgpu_atombios.h" 46 #include "amdgpu_atomfirmware.h" 47 #include "amd_pcie.h" 48 #ifdef CONFIG_DRM_AMDGPU_SI 49 #include "si.h" 50 #endif 51 #ifdef CONFIG_DRM_AMDGPU_CIK 52 #include "cik.h" 53 #endif 54 #include "vi.h" 55 #include "soc15.h" 56 #include "nv.h" 57 #include "bif/bif_4_1_d.h" 58 #include <linux/pci.h> 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 92 const char *amdgpu_asic_name[] = { 93 "TAHITI", 94 "PITCAIRN", 95 "VERDE", 96 "OLAND", 97 "HAINAN", 98 "BONAIRE", 99 "KAVERI", 100 "KABINI", 101 "HAWAII", 102 "MULLINS", 103 "TOPAZ", 104 "TONGA", 105 "FIJI", 106 "CARRIZO", 107 "STONEY", 108 "POLARIS10", 109 "POLARIS11", 110 "POLARIS12", 111 "VEGAM", 112 "VEGA10", 113 "VEGA12", 114 "VEGA20", 115 "RAVEN", 116 "ARCTURUS", 117 "RENOIR", 118 "ALDEBARAN", 119 "NAVI10", 120 "CYAN_SKILLFISH", 121 "NAVI14", 122 "NAVI12", 123 "SIENNA_CICHLID", 124 "NAVY_FLOUNDER", 125 "VANGOGH", 126 "DIMGREY_CAVEFISH", 127 "BEIGE_GOBY", 128 "YELLOW_CARP", 129 "LAST", 130 }; 131 132 /** 133 * DOC: pcie_replay_count 134 * 135 * The amdgpu driver provides a sysfs API for reporting the total number 136 * of PCIe replays (NAKs) 137 * The file pcie_replay_count is used for this and returns the total 138 * number of replays as a sum of the NAKs generated and NAKs received 139 */ 140 141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct drm_device *ddev = dev_get_drvdata(dev); 145 struct amdgpu_device *adev = drm_to_adev(ddev); 146 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 147 148 return sysfs_emit(buf, "%llu\n", cnt); 149 } 150 151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 152 amdgpu_device_get_pcie_replay_count, NULL); 153 154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 155 156 /** 157 * DOC: product_name 158 * 159 * The amdgpu driver provides a sysfs API for reporting the product name 160 * for the device 161 * The file serial_number is used for this and returns the product name 162 * as returned from the FRU. 163 * NOTE: This is only available for certain server cards 164 */ 165 166 static ssize_t amdgpu_device_get_product_name(struct device *dev, 167 struct device_attribute *attr, char *buf) 168 { 169 struct drm_device *ddev = dev_get_drvdata(dev); 170 struct amdgpu_device *adev = drm_to_adev(ddev); 171 172 return sysfs_emit(buf, "%s\n", adev->product_name); 173 } 174 175 static DEVICE_ATTR(product_name, S_IRUGO, 176 amdgpu_device_get_product_name, NULL); 177 178 /** 179 * DOC: product_number 180 * 181 * The amdgpu driver provides a sysfs API for reporting the part number 182 * for the device 183 * The file serial_number is used for this and returns the part number 184 * as returned from the FRU. 185 * NOTE: This is only available for certain server cards 186 */ 187 188 static ssize_t amdgpu_device_get_product_number(struct device *dev, 189 struct device_attribute *attr, char *buf) 190 { 191 struct drm_device *ddev = dev_get_drvdata(dev); 192 struct amdgpu_device *adev = drm_to_adev(ddev); 193 194 return sysfs_emit(buf, "%s\n", adev->product_number); 195 } 196 197 static DEVICE_ATTR(product_number, S_IRUGO, 198 amdgpu_device_get_product_number, NULL); 199 200 /** 201 * DOC: serial_number 202 * 203 * The amdgpu driver provides a sysfs API for reporting the serial number 204 * for the device 205 * The file serial_number is used for this and returns the serial number 206 * as returned from the FRU. 207 * NOTE: This is only available for certain server cards 208 */ 209 210 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 211 struct device_attribute *attr, char *buf) 212 { 213 struct drm_device *ddev = dev_get_drvdata(dev); 214 struct amdgpu_device *adev = drm_to_adev(ddev); 215 216 return sysfs_emit(buf, "%s\n", adev->serial); 217 } 218 219 static DEVICE_ATTR(serial_number, S_IRUGO, 220 amdgpu_device_get_serial_number, NULL); 221 222 /** 223 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 224 * 225 * @dev: drm_device pointer 226 * 227 * Returns true if the device is a dGPU with ATPX power control, 228 * otherwise return false. 229 */ 230 bool amdgpu_device_supports_px(struct drm_device *dev) 231 { 232 struct amdgpu_device *adev = drm_to_adev(dev); 233 234 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 235 return true; 236 return false; 237 } 238 239 /** 240 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 241 * 242 * @dev: drm_device pointer 243 * 244 * Returns true if the device is a dGPU with ACPI power control, 245 * otherwise return false. 246 */ 247 bool amdgpu_device_supports_boco(struct drm_device *dev) 248 { 249 struct amdgpu_device *adev = drm_to_adev(dev); 250 251 if (adev->has_pr3 || 252 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 253 return true; 254 return false; 255 } 256 257 /** 258 * amdgpu_device_supports_baco - Does the device support BACO 259 * 260 * @dev: drm_device pointer 261 * 262 * Returns true if the device supporte BACO, 263 * otherwise return false. 264 */ 265 bool amdgpu_device_supports_baco(struct drm_device *dev) 266 { 267 struct amdgpu_device *adev = drm_to_adev(dev); 268 269 return amdgpu_asic_supports_baco(adev); 270 } 271 272 /** 273 * amdgpu_device_supports_smart_shift - Is the device dGPU with 274 * smart shift support 275 * 276 * @dev: drm_device pointer 277 * 278 * Returns true if the device is a dGPU with Smart Shift support, 279 * otherwise returns false. 280 */ 281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 282 { 283 return (amdgpu_device_supports_boco(dev) && 284 amdgpu_acpi_is_power_shift_control_supported()); 285 } 286 287 /* 288 * VRAM access helper functions 289 */ 290 291 /** 292 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 293 * 294 * @adev: amdgpu_device pointer 295 * @pos: offset of the buffer in vram 296 * @buf: virtual address of the buffer in system memory 297 * @size: read/write size, sizeof(@buf) must > @size 298 * @write: true - write to vram, otherwise - read from vram 299 */ 300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 301 void *buf, size_t size, bool write) 302 { 303 unsigned long flags; 304 uint32_t hi = ~0, tmp = 0; 305 uint32_t *data = buf; 306 uint64_t last; 307 int idx; 308 309 if (!drm_dev_enter(&adev->ddev, &idx)) 310 return; 311 312 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 313 314 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 315 for (last = pos + size; pos < last; pos += 4) { 316 tmp = pos >> 31; 317 318 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 319 if (tmp != hi) { 320 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 321 hi = tmp; 322 } 323 if (write) 324 WREG32_NO_KIQ(mmMM_DATA, *data++); 325 else 326 *data++ = RREG32_NO_KIQ(mmMM_DATA); 327 } 328 329 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 330 drm_dev_exit(idx); 331 } 332 333 /** 334 * amdgpu_device_vram_access - access vram by vram aperature 335 * 336 * @adev: amdgpu_device pointer 337 * @pos: offset of the buffer in vram 338 * @buf: virtual address of the buffer in system memory 339 * @size: read/write size, sizeof(@buf) must > @size 340 * @write: true - write to vram, otherwise - read from vram 341 * 342 * The return value means how many bytes have been transferred. 343 */ 344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 345 void *buf, size_t size, bool write) 346 { 347 #ifdef CONFIG_64BIT 348 void __iomem *addr; 349 size_t count = 0; 350 uint64_t last; 351 352 if (!adev->mman.aper_base_kaddr) 353 return 0; 354 355 last = min(pos + size, adev->gmc.visible_vram_size); 356 if (last > pos) { 357 addr = adev->mman.aper_base_kaddr + pos; 358 count = last - pos; 359 360 if (write) { 361 memcpy_toio(addr, buf, count); 362 mb(); 363 amdgpu_device_flush_hdp(adev, NULL); 364 } else { 365 amdgpu_device_invalidate_hdp(adev, NULL); 366 mb(); 367 memcpy_fromio(buf, addr, count); 368 } 369 370 } 371 372 return count; 373 #else 374 return 0; 375 #endif 376 } 377 378 /** 379 * amdgpu_device_vram_access - read/write a buffer in vram 380 * 381 * @adev: amdgpu_device pointer 382 * @pos: offset of the buffer in vram 383 * @buf: virtual address of the buffer in system memory 384 * @size: read/write size, sizeof(@buf) must > @size 385 * @write: true - write to vram, otherwise - read from vram 386 */ 387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 388 void *buf, size_t size, bool write) 389 { 390 size_t count; 391 392 /* try to using vram apreature to access vram first */ 393 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 394 size -= count; 395 if (size) { 396 /* using MM to access rest vram */ 397 pos += count; 398 buf += count; 399 amdgpu_device_mm_access(adev, pos, buf, size, write); 400 } 401 } 402 403 /* 404 * register access helper functions. 405 */ 406 407 /* Check if hw access should be skipped because of hotplug or device error */ 408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 409 { 410 if (adev->no_hw_access) 411 return true; 412 413 #ifdef CONFIG_LOCKDEP 414 /* 415 * This is a bit complicated to understand, so worth a comment. What we assert 416 * here is that the GPU reset is not running on another thread in parallel. 417 * 418 * For this we trylock the read side of the reset semaphore, if that succeeds 419 * we know that the reset is not running in paralell. 420 * 421 * If the trylock fails we assert that we are either already holding the read 422 * side of the lock or are the reset thread itself and hold the write side of 423 * the lock. 424 */ 425 if (in_task()) { 426 if (down_read_trylock(&adev->reset_sem)) 427 up_read(&adev->reset_sem); 428 else 429 lockdep_assert_held(&adev->reset_sem); 430 } 431 #endif 432 return false; 433 } 434 435 /** 436 * amdgpu_device_rreg - read a memory mapped IO or indirect register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * @acc_flags: access flags which require special behavior 441 * 442 * Returns the 32 bit value from the offset specified. 443 */ 444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 445 uint32_t reg, uint32_t acc_flags) 446 { 447 uint32_t ret; 448 449 if (amdgpu_device_skip_hw_access(adev)) 450 return 0; 451 452 if ((reg * 4) < adev->rmmio_size) { 453 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 454 amdgpu_sriov_runtime(adev) && 455 down_read_trylock(&adev->reset_sem)) { 456 ret = amdgpu_kiq_rreg(adev, reg); 457 up_read(&adev->reset_sem); 458 } else { 459 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 460 } 461 } else { 462 ret = adev->pcie_rreg(adev, reg * 4); 463 } 464 465 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 466 467 return ret; 468 } 469 470 /* 471 * MMIO register read with bytes helper functions 472 * @offset:bytes offset from MMIO start 473 * 474 */ 475 476 /** 477 * amdgpu_mm_rreg8 - read a memory mapped IO register 478 * 479 * @adev: amdgpu_device pointer 480 * @offset: byte aligned register offset 481 * 482 * Returns the 8 bit value from the offset specified. 483 */ 484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return 0; 488 489 if (offset < adev->rmmio_size) 490 return (readb(adev->rmmio + offset)); 491 BUG(); 492 } 493 494 /* 495 * MMIO register write with bytes helper functions 496 * @offset:bytes offset from MMIO start 497 * @value: the value want to be written to the register 498 * 499 */ 500 /** 501 * amdgpu_mm_wreg8 - read a memory mapped IO register 502 * 503 * @adev: amdgpu_device pointer 504 * @offset: byte aligned register offset 505 * @value: 8 bit value to write 506 * 507 * Writes the value specified to the offset specified. 508 */ 509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 510 { 511 if (amdgpu_device_skip_hw_access(adev)) 512 return; 513 514 if (offset < adev->rmmio_size) 515 writeb(value, adev->rmmio + offset); 516 else 517 BUG(); 518 } 519 520 /** 521 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @v: 32 bit value to write to the register 526 * @acc_flags: access flags which require special behavior 527 * 528 * Writes the value specified to the offset specified. 529 */ 530 void amdgpu_device_wreg(struct amdgpu_device *adev, 531 uint32_t reg, uint32_t v, 532 uint32_t acc_flags) 533 { 534 if (amdgpu_device_skip_hw_access(adev)) 535 return; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_sem)) { 541 amdgpu_kiq_wreg(adev, reg, v); 542 up_read(&adev->reset_sem); 543 } else { 544 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 adev->pcie_wreg(adev, reg * 4, v); 548 } 549 550 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 551 } 552 553 /* 554 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 555 * 556 * this function is invoked only the debugfs register access 557 * */ 558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 559 uint32_t reg, uint32_t v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (amdgpu_sriov_fullaccess(adev) && 565 adev->gfx.rlc.funcs && 566 adev->gfx.rlc.funcs->is_rlcg_access_range) { 567 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 568 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); 569 } else { 570 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 571 } 572 } 573 574 /** 575 * amdgpu_mm_rdoorbell - read a doorbell dword 576 * 577 * @adev: amdgpu_device pointer 578 * @index: doorbell index 579 * 580 * Returns the value in the doorbell aperture at the 581 * requested doorbell index (CIK). 582 */ 583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 584 { 585 if (amdgpu_device_skip_hw_access(adev)) 586 return 0; 587 588 if (index < adev->doorbell.num_doorbells) { 589 return readl(adev->doorbell.ptr + index); 590 } else { 591 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 592 return 0; 593 } 594 } 595 596 /** 597 * amdgpu_mm_wdoorbell - write a doorbell dword 598 * 599 * @adev: amdgpu_device pointer 600 * @index: doorbell index 601 * @v: value to write 602 * 603 * Writes @v to the doorbell aperture at the 604 * requested doorbell index (CIK). 605 */ 606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 607 { 608 if (amdgpu_device_skip_hw_access(adev)) 609 return; 610 611 if (index < adev->doorbell.num_doorbells) { 612 writel(v, adev->doorbell.ptr + index); 613 } else { 614 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 615 } 616 } 617 618 /** 619 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 620 * 621 * @adev: amdgpu_device pointer 622 * @index: doorbell index 623 * 624 * Returns the value in the doorbell aperture at the 625 * requested doorbell index (VEGA10+). 626 */ 627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 628 { 629 if (amdgpu_device_skip_hw_access(adev)) 630 return 0; 631 632 if (index < adev->doorbell.num_doorbells) { 633 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 634 } else { 635 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 636 return 0; 637 } 638 } 639 640 /** 641 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 642 * 643 * @adev: amdgpu_device pointer 644 * @index: doorbell index 645 * @v: value to write 646 * 647 * Writes @v to the doorbell aperture at the 648 * requested doorbell index (VEGA10+). 649 */ 650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 651 { 652 if (amdgpu_device_skip_hw_access(adev)) 653 return; 654 655 if (index < adev->doorbell.num_doorbells) { 656 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 657 } else { 658 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 659 } 660 } 661 662 /** 663 * amdgpu_device_indirect_rreg - read an indirect register 664 * 665 * @adev: amdgpu_device pointer 666 * @pcie_index: mmio register offset 667 * @pcie_data: mmio register offset 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 673 u32 pcie_index, u32 pcie_data, 674 u32 reg_addr) 675 { 676 unsigned long flags; 677 u32 r; 678 void __iomem *pcie_index_offset; 679 void __iomem *pcie_data_offset; 680 681 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 682 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 683 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 684 685 writel(reg_addr, pcie_index_offset); 686 readl(pcie_index_offset); 687 r = readl(pcie_data_offset); 688 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 689 690 return r; 691 } 692 693 /** 694 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 695 * 696 * @adev: amdgpu_device pointer 697 * @pcie_index: mmio register offset 698 * @pcie_data: mmio register offset 699 * @reg_addr: indirect register address to read from 700 * 701 * Returns the value of indirect register @reg_addr 702 */ 703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 704 u32 pcie_index, u32 pcie_data, 705 u32 reg_addr) 706 { 707 unsigned long flags; 708 u64 r; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* read low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 r = readl(pcie_data_offset); 720 /* read high 32 bits */ 721 writel(reg_addr + 4, pcie_index_offset); 722 readl(pcie_index_offset); 723 r |= ((u64)readl(pcie_data_offset) << 32); 724 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 725 726 return r; 727 } 728 729 /** 730 * amdgpu_device_indirect_wreg - write an indirect register address 731 * 732 * @adev: amdgpu_device pointer 733 * @pcie_index: mmio register offset 734 * @pcie_data: mmio register offset 735 * @reg_addr: indirect register offset 736 * @reg_data: indirect register data 737 * 738 */ 739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 740 u32 pcie_index, u32 pcie_data, 741 u32 reg_addr, u32 reg_data) 742 { 743 unsigned long flags; 744 void __iomem *pcie_index_offset; 745 void __iomem *pcie_data_offset; 746 747 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 748 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 749 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 writel(reg_data, pcie_data_offset); 754 readl(pcie_data_offset); 755 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 756 } 757 758 /** 759 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 760 * 761 * @adev: amdgpu_device pointer 762 * @pcie_index: mmio register offset 763 * @pcie_data: mmio register offset 764 * @reg_addr: indirect register offset 765 * @reg_data: indirect register data 766 * 767 */ 768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 769 u32 pcie_index, u32 pcie_data, 770 u32 reg_addr, u64 reg_data) 771 { 772 unsigned long flags; 773 void __iomem *pcie_index_offset; 774 void __iomem *pcie_data_offset; 775 776 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 777 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 778 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 779 780 /* write low 32 bits */ 781 writel(reg_addr, pcie_index_offset); 782 readl(pcie_index_offset); 783 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 784 readl(pcie_data_offset); 785 /* write high 32 bits */ 786 writel(reg_addr + 4, pcie_index_offset); 787 readl(pcie_index_offset); 788 writel((u32)(reg_data >> 32), pcie_data_offset); 789 readl(pcie_data_offset); 790 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 791 } 792 793 /** 794 * amdgpu_invalid_rreg - dummy reg read function 795 * 796 * @adev: amdgpu_device pointer 797 * @reg: offset of register 798 * 799 * Dummy register read function. Used for register blocks 800 * that certain asics don't have (all asics). 801 * Returns the value in the register. 802 */ 803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 804 { 805 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 806 BUG(); 807 return 0; 808 } 809 810 /** 811 * amdgpu_invalid_wreg - dummy reg write function 812 * 813 * @adev: amdgpu_device pointer 814 * @reg: offset of register 815 * @v: value to write to the register 816 * 817 * Dummy register read function. Used for register blocks 818 * that certain asics don't have (all asics). 819 */ 820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 821 { 822 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 823 reg, v); 824 BUG(); 825 } 826 827 /** 828 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 829 * 830 * @adev: amdgpu_device pointer 831 * @reg: offset of register 832 * 833 * Dummy register read function. Used for register blocks 834 * that certain asics don't have (all asics). 835 * Returns the value in the register. 836 */ 837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 838 { 839 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 840 BUG(); 841 return 0; 842 } 843 844 /** 845 * amdgpu_invalid_wreg64 - dummy reg write function 846 * 847 * @adev: amdgpu_device pointer 848 * @reg: offset of register 849 * @v: value to write to the register 850 * 851 * Dummy register read function. Used for register blocks 852 * that certain asics don't have (all asics). 853 */ 854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 855 { 856 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 857 reg, v); 858 BUG(); 859 } 860 861 /** 862 * amdgpu_block_invalid_rreg - dummy reg read function 863 * 864 * @adev: amdgpu_device pointer 865 * @block: offset of instance 866 * @reg: offset of register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 * Returns the value in the register. 871 */ 872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 873 uint32_t block, uint32_t reg) 874 { 875 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 876 reg, block); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_block_invalid_wreg - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @block: offset of instance 886 * @reg: offset of register 887 * @v: value to write to the register 888 * 889 * Dummy register read function. Used for register blocks 890 * that certain asics don't have (all asics). 891 */ 892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 893 uint32_t block, 894 uint32_t reg, uint32_t v) 895 { 896 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 897 reg, block, v); 898 BUG(); 899 } 900 901 /** 902 * amdgpu_device_asic_init - Wrapper for atom asic_init 903 * 904 * @adev: amdgpu_device pointer 905 * 906 * Does any asic specific work and then calls atom asic init. 907 */ 908 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 909 { 910 amdgpu_asic_pre_asic_init(adev); 911 912 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 913 } 914 915 /** 916 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Allocates a scratch page of VRAM for use by various things in the 921 * driver. 922 */ 923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 924 { 925 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 926 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 927 &adev->vram_scratch.robj, 928 &adev->vram_scratch.gpu_addr, 929 (void **)&adev->vram_scratch.ptr); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Frees the VRAM scratch page. 938 */ 939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 940 { 941 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 942 } 943 944 /** 945 * amdgpu_device_program_register_sequence - program an array of registers. 946 * 947 * @adev: amdgpu_device pointer 948 * @registers: pointer to the register array 949 * @array_size: size of the register array 950 * 951 * Programs an array or registers with and and or masks. 952 * This is a helper for setting golden registers. 953 */ 954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 955 const u32 *registers, 956 const u32 array_size) 957 { 958 u32 tmp, reg, and_mask, or_mask; 959 int i; 960 961 if (array_size % 3) 962 return; 963 964 for (i = 0; i < array_size; i +=3) { 965 reg = registers[i + 0]; 966 and_mask = registers[i + 1]; 967 or_mask = registers[i + 2]; 968 969 if (and_mask == 0xffffffff) { 970 tmp = or_mask; 971 } else { 972 tmp = RREG32(reg); 973 tmp &= ~and_mask; 974 if (adev->family >= AMDGPU_FAMILY_AI) 975 tmp |= (or_mask & and_mask); 976 else 977 tmp |= or_mask; 978 } 979 WREG32(reg, tmp); 980 } 981 } 982 983 /** 984 * amdgpu_device_pci_config_reset - reset the GPU 985 * 986 * @adev: amdgpu_device pointer 987 * 988 * Resets the GPU using the pci config reset sequence. 989 * Only applicable to asics prior to vega10. 990 */ 991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 992 { 993 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 994 } 995 996 /** 997 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 998 * 999 * @adev: amdgpu_device pointer 1000 * 1001 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1002 */ 1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1004 { 1005 STUB(); 1006 return -ENOSYS; 1007 #ifdef notyet 1008 return pci_reset_function(adev->pdev); 1009 #endif 1010 } 1011 1012 /* 1013 * GPU doorbell aperture helpers function. 1014 */ 1015 /** 1016 * amdgpu_device_doorbell_init - Init doorbell driver information. 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Init doorbell driver information (CIK) 1021 * Returns 0 on success, error on failure. 1022 */ 1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1024 { 1025 1026 /* No doorbell on SI hardware generation */ 1027 if (adev->asic_type < CHIP_BONAIRE) { 1028 adev->doorbell.base = 0; 1029 adev->doorbell.size = 0; 1030 adev->doorbell.num_doorbells = 0; 1031 adev->doorbell.ptr = NULL; 1032 return 0; 1033 } 1034 1035 #ifdef __linux__ 1036 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1037 return -EINVAL; 1038 #endif 1039 1040 amdgpu_asic_init_doorbell_index(adev); 1041 1042 /* doorbell bar mapping */ 1043 #ifdef __linux__ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 #endif 1047 1048 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1049 adev->doorbell_index.max_assignment+1); 1050 if (adev->doorbell.num_doorbells == 0) 1051 return -EINVAL; 1052 1053 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1054 * paging queue doorbell use the second page. The 1055 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1056 * doorbells are in the first page. So with paging queue enabled, 1057 * the max num_doorbells should + 1 page (0x400 in dword) 1058 */ 1059 if (adev->asic_type >= CHIP_VEGA10) 1060 adev->doorbell.num_doorbells += 0x400; 1061 1062 #ifdef __linux__ 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 #endif 1069 1070 return 0; 1071 } 1072 1073 /** 1074 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Tear down doorbell driver information (CIK) 1079 */ 1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1081 { 1082 #ifdef __linux__ 1083 iounmap(adev->doorbell.ptr); 1084 #else 1085 if (adev->doorbell.size > 0) 1086 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1087 adev->doorbell.size); 1088 #endif 1089 adev->doorbell.ptr = NULL; 1090 } 1091 1092 1093 1094 /* 1095 * amdgpu_device_wb_*() 1096 * Writeback is the method by which the GPU updates special pages in memory 1097 * with the status of certain GPU events (fences, ring pointers,etc.). 1098 */ 1099 1100 /** 1101 * amdgpu_device_wb_fini - Disable Writeback and free memory 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Disables Writeback and frees the Writeback memory (all asics). 1106 * Used at driver shutdown. 1107 */ 1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1109 { 1110 if (adev->wb.wb_obj) { 1111 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1112 &adev->wb.gpu_addr, 1113 (void **)&adev->wb.wb); 1114 adev->wb.wb_obj = NULL; 1115 } 1116 } 1117 1118 /** 1119 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Initializes writeback and allocates writeback memory (all asics). 1124 * Used at driver startup. 1125 * Returns 0 on success or an -error on failure. 1126 */ 1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1128 { 1129 int r; 1130 1131 if (adev->wb.wb_obj == NULL) { 1132 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1133 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1134 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1135 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 if (r) { 1138 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1139 return r; 1140 } 1141 1142 adev->wb.num_wb = AMDGPU_MAX_WB; 1143 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1144 1145 /* clear wb memory */ 1146 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1147 } 1148 1149 return 0; 1150 } 1151 1152 /** 1153 * amdgpu_device_wb_get - Allocate a wb entry 1154 * 1155 * @adev: amdgpu_device pointer 1156 * @wb: wb index 1157 * 1158 * Allocate a wb slot for use by the driver (all asics). 1159 * Returns 0 on success or -EINVAL on failure. 1160 */ 1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1162 { 1163 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1164 1165 if (offset < adev->wb.num_wb) { 1166 __set_bit(offset, adev->wb.used); 1167 *wb = offset << 3; /* convert to dw offset */ 1168 return 0; 1169 } else { 1170 return -EINVAL; 1171 } 1172 } 1173 1174 /** 1175 * amdgpu_device_wb_free - Free a wb entry 1176 * 1177 * @adev: amdgpu_device pointer 1178 * @wb: wb index 1179 * 1180 * Free a wb slot allocated for use by the driver (all asics) 1181 */ 1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1183 { 1184 wb >>= 3; 1185 if (wb < adev->wb.num_wb) 1186 __clear_bit(wb, adev->wb.used); 1187 } 1188 1189 /** 1190 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1191 * 1192 * @adev: amdgpu_device pointer 1193 * 1194 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1195 * to fail, but if any of the BARs is not accessible after the size we abort 1196 * driver loading by returning -ENODEV. 1197 */ 1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1199 { 1200 #ifdef __linux__ 1201 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1202 struct pci_bus *root; 1203 struct resource *res; 1204 unsigned i; 1205 u16 cmd; 1206 int r; 1207 1208 /* Bypass for VF */ 1209 if (amdgpu_sriov_vf(adev)) 1210 return 0; 1211 1212 /* skip if the bios has already enabled large BAR */ 1213 if (adev->gmc.real_vram_size && 1214 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1215 return 0; 1216 1217 /* Check if the root BUS has 64bit memory resources */ 1218 root = adev->pdev->bus; 1219 while (root->parent) 1220 root = root->parent; 1221 1222 pci_bus_for_each_resource(root, res, i) { 1223 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1224 res->start > 0x100000000ull) 1225 break; 1226 } 1227 1228 /* Trying to resize is pointless without a root hub window above 4GB */ 1229 if (!res) 1230 return 0; 1231 1232 /* Limit the BAR size to what is available */ 1233 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1234 rbar_size); 1235 1236 /* Disable memory decoding while we change the BAR addresses and size */ 1237 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1238 pci_write_config_word(adev->pdev, PCI_COMMAND, 1239 cmd & ~PCI_COMMAND_MEMORY); 1240 1241 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1242 amdgpu_device_doorbell_fini(adev); 1243 if (adev->asic_type >= CHIP_BONAIRE) 1244 pci_release_resource(adev->pdev, 2); 1245 1246 pci_release_resource(adev->pdev, 0); 1247 1248 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1249 if (r == -ENOSPC) 1250 DRM_INFO("Not enough PCI address space for a large BAR."); 1251 else if (r && r != -ENOTSUPP) 1252 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1253 1254 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1255 1256 /* When the doorbell or fb BAR isn't available we have no chance of 1257 * using the device. 1258 */ 1259 r = amdgpu_device_doorbell_init(adev); 1260 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1261 return -ENODEV; 1262 1263 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1264 #endif /* __linux__ */ 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /* if we get transitioned to only one device, take VGA back */ 1331 /** 1332 * amdgpu_device_vga_set_decode - enable/disable vga decode 1333 * 1334 * @pdev: PCI device pointer 1335 * @state: enable/disable vga decode 1336 * 1337 * Enable/disable vga decode (all asics). 1338 * Returns VGA resource flags. 1339 */ 1340 #ifdef notyet 1341 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1342 bool state) 1343 { 1344 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1345 amdgpu_asic_set_vga_state(adev, state); 1346 if (state) 1347 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1348 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1349 else 1350 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1351 } 1352 #endif 1353 1354 /** 1355 * amdgpu_device_check_block_size - validate the vm block size 1356 * 1357 * @adev: amdgpu_device pointer 1358 * 1359 * Validates the vm block size specified via module parameter. 1360 * The vm block size defines number of bits in page table versus page directory, 1361 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1362 * page table and the remaining bits are in the page directory. 1363 */ 1364 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1365 { 1366 /* defines number of bits in page table versus page directory, 1367 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1368 * page table and the remaining bits are in the page directory */ 1369 if (amdgpu_vm_block_size == -1) 1370 return; 1371 1372 if (amdgpu_vm_block_size < 9) { 1373 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1374 amdgpu_vm_block_size); 1375 amdgpu_vm_block_size = -1; 1376 } 1377 } 1378 1379 /** 1380 * amdgpu_device_check_vm_size - validate the vm size 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Validates the vm size in GB specified via module parameter. 1385 * The VM size is the size of the GPU virtual memory space in GB. 1386 */ 1387 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1388 { 1389 /* no need to check the default value */ 1390 if (amdgpu_vm_size == -1) 1391 return; 1392 1393 if (amdgpu_vm_size < 1) { 1394 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1395 amdgpu_vm_size); 1396 amdgpu_vm_size = -1; 1397 } 1398 } 1399 1400 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1401 { 1402 #ifdef __linux__ 1403 struct sysinfo si; 1404 #endif 1405 bool is_os_64 = (sizeof(void *) == 8); 1406 uint64_t total_memory; 1407 uint64_t dram_size_seven_GB = 0x1B8000000; 1408 uint64_t dram_size_three_GB = 0xB8000000; 1409 1410 if (amdgpu_smu_memory_pool_size == 0) 1411 return; 1412 1413 if (!is_os_64) { 1414 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1415 goto def_value; 1416 } 1417 #ifdef __linux__ 1418 si_meminfo(&si); 1419 total_memory = (uint64_t)si.totalram * si.mem_unit; 1420 #else 1421 total_memory = ptoa(physmem); 1422 #endif 1423 1424 if ((amdgpu_smu_memory_pool_size == 1) || 1425 (amdgpu_smu_memory_pool_size == 2)) { 1426 if (total_memory < dram_size_three_GB) 1427 goto def_value1; 1428 } else if ((amdgpu_smu_memory_pool_size == 4) || 1429 (amdgpu_smu_memory_pool_size == 8)) { 1430 if (total_memory < dram_size_seven_GB) 1431 goto def_value1; 1432 } else { 1433 DRM_WARN("Smu memory pool size not supported\n"); 1434 goto def_value; 1435 } 1436 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1437 1438 return; 1439 1440 def_value1: 1441 DRM_WARN("No enough system memory\n"); 1442 def_value: 1443 adev->pm.smu_prv_buffer_size = 0; 1444 } 1445 1446 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1447 { 1448 if (!(adev->flags & AMD_IS_APU) || 1449 adev->asic_type < CHIP_RAVEN) 1450 return 0; 1451 1452 switch (adev->asic_type) { 1453 case CHIP_RAVEN: 1454 if (adev->pdev->device == 0x15dd) 1455 adev->apu_flags |= AMD_APU_IS_RAVEN; 1456 if (adev->pdev->device == 0x15d8) 1457 adev->apu_flags |= AMD_APU_IS_PICASSO; 1458 break; 1459 case CHIP_RENOIR: 1460 if ((adev->pdev->device == 0x1636) || 1461 (adev->pdev->device == 0x164c)) 1462 adev->apu_flags |= AMD_APU_IS_RENOIR; 1463 else 1464 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1465 break; 1466 case CHIP_VANGOGH: 1467 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1468 break; 1469 case CHIP_YELLOW_CARP: 1470 break; 1471 case CHIP_CYAN_SKILLFISH: 1472 if (adev->pdev->device == 0x13FE) 1473 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1474 break; 1475 default: 1476 return -EINVAL; 1477 } 1478 1479 return 0; 1480 } 1481 1482 /** 1483 * amdgpu_device_check_arguments - validate module params 1484 * 1485 * @adev: amdgpu_device pointer 1486 * 1487 * Validates certain module parameters and updates 1488 * the associated values used by the driver (all asics). 1489 */ 1490 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1491 { 1492 if (amdgpu_sched_jobs < 4) { 1493 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1494 amdgpu_sched_jobs); 1495 amdgpu_sched_jobs = 4; 1496 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1497 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1498 amdgpu_sched_jobs); 1499 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1500 } 1501 1502 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1503 /* gart size must be greater or equal to 32M */ 1504 dev_warn(adev->dev, "gart size (%d) too small\n", 1505 amdgpu_gart_size); 1506 amdgpu_gart_size = -1; 1507 } 1508 1509 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1510 /* gtt size must be greater or equal to 32M */ 1511 dev_warn(adev->dev, "gtt size (%d) too small\n", 1512 amdgpu_gtt_size); 1513 amdgpu_gtt_size = -1; 1514 } 1515 1516 /* valid range is between 4 and 9 inclusive */ 1517 if (amdgpu_vm_fragment_size != -1 && 1518 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1519 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1520 amdgpu_vm_fragment_size = -1; 1521 } 1522 1523 if (amdgpu_sched_hw_submission < 2) { 1524 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1525 amdgpu_sched_hw_submission); 1526 amdgpu_sched_hw_submission = 2; 1527 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1528 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1529 amdgpu_sched_hw_submission); 1530 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1531 } 1532 1533 amdgpu_device_check_smu_prv_buffer_size(adev); 1534 1535 amdgpu_device_check_vm_size(adev); 1536 1537 amdgpu_device_check_block_size(adev); 1538 1539 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1540 1541 amdgpu_gmc_tmz_set(adev); 1542 1543 amdgpu_gmc_noretry_set(adev); 1544 1545 return 0; 1546 } 1547 1548 #ifdef __linux__ 1549 /** 1550 * amdgpu_switcheroo_set_state - set switcheroo state 1551 * 1552 * @pdev: pci dev pointer 1553 * @state: vga_switcheroo state 1554 * 1555 * Callback for the switcheroo driver. Suspends or resumes the 1556 * the asics before or after it is powered up using ACPI methods. 1557 */ 1558 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1559 enum vga_switcheroo_state state) 1560 { 1561 struct drm_device *dev = pci_get_drvdata(pdev); 1562 int r; 1563 1564 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1565 return; 1566 1567 if (state == VGA_SWITCHEROO_ON) { 1568 pr_info("switched on\n"); 1569 /* don't suspend or resume card normally */ 1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1571 1572 pci_set_power_state(pdev, PCI_D0); 1573 amdgpu_device_load_pci_state(pdev); 1574 r = pci_enable_device(pdev); 1575 if (r) 1576 DRM_WARN("pci_enable_device failed (%d)\n", r); 1577 amdgpu_device_resume(dev, true); 1578 1579 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1580 } else { 1581 pr_info("switched off\n"); 1582 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1583 amdgpu_device_suspend(dev, true); 1584 amdgpu_device_cache_pci_state(pdev); 1585 /* Shut down the device */ 1586 pci_disable_device(pdev); 1587 pci_set_power_state(pdev, PCI_D3cold); 1588 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1589 } 1590 } 1591 1592 /** 1593 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1594 * 1595 * @pdev: pci dev pointer 1596 * 1597 * Callback for the switcheroo driver. Check of the switcheroo 1598 * state can be changed. 1599 * Returns true if the state can be changed, false if not. 1600 */ 1601 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1602 { 1603 struct drm_device *dev = pci_get_drvdata(pdev); 1604 1605 /* 1606 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1607 * locking inversion with the driver load path. And the access here is 1608 * completely racy anyway. So don't bother with locking for now. 1609 */ 1610 return atomic_read(&dev->open_count) == 0; 1611 } 1612 #endif /* __linux__ */ 1613 1614 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1615 #ifdef notyet 1616 .set_gpu_state = amdgpu_switcheroo_set_state, 1617 .reprobe = NULL, 1618 .can_switch = amdgpu_switcheroo_can_switch, 1619 #endif 1620 }; 1621 1622 /** 1623 * amdgpu_device_ip_set_clockgating_state - set the CG state 1624 * 1625 * @dev: amdgpu_device pointer 1626 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1627 * @state: clockgating state (gate or ungate) 1628 * 1629 * Sets the requested clockgating state for all instances of 1630 * the hardware IP specified. 1631 * Returns the error code from the last instance. 1632 */ 1633 int amdgpu_device_ip_set_clockgating_state(void *dev, 1634 enum amd_ip_block_type block_type, 1635 enum amd_clockgating_state state) 1636 { 1637 struct amdgpu_device *adev = dev; 1638 int i, r = 0; 1639 1640 for (i = 0; i < adev->num_ip_blocks; i++) { 1641 if (!adev->ip_blocks[i].status.valid) 1642 continue; 1643 if (adev->ip_blocks[i].version->type != block_type) 1644 continue; 1645 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1646 continue; 1647 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1648 (void *)adev, state); 1649 if (r) 1650 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1651 adev->ip_blocks[i].version->funcs->name, r); 1652 } 1653 return r; 1654 } 1655 1656 /** 1657 * amdgpu_device_ip_set_powergating_state - set the PG state 1658 * 1659 * @dev: amdgpu_device pointer 1660 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1661 * @state: powergating state (gate or ungate) 1662 * 1663 * Sets the requested powergating state for all instances of 1664 * the hardware IP specified. 1665 * Returns the error code from the last instance. 1666 */ 1667 int amdgpu_device_ip_set_powergating_state(void *dev, 1668 enum amd_ip_block_type block_type, 1669 enum amd_powergating_state state) 1670 { 1671 struct amdgpu_device *adev = dev; 1672 int i, r = 0; 1673 1674 for (i = 0; i < adev->num_ip_blocks; i++) { 1675 if (!adev->ip_blocks[i].status.valid) 1676 continue; 1677 if (adev->ip_blocks[i].version->type != block_type) 1678 continue; 1679 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1680 continue; 1681 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1682 (void *)adev, state); 1683 if (r) 1684 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1685 adev->ip_blocks[i].version->funcs->name, r); 1686 } 1687 return r; 1688 } 1689 1690 /** 1691 * amdgpu_device_ip_get_clockgating_state - get the CG state 1692 * 1693 * @adev: amdgpu_device pointer 1694 * @flags: clockgating feature flags 1695 * 1696 * Walks the list of IPs on the device and updates the clockgating 1697 * flags for each IP. 1698 * Updates @flags with the feature flags for each hardware IP where 1699 * clockgating is enabled. 1700 */ 1701 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1702 u32 *flags) 1703 { 1704 int i; 1705 1706 for (i = 0; i < adev->num_ip_blocks; i++) { 1707 if (!adev->ip_blocks[i].status.valid) 1708 continue; 1709 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1710 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1711 } 1712 } 1713 1714 /** 1715 * amdgpu_device_ip_wait_for_idle - wait for idle 1716 * 1717 * @adev: amdgpu_device pointer 1718 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1719 * 1720 * Waits for the request hardware IP to be idle. 1721 * Returns 0 for success or a negative error code on failure. 1722 */ 1723 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1724 enum amd_ip_block_type block_type) 1725 { 1726 int i, r; 1727 1728 for (i = 0; i < adev->num_ip_blocks; i++) { 1729 if (!adev->ip_blocks[i].status.valid) 1730 continue; 1731 if (adev->ip_blocks[i].version->type == block_type) { 1732 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1733 if (r) 1734 return r; 1735 break; 1736 } 1737 } 1738 return 0; 1739 1740 } 1741 1742 /** 1743 * amdgpu_device_ip_is_idle - is the hardware IP idle 1744 * 1745 * @adev: amdgpu_device pointer 1746 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1747 * 1748 * Check if the hardware IP is idle or not. 1749 * Returns true if it the IP is idle, false if not. 1750 */ 1751 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1752 enum amd_ip_block_type block_type) 1753 { 1754 int i; 1755 1756 for (i = 0; i < adev->num_ip_blocks; i++) { 1757 if (!adev->ip_blocks[i].status.valid) 1758 continue; 1759 if (adev->ip_blocks[i].version->type == block_type) 1760 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1761 } 1762 return true; 1763 1764 } 1765 1766 /** 1767 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1768 * 1769 * @adev: amdgpu_device pointer 1770 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1771 * 1772 * Returns a pointer to the hardware IP block structure 1773 * if it exists for the asic, otherwise NULL. 1774 */ 1775 struct amdgpu_ip_block * 1776 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1777 enum amd_ip_block_type type) 1778 { 1779 int i; 1780 1781 for (i = 0; i < adev->num_ip_blocks; i++) 1782 if (adev->ip_blocks[i].version->type == type) 1783 return &adev->ip_blocks[i]; 1784 1785 return NULL; 1786 } 1787 1788 /** 1789 * amdgpu_device_ip_block_version_cmp 1790 * 1791 * @adev: amdgpu_device pointer 1792 * @type: enum amd_ip_block_type 1793 * @major: major version 1794 * @minor: minor version 1795 * 1796 * return 0 if equal or greater 1797 * return 1 if smaller or the ip_block doesn't exist 1798 */ 1799 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1800 enum amd_ip_block_type type, 1801 u32 major, u32 minor) 1802 { 1803 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1804 1805 if (ip_block && ((ip_block->version->major > major) || 1806 ((ip_block->version->major == major) && 1807 (ip_block->version->minor >= minor)))) 1808 return 0; 1809 1810 return 1; 1811 } 1812 1813 /** 1814 * amdgpu_device_ip_block_add 1815 * 1816 * @adev: amdgpu_device pointer 1817 * @ip_block_version: pointer to the IP to add 1818 * 1819 * Adds the IP block driver information to the collection of IPs 1820 * on the asic. 1821 */ 1822 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1823 const struct amdgpu_ip_block_version *ip_block_version) 1824 { 1825 if (!ip_block_version) 1826 return -EINVAL; 1827 1828 switch (ip_block_version->type) { 1829 case AMD_IP_BLOCK_TYPE_VCN: 1830 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1831 return 0; 1832 break; 1833 case AMD_IP_BLOCK_TYPE_JPEG: 1834 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1835 return 0; 1836 break; 1837 default: 1838 break; 1839 } 1840 1841 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1842 ip_block_version->funcs->name); 1843 1844 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1845 1846 return 0; 1847 } 1848 1849 /** 1850 * amdgpu_device_enable_virtual_display - enable virtual display feature 1851 * 1852 * @adev: amdgpu_device pointer 1853 * 1854 * Enabled the virtual display feature if the user has enabled it via 1855 * the module parameter virtual_display. This feature provides a virtual 1856 * display hardware on headless boards or in virtualized environments. 1857 * This function parses and validates the configuration string specified by 1858 * the user and configues the virtual display configuration (number of 1859 * virtual connectors, crtcs, etc.) specified. 1860 */ 1861 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1862 { 1863 adev->enable_virtual_display = false; 1864 1865 #ifdef notyet 1866 if (amdgpu_virtual_display) { 1867 const char *pci_address_name = pci_name(adev->pdev); 1868 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1869 1870 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1871 pciaddstr_tmp = pciaddstr; 1872 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1873 pciaddname = strsep(&pciaddname_tmp, ","); 1874 if (!strcmp("all", pciaddname) 1875 || !strcmp(pci_address_name, pciaddname)) { 1876 long num_crtc; 1877 int res = -1; 1878 1879 adev->enable_virtual_display = true; 1880 1881 if (pciaddname_tmp) 1882 res = kstrtol(pciaddname_tmp, 10, 1883 &num_crtc); 1884 1885 if (!res) { 1886 if (num_crtc < 1) 1887 num_crtc = 1; 1888 if (num_crtc > 6) 1889 num_crtc = 6; 1890 adev->mode_info.num_crtc = num_crtc; 1891 } else { 1892 adev->mode_info.num_crtc = 1; 1893 } 1894 break; 1895 } 1896 } 1897 1898 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1899 amdgpu_virtual_display, pci_address_name, 1900 adev->enable_virtual_display, adev->mode_info.num_crtc); 1901 1902 kfree(pciaddstr); 1903 } 1904 #endif 1905 } 1906 1907 /** 1908 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Parses the asic configuration parameters specified in the gpu info 1913 * firmware and makes them availale to the driver for use in configuring 1914 * the asic. 1915 * Returns 0 on success, -EINVAL on failure. 1916 */ 1917 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1918 { 1919 const char *chip_name; 1920 char fw_name[40]; 1921 int err; 1922 const struct gpu_info_firmware_header_v1_0 *hdr; 1923 1924 adev->firmware.gpu_info_fw = NULL; 1925 1926 if (adev->mman.discovery_bin) { 1927 amdgpu_discovery_get_gfx_info(adev); 1928 1929 /* 1930 * FIXME: The bounding box is still needed by Navi12, so 1931 * temporarily read it from gpu_info firmware. Should be droped 1932 * when DAL no longer needs it. 1933 */ 1934 if (adev->asic_type != CHIP_NAVI12) 1935 return 0; 1936 } 1937 1938 switch (adev->asic_type) { 1939 #ifdef CONFIG_DRM_AMDGPU_SI 1940 case CHIP_VERDE: 1941 case CHIP_TAHITI: 1942 case CHIP_PITCAIRN: 1943 case CHIP_OLAND: 1944 case CHIP_HAINAN: 1945 #endif 1946 #ifdef CONFIG_DRM_AMDGPU_CIK 1947 case CHIP_BONAIRE: 1948 case CHIP_HAWAII: 1949 case CHIP_KAVERI: 1950 case CHIP_KABINI: 1951 case CHIP_MULLINS: 1952 #endif 1953 case CHIP_TOPAZ: 1954 case CHIP_TONGA: 1955 case CHIP_FIJI: 1956 case CHIP_POLARIS10: 1957 case CHIP_POLARIS11: 1958 case CHIP_POLARIS12: 1959 case CHIP_VEGAM: 1960 case CHIP_CARRIZO: 1961 case CHIP_STONEY: 1962 case CHIP_VEGA20: 1963 case CHIP_ALDEBARAN: 1964 case CHIP_SIENNA_CICHLID: 1965 case CHIP_NAVY_FLOUNDER: 1966 case CHIP_DIMGREY_CAVEFISH: 1967 case CHIP_BEIGE_GOBY: 1968 default: 1969 return 0; 1970 case CHIP_VEGA10: 1971 chip_name = "vega10"; 1972 break; 1973 case CHIP_VEGA12: 1974 chip_name = "vega12"; 1975 break; 1976 case CHIP_RAVEN: 1977 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1978 chip_name = "raven2"; 1979 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1980 chip_name = "picasso"; 1981 else 1982 chip_name = "raven"; 1983 break; 1984 case CHIP_ARCTURUS: 1985 chip_name = "arcturus"; 1986 break; 1987 case CHIP_RENOIR: 1988 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1989 chip_name = "renoir"; 1990 else 1991 chip_name = "green_sardine"; 1992 break; 1993 case CHIP_NAVI10: 1994 chip_name = "navi10"; 1995 break; 1996 case CHIP_NAVI14: 1997 chip_name = "navi14"; 1998 break; 1999 case CHIP_NAVI12: 2000 chip_name = "navi12"; 2001 break; 2002 case CHIP_VANGOGH: 2003 chip_name = "vangogh"; 2004 break; 2005 case CHIP_YELLOW_CARP: 2006 chip_name = "yellow_carp"; 2007 break; 2008 } 2009 2010 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2011 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2012 if (err) { 2013 dev_err(adev->dev, 2014 "Failed to load gpu_info firmware \"%s\"\n", 2015 fw_name); 2016 goto out; 2017 } 2018 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2019 if (err) { 2020 dev_err(adev->dev, 2021 "Failed to validate gpu_info firmware \"%s\"\n", 2022 fw_name); 2023 goto out; 2024 } 2025 2026 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2027 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2028 2029 switch (hdr->version_major) { 2030 case 1: 2031 { 2032 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2033 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2034 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2035 2036 /* 2037 * Should be droped when DAL no longer needs it. 2038 */ 2039 if (adev->asic_type == CHIP_NAVI12) 2040 goto parse_soc_bounding_box; 2041 2042 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2043 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2044 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2045 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2046 adev->gfx.config.max_texture_channel_caches = 2047 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2048 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2049 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2050 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2051 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2052 adev->gfx.config.double_offchip_lds_buf = 2053 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2054 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2055 adev->gfx.cu_info.max_waves_per_simd = 2056 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2057 adev->gfx.cu_info.max_scratch_slots_per_cu = 2058 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2059 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2060 if (hdr->version_minor >= 1) { 2061 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2062 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2063 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2064 adev->gfx.config.num_sc_per_sh = 2065 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2066 adev->gfx.config.num_packer_per_sc = 2067 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2068 } 2069 2070 parse_soc_bounding_box: 2071 /* 2072 * soc bounding box info is not integrated in disocovery table, 2073 * we always need to parse it from gpu info firmware if needed. 2074 */ 2075 if (hdr->version_minor == 2) { 2076 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2077 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2078 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2079 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2080 } 2081 break; 2082 } 2083 default: 2084 dev_err(adev->dev, 2085 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2086 err = -EINVAL; 2087 goto out; 2088 } 2089 out: 2090 return err; 2091 } 2092 2093 /** 2094 * amdgpu_device_ip_early_init - run early init for hardware IPs 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Early initialization pass for hardware IPs. The hardware IPs that make 2099 * up each asic are discovered each IP's early_init callback is run. This 2100 * is the first stage in initializing the asic. 2101 * Returns 0 on success, negative error code on failure. 2102 */ 2103 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2104 { 2105 struct drm_device *dev = adev_to_drm(adev); 2106 struct pci_dev *parent; 2107 int i, r; 2108 2109 amdgpu_device_enable_virtual_display(adev); 2110 2111 if (amdgpu_sriov_vf(adev)) { 2112 r = amdgpu_virt_request_full_gpu(adev, true); 2113 if (r) 2114 return r; 2115 } 2116 2117 switch (adev->asic_type) { 2118 #ifdef CONFIG_DRM_AMDGPU_SI 2119 case CHIP_VERDE: 2120 case CHIP_TAHITI: 2121 case CHIP_PITCAIRN: 2122 case CHIP_OLAND: 2123 case CHIP_HAINAN: 2124 adev->family = AMDGPU_FAMILY_SI; 2125 r = si_set_ip_blocks(adev); 2126 if (r) 2127 return r; 2128 break; 2129 #endif 2130 #ifdef CONFIG_DRM_AMDGPU_CIK 2131 case CHIP_BONAIRE: 2132 case CHIP_HAWAII: 2133 case CHIP_KAVERI: 2134 case CHIP_KABINI: 2135 case CHIP_MULLINS: 2136 if (adev->flags & AMD_IS_APU) 2137 adev->family = AMDGPU_FAMILY_KV; 2138 else 2139 adev->family = AMDGPU_FAMILY_CI; 2140 2141 r = cik_set_ip_blocks(adev); 2142 if (r) 2143 return r; 2144 break; 2145 #endif 2146 case CHIP_TOPAZ: 2147 case CHIP_TONGA: 2148 case CHIP_FIJI: 2149 case CHIP_POLARIS10: 2150 case CHIP_POLARIS11: 2151 case CHIP_POLARIS12: 2152 case CHIP_VEGAM: 2153 case CHIP_CARRIZO: 2154 case CHIP_STONEY: 2155 if (adev->flags & AMD_IS_APU) 2156 adev->family = AMDGPU_FAMILY_CZ; 2157 else 2158 adev->family = AMDGPU_FAMILY_VI; 2159 2160 r = vi_set_ip_blocks(adev); 2161 if (r) 2162 return r; 2163 break; 2164 case CHIP_VEGA10: 2165 case CHIP_VEGA12: 2166 case CHIP_VEGA20: 2167 case CHIP_RAVEN: 2168 case CHIP_ARCTURUS: 2169 case CHIP_RENOIR: 2170 case CHIP_ALDEBARAN: 2171 if (adev->flags & AMD_IS_APU) 2172 adev->family = AMDGPU_FAMILY_RV; 2173 else 2174 adev->family = AMDGPU_FAMILY_AI; 2175 2176 r = soc15_set_ip_blocks(adev); 2177 if (r) 2178 return r; 2179 break; 2180 case CHIP_NAVI10: 2181 case CHIP_NAVI14: 2182 case CHIP_NAVI12: 2183 case CHIP_SIENNA_CICHLID: 2184 case CHIP_NAVY_FLOUNDER: 2185 case CHIP_DIMGREY_CAVEFISH: 2186 case CHIP_BEIGE_GOBY: 2187 case CHIP_VANGOGH: 2188 case CHIP_YELLOW_CARP: 2189 case CHIP_CYAN_SKILLFISH: 2190 if (adev->asic_type == CHIP_VANGOGH) 2191 adev->family = AMDGPU_FAMILY_VGH; 2192 else if (adev->asic_type == CHIP_YELLOW_CARP) 2193 adev->family = AMDGPU_FAMILY_YC; 2194 else 2195 adev->family = AMDGPU_FAMILY_NV; 2196 2197 r = nv_set_ip_blocks(adev); 2198 if (r) 2199 return r; 2200 break; 2201 default: 2202 /* FIXME: not supported yet */ 2203 return -EINVAL; 2204 } 2205 2206 if (amdgpu_has_atpx() && 2207 (amdgpu_is_atpx_hybrid() || 2208 amdgpu_has_atpx_dgpu_power_cntl()) && 2209 ((adev->flags & AMD_IS_APU) == 0) && 2210 !pci_is_thunderbolt_attached(dev->pdev)) 2211 adev->flags |= AMD_IS_PX; 2212 2213 if (!(adev->flags & AMD_IS_APU)) { 2214 parent = pci_upstream_bridge(adev->pdev); 2215 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2216 } 2217 2218 amdgpu_amdkfd_device_probe(adev); 2219 2220 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2221 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2222 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2223 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2224 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2228 DRM_ERROR("disabled ip block: %d <%s>\n", 2229 i, adev->ip_blocks[i].version->funcs->name); 2230 adev->ip_blocks[i].status.valid = false; 2231 } else { 2232 if (adev->ip_blocks[i].version->funcs->early_init) { 2233 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2234 if (r == -ENOENT) { 2235 adev->ip_blocks[i].status.valid = false; 2236 } else if (r) { 2237 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2238 adev->ip_blocks[i].version->funcs->name, r); 2239 return r; 2240 } else { 2241 adev->ip_blocks[i].status.valid = true; 2242 } 2243 } else { 2244 adev->ip_blocks[i].status.valid = true; 2245 } 2246 } 2247 /* get the vbios after the asic_funcs are set up */ 2248 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2249 r = amdgpu_device_parse_gpu_info_fw(adev); 2250 if (r) 2251 return r; 2252 2253 /* Read BIOS */ 2254 if (!amdgpu_get_bios(adev)) 2255 return -EINVAL; 2256 2257 r = amdgpu_atombios_init(adev); 2258 if (r) { 2259 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2260 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2261 return r; 2262 } 2263 2264 /*get pf2vf msg info at it's earliest time*/ 2265 if (amdgpu_sriov_vf(adev)) 2266 amdgpu_virt_init_data_exchange(adev); 2267 2268 } 2269 } 2270 2271 adev->cg_flags &= amdgpu_cg_mask; 2272 adev->pg_flags &= amdgpu_pg_mask; 2273 2274 return 0; 2275 } 2276 2277 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.sw) 2283 continue; 2284 if (adev->ip_blocks[i].status.hw) 2285 continue; 2286 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2287 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2288 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2289 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2290 if (r) { 2291 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2292 adev->ip_blocks[i].version->funcs->name, r); 2293 return r; 2294 } 2295 adev->ip_blocks[i].status.hw = true; 2296 } 2297 } 2298 2299 return 0; 2300 } 2301 2302 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2303 { 2304 int i, r; 2305 2306 for (i = 0; i < adev->num_ip_blocks; i++) { 2307 if (!adev->ip_blocks[i].status.sw) 2308 continue; 2309 if (adev->ip_blocks[i].status.hw) 2310 continue; 2311 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2312 if (r) { 2313 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2314 adev->ip_blocks[i].version->funcs->name, r); 2315 return r; 2316 } 2317 adev->ip_blocks[i].status.hw = true; 2318 } 2319 2320 return 0; 2321 } 2322 2323 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2324 { 2325 int r = 0; 2326 int i; 2327 uint32_t smu_version; 2328 2329 if (adev->asic_type >= CHIP_VEGA10) { 2330 for (i = 0; i < adev->num_ip_blocks; i++) { 2331 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2332 continue; 2333 2334 if (!adev->ip_blocks[i].status.sw) 2335 continue; 2336 2337 /* no need to do the fw loading again if already done*/ 2338 if (adev->ip_blocks[i].status.hw == true) 2339 break; 2340 2341 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2342 r = adev->ip_blocks[i].version->funcs->resume(adev); 2343 if (r) { 2344 DRM_ERROR("resume of IP block <%s> failed %d\n", 2345 adev->ip_blocks[i].version->funcs->name, r); 2346 return r; 2347 } 2348 } else { 2349 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2350 if (r) { 2351 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2352 adev->ip_blocks[i].version->funcs->name, r); 2353 return r; 2354 } 2355 } 2356 2357 adev->ip_blocks[i].status.hw = true; 2358 break; 2359 } 2360 } 2361 2362 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2363 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2364 2365 return r; 2366 } 2367 2368 /** 2369 * amdgpu_device_ip_init - run init for hardware IPs 2370 * 2371 * @adev: amdgpu_device pointer 2372 * 2373 * Main initialization pass for hardware IPs. The list of all the hardware 2374 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2375 * are run. sw_init initializes the software state associated with each IP 2376 * and hw_init initializes the hardware associated with each IP. 2377 * Returns 0 on success, negative error code on failure. 2378 */ 2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2380 { 2381 int i, r; 2382 2383 r = amdgpu_ras_init(adev); 2384 if (r) 2385 return r; 2386 2387 for (i = 0; i < adev->num_ip_blocks; i++) { 2388 if (!adev->ip_blocks[i].status.valid) 2389 continue; 2390 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2391 if (r) { 2392 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2393 adev->ip_blocks[i].version->funcs->name, r); 2394 goto init_failed; 2395 } 2396 adev->ip_blocks[i].status.sw = true; 2397 2398 /* need to do gmc hw init early so we can allocate gpu mem */ 2399 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2400 r = amdgpu_device_vram_scratch_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2406 if (r) { 2407 DRM_ERROR("hw_init %d failed %d\n", i, r); 2408 goto init_failed; 2409 } 2410 r = amdgpu_device_wb_init(adev); 2411 if (r) { 2412 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2413 goto init_failed; 2414 } 2415 adev->ip_blocks[i].status.hw = true; 2416 2417 /* right after GMC hw init, we create CSA */ 2418 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2419 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2420 AMDGPU_GEM_DOMAIN_VRAM, 2421 AMDGPU_CSA_SIZE); 2422 if (r) { 2423 DRM_ERROR("allocate CSA failed %d\n", r); 2424 goto init_failed; 2425 } 2426 } 2427 } 2428 } 2429 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_init_data_exchange(adev); 2432 2433 r = amdgpu_ib_pool_init(adev); 2434 if (r) { 2435 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2437 goto init_failed; 2438 } 2439 2440 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_amdkfd_resume_iommu(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_ip_hw_init_phase1(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 r = amdgpu_device_fw_loading(adev); 2453 if (r) 2454 goto init_failed; 2455 2456 r = amdgpu_device_ip_hw_init_phase2(adev); 2457 if (r) 2458 goto init_failed; 2459 2460 /* 2461 * retired pages will be loaded from eeprom and reserved here, 2462 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2463 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2464 * for I2C communication which only true at this point. 2465 * 2466 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2467 * failure from bad gpu situation and stop amdgpu init process 2468 * accordingly. For other failed cases, it will still release all 2469 * the resource and print error message, rather than returning one 2470 * negative value to upper level. 2471 * 2472 * Note: theoretically, this should be called before all vram allocations 2473 * to protect retired page from abusing 2474 */ 2475 r = amdgpu_ras_recovery_init(adev); 2476 if (r) 2477 goto init_failed; 2478 2479 if (adev->gmc.xgmi.num_physical_nodes > 1) 2480 amdgpu_xgmi_add_device(adev); 2481 2482 /* Don't init kfd if whole hive need to be reset during init */ 2483 if (!adev->gmc.xgmi.pending_reset) 2484 amdgpu_amdkfd_device_init(adev); 2485 2486 amdgpu_fru_get_product_info(adev); 2487 2488 init_failed: 2489 if (amdgpu_sriov_vf(adev)) 2490 amdgpu_virt_release_full_gpu(adev, true); 2491 2492 return r; 2493 } 2494 2495 /** 2496 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2497 * 2498 * @adev: amdgpu_device pointer 2499 * 2500 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2501 * this function before a GPU reset. If the value is retained after a 2502 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2503 */ 2504 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2505 { 2506 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2507 } 2508 2509 /** 2510 * amdgpu_device_check_vram_lost - check if vram is valid 2511 * 2512 * @adev: amdgpu_device pointer 2513 * 2514 * Checks the reset magic value written to the gart pointer in VRAM. 2515 * The driver calls this after a GPU reset to see if the contents of 2516 * VRAM is lost or now. 2517 * returns true if vram is lost, false if not. 2518 */ 2519 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2520 { 2521 if (memcmp(adev->gart.ptr, adev->reset_magic, 2522 AMDGPU_RESET_MAGIC_NUM)) 2523 return true; 2524 2525 if (!amdgpu_in_reset(adev)) 2526 return false; 2527 2528 /* 2529 * For all ASICs with baco/mode1 reset, the VRAM is 2530 * always assumed to be lost. 2531 */ 2532 switch (amdgpu_asic_reset_method(adev)) { 2533 case AMD_RESET_METHOD_BACO: 2534 case AMD_RESET_METHOD_MODE1: 2535 return true; 2536 default: 2537 return false; 2538 } 2539 } 2540 2541 /** 2542 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2543 * 2544 * @adev: amdgpu_device pointer 2545 * @state: clockgating state (gate or ungate) 2546 * 2547 * The list of all the hardware IPs that make up the asic is walked and the 2548 * set_clockgating_state callbacks are run. 2549 * Late initialization pass enabling clockgating for hardware IPs. 2550 * Fini or suspend, pass disabling clockgating for hardware IPs. 2551 * Returns 0 on success, negative error code on failure. 2552 */ 2553 2554 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2555 enum amd_clockgating_state state) 2556 { 2557 int i, j, r; 2558 2559 if (amdgpu_emu_mode == 1) 2560 return 0; 2561 2562 for (j = 0; j < adev->num_ip_blocks; j++) { 2563 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2564 if (!adev->ip_blocks[i].status.late_initialized) 2565 continue; 2566 /* skip CG for GFX on S0ix */ 2567 if (adev->in_s0ix && 2568 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2569 continue; 2570 /* skip CG for VCE/UVD, it's handled specially */ 2571 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2572 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2573 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2574 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2575 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2576 /* enable clockgating to save power */ 2577 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2578 state); 2579 if (r) { 2580 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2581 adev->ip_blocks[i].version->funcs->name, r); 2582 return r; 2583 } 2584 } 2585 } 2586 2587 return 0; 2588 } 2589 2590 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2591 enum amd_powergating_state state) 2592 { 2593 int i, j, r; 2594 2595 if (amdgpu_emu_mode == 1) 2596 return 0; 2597 2598 for (j = 0; j < adev->num_ip_blocks; j++) { 2599 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2600 if (!adev->ip_blocks[i].status.late_initialized) 2601 continue; 2602 /* skip PG for GFX on S0ix */ 2603 if (adev->in_s0ix && 2604 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2605 continue; 2606 /* skip CG for VCE/UVD, it's handled specially */ 2607 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2608 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2609 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2610 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2611 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2612 /* enable powergating to save power */ 2613 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2614 state); 2615 if (r) { 2616 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2617 adev->ip_blocks[i].version->funcs->name, r); 2618 return r; 2619 } 2620 } 2621 } 2622 return 0; 2623 } 2624 2625 static int amdgpu_device_enable_mgpu_fan_boost(void) 2626 { 2627 struct amdgpu_gpu_instance *gpu_ins; 2628 struct amdgpu_device *adev; 2629 int i, ret = 0; 2630 2631 mutex_lock(&mgpu_info.mutex); 2632 2633 /* 2634 * MGPU fan boost feature should be enabled 2635 * only when there are two or more dGPUs in 2636 * the system 2637 */ 2638 if (mgpu_info.num_dgpu < 2) 2639 goto out; 2640 2641 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2642 gpu_ins = &(mgpu_info.gpu_ins[i]); 2643 adev = gpu_ins->adev; 2644 if (!(adev->flags & AMD_IS_APU) && 2645 !gpu_ins->mgpu_fan_enabled) { 2646 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2647 if (ret) 2648 break; 2649 2650 gpu_ins->mgpu_fan_enabled = 1; 2651 } 2652 } 2653 2654 out: 2655 mutex_unlock(&mgpu_info.mutex); 2656 2657 return ret; 2658 } 2659 2660 /** 2661 * amdgpu_device_ip_late_init - run late init for hardware IPs 2662 * 2663 * @adev: amdgpu_device pointer 2664 * 2665 * Late initialization pass for hardware IPs. The list of all the hardware 2666 * IPs that make up the asic is walked and the late_init callbacks are run. 2667 * late_init covers any special initialization that an IP requires 2668 * after all of the have been initialized or something that needs to happen 2669 * late in the init process. 2670 * Returns 0 on success, negative error code on failure. 2671 */ 2672 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2673 { 2674 struct amdgpu_gpu_instance *gpu_instance; 2675 int i = 0, r; 2676 2677 for (i = 0; i < adev->num_ip_blocks; i++) { 2678 if (!adev->ip_blocks[i].status.hw) 2679 continue; 2680 if (adev->ip_blocks[i].version->funcs->late_init) { 2681 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2682 if (r) { 2683 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2684 adev->ip_blocks[i].version->funcs->name, r); 2685 return r; 2686 } 2687 } 2688 adev->ip_blocks[i].status.late_initialized = true; 2689 } 2690 2691 amdgpu_ras_set_error_query_ready(adev, true); 2692 2693 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2694 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2695 2696 amdgpu_device_fill_reset_magic(adev); 2697 2698 r = amdgpu_device_enable_mgpu_fan_boost(); 2699 if (r) 2700 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2701 2702 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2703 if (adev->asic_type == CHIP_ARCTURUS && 2704 amdgpu_passthrough(adev) && 2705 adev->gmc.xgmi.num_physical_nodes > 1) 2706 smu_set_light_sbr(&adev->smu, true); 2707 2708 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2709 mutex_lock(&mgpu_info.mutex); 2710 2711 /* 2712 * Reset device p-state to low as this was booted with high. 2713 * 2714 * This should be performed only after all devices from the same 2715 * hive get initialized. 2716 * 2717 * However, it's unknown how many device in the hive in advance. 2718 * As this is counted one by one during devices initializations. 2719 * 2720 * So, we wait for all XGMI interlinked devices initialized. 2721 * This may bring some delays as those devices may come from 2722 * different hives. But that should be OK. 2723 */ 2724 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2725 for (i = 0; i < mgpu_info.num_gpu; i++) { 2726 gpu_instance = &(mgpu_info.gpu_ins[i]); 2727 if (gpu_instance->adev->flags & AMD_IS_APU) 2728 continue; 2729 2730 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2731 AMDGPU_XGMI_PSTATE_MIN); 2732 if (r) { 2733 DRM_ERROR("pstate setting failed (%d).\n", r); 2734 break; 2735 } 2736 } 2737 } 2738 2739 mutex_unlock(&mgpu_info.mutex); 2740 } 2741 2742 return 0; 2743 } 2744 2745 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2746 { 2747 int i, r; 2748 2749 for (i = 0; i < adev->num_ip_blocks; i++) { 2750 if (!adev->ip_blocks[i].version->funcs->early_fini) 2751 continue; 2752 2753 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2754 if (r) { 2755 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2756 adev->ip_blocks[i].version->funcs->name, r); 2757 } 2758 } 2759 2760 amdgpu_amdkfd_suspend(adev, false); 2761 2762 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2763 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2764 2765 /* need to disable SMC first */ 2766 for (i = 0; i < adev->num_ip_blocks; i++) { 2767 if (!adev->ip_blocks[i].status.hw) 2768 continue; 2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2770 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2771 /* XXX handle errors */ 2772 if (r) { 2773 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2774 adev->ip_blocks[i].version->funcs->name, r); 2775 } 2776 adev->ip_blocks[i].status.hw = false; 2777 break; 2778 } 2779 } 2780 2781 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2782 if (!adev->ip_blocks[i].status.hw) 2783 continue; 2784 2785 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2786 /* XXX handle errors */ 2787 if (r) { 2788 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2789 adev->ip_blocks[i].version->funcs->name, r); 2790 } 2791 2792 adev->ip_blocks[i].status.hw = false; 2793 } 2794 2795 if (amdgpu_sriov_vf(adev)) { 2796 if (amdgpu_virt_release_full_gpu(adev, false)) 2797 DRM_ERROR("failed to release exclusive mode on fini\n"); 2798 } 2799 2800 return 0; 2801 } 2802 2803 /** 2804 * amdgpu_device_ip_fini - run fini for hardware IPs 2805 * 2806 * @adev: amdgpu_device pointer 2807 * 2808 * Main teardown pass for hardware IPs. The list of all the hardware 2809 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2810 * are run. hw_fini tears down the hardware associated with each IP 2811 * and sw_fini tears down any software state associated with each IP. 2812 * Returns 0 on success, negative error code on failure. 2813 */ 2814 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2815 { 2816 int i, r; 2817 2818 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2819 amdgpu_virt_release_ras_err_handler_data(adev); 2820 2821 amdgpu_ras_pre_fini(adev); 2822 2823 if (adev->gmc.xgmi.num_physical_nodes > 1) 2824 amdgpu_xgmi_remove_device(adev); 2825 2826 amdgpu_amdkfd_device_fini_sw(adev); 2827 2828 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2829 if (!adev->ip_blocks[i].status.sw) 2830 continue; 2831 2832 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2833 amdgpu_ucode_free_bo(adev); 2834 amdgpu_free_static_csa(&adev->virt.csa_obj); 2835 amdgpu_device_wb_fini(adev); 2836 amdgpu_device_vram_scratch_fini(adev); 2837 amdgpu_ib_pool_fini(adev); 2838 } 2839 2840 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2841 /* XXX handle errors */ 2842 if (r) { 2843 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2844 adev->ip_blocks[i].version->funcs->name, r); 2845 } 2846 adev->ip_blocks[i].status.sw = false; 2847 adev->ip_blocks[i].status.valid = false; 2848 } 2849 2850 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2851 if (!adev->ip_blocks[i].status.late_initialized) 2852 continue; 2853 if (adev->ip_blocks[i].version->funcs->late_fini) 2854 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2855 adev->ip_blocks[i].status.late_initialized = false; 2856 } 2857 2858 amdgpu_ras_fini(adev); 2859 2860 return 0; 2861 } 2862 2863 /** 2864 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2865 * 2866 * @work: work_struct. 2867 */ 2868 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2869 { 2870 struct amdgpu_device *adev = 2871 container_of(work, struct amdgpu_device, delayed_init_work.work); 2872 int r; 2873 2874 r = amdgpu_ib_ring_tests(adev); 2875 if (r) 2876 DRM_ERROR("ib ring test failed (%d).\n", r); 2877 } 2878 2879 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2880 { 2881 struct amdgpu_device *adev = 2882 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2883 2884 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2885 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2886 2887 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2888 adev->gfx.gfx_off_state = true; 2889 } 2890 2891 /** 2892 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2893 * 2894 * @adev: amdgpu_device pointer 2895 * 2896 * Main suspend function for hardware IPs. The list of all the hardware 2897 * IPs that make up the asic is walked, clockgating is disabled and the 2898 * suspend callbacks are run. suspend puts the hardware and software state 2899 * in each IP into a state suitable for suspend. 2900 * Returns 0 on success, negative error code on failure. 2901 */ 2902 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2903 { 2904 int i, r; 2905 2906 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2907 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2908 2909 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2910 if (!adev->ip_blocks[i].status.valid) 2911 continue; 2912 2913 /* displays are handled separately */ 2914 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2915 continue; 2916 2917 /* XXX handle errors */ 2918 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2919 /* XXX handle errors */ 2920 if (r) { 2921 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2922 adev->ip_blocks[i].version->funcs->name, r); 2923 return r; 2924 } 2925 2926 adev->ip_blocks[i].status.hw = false; 2927 } 2928 2929 return 0; 2930 } 2931 2932 /** 2933 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2934 * 2935 * @adev: amdgpu_device pointer 2936 * 2937 * Main suspend function for hardware IPs. The list of all the hardware 2938 * IPs that make up the asic is walked, clockgating is disabled and the 2939 * suspend callbacks are run. suspend puts the hardware and software state 2940 * in each IP into a state suitable for suspend. 2941 * Returns 0 on success, negative error code on failure. 2942 */ 2943 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2944 { 2945 int i, r; 2946 2947 if (adev->in_s0ix) 2948 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2949 2950 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2951 if (!adev->ip_blocks[i].status.valid) 2952 continue; 2953 /* displays are handled in phase1 */ 2954 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2955 continue; 2956 /* PSP lost connection when err_event_athub occurs */ 2957 if (amdgpu_ras_intr_triggered() && 2958 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2959 adev->ip_blocks[i].status.hw = false; 2960 continue; 2961 } 2962 2963 /* skip unnecessary suspend if we do not initialize them yet */ 2964 if (adev->gmc.xgmi.pending_reset && 2965 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2968 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2969 adev->ip_blocks[i].status.hw = false; 2970 continue; 2971 } 2972 2973 /* skip suspend of gfx and psp for S0ix 2974 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2975 * like at runtime. PSP is also part of the always on hardware 2976 * so no need to suspend it. 2977 */ 2978 if (adev->in_s0ix && 2979 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2981 continue; 2982 2983 /* XXX handle errors */ 2984 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2985 /* XXX handle errors */ 2986 if (r) { 2987 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2988 adev->ip_blocks[i].version->funcs->name, r); 2989 } 2990 adev->ip_blocks[i].status.hw = false; 2991 /* handle putting the SMC in the appropriate state */ 2992 if(!amdgpu_sriov_vf(adev)){ 2993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2994 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2995 if (r) { 2996 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2997 adev->mp1_state, r); 2998 return r; 2999 } 3000 } 3001 } 3002 } 3003 3004 return 0; 3005 } 3006 3007 /** 3008 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3009 * 3010 * @adev: amdgpu_device pointer 3011 * 3012 * Main suspend function for hardware IPs. The list of all the hardware 3013 * IPs that make up the asic is walked, clockgating is disabled and the 3014 * suspend callbacks are run. suspend puts the hardware and software state 3015 * in each IP into a state suitable for suspend. 3016 * Returns 0 on success, negative error code on failure. 3017 */ 3018 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3019 { 3020 int r; 3021 3022 if (amdgpu_sriov_vf(adev)) { 3023 amdgpu_virt_fini_data_exchange(adev); 3024 amdgpu_virt_request_full_gpu(adev, false); 3025 } 3026 3027 r = amdgpu_device_ip_suspend_phase1(adev); 3028 if (r) 3029 return r; 3030 r = amdgpu_device_ip_suspend_phase2(adev); 3031 3032 if (amdgpu_sriov_vf(adev)) 3033 amdgpu_virt_release_full_gpu(adev, false); 3034 3035 return r; 3036 } 3037 3038 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3039 { 3040 int i, r; 3041 3042 static enum amd_ip_block_type ip_order[] = { 3043 AMD_IP_BLOCK_TYPE_GMC, 3044 AMD_IP_BLOCK_TYPE_COMMON, 3045 AMD_IP_BLOCK_TYPE_PSP, 3046 AMD_IP_BLOCK_TYPE_IH, 3047 }; 3048 3049 for (i = 0; i < adev->num_ip_blocks; i++) { 3050 int j; 3051 struct amdgpu_ip_block *block; 3052 3053 block = &adev->ip_blocks[i]; 3054 block->status.hw = false; 3055 3056 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3057 3058 if (block->version->type != ip_order[j] || 3059 !block->status.valid) 3060 continue; 3061 3062 r = block->version->funcs->hw_init(adev); 3063 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3064 if (r) 3065 return r; 3066 block->status.hw = true; 3067 } 3068 } 3069 3070 return 0; 3071 } 3072 3073 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3074 { 3075 int i, r; 3076 3077 static enum amd_ip_block_type ip_order[] = { 3078 AMD_IP_BLOCK_TYPE_SMC, 3079 AMD_IP_BLOCK_TYPE_DCE, 3080 AMD_IP_BLOCK_TYPE_GFX, 3081 AMD_IP_BLOCK_TYPE_SDMA, 3082 AMD_IP_BLOCK_TYPE_UVD, 3083 AMD_IP_BLOCK_TYPE_VCE, 3084 AMD_IP_BLOCK_TYPE_VCN 3085 }; 3086 3087 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3088 int j; 3089 struct amdgpu_ip_block *block; 3090 3091 for (j = 0; j < adev->num_ip_blocks; j++) { 3092 block = &adev->ip_blocks[j]; 3093 3094 if (block->version->type != ip_order[i] || 3095 !block->status.valid || 3096 block->status.hw) 3097 continue; 3098 3099 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3100 r = block->version->funcs->resume(adev); 3101 else 3102 r = block->version->funcs->hw_init(adev); 3103 3104 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3105 if (r) 3106 return r; 3107 block->status.hw = true; 3108 } 3109 } 3110 3111 return 0; 3112 } 3113 3114 /** 3115 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3116 * 3117 * @adev: amdgpu_device pointer 3118 * 3119 * First resume function for hardware IPs. The list of all the hardware 3120 * IPs that make up the asic is walked and the resume callbacks are run for 3121 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3122 * after a suspend and updates the software state as necessary. This 3123 * function is also used for restoring the GPU after a GPU reset. 3124 * Returns 0 on success, negative error code on failure. 3125 */ 3126 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 for (i = 0; i < adev->num_ip_blocks; i++) { 3131 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3132 continue; 3133 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3135 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3136 3137 r = adev->ip_blocks[i].version->funcs->resume(adev); 3138 if (r) { 3139 DRM_ERROR("resume of IP block <%s> failed %d\n", 3140 adev->ip_blocks[i].version->funcs->name, r); 3141 return r; 3142 } 3143 adev->ip_blocks[i].status.hw = true; 3144 } 3145 } 3146 3147 return 0; 3148 } 3149 3150 /** 3151 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3152 * 3153 * @adev: amdgpu_device pointer 3154 * 3155 * First resume function for hardware IPs. The list of all the hardware 3156 * IPs that make up the asic is walked and the resume callbacks are run for 3157 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3158 * functional state after a suspend and updates the software state as 3159 * necessary. This function is also used for restoring the GPU after a GPU 3160 * reset. 3161 * Returns 0 on success, negative error code on failure. 3162 */ 3163 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3164 { 3165 int i, r; 3166 3167 for (i = 0; i < adev->num_ip_blocks; i++) { 3168 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3169 continue; 3170 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3173 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3174 continue; 3175 r = adev->ip_blocks[i].version->funcs->resume(adev); 3176 if (r) { 3177 DRM_ERROR("resume of IP block <%s> failed %d\n", 3178 adev->ip_blocks[i].version->funcs->name, r); 3179 return r; 3180 } 3181 adev->ip_blocks[i].status.hw = true; 3182 } 3183 3184 return 0; 3185 } 3186 3187 /** 3188 * amdgpu_device_ip_resume - run resume for hardware IPs 3189 * 3190 * @adev: amdgpu_device pointer 3191 * 3192 * Main resume function for hardware IPs. The hardware IPs 3193 * are split into two resume functions because they are 3194 * are also used in in recovering from a GPU reset and some additional 3195 * steps need to be take between them. In this case (S3/S4) they are 3196 * run sequentially. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3200 { 3201 int r; 3202 3203 r = amdgpu_amdkfd_resume_iommu(adev); 3204 if (r) 3205 return r; 3206 3207 r = amdgpu_device_ip_resume_phase1(adev); 3208 if (r) 3209 return r; 3210 3211 r = amdgpu_device_fw_loading(adev); 3212 if (r) 3213 return r; 3214 3215 r = amdgpu_device_ip_resume_phase2(adev); 3216 3217 return r; 3218 } 3219 3220 /** 3221 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3222 * 3223 * @adev: amdgpu_device pointer 3224 * 3225 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3226 */ 3227 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3228 { 3229 if (amdgpu_sriov_vf(adev)) { 3230 if (adev->is_atom_fw) { 3231 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3232 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3233 } else { 3234 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3235 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3236 } 3237 3238 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3240 } 3241 } 3242 3243 /** 3244 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3245 * 3246 * @asic_type: AMD asic type 3247 * 3248 * Check if there is DC (new modesetting infrastructre) support for an asic. 3249 * returns true if DC has support, false if not. 3250 */ 3251 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3252 { 3253 switch (asic_type) { 3254 #if defined(CONFIG_DRM_AMD_DC) 3255 #if defined(CONFIG_DRM_AMD_DC_SI) 3256 case CHIP_TAHITI: 3257 case CHIP_PITCAIRN: 3258 case CHIP_VERDE: 3259 case CHIP_OLAND: 3260 #endif 3261 case CHIP_BONAIRE: 3262 case CHIP_KAVERI: 3263 case CHIP_KABINI: 3264 case CHIP_MULLINS: 3265 /* 3266 * We have systems in the wild with these ASICs that require 3267 * LVDS and VGA support which is not supported with DC. 3268 * 3269 * Fallback to the non-DC driver here by default so as not to 3270 * cause regressions. 3271 */ 3272 return amdgpu_dc > 0; 3273 case CHIP_HAWAII: 3274 case CHIP_CARRIZO: 3275 case CHIP_STONEY: 3276 case CHIP_POLARIS10: 3277 case CHIP_POLARIS11: 3278 case CHIP_POLARIS12: 3279 case CHIP_VEGAM: 3280 case CHIP_TONGA: 3281 case CHIP_FIJI: 3282 case CHIP_VEGA10: 3283 case CHIP_VEGA12: 3284 case CHIP_VEGA20: 3285 #if defined(CONFIG_DRM_AMD_DC_DCN) 3286 case CHIP_RAVEN: 3287 case CHIP_NAVI10: 3288 case CHIP_NAVI14: 3289 case CHIP_NAVI12: 3290 case CHIP_RENOIR: 3291 case CHIP_SIENNA_CICHLID: 3292 case CHIP_NAVY_FLOUNDER: 3293 case CHIP_DIMGREY_CAVEFISH: 3294 case CHIP_BEIGE_GOBY: 3295 case CHIP_VANGOGH: 3296 case CHIP_YELLOW_CARP: 3297 #endif 3298 return amdgpu_dc != 0; 3299 #endif 3300 default: 3301 if (amdgpu_dc > 0) 3302 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3303 "but isn't supported by ASIC, ignoring\n"); 3304 return false; 3305 } 3306 } 3307 3308 /** 3309 * amdgpu_device_has_dc_support - check if dc is supported 3310 * 3311 * @adev: amdgpu_device pointer 3312 * 3313 * Returns true for supported, false for not supported 3314 */ 3315 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3316 { 3317 if (amdgpu_sriov_vf(adev) || 3318 adev->enable_virtual_display || 3319 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3320 return false; 3321 3322 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3323 } 3324 3325 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3326 { 3327 struct amdgpu_device *adev = 3328 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3329 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3330 3331 /* It's a bug to not have a hive within this function */ 3332 if (WARN_ON(!hive)) 3333 return; 3334 3335 /* 3336 * Use task barrier to synchronize all xgmi reset works across the 3337 * hive. task_barrier_enter and task_barrier_exit will block 3338 * until all the threads running the xgmi reset works reach 3339 * those points. task_barrier_full will do both blocks. 3340 */ 3341 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3342 3343 task_barrier_enter(&hive->tb); 3344 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3345 3346 if (adev->asic_reset_res) 3347 goto fail; 3348 3349 task_barrier_exit(&hive->tb); 3350 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3351 3352 if (adev->asic_reset_res) 3353 goto fail; 3354 3355 if (adev->mmhub.ras_funcs && 3356 adev->mmhub.ras_funcs->reset_ras_error_count) 3357 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3358 } else { 3359 3360 task_barrier_full(&hive->tb); 3361 adev->asic_reset_res = amdgpu_asic_reset(adev); 3362 } 3363 3364 fail: 3365 if (adev->asic_reset_res) 3366 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3367 adev->asic_reset_res, adev_to_drm(adev)->unique); 3368 amdgpu_put_xgmi_hive(hive); 3369 } 3370 3371 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3372 { 3373 char *input = amdgpu_lockup_timeout; 3374 char *timeout_setting = NULL; 3375 int index = 0; 3376 long timeout; 3377 int ret = 0; 3378 3379 /* 3380 * By default timeout for non compute jobs is 10000 3381 * and 60000 for compute jobs. 3382 * In SR-IOV or passthrough mode, timeout for compute 3383 * jobs are 60000 by default. 3384 */ 3385 adev->gfx_timeout = msecs_to_jiffies(10000); 3386 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3387 if (amdgpu_sriov_vf(adev)) 3388 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3389 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3390 else 3391 adev->compute_timeout = msecs_to_jiffies(60000); 3392 3393 #ifdef notyet 3394 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3395 while ((timeout_setting = strsep(&input, ",")) && 3396 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3397 ret = kstrtol(timeout_setting, 0, &timeout); 3398 if (ret) 3399 return ret; 3400 3401 if (timeout == 0) { 3402 index++; 3403 continue; 3404 } else if (timeout < 0) { 3405 timeout = MAX_SCHEDULE_TIMEOUT; 3406 } else { 3407 timeout = msecs_to_jiffies(timeout); 3408 } 3409 3410 switch (index++) { 3411 case 0: 3412 adev->gfx_timeout = timeout; 3413 break; 3414 case 1: 3415 adev->compute_timeout = timeout; 3416 break; 3417 case 2: 3418 adev->sdma_timeout = timeout; 3419 break; 3420 case 3: 3421 adev->video_timeout = timeout; 3422 break; 3423 default: 3424 break; 3425 } 3426 } 3427 /* 3428 * There is only one value specified and 3429 * it should apply to all non-compute jobs. 3430 */ 3431 if (index == 1) { 3432 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3433 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3434 adev->compute_timeout = adev->gfx_timeout; 3435 } 3436 } 3437 #endif 3438 3439 return ret; 3440 } 3441 3442 static const struct attribute *amdgpu_dev_attributes[] = { 3443 &dev_attr_product_name.attr, 3444 &dev_attr_product_number.attr, 3445 &dev_attr_serial_number.attr, 3446 &dev_attr_pcie_replay_count.attr, 3447 NULL 3448 }; 3449 3450 /** 3451 * amdgpu_device_init - initialize the driver 3452 * 3453 * @adev: amdgpu_device pointer 3454 * @flags: driver flags 3455 * 3456 * Initializes the driver info and hw (all asics). 3457 * Returns 0 for success or an error on failure. 3458 * Called at driver startup. 3459 */ 3460 int amdgpu_device_init(struct amdgpu_device *adev, 3461 uint32_t flags) 3462 { 3463 struct drm_device *ddev = adev_to_drm(adev); 3464 struct pci_dev *pdev = adev->pdev; 3465 int r, i; 3466 bool px = false; 3467 u32 max_MBps; 3468 3469 adev->shutdown = false; 3470 adev->flags = flags; 3471 3472 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3473 adev->asic_type = amdgpu_force_asic_type; 3474 else 3475 adev->asic_type = flags & AMD_ASIC_MASK; 3476 3477 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3478 if (amdgpu_emu_mode == 1) 3479 adev->usec_timeout *= 10; 3480 adev->gmc.gart_size = 512 * 1024 * 1024; 3481 adev->accel_working = false; 3482 adev->num_rings = 0; 3483 adev->mman.buffer_funcs = NULL; 3484 adev->mman.buffer_funcs_ring = NULL; 3485 adev->vm_manager.vm_pte_funcs = NULL; 3486 adev->vm_manager.vm_pte_num_scheds = 0; 3487 adev->gmc.gmc_funcs = NULL; 3488 adev->harvest_ip_mask = 0x0; 3489 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3490 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3491 3492 adev->smc_rreg = &amdgpu_invalid_rreg; 3493 adev->smc_wreg = &amdgpu_invalid_wreg; 3494 adev->pcie_rreg = &amdgpu_invalid_rreg; 3495 adev->pcie_wreg = &amdgpu_invalid_wreg; 3496 adev->pciep_rreg = &amdgpu_invalid_rreg; 3497 adev->pciep_wreg = &amdgpu_invalid_wreg; 3498 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3499 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3500 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3501 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3502 adev->didt_rreg = &amdgpu_invalid_rreg; 3503 adev->didt_wreg = &amdgpu_invalid_wreg; 3504 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3505 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3506 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3507 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3508 3509 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3510 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3511 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3512 3513 /* mutex initialization are all done here so we 3514 * can recall function without having locking issues */ 3515 rw_init(&adev->firmware.mutex, "agfw"); 3516 rw_init(&adev->pm.mutex, "agpm"); 3517 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3518 rw_init(&adev->srbm_mutex, "srbm"); 3519 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3520 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3521 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3522 rw_init(&adev->mn_lock, "agpumn"); 3523 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3524 hash_init(adev->mn_hash); 3525 atomic_set(&adev->in_gpu_reset, 0); 3526 rw_init(&adev->reset_sem, "amrs"); 3527 rw_init(&adev->psp.mutex, "agpsp"); 3528 rw_init(&adev->notifier_lock, "agnf"); 3529 3530 r = amdgpu_device_init_apu_flags(adev); 3531 if (r) 3532 return r; 3533 3534 r = amdgpu_device_check_arguments(adev); 3535 if (r) 3536 return r; 3537 3538 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3539 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3540 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3541 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3542 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3543 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3544 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3545 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3546 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3547 3548 INIT_LIST_HEAD(&adev->shadow_list); 3549 rw_init(&adev->shadow_list_lock, "sdwlst"); 3550 3551 INIT_LIST_HEAD(&adev->reset_list); 3552 3553 INIT_DELAYED_WORK(&adev->delayed_init_work, 3554 amdgpu_device_delayed_init_work_handler); 3555 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3556 amdgpu_device_delay_enable_gfx_off); 3557 3558 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3559 3560 adev->gfx.gfx_off_req_count = 1; 3561 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3562 3563 atomic_set(&adev->throttling_logging_enabled, 1); 3564 /* 3565 * If throttling continues, logging will be performed every minute 3566 * to avoid log flooding. "-1" is subtracted since the thermal 3567 * throttling interrupt comes every second. Thus, the total logging 3568 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3569 * for throttling interrupt) = 60 seconds. 3570 */ 3571 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3572 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3573 3574 #ifdef __linux__ 3575 /* Registers mapping */ 3576 /* TODO: block userspace mapping of io register */ 3577 if (adev->asic_type >= CHIP_BONAIRE) { 3578 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3579 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3580 } else { 3581 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3582 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3583 } 3584 3585 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3586 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3587 3588 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3589 if (adev->rmmio == NULL) { 3590 return -ENOMEM; 3591 } 3592 #endif 3593 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3594 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3595 3596 /* enable PCIE atomic ops */ 3597 #ifdef notyet 3598 r = pci_enable_atomic_ops_to_root(adev->pdev, 3599 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3600 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3601 if (r) { 3602 adev->have_atomics_support = false; 3603 DRM_INFO("PCIE atomic ops is not supported\n"); 3604 } else { 3605 adev->have_atomics_support = true; 3606 } 3607 #else 3608 adev->have_atomics_support = false; 3609 #endif 3610 3611 amdgpu_device_get_pcie_info(adev); 3612 3613 if (amdgpu_mcbp) 3614 DRM_INFO("MCBP is enabled\n"); 3615 3616 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3617 adev->enable_mes = true; 3618 3619 /* detect hw virtualization here */ 3620 amdgpu_detect_virtualization(adev); 3621 3622 r = amdgpu_device_get_job_timeout_settings(adev); 3623 if (r) { 3624 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3625 return r; 3626 } 3627 3628 /* early init functions */ 3629 r = amdgpu_device_ip_early_init(adev); 3630 if (r) 3631 return r; 3632 3633 /* doorbell bar mapping and doorbell index init*/ 3634 amdgpu_device_doorbell_init(adev); 3635 3636 if (amdgpu_emu_mode == 1) { 3637 /* post the asic on emulation mode */ 3638 emu_soc_asic_init(adev); 3639 goto fence_driver_init; 3640 } 3641 3642 amdgpu_reset_init(adev); 3643 3644 /* detect if we are with an SRIOV vbios */ 3645 amdgpu_device_detect_sriov_bios(adev); 3646 3647 /* check if we need to reset the asic 3648 * E.g., driver was not cleanly unloaded previously, etc. 3649 */ 3650 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3651 if (adev->gmc.xgmi.num_physical_nodes) { 3652 dev_info(adev->dev, "Pending hive reset.\n"); 3653 adev->gmc.xgmi.pending_reset = true; 3654 /* Only need to init necessary block for SMU to handle the reset */ 3655 for (i = 0; i < adev->num_ip_blocks; i++) { 3656 if (!adev->ip_blocks[i].status.valid) 3657 continue; 3658 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3659 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3660 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3662 DRM_DEBUG("IP %s disabled for hw_init.\n", 3663 adev->ip_blocks[i].version->funcs->name); 3664 adev->ip_blocks[i].status.hw = true; 3665 } 3666 } 3667 } else { 3668 r = amdgpu_asic_reset(adev); 3669 if (r) { 3670 dev_err(adev->dev, "asic reset on init failed\n"); 3671 goto failed; 3672 } 3673 } 3674 } 3675 3676 pci_enable_pcie_error_reporting(adev->pdev); 3677 3678 /* Post card if necessary */ 3679 if (amdgpu_device_need_post(adev)) { 3680 if (!adev->bios) { 3681 dev_err(adev->dev, "no vBIOS found\n"); 3682 r = -EINVAL; 3683 goto failed; 3684 } 3685 DRM_INFO("GPU posting now...\n"); 3686 r = amdgpu_device_asic_init(adev); 3687 if (r) { 3688 dev_err(adev->dev, "gpu post error!\n"); 3689 goto failed; 3690 } 3691 } 3692 3693 if (adev->is_atom_fw) { 3694 /* Initialize clocks */ 3695 r = amdgpu_atomfirmware_get_clock_info(adev); 3696 if (r) { 3697 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3698 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3699 goto failed; 3700 } 3701 } else { 3702 /* Initialize clocks */ 3703 r = amdgpu_atombios_get_clock_info(adev); 3704 if (r) { 3705 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3706 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3707 goto failed; 3708 } 3709 /* init i2c buses */ 3710 if (!amdgpu_device_has_dc_support(adev)) 3711 amdgpu_atombios_i2c_init(adev); 3712 } 3713 3714 fence_driver_init: 3715 /* Fence driver */ 3716 r = amdgpu_fence_driver_sw_init(adev); 3717 if (r) { 3718 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3719 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3720 goto failed; 3721 } 3722 3723 /* init the mode config */ 3724 drm_mode_config_init(adev_to_drm(adev)); 3725 3726 r = amdgpu_device_ip_init(adev); 3727 if (r) { 3728 /* failed in exclusive mode due to timeout */ 3729 if (amdgpu_sriov_vf(adev) && 3730 !amdgpu_sriov_runtime(adev) && 3731 amdgpu_virt_mmio_blocked(adev) && 3732 !amdgpu_virt_wait_reset(adev)) { 3733 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3734 /* Don't send request since VF is inactive. */ 3735 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3736 adev->virt.ops = NULL; 3737 r = -EAGAIN; 3738 goto release_ras_con; 3739 } 3740 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3741 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3742 goto release_ras_con; 3743 } 3744 3745 amdgpu_fence_driver_hw_init(adev); 3746 3747 dev_info(adev->dev, 3748 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3749 adev->gfx.config.max_shader_engines, 3750 adev->gfx.config.max_sh_per_se, 3751 adev->gfx.config.max_cu_per_sh, 3752 adev->gfx.cu_info.number); 3753 3754 #ifdef __OpenBSD__ 3755 { 3756 const char *chip_name; 3757 3758 switch (adev->asic_type) { 3759 case CHIP_RAVEN: 3760 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3761 chip_name = "RAVEN2"; 3762 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3763 chip_name = "PICASSO"; 3764 else 3765 chip_name = "RAVEN"; 3766 break; 3767 case CHIP_RENOIR: 3768 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3769 chip_name = "RENOIR"; 3770 else 3771 chip_name = "GREEN_SARDINE"; 3772 break; 3773 default: 3774 chip_name = amdgpu_asic_name[adev->asic_type]; 3775 } 3776 printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname, 3777 chip_name, adev->gfx.cu_info.number, adev->rev_id); 3778 } 3779 #endif 3780 3781 adev->accel_working = true; 3782 3783 amdgpu_vm_check_compute_bug(adev); 3784 3785 /* Initialize the buffer migration limit. */ 3786 if (amdgpu_moverate >= 0) 3787 max_MBps = amdgpu_moverate; 3788 else 3789 max_MBps = 8; /* Allow 8 MB/s. */ 3790 /* Get a log2 for easy divisions. */ 3791 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3792 3793 amdgpu_fbdev_init(adev); 3794 3795 r = amdgpu_pm_sysfs_init(adev); 3796 if (r) { 3797 adev->pm_sysfs_en = false; 3798 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3799 } else 3800 adev->pm_sysfs_en = true; 3801 3802 r = amdgpu_ucode_sysfs_init(adev); 3803 if (r) { 3804 adev->ucode_sysfs_en = false; 3805 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3806 } else 3807 adev->ucode_sysfs_en = true; 3808 3809 if ((amdgpu_testing & 1)) { 3810 if (adev->accel_working) 3811 amdgpu_test_moves(adev); 3812 else 3813 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3814 } 3815 if (amdgpu_benchmarking) { 3816 if (adev->accel_working) 3817 amdgpu_benchmark(adev, amdgpu_benchmarking); 3818 else 3819 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3820 } 3821 3822 /* 3823 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3824 * Otherwise the mgpu fan boost feature will be skipped due to the 3825 * gpu instance is counted less. 3826 */ 3827 amdgpu_register_gpu_instance(adev); 3828 3829 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3830 * explicit gating rather than handling it automatically. 3831 */ 3832 if (!adev->gmc.xgmi.pending_reset) { 3833 r = amdgpu_device_ip_late_init(adev); 3834 if (r) { 3835 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3836 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3837 goto release_ras_con; 3838 } 3839 /* must succeed. */ 3840 amdgpu_ras_resume(adev); 3841 queue_delayed_work(system_wq, &adev->delayed_init_work, 3842 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3843 } 3844 3845 if (amdgpu_sriov_vf(adev)) 3846 flush_delayed_work(&adev->delayed_init_work); 3847 3848 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3849 if (r) 3850 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3851 3852 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3853 r = amdgpu_pmu_init(adev); 3854 if (r) 3855 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3856 3857 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3858 if (amdgpu_device_cache_pci_state(adev->pdev)) 3859 pci_restore_state(pdev); 3860 3861 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3862 /* this will fail for cards that aren't VGA class devices, just 3863 * ignore it */ 3864 #ifdef notyet 3865 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3866 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3867 #endif 3868 3869 if (amdgpu_device_supports_px(ddev)) { 3870 px = true; 3871 vga_switcheroo_register_client(adev->pdev, 3872 &amdgpu_switcheroo_ops, px); 3873 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3874 } 3875 3876 if (adev->gmc.xgmi.pending_reset) 3877 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3878 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3879 3880 return 0; 3881 3882 release_ras_con: 3883 amdgpu_release_ras_context(adev); 3884 3885 failed: 3886 amdgpu_vf_error_trans_all(adev); 3887 3888 return r; 3889 } 3890 3891 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3892 { 3893 STUB(); 3894 #ifdef notyet 3895 /* Clear all CPU mappings pointing to this device */ 3896 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3897 #endif 3898 3899 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3900 amdgpu_device_doorbell_fini(adev); 3901 3902 #ifdef __linux__ 3903 iounmap(adev->rmmio); 3904 adev->rmmio = NULL; 3905 if (adev->mman.aper_base_kaddr) 3906 iounmap(adev->mman.aper_base_kaddr); 3907 adev->mman.aper_base_kaddr = NULL; 3908 #else 3909 if (adev->rmmio_size > 0) 3910 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 3911 adev->rmmio_size); 3912 adev->rmmio_size = 0; 3913 adev->rmmio = NULL; 3914 if (adev->mman.aper_base_kaddr) 3915 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 3916 adev->gmc.visible_vram_size); 3917 adev->mman.aper_base_kaddr = NULL; 3918 #endif 3919 3920 /* Memory manager related */ 3921 if (!adev->gmc.xgmi.connected_to_cpu) { 3922 #ifdef __linux__ 3923 arch_phys_wc_del(adev->gmc.vram_mtrr); 3924 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3925 #else 3926 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 3927 #endif 3928 } 3929 } 3930 3931 /** 3932 * amdgpu_device_fini - tear down the driver 3933 * 3934 * @adev: amdgpu_device pointer 3935 * 3936 * Tear down the driver info (all asics). 3937 * Called at driver shutdown. 3938 */ 3939 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3940 { 3941 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3942 flush_delayed_work(&adev->delayed_init_work); 3943 if (adev->mman.initialized) { 3944 flush_delayed_work(&adev->mman.bdev.wq); 3945 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3946 } 3947 adev->shutdown = true; 3948 3949 /* make sure IB test finished before entering exclusive mode 3950 * to avoid preemption on IB test 3951 * */ 3952 if (amdgpu_sriov_vf(adev)) { 3953 amdgpu_virt_request_full_gpu(adev, false); 3954 amdgpu_virt_fini_data_exchange(adev); 3955 } 3956 3957 /* disable all interrupts */ 3958 amdgpu_irq_disable_all(adev); 3959 if (adev->mode_info.mode_config_initialized){ 3960 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3961 drm_helper_force_disable_all(adev_to_drm(adev)); 3962 else 3963 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3964 } 3965 amdgpu_fence_driver_hw_fini(adev); 3966 3967 if (adev->pm_sysfs_en) 3968 amdgpu_pm_sysfs_fini(adev); 3969 if (adev->ucode_sysfs_en) 3970 amdgpu_ucode_sysfs_fini(adev); 3971 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3972 3973 amdgpu_fbdev_fini(adev); 3974 3975 amdgpu_irq_fini_hw(adev); 3976 3977 amdgpu_device_ip_fini_early(adev); 3978 3979 amdgpu_gart_dummy_page_fini(adev); 3980 3981 amdgpu_device_unmap_mmio(adev); 3982 } 3983 3984 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3985 { 3986 amdgpu_fence_driver_sw_fini(adev); 3987 amdgpu_device_ip_fini(adev); 3988 release_firmware(adev->firmware.gpu_info_fw); 3989 adev->firmware.gpu_info_fw = NULL; 3990 adev->accel_working = false; 3991 3992 amdgpu_reset_fini(adev); 3993 3994 /* free i2c buses */ 3995 if (!amdgpu_device_has_dc_support(adev)) 3996 amdgpu_i2c_fini(adev); 3997 3998 if (amdgpu_emu_mode != 1) 3999 amdgpu_atombios_fini(adev); 4000 4001 kfree(adev->bios); 4002 adev->bios = NULL; 4003 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4004 vga_switcheroo_unregister_client(adev->pdev); 4005 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4006 } 4007 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4008 vga_client_unregister(adev->pdev); 4009 4010 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4011 amdgpu_pmu_fini(adev); 4012 if (adev->mman.discovery_bin) 4013 amdgpu_discovery_fini(adev); 4014 4015 kfree(adev->pci_state); 4016 4017 } 4018 4019 4020 /* 4021 * Suspend & resume. 4022 */ 4023 /** 4024 * amdgpu_device_suspend - initiate device suspend 4025 * 4026 * @dev: drm dev pointer 4027 * @fbcon : notify the fbdev of suspend 4028 * 4029 * Puts the hw in the suspend state (all asics). 4030 * Returns 0 for success or an error on failure. 4031 * Called at driver suspend. 4032 */ 4033 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4034 { 4035 struct amdgpu_device *adev = drm_to_adev(dev); 4036 4037 if (adev->shutdown) 4038 return 0; 4039 4040 #ifdef notyet 4041 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4042 return 0; 4043 #endif 4044 4045 adev->in_suspend = true; 4046 4047 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4048 DRM_WARN("smart shift update failed\n"); 4049 4050 drm_kms_helper_poll_disable(dev); 4051 4052 if (fbcon) 4053 amdgpu_fbdev_set_suspend(adev, 1); 4054 4055 cancel_delayed_work_sync(&adev->delayed_init_work); 4056 4057 amdgpu_ras_suspend(adev); 4058 4059 amdgpu_device_ip_suspend_phase1(adev); 4060 4061 if (!adev->in_s0ix) 4062 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4063 4064 /* evict vram memory */ 4065 amdgpu_bo_evict_vram(adev); 4066 4067 amdgpu_fence_driver_hw_fini(adev); 4068 4069 amdgpu_device_ip_suspend_phase2(adev); 4070 /* evict remaining vram memory 4071 * This second call to evict vram is to evict the gart page table 4072 * using the CPU. 4073 */ 4074 amdgpu_bo_evict_vram(adev); 4075 4076 return 0; 4077 } 4078 4079 /** 4080 * amdgpu_device_resume - initiate device resume 4081 * 4082 * @dev: drm dev pointer 4083 * @fbcon : notify the fbdev of resume 4084 * 4085 * Bring the hw back to operating state (all asics). 4086 * Returns 0 for success or an error on failure. 4087 * Called at driver resume. 4088 */ 4089 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4090 { 4091 struct amdgpu_device *adev = drm_to_adev(dev); 4092 int r = 0; 4093 4094 #ifdef notyet 4095 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4096 return 0; 4097 #endif 4098 4099 if (adev->in_s0ix) 4100 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 4101 4102 /* post card */ 4103 if (amdgpu_device_need_post(adev)) { 4104 r = amdgpu_device_asic_init(adev); 4105 if (r) 4106 dev_err(adev->dev, "amdgpu asic init failed\n"); 4107 } 4108 4109 r = amdgpu_device_ip_resume(adev); 4110 if (r) { 4111 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4112 return r; 4113 } 4114 amdgpu_fence_driver_hw_init(adev); 4115 4116 r = amdgpu_device_ip_late_init(adev); 4117 if (r) 4118 return r; 4119 4120 queue_delayed_work(system_wq, &adev->delayed_init_work, 4121 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4122 4123 if (!adev->in_s0ix) { 4124 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4125 if (r) 4126 return r; 4127 } 4128 4129 /* Make sure IB tests flushed */ 4130 flush_delayed_work(&adev->delayed_init_work); 4131 4132 if (fbcon) 4133 amdgpu_fbdev_set_suspend(adev, 0); 4134 4135 drm_kms_helper_poll_enable(dev); 4136 4137 amdgpu_ras_resume(adev); 4138 4139 /* 4140 * Most of the connector probing functions try to acquire runtime pm 4141 * refs to ensure that the GPU is powered on when connector polling is 4142 * performed. Since we're calling this from a runtime PM callback, 4143 * trying to acquire rpm refs will cause us to deadlock. 4144 * 4145 * Since we're guaranteed to be holding the rpm lock, it's safe to 4146 * temporarily disable the rpm helpers so this doesn't deadlock us. 4147 */ 4148 #if defined(CONFIG_PM) && defined(__linux__) 4149 dev->dev->power.disable_depth++; 4150 #endif 4151 if (!amdgpu_device_has_dc_support(adev)) 4152 drm_helper_hpd_irq_event(dev); 4153 else 4154 drm_kms_helper_hotplug_event(dev); 4155 #if defined(CONFIG_PM) && defined(__linux__) 4156 dev->dev->power.disable_depth--; 4157 #endif 4158 adev->in_suspend = false; 4159 4160 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4161 DRM_WARN("smart shift update failed\n"); 4162 4163 return 0; 4164 } 4165 4166 /** 4167 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4168 * 4169 * @adev: amdgpu_device pointer 4170 * 4171 * The list of all the hardware IPs that make up the asic is walked and 4172 * the check_soft_reset callbacks are run. check_soft_reset determines 4173 * if the asic is still hung or not. 4174 * Returns true if any of the IPs are still in a hung state, false if not. 4175 */ 4176 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4177 { 4178 int i; 4179 bool asic_hang = false; 4180 4181 if (amdgpu_sriov_vf(adev)) 4182 return true; 4183 4184 if (amdgpu_asic_need_full_reset(adev)) 4185 return true; 4186 4187 for (i = 0; i < adev->num_ip_blocks; i++) { 4188 if (!adev->ip_blocks[i].status.valid) 4189 continue; 4190 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4191 adev->ip_blocks[i].status.hang = 4192 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4193 if (adev->ip_blocks[i].status.hang) { 4194 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4195 asic_hang = true; 4196 } 4197 } 4198 return asic_hang; 4199 } 4200 4201 /** 4202 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4203 * 4204 * @adev: amdgpu_device pointer 4205 * 4206 * The list of all the hardware IPs that make up the asic is walked and the 4207 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4208 * handles any IP specific hardware or software state changes that are 4209 * necessary for a soft reset to succeed. 4210 * Returns 0 on success, negative error code on failure. 4211 */ 4212 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4213 { 4214 int i, r = 0; 4215 4216 for (i = 0; i < adev->num_ip_blocks; i++) { 4217 if (!adev->ip_blocks[i].status.valid) 4218 continue; 4219 if (adev->ip_blocks[i].status.hang && 4220 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4221 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4222 if (r) 4223 return r; 4224 } 4225 } 4226 4227 return 0; 4228 } 4229 4230 /** 4231 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4232 * 4233 * @adev: amdgpu_device pointer 4234 * 4235 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4236 * reset is necessary to recover. 4237 * Returns true if a full asic reset is required, false if not. 4238 */ 4239 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4240 { 4241 int i; 4242 4243 if (amdgpu_asic_need_full_reset(adev)) 4244 return true; 4245 4246 for (i = 0; i < adev->num_ip_blocks; i++) { 4247 if (!adev->ip_blocks[i].status.valid) 4248 continue; 4249 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4250 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4251 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4252 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4253 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4254 if (adev->ip_blocks[i].status.hang) { 4255 dev_info(adev->dev, "Some block need full reset!\n"); 4256 return true; 4257 } 4258 } 4259 } 4260 return false; 4261 } 4262 4263 /** 4264 * amdgpu_device_ip_soft_reset - do a soft reset 4265 * 4266 * @adev: amdgpu_device pointer 4267 * 4268 * The list of all the hardware IPs that make up the asic is walked and the 4269 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4270 * IP specific hardware or software state changes that are necessary to soft 4271 * reset the IP. 4272 * Returns 0 on success, negative error code on failure. 4273 */ 4274 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4275 { 4276 int i, r = 0; 4277 4278 for (i = 0; i < adev->num_ip_blocks; i++) { 4279 if (!adev->ip_blocks[i].status.valid) 4280 continue; 4281 if (adev->ip_blocks[i].status.hang && 4282 adev->ip_blocks[i].version->funcs->soft_reset) { 4283 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4284 if (r) 4285 return r; 4286 } 4287 } 4288 4289 return 0; 4290 } 4291 4292 /** 4293 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * The list of all the hardware IPs that make up the asic is walked and the 4298 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4299 * handles any IP specific hardware or software state changes that are 4300 * necessary after the IP has been soft reset. 4301 * Returns 0 on success, negative error code on failure. 4302 */ 4303 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4304 { 4305 int i, r = 0; 4306 4307 for (i = 0; i < adev->num_ip_blocks; i++) { 4308 if (!adev->ip_blocks[i].status.valid) 4309 continue; 4310 if (adev->ip_blocks[i].status.hang && 4311 adev->ip_blocks[i].version->funcs->post_soft_reset) 4312 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4313 if (r) 4314 return r; 4315 } 4316 4317 return 0; 4318 } 4319 4320 /** 4321 * amdgpu_device_recover_vram - Recover some VRAM contents 4322 * 4323 * @adev: amdgpu_device pointer 4324 * 4325 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4326 * restore things like GPUVM page tables after a GPU reset where 4327 * the contents of VRAM might be lost. 4328 * 4329 * Returns: 4330 * 0 on success, negative error code on failure. 4331 */ 4332 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4333 { 4334 struct dma_fence *fence = NULL, *next = NULL; 4335 struct amdgpu_bo *shadow; 4336 struct amdgpu_bo_vm *vmbo; 4337 long r = 1, tmo; 4338 4339 if (amdgpu_sriov_runtime(adev)) 4340 tmo = msecs_to_jiffies(8000); 4341 else 4342 tmo = msecs_to_jiffies(100); 4343 4344 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4345 mutex_lock(&adev->shadow_list_lock); 4346 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4347 shadow = &vmbo->bo; 4348 /* No need to recover an evicted BO */ 4349 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4350 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4351 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4352 continue; 4353 4354 r = amdgpu_bo_restore_shadow(shadow, &next); 4355 if (r) 4356 break; 4357 4358 if (fence) { 4359 tmo = dma_fence_wait_timeout(fence, false, tmo); 4360 dma_fence_put(fence); 4361 fence = next; 4362 if (tmo == 0) { 4363 r = -ETIMEDOUT; 4364 break; 4365 } else if (tmo < 0) { 4366 r = tmo; 4367 break; 4368 } 4369 } else { 4370 fence = next; 4371 } 4372 } 4373 mutex_unlock(&adev->shadow_list_lock); 4374 4375 if (fence) 4376 tmo = dma_fence_wait_timeout(fence, false, tmo); 4377 dma_fence_put(fence); 4378 4379 if (r < 0 || tmo <= 0) { 4380 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4381 return -EIO; 4382 } 4383 4384 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4385 return 0; 4386 } 4387 4388 4389 /** 4390 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4391 * 4392 * @adev: amdgpu_device pointer 4393 * @from_hypervisor: request from hypervisor 4394 * 4395 * do VF FLR and reinitialize Asic 4396 * return 0 means succeeded otherwise failed 4397 */ 4398 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4399 bool from_hypervisor) 4400 { 4401 int r; 4402 4403 if (from_hypervisor) 4404 r = amdgpu_virt_request_full_gpu(adev, true); 4405 else 4406 r = amdgpu_virt_reset_gpu(adev); 4407 if (r) 4408 return r; 4409 4410 amdgpu_amdkfd_pre_reset(adev); 4411 4412 /* Resume IP prior to SMC */ 4413 r = amdgpu_device_ip_reinit_early_sriov(adev); 4414 if (r) 4415 goto error; 4416 4417 amdgpu_virt_init_data_exchange(adev); 4418 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4419 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4420 4421 r = amdgpu_device_fw_loading(adev); 4422 if (r) 4423 return r; 4424 4425 /* now we are okay to resume SMC/CP/SDMA */ 4426 r = amdgpu_device_ip_reinit_late_sriov(adev); 4427 if (r) 4428 goto error; 4429 4430 amdgpu_irq_gpu_reset_resume_helper(adev); 4431 r = amdgpu_ib_ring_tests(adev); 4432 amdgpu_amdkfd_post_reset(adev); 4433 4434 error: 4435 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4436 amdgpu_inc_vram_lost(adev); 4437 r = amdgpu_device_recover_vram(adev); 4438 } 4439 amdgpu_virt_release_full_gpu(adev, true); 4440 4441 return r; 4442 } 4443 4444 /** 4445 * amdgpu_device_has_job_running - check if there is any job in mirror list 4446 * 4447 * @adev: amdgpu_device pointer 4448 * 4449 * check if there is any job in mirror list 4450 */ 4451 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4452 { 4453 int i; 4454 struct drm_sched_job *job; 4455 4456 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4457 struct amdgpu_ring *ring = adev->rings[i]; 4458 4459 if (!ring || !ring->sched.thread) 4460 continue; 4461 4462 spin_lock(&ring->sched.job_list_lock); 4463 job = list_first_entry_or_null(&ring->sched.pending_list, 4464 struct drm_sched_job, list); 4465 spin_unlock(&ring->sched.job_list_lock); 4466 if (job) 4467 return true; 4468 } 4469 return false; 4470 } 4471 4472 /** 4473 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4474 * 4475 * @adev: amdgpu_device pointer 4476 * 4477 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4478 * a hung GPU. 4479 */ 4480 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4481 { 4482 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4483 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4484 return false; 4485 } 4486 4487 if (amdgpu_gpu_recovery == 0) 4488 goto disabled; 4489 4490 if (amdgpu_sriov_vf(adev)) 4491 return true; 4492 4493 if (amdgpu_gpu_recovery == -1) { 4494 switch (adev->asic_type) { 4495 case CHIP_BONAIRE: 4496 case CHIP_HAWAII: 4497 case CHIP_TOPAZ: 4498 case CHIP_TONGA: 4499 case CHIP_FIJI: 4500 case CHIP_POLARIS10: 4501 case CHIP_POLARIS11: 4502 case CHIP_POLARIS12: 4503 case CHIP_VEGAM: 4504 case CHIP_VEGA20: 4505 case CHIP_VEGA10: 4506 case CHIP_VEGA12: 4507 case CHIP_RAVEN: 4508 case CHIP_ARCTURUS: 4509 case CHIP_RENOIR: 4510 case CHIP_NAVI10: 4511 case CHIP_NAVI14: 4512 case CHIP_NAVI12: 4513 case CHIP_SIENNA_CICHLID: 4514 case CHIP_NAVY_FLOUNDER: 4515 case CHIP_DIMGREY_CAVEFISH: 4516 case CHIP_BEIGE_GOBY: 4517 case CHIP_VANGOGH: 4518 case CHIP_ALDEBARAN: 4519 break; 4520 default: 4521 goto disabled; 4522 } 4523 } 4524 4525 return true; 4526 4527 disabled: 4528 dev_info(adev->dev, "GPU recovery disabled.\n"); 4529 return false; 4530 } 4531 4532 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4533 { 4534 u32 i; 4535 int ret = 0; 4536 4537 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4538 4539 dev_info(adev->dev, "GPU mode1 reset\n"); 4540 4541 /* disable BM */ 4542 pci_clear_master(adev->pdev); 4543 4544 amdgpu_device_cache_pci_state(adev->pdev); 4545 4546 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4547 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4548 ret = amdgpu_dpm_mode1_reset(adev); 4549 } else { 4550 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4551 ret = psp_gpu_reset(adev); 4552 } 4553 4554 if (ret) 4555 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4556 4557 amdgpu_device_load_pci_state(adev->pdev); 4558 4559 /* wait for asic to come out of reset */ 4560 for (i = 0; i < adev->usec_timeout; i++) { 4561 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4562 4563 if (memsize != 0xffffffff) 4564 break; 4565 udelay(1); 4566 } 4567 4568 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4569 return ret; 4570 } 4571 4572 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4573 struct amdgpu_reset_context *reset_context) 4574 { 4575 int i, j, r = 0; 4576 struct amdgpu_job *job = NULL; 4577 bool need_full_reset = 4578 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4579 4580 if (reset_context->reset_req_dev == adev) 4581 job = reset_context->job; 4582 4583 if (amdgpu_sriov_vf(adev)) { 4584 /* stop the data exchange thread */ 4585 amdgpu_virt_fini_data_exchange(adev); 4586 } 4587 4588 /* block all schedulers and reset given job's ring */ 4589 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4590 struct amdgpu_ring *ring = adev->rings[i]; 4591 4592 if (!ring || !ring->sched.thread) 4593 continue; 4594 4595 /*clear job fence from fence drv to avoid force_completion 4596 *leave NULL and vm flush fence in fence drv */ 4597 for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { 4598 struct dma_fence *old, **ptr; 4599 4600 ptr = &ring->fence_drv.fences[j]; 4601 old = rcu_dereference_protected(*ptr, 1); 4602 if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { 4603 RCU_INIT_POINTER(*ptr, NULL); 4604 } 4605 } 4606 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4607 amdgpu_fence_driver_force_completion(ring); 4608 } 4609 4610 if (job && job->vm) 4611 drm_sched_increase_karma(&job->base); 4612 4613 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4614 /* If reset handler not implemented, continue; otherwise return */ 4615 if (r == -ENOSYS) 4616 r = 0; 4617 else 4618 return r; 4619 4620 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4621 if (!amdgpu_sriov_vf(adev)) { 4622 4623 if (!need_full_reset) 4624 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4625 4626 if (!need_full_reset) { 4627 amdgpu_device_ip_pre_soft_reset(adev); 4628 r = amdgpu_device_ip_soft_reset(adev); 4629 amdgpu_device_ip_post_soft_reset(adev); 4630 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4631 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4632 need_full_reset = true; 4633 } 4634 } 4635 4636 if (need_full_reset) 4637 r = amdgpu_device_ip_suspend(adev); 4638 if (need_full_reset) 4639 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4640 else 4641 clear_bit(AMDGPU_NEED_FULL_RESET, 4642 &reset_context->flags); 4643 } 4644 4645 return r; 4646 } 4647 4648 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4649 struct amdgpu_reset_context *reset_context) 4650 { 4651 struct amdgpu_device *tmp_adev = NULL; 4652 bool need_full_reset, skip_hw_reset, vram_lost = false; 4653 int r = 0; 4654 4655 /* Try reset handler method first */ 4656 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4657 reset_list); 4658 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4659 /* If reset handler not implemented, continue; otherwise return */ 4660 if (r == -ENOSYS) 4661 r = 0; 4662 else 4663 return r; 4664 4665 /* Reset handler not implemented, use the default method */ 4666 need_full_reset = 4667 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4668 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4669 4670 /* 4671 * ASIC reset has to be done on all XGMI hive nodes ASAP 4672 * to allow proper links negotiation in FW (within 1 sec) 4673 */ 4674 if (!skip_hw_reset && need_full_reset) { 4675 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4676 /* For XGMI run all resets in parallel to speed up the process */ 4677 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4678 tmp_adev->gmc.xgmi.pending_reset = false; 4679 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4680 r = -EALREADY; 4681 } else 4682 r = amdgpu_asic_reset(tmp_adev); 4683 4684 if (r) { 4685 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4686 r, adev_to_drm(tmp_adev)->unique); 4687 break; 4688 } 4689 } 4690 4691 /* For XGMI wait for all resets to complete before proceed */ 4692 if (!r) { 4693 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4694 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4695 flush_work(&tmp_adev->xgmi_reset_work); 4696 r = tmp_adev->asic_reset_res; 4697 if (r) 4698 break; 4699 } 4700 } 4701 } 4702 } 4703 4704 if (!r && amdgpu_ras_intr_triggered()) { 4705 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4706 if (tmp_adev->mmhub.ras_funcs && 4707 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4708 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4709 } 4710 4711 amdgpu_ras_intr_cleared(); 4712 } 4713 4714 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4715 if (need_full_reset) { 4716 /* post card */ 4717 r = amdgpu_device_asic_init(tmp_adev); 4718 if (r) { 4719 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4720 } else { 4721 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4722 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4723 if (r) 4724 goto out; 4725 4726 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4727 if (r) 4728 goto out; 4729 4730 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4731 if (vram_lost) { 4732 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4733 amdgpu_inc_vram_lost(tmp_adev); 4734 } 4735 4736 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4737 if (r) 4738 goto out; 4739 4740 r = amdgpu_device_fw_loading(tmp_adev); 4741 if (r) 4742 return r; 4743 4744 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4745 if (r) 4746 goto out; 4747 4748 if (vram_lost) 4749 amdgpu_device_fill_reset_magic(tmp_adev); 4750 4751 /* 4752 * Add this ASIC as tracked as reset was already 4753 * complete successfully. 4754 */ 4755 amdgpu_register_gpu_instance(tmp_adev); 4756 4757 if (!reset_context->hive && 4758 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4759 amdgpu_xgmi_add_device(tmp_adev); 4760 4761 r = amdgpu_device_ip_late_init(tmp_adev); 4762 if (r) 4763 goto out; 4764 4765 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4766 4767 /* 4768 * The GPU enters bad state once faulty pages 4769 * by ECC has reached the threshold, and ras 4770 * recovery is scheduled next. So add one check 4771 * here to break recovery if it indeed exceeds 4772 * bad page threshold, and remind user to 4773 * retire this GPU or setting one bigger 4774 * bad_page_threshold value to fix this once 4775 * probing driver again. 4776 */ 4777 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4778 /* must succeed. */ 4779 amdgpu_ras_resume(tmp_adev); 4780 } else { 4781 r = -EINVAL; 4782 goto out; 4783 } 4784 4785 /* Update PSP FW topology after reset */ 4786 if (reset_context->hive && 4787 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4788 r = amdgpu_xgmi_update_topology( 4789 reset_context->hive, tmp_adev); 4790 } 4791 } 4792 4793 out: 4794 if (!r) { 4795 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4796 r = amdgpu_ib_ring_tests(tmp_adev); 4797 if (r) { 4798 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4799 need_full_reset = true; 4800 r = -EAGAIN; 4801 goto end; 4802 } 4803 } 4804 4805 if (!r) 4806 r = amdgpu_device_recover_vram(tmp_adev); 4807 else 4808 tmp_adev->asic_reset_res = r; 4809 } 4810 4811 end: 4812 if (need_full_reset) 4813 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4814 else 4815 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4816 return r; 4817 } 4818 4819 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4820 struct amdgpu_hive_info *hive) 4821 { 4822 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4823 return false; 4824 4825 if (hive) { 4826 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4827 } else { 4828 down_write(&adev->reset_sem); 4829 } 4830 4831 switch (amdgpu_asic_reset_method(adev)) { 4832 case AMD_RESET_METHOD_MODE1: 4833 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4834 break; 4835 case AMD_RESET_METHOD_MODE2: 4836 adev->mp1_state = PP_MP1_STATE_RESET; 4837 break; 4838 default: 4839 adev->mp1_state = PP_MP1_STATE_NONE; 4840 break; 4841 } 4842 4843 return true; 4844 } 4845 4846 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4847 { 4848 amdgpu_vf_error_trans_all(adev); 4849 adev->mp1_state = PP_MP1_STATE_NONE; 4850 atomic_set(&adev->in_gpu_reset, 0); 4851 up_write(&adev->reset_sem); 4852 } 4853 4854 /* 4855 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4856 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4857 * 4858 * unlock won't require roll back. 4859 */ 4860 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4861 { 4862 struct amdgpu_device *tmp_adev = NULL; 4863 4864 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4865 if (!hive) { 4866 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4867 return -ENODEV; 4868 } 4869 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4870 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4871 goto roll_back; 4872 } 4873 } else if (!amdgpu_device_lock_adev(adev, hive)) 4874 return -EAGAIN; 4875 4876 return 0; 4877 roll_back: 4878 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4879 /* 4880 * if the lockup iteration break in the middle of a hive, 4881 * it may means there may has a race issue, 4882 * or a hive device locked up independently. 4883 * we may be in trouble and may not, so will try to roll back 4884 * the lock and give out a warnning. 4885 */ 4886 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4887 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4888 amdgpu_device_unlock_adev(tmp_adev); 4889 } 4890 } 4891 return -EAGAIN; 4892 } 4893 4894 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4895 { 4896 STUB(); 4897 #ifdef notyet 4898 struct pci_dev *p = NULL; 4899 4900 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4901 adev->pdev->bus->number, 1); 4902 if (p) { 4903 pm_runtime_enable(&(p->dev)); 4904 pm_runtime_resume(&(p->dev)); 4905 } 4906 #endif 4907 } 4908 4909 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4910 { 4911 enum amd_reset_method reset_method; 4912 struct pci_dev *p = NULL; 4913 u64 expires; 4914 4915 /* 4916 * For now, only BACO and mode1 reset are confirmed 4917 * to suffer the audio issue without proper suspended. 4918 */ 4919 reset_method = amdgpu_asic_reset_method(adev); 4920 if ((reset_method != AMD_RESET_METHOD_BACO) && 4921 (reset_method != AMD_RESET_METHOD_MODE1)) 4922 return -EINVAL; 4923 4924 STUB(); 4925 return -ENOSYS; 4926 #ifdef notyet 4927 4928 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4929 adev->pdev->bus->number, 1); 4930 if (!p) 4931 return -ENODEV; 4932 4933 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4934 if (!expires) 4935 /* 4936 * If we cannot get the audio device autosuspend delay, 4937 * a fixed 4S interval will be used. Considering 3S is 4938 * the audio controller default autosuspend delay setting. 4939 * 4S used here is guaranteed to cover that. 4940 */ 4941 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4942 4943 while (!pm_runtime_status_suspended(&(p->dev))) { 4944 if (!pm_runtime_suspend(&(p->dev))) 4945 break; 4946 4947 if (expires < ktime_get_mono_fast_ns()) { 4948 dev_warn(adev->dev, "failed to suspend display audio\n"); 4949 /* TODO: abort the succeeding gpu reset? */ 4950 return -ETIMEDOUT; 4951 } 4952 } 4953 4954 pm_runtime_disable(&(p->dev)); 4955 4956 return 0; 4957 #endif 4958 } 4959 4960 static void amdgpu_device_recheck_guilty_jobs( 4961 struct amdgpu_device *adev, struct list_head *device_list_handle, 4962 struct amdgpu_reset_context *reset_context) 4963 { 4964 int i, r = 0; 4965 4966 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4967 struct amdgpu_ring *ring = adev->rings[i]; 4968 int ret = 0; 4969 struct drm_sched_job *s_job; 4970 4971 if (!ring || !ring->sched.thread) 4972 continue; 4973 4974 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4975 struct drm_sched_job, list); 4976 if (s_job == NULL) 4977 continue; 4978 4979 /* clear job's guilty and depend the folowing step to decide the real one */ 4980 drm_sched_reset_karma(s_job); 4981 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4982 4983 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4984 if (ret == 0) { /* timeout */ 4985 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4986 ring->sched.name, s_job->id); 4987 4988 /* set guilty */ 4989 drm_sched_increase_karma(s_job); 4990 retry: 4991 /* do hw reset */ 4992 if (amdgpu_sriov_vf(adev)) { 4993 amdgpu_virt_fini_data_exchange(adev); 4994 r = amdgpu_device_reset_sriov(adev, false); 4995 if (r) 4996 adev->asic_reset_res = r; 4997 } else { 4998 clear_bit(AMDGPU_SKIP_HW_RESET, 4999 &reset_context->flags); 5000 r = amdgpu_do_asic_reset(device_list_handle, 5001 reset_context); 5002 if (r && r == -EAGAIN) 5003 goto retry; 5004 } 5005 5006 /* 5007 * add reset counter so that the following 5008 * resubmitted job could flush vmid 5009 */ 5010 atomic_inc(&adev->gpu_reset_counter); 5011 continue; 5012 } 5013 5014 /* got the hw fence, signal finished fence */ 5015 atomic_dec(ring->sched.score); 5016 dma_fence_get(&s_job->s_fence->finished); 5017 dma_fence_signal(&s_job->s_fence->finished); 5018 dma_fence_put(&s_job->s_fence->finished); 5019 5020 /* remove node from list and free the job */ 5021 spin_lock(&ring->sched.job_list_lock); 5022 list_del_init(&s_job->list); 5023 spin_unlock(&ring->sched.job_list_lock); 5024 ring->sched.ops->free_job(s_job); 5025 } 5026 } 5027 5028 /** 5029 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5030 * 5031 * @adev: amdgpu_device pointer 5032 * @job: which job trigger hang 5033 * 5034 * Attempt to reset the GPU if it has hung (all asics). 5035 * Attempt to do soft-reset or full-reset and reinitialize Asic 5036 * Returns 0 for success or an error on failure. 5037 */ 5038 5039 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5040 struct amdgpu_job *job) 5041 { 5042 struct list_head device_list, *device_list_handle = NULL; 5043 bool job_signaled = false; 5044 struct amdgpu_hive_info *hive = NULL; 5045 struct amdgpu_device *tmp_adev = NULL; 5046 int i, r = 0; 5047 bool need_emergency_restart = false; 5048 bool audio_suspended = false; 5049 int tmp_vram_lost_counter; 5050 struct amdgpu_reset_context reset_context; 5051 5052 memset(&reset_context, 0, sizeof(reset_context)); 5053 5054 /* 5055 * Special case: RAS triggered and full reset isn't supported 5056 */ 5057 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5058 5059 /* 5060 * Flush RAM to disk so that after reboot 5061 * the user can read log and see why the system rebooted. 5062 */ 5063 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5064 DRM_WARN("Emergency reboot."); 5065 5066 #ifdef notyet 5067 ksys_sync_helper(); 5068 emergency_restart(); 5069 #else 5070 panic("emergency_restart"); 5071 #endif 5072 } 5073 5074 dev_info(adev->dev, "GPU %s begin!\n", 5075 need_emergency_restart ? "jobs stop":"reset"); 5076 5077 /* 5078 * Here we trylock to avoid chain of resets executing from 5079 * either trigger by jobs on different adevs in XGMI hive or jobs on 5080 * different schedulers for same device while this TO handler is running. 5081 * We always reset all schedulers for device and all devices for XGMI 5082 * hive so that should take care of them too. 5083 */ 5084 hive = amdgpu_get_xgmi_hive(adev); 5085 if (hive) { 5086 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5087 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5088 job ? job->base.id : -1, hive->hive_id); 5089 amdgpu_put_xgmi_hive(hive); 5090 if (job && job->vm) 5091 drm_sched_increase_karma(&job->base); 5092 return 0; 5093 } 5094 mutex_lock(&hive->hive_lock); 5095 } 5096 5097 reset_context.method = AMD_RESET_METHOD_NONE; 5098 reset_context.reset_req_dev = adev; 5099 reset_context.job = job; 5100 reset_context.hive = hive; 5101 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5102 5103 /* 5104 * lock the device before we try to operate the linked list 5105 * if didn't get the device lock, don't touch the linked list since 5106 * others may iterating it. 5107 */ 5108 r = amdgpu_device_lock_hive_adev(adev, hive); 5109 if (r) { 5110 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5111 job ? job->base.id : -1); 5112 5113 /* even we skipped this reset, still need to set the job to guilty */ 5114 if (job && job->vm) 5115 drm_sched_increase_karma(&job->base); 5116 goto skip_recovery; 5117 } 5118 5119 /* 5120 * Build list of devices to reset. 5121 * In case we are in XGMI hive mode, resort the device list 5122 * to put adev in the 1st position. 5123 */ 5124 INIT_LIST_HEAD(&device_list); 5125 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5126 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5127 list_add_tail(&tmp_adev->reset_list, &device_list); 5128 if (!list_is_first(&adev->reset_list, &device_list)) 5129 list_rotate_to_front(&adev->reset_list, &device_list); 5130 device_list_handle = &device_list; 5131 } else { 5132 list_add_tail(&adev->reset_list, &device_list); 5133 device_list_handle = &device_list; 5134 } 5135 5136 /* block all schedulers and reset given job's ring */ 5137 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5138 /* 5139 * Try to put the audio codec into suspend state 5140 * before gpu reset started. 5141 * 5142 * Due to the power domain of the graphics device 5143 * is shared with AZ power domain. Without this, 5144 * we may change the audio hardware from behind 5145 * the audio driver's back. That will trigger 5146 * some audio codec errors. 5147 */ 5148 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5149 audio_suspended = true; 5150 5151 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5152 5153 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5154 5155 if (!amdgpu_sriov_vf(tmp_adev)) 5156 amdgpu_amdkfd_pre_reset(tmp_adev); 5157 5158 /* 5159 * Mark these ASICs to be reseted as untracked first 5160 * And add them back after reset completed 5161 */ 5162 amdgpu_unregister_gpu_instance(tmp_adev); 5163 5164 amdgpu_fbdev_set_suspend(tmp_adev, 1); 5165 5166 /* disable ras on ALL IPs */ 5167 if (!need_emergency_restart && 5168 amdgpu_device_ip_need_full_reset(tmp_adev)) 5169 amdgpu_ras_suspend(tmp_adev); 5170 5171 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5172 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5173 5174 if (!ring || !ring->sched.thread) 5175 continue; 5176 5177 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5178 5179 if (need_emergency_restart) 5180 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5181 } 5182 atomic_inc(&tmp_adev->gpu_reset_counter); 5183 } 5184 5185 if (need_emergency_restart) 5186 goto skip_sched_resume; 5187 5188 /* 5189 * Must check guilty signal here since after this point all old 5190 * HW fences are force signaled. 5191 * 5192 * job->base holds a reference to parent fence 5193 */ 5194 if (job && job->base.s_fence->parent && 5195 dma_fence_is_signaled(job->base.s_fence->parent)) { 5196 job_signaled = true; 5197 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5198 goto skip_hw_reset; 5199 } 5200 5201 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5202 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5203 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5204 /*TODO Should we stop ?*/ 5205 if (r) { 5206 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5207 r, adev_to_drm(tmp_adev)->unique); 5208 tmp_adev->asic_reset_res = r; 5209 } 5210 } 5211 5212 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5213 /* Actual ASIC resets if needed.*/ 5214 /* TODO Implement XGMI hive reset logic for SRIOV */ 5215 if (amdgpu_sriov_vf(adev)) { 5216 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5217 if (r) 5218 adev->asic_reset_res = r; 5219 } else { 5220 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5221 if (r && r == -EAGAIN) 5222 goto retry; 5223 } 5224 5225 skip_hw_reset: 5226 5227 /* Post ASIC reset for all devs .*/ 5228 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5229 5230 /* 5231 * Sometimes a later bad compute job can block a good gfx job as gfx 5232 * and compute ring share internal GC HW mutually. We add an additional 5233 * guilty jobs recheck step to find the real guilty job, it synchronously 5234 * submits and pends for the first job being signaled. If it gets timeout, 5235 * we identify it as a real guilty job. 5236 */ 5237 if (amdgpu_gpu_recovery == 2 && 5238 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5239 amdgpu_device_recheck_guilty_jobs( 5240 tmp_adev, device_list_handle, &reset_context); 5241 5242 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5243 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5244 5245 if (!ring || !ring->sched.thread) 5246 continue; 5247 5248 /* No point to resubmit jobs if we didn't HW reset*/ 5249 if (!tmp_adev->asic_reset_res && !job_signaled) 5250 drm_sched_resubmit_jobs(&ring->sched); 5251 5252 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5253 } 5254 5255 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5256 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5257 } 5258 5259 tmp_adev->asic_reset_res = 0; 5260 5261 if (r) { 5262 /* bad news, how to tell it to userspace ? */ 5263 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5264 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5265 } else { 5266 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5267 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5268 DRM_WARN("smart shift update failed\n"); 5269 } 5270 } 5271 5272 skip_sched_resume: 5273 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5274 /* unlock kfd: SRIOV would do it separately */ 5275 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5276 amdgpu_amdkfd_post_reset(tmp_adev); 5277 5278 /* kfd_post_reset will do nothing if kfd device is not initialized, 5279 * need to bring up kfd here if it's not be initialized before 5280 */ 5281 if (!adev->kfd.init_complete) 5282 amdgpu_amdkfd_device_init(adev); 5283 5284 if (audio_suspended) 5285 amdgpu_device_resume_display_audio(tmp_adev); 5286 amdgpu_device_unlock_adev(tmp_adev); 5287 } 5288 5289 skip_recovery: 5290 if (hive) { 5291 atomic_set(&hive->in_reset, 0); 5292 mutex_unlock(&hive->hive_lock); 5293 amdgpu_put_xgmi_hive(hive); 5294 } 5295 5296 if (r && r != -EAGAIN) 5297 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5298 return r; 5299 } 5300 5301 /** 5302 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5303 * 5304 * @adev: amdgpu_device pointer 5305 * 5306 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5307 * and lanes) of the slot the device is in. Handles APUs and 5308 * virtualized environments where PCIE config space may not be available. 5309 */ 5310 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5311 { 5312 struct pci_dev *pdev; 5313 enum pci_bus_speed speed_cap, platform_speed_cap; 5314 enum pcie_link_width platform_link_width; 5315 5316 if (amdgpu_pcie_gen_cap) 5317 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5318 5319 if (amdgpu_pcie_lane_cap) 5320 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5321 5322 /* covers APUs as well */ 5323 if (pci_is_root_bus(adev->pdev->bus)) { 5324 if (adev->pm.pcie_gen_mask == 0) 5325 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5326 if (adev->pm.pcie_mlw_mask == 0) 5327 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5328 return; 5329 } 5330 5331 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5332 return; 5333 5334 pcie_bandwidth_available(adev->pdev, NULL, 5335 &platform_speed_cap, &platform_link_width); 5336 5337 if (adev->pm.pcie_gen_mask == 0) { 5338 /* asic caps */ 5339 pdev = adev->pdev; 5340 speed_cap = pcie_get_speed_cap(pdev); 5341 if (speed_cap == PCI_SPEED_UNKNOWN) { 5342 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5343 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5344 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5345 } else { 5346 if (speed_cap == PCIE_SPEED_32_0GT) 5347 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5348 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5349 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5350 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5351 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5352 else if (speed_cap == PCIE_SPEED_16_0GT) 5353 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5354 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5355 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5356 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5357 else if (speed_cap == PCIE_SPEED_8_0GT) 5358 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5359 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5360 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5361 else if (speed_cap == PCIE_SPEED_5_0GT) 5362 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5363 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5364 else 5365 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5366 } 5367 /* platform caps */ 5368 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5369 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5370 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5371 } else { 5372 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5373 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5374 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5375 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5376 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5377 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5378 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5379 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5380 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5381 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5382 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5383 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5384 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5385 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5386 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5387 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5388 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5389 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5390 else 5391 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5392 5393 } 5394 } 5395 if (adev->pm.pcie_mlw_mask == 0) { 5396 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5397 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5398 } else { 5399 switch (platform_link_width) { 5400 case PCIE_LNK_X32: 5401 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5402 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5403 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5404 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5405 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5406 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5407 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5408 break; 5409 case PCIE_LNK_X16: 5410 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5411 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5412 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5413 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5414 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5415 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5416 break; 5417 case PCIE_LNK_X12: 5418 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5423 break; 5424 case PCIE_LNK_X8: 5425 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5426 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5427 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5428 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5429 break; 5430 case PCIE_LNK_X4: 5431 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5432 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5433 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5434 break; 5435 case PCIE_LNK_X2: 5436 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5438 break; 5439 case PCIE_LNK_X1: 5440 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5441 break; 5442 default: 5443 break; 5444 } 5445 } 5446 } 5447 } 5448 5449 int amdgpu_device_baco_enter(struct drm_device *dev) 5450 { 5451 struct amdgpu_device *adev = drm_to_adev(dev); 5452 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5453 5454 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5455 return -ENOTSUPP; 5456 5457 if (ras && adev->ras_enabled && 5458 adev->nbio.funcs->enable_doorbell_interrupt) 5459 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5460 5461 return amdgpu_dpm_baco_enter(adev); 5462 } 5463 5464 int amdgpu_device_baco_exit(struct drm_device *dev) 5465 { 5466 struct amdgpu_device *adev = drm_to_adev(dev); 5467 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5468 int ret = 0; 5469 5470 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5471 return -ENOTSUPP; 5472 5473 ret = amdgpu_dpm_baco_exit(adev); 5474 if (ret) 5475 return ret; 5476 5477 if (ras && adev->ras_enabled && 5478 adev->nbio.funcs->enable_doorbell_interrupt) 5479 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5480 5481 if (amdgpu_passthrough(adev) && 5482 adev->nbio.funcs->clear_doorbell_interrupt) 5483 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5484 5485 return 0; 5486 } 5487 5488 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5489 { 5490 int i; 5491 5492 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5493 struct amdgpu_ring *ring = adev->rings[i]; 5494 5495 if (!ring || !ring->sched.thread) 5496 continue; 5497 5498 cancel_delayed_work_sync(&ring->sched.work_tdr); 5499 } 5500 } 5501 5502 /** 5503 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5504 * @pdev: PCI device struct 5505 * @state: PCI channel state 5506 * 5507 * Description: Called when a PCI error is detected. 5508 * 5509 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5510 */ 5511 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5512 { 5513 STUB(); 5514 return 0; 5515 #ifdef notyet 5516 struct drm_device *dev = pci_get_drvdata(pdev); 5517 struct amdgpu_device *adev = drm_to_adev(dev); 5518 int i; 5519 5520 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5521 5522 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5523 DRM_WARN("No support for XGMI hive yet..."); 5524 return PCI_ERS_RESULT_DISCONNECT; 5525 } 5526 5527 adev->pci_channel_state = state; 5528 5529 switch (state) { 5530 case pci_channel_io_normal: 5531 return PCI_ERS_RESULT_CAN_RECOVER; 5532 /* Fatal error, prepare for slot reset */ 5533 case pci_channel_io_frozen: 5534 /* 5535 * Cancel and wait for all TDRs in progress if failing to 5536 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5537 * 5538 * Locking adev->reset_sem will prevent any external access 5539 * to GPU during PCI error recovery 5540 */ 5541 while (!amdgpu_device_lock_adev(adev, NULL)) 5542 amdgpu_cancel_all_tdr(adev); 5543 5544 /* 5545 * Block any work scheduling as we do for regular GPU reset 5546 * for the duration of the recovery 5547 */ 5548 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5549 struct amdgpu_ring *ring = adev->rings[i]; 5550 5551 if (!ring || !ring->sched.thread) 5552 continue; 5553 5554 drm_sched_stop(&ring->sched, NULL); 5555 } 5556 atomic_inc(&adev->gpu_reset_counter); 5557 return PCI_ERS_RESULT_NEED_RESET; 5558 case pci_channel_io_perm_failure: 5559 /* Permanent error, prepare for device removal */ 5560 return PCI_ERS_RESULT_DISCONNECT; 5561 } 5562 5563 return PCI_ERS_RESULT_NEED_RESET; 5564 #endif 5565 } 5566 5567 /** 5568 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5569 * @pdev: pointer to PCI device 5570 */ 5571 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5572 { 5573 5574 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5575 5576 /* TODO - dump whatever for debugging purposes */ 5577 5578 /* This called only if amdgpu_pci_error_detected returns 5579 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5580 * works, no need to reset slot. 5581 */ 5582 5583 return PCI_ERS_RESULT_RECOVERED; 5584 } 5585 5586 /** 5587 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5588 * @pdev: PCI device struct 5589 * 5590 * Description: This routine is called by the pci error recovery 5591 * code after the PCI slot has been reset, just before we 5592 * should resume normal operations. 5593 */ 5594 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5595 { 5596 STUB(); 5597 return PCI_ERS_RESULT_RECOVERED; 5598 #ifdef notyet 5599 struct drm_device *dev = pci_get_drvdata(pdev); 5600 struct amdgpu_device *adev = drm_to_adev(dev); 5601 int r, i; 5602 struct amdgpu_reset_context reset_context; 5603 u32 memsize; 5604 struct list_head device_list; 5605 5606 DRM_INFO("PCI error: slot reset callback!!\n"); 5607 5608 memset(&reset_context, 0, sizeof(reset_context)); 5609 5610 INIT_LIST_HEAD(&device_list); 5611 list_add_tail(&adev->reset_list, &device_list); 5612 5613 /* wait for asic to come out of reset */ 5614 drm_msleep(500); 5615 5616 /* Restore PCI confspace */ 5617 amdgpu_device_load_pci_state(pdev); 5618 5619 /* confirm ASIC came out of reset */ 5620 for (i = 0; i < adev->usec_timeout; i++) { 5621 memsize = amdgpu_asic_get_config_memsize(adev); 5622 5623 if (memsize != 0xffffffff) 5624 break; 5625 udelay(1); 5626 } 5627 if (memsize == 0xffffffff) { 5628 r = -ETIME; 5629 goto out; 5630 } 5631 5632 reset_context.method = AMD_RESET_METHOD_NONE; 5633 reset_context.reset_req_dev = adev; 5634 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5635 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5636 5637 adev->no_hw_access = true; 5638 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5639 adev->no_hw_access = false; 5640 if (r) 5641 goto out; 5642 5643 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5644 5645 out: 5646 if (!r) { 5647 if (amdgpu_device_cache_pci_state(adev->pdev)) 5648 pci_restore_state(adev->pdev); 5649 5650 DRM_INFO("PCIe error recovery succeeded\n"); 5651 } else { 5652 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5653 amdgpu_device_unlock_adev(adev); 5654 } 5655 5656 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5657 #endif 5658 } 5659 5660 /** 5661 * amdgpu_pci_resume() - resume normal ops after PCI reset 5662 * @pdev: pointer to PCI device 5663 * 5664 * Called when the error recovery driver tells us that its 5665 * OK to resume normal operation. 5666 */ 5667 void amdgpu_pci_resume(struct pci_dev *pdev) 5668 { 5669 STUB(); 5670 #ifdef notyet 5671 struct drm_device *dev = pci_get_drvdata(pdev); 5672 struct amdgpu_device *adev = drm_to_adev(dev); 5673 int i; 5674 5675 5676 DRM_INFO("PCI error: resume callback!!\n"); 5677 5678 /* Only continue execution for the case of pci_channel_io_frozen */ 5679 if (adev->pci_channel_state != pci_channel_io_frozen) 5680 return; 5681 5682 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5683 struct amdgpu_ring *ring = adev->rings[i]; 5684 5685 if (!ring || !ring->sched.thread) 5686 continue; 5687 5688 5689 drm_sched_resubmit_jobs(&ring->sched); 5690 drm_sched_start(&ring->sched, true); 5691 } 5692 5693 amdgpu_device_unlock_adev(adev); 5694 #endif 5695 } 5696 5697 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5698 { 5699 return false; 5700 #ifdef notyet 5701 struct drm_device *dev = pci_get_drvdata(pdev); 5702 struct amdgpu_device *adev = drm_to_adev(dev); 5703 int r; 5704 5705 r = pci_save_state(pdev); 5706 if (!r) { 5707 kfree(adev->pci_state); 5708 5709 adev->pci_state = pci_store_saved_state(pdev); 5710 5711 if (!adev->pci_state) { 5712 DRM_ERROR("Failed to store PCI saved state"); 5713 return false; 5714 } 5715 } else { 5716 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5717 return false; 5718 } 5719 5720 return true; 5721 #endif 5722 } 5723 5724 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5725 { 5726 STUB(); 5727 return false; 5728 #ifdef notyet 5729 struct drm_device *dev = pci_get_drvdata(pdev); 5730 struct amdgpu_device *adev = drm_to_adev(dev); 5731 int r; 5732 5733 if (!adev->pci_state) 5734 return false; 5735 5736 r = pci_load_saved_state(pdev, adev->pci_state); 5737 5738 if (!r) { 5739 pci_restore_state(pdev); 5740 } else { 5741 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5742 return false; 5743 } 5744 5745 return true; 5746 #endif 5747 } 5748 5749 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5750 struct amdgpu_ring *ring) 5751 { 5752 #ifdef CONFIG_X86_64 5753 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5754 return; 5755 #endif 5756 if (adev->gmc.xgmi.connected_to_cpu) 5757 return; 5758 5759 if (ring && ring->funcs->emit_hdp_flush) 5760 amdgpu_ring_emit_hdp_flush(ring); 5761 else 5762 amdgpu_asic_flush_hdp(adev, ring); 5763 } 5764 5765 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5766 struct amdgpu_ring *ring) 5767 { 5768 #ifdef CONFIG_X86_64 5769 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5770 return; 5771 #endif 5772 if (adev->gmc.xgmi.connected_to_cpu) 5773 return; 5774 5775 amdgpu_asic_invalidate_hdp(adev, ring); 5776 } 5777