1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/pci.h> 34 35 #include <drm/drm_atomic_helper.h> 36 #include <drm/drm_probe_helper.h> 37 #include <drm/amdgpu_drm.h> 38 #include <linux/vgaarb.h> 39 #include <linux/vga_switcheroo.h> 40 #include <linux/efi.h> 41 #include "amdgpu.h" 42 #include "amdgpu_trace.h" 43 #include "amdgpu_i2c.h" 44 #include "atom.h" 45 #include "amdgpu_atombios.h" 46 #include "amdgpu_atomfirmware.h" 47 #include "amd_pcie.h" 48 #ifdef CONFIG_DRM_AMDGPU_SI 49 #include "si.h" 50 #endif 51 #ifdef CONFIG_DRM_AMDGPU_CIK 52 #include "cik.h" 53 #endif 54 #include "vi.h" 55 #include "soc15.h" 56 #include "nv.h" 57 #include "bif/bif_4_1_d.h" 58 #include <linux/pci.h> 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 92 const char *amdgpu_asic_name[] = { 93 "TAHITI", 94 "PITCAIRN", 95 "VERDE", 96 "OLAND", 97 "HAINAN", 98 "BONAIRE", 99 "KAVERI", 100 "KABINI", 101 "HAWAII", 102 "MULLINS", 103 "TOPAZ", 104 "TONGA", 105 "FIJI", 106 "CARRIZO", 107 "STONEY", 108 "POLARIS10", 109 "POLARIS11", 110 "POLARIS12", 111 "VEGAM", 112 "VEGA10", 113 "VEGA12", 114 "VEGA20", 115 "RAVEN", 116 "ARCTURUS", 117 "RENOIR", 118 "ALDEBARAN", 119 "NAVI10", 120 "CYAN_SKILLFISH", 121 "NAVI14", 122 "NAVI12", 123 "SIENNA_CICHLID", 124 "NAVY_FLOUNDER", 125 "VANGOGH", 126 "DIMGREY_CAVEFISH", 127 "BEIGE_GOBY", 128 "YELLOW_CARP", 129 "LAST", 130 }; 131 132 /** 133 * DOC: pcie_replay_count 134 * 135 * The amdgpu driver provides a sysfs API for reporting the total number 136 * of PCIe replays (NAKs) 137 * The file pcie_replay_count is used for this and returns the total 138 * number of replays as a sum of the NAKs generated and NAKs received 139 */ 140 141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 142 struct device_attribute *attr, char *buf) 143 { 144 struct drm_device *ddev = dev_get_drvdata(dev); 145 struct amdgpu_device *adev = drm_to_adev(ddev); 146 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 147 148 return sysfs_emit(buf, "%llu\n", cnt); 149 } 150 151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 152 amdgpu_device_get_pcie_replay_count, NULL); 153 154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 155 156 /** 157 * DOC: product_name 158 * 159 * The amdgpu driver provides a sysfs API for reporting the product name 160 * for the device 161 * The file serial_number is used for this and returns the product name 162 * as returned from the FRU. 163 * NOTE: This is only available for certain server cards 164 */ 165 166 static ssize_t amdgpu_device_get_product_name(struct device *dev, 167 struct device_attribute *attr, char *buf) 168 { 169 struct drm_device *ddev = dev_get_drvdata(dev); 170 struct amdgpu_device *adev = drm_to_adev(ddev); 171 172 return sysfs_emit(buf, "%s\n", adev->product_name); 173 } 174 175 static DEVICE_ATTR(product_name, S_IRUGO, 176 amdgpu_device_get_product_name, NULL); 177 178 /** 179 * DOC: product_number 180 * 181 * The amdgpu driver provides a sysfs API for reporting the part number 182 * for the device 183 * The file serial_number is used for this and returns the part number 184 * as returned from the FRU. 185 * NOTE: This is only available for certain server cards 186 */ 187 188 static ssize_t amdgpu_device_get_product_number(struct device *dev, 189 struct device_attribute *attr, char *buf) 190 { 191 struct drm_device *ddev = dev_get_drvdata(dev); 192 struct amdgpu_device *adev = drm_to_adev(ddev); 193 194 return sysfs_emit(buf, "%s\n", adev->product_number); 195 } 196 197 static DEVICE_ATTR(product_number, S_IRUGO, 198 amdgpu_device_get_product_number, NULL); 199 200 /** 201 * DOC: serial_number 202 * 203 * The amdgpu driver provides a sysfs API for reporting the serial number 204 * for the device 205 * The file serial_number is used for this and returns the serial number 206 * as returned from the FRU. 207 * NOTE: This is only available for certain server cards 208 */ 209 210 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 211 struct device_attribute *attr, char *buf) 212 { 213 struct drm_device *ddev = dev_get_drvdata(dev); 214 struct amdgpu_device *adev = drm_to_adev(ddev); 215 216 return sysfs_emit(buf, "%s\n", adev->serial); 217 } 218 219 static DEVICE_ATTR(serial_number, S_IRUGO, 220 amdgpu_device_get_serial_number, NULL); 221 222 /** 223 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 224 * 225 * @dev: drm_device pointer 226 * 227 * Returns true if the device is a dGPU with ATPX power control, 228 * otherwise return false. 229 */ 230 bool amdgpu_device_supports_px(struct drm_device *dev) 231 { 232 struct amdgpu_device *adev = drm_to_adev(dev); 233 234 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 235 return true; 236 return false; 237 } 238 239 /** 240 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 241 * 242 * @dev: drm_device pointer 243 * 244 * Returns true if the device is a dGPU with ACPI power control, 245 * otherwise return false. 246 */ 247 bool amdgpu_device_supports_boco(struct drm_device *dev) 248 { 249 struct amdgpu_device *adev = drm_to_adev(dev); 250 251 if (adev->has_pr3 || 252 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 253 return true; 254 return false; 255 } 256 257 /** 258 * amdgpu_device_supports_baco - Does the device support BACO 259 * 260 * @dev: drm_device pointer 261 * 262 * Returns true if the device supporte BACO, 263 * otherwise return false. 264 */ 265 bool amdgpu_device_supports_baco(struct drm_device *dev) 266 { 267 struct amdgpu_device *adev = drm_to_adev(dev); 268 269 return amdgpu_asic_supports_baco(adev); 270 } 271 272 /** 273 * amdgpu_device_supports_smart_shift - Is the device dGPU with 274 * smart shift support 275 * 276 * @dev: drm_device pointer 277 * 278 * Returns true if the device is a dGPU with Smart Shift support, 279 * otherwise returns false. 280 */ 281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 282 { 283 return (amdgpu_device_supports_boco(dev) && 284 amdgpu_acpi_is_power_shift_control_supported()); 285 } 286 287 /* 288 * VRAM access helper functions 289 */ 290 291 /** 292 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 293 * 294 * @adev: amdgpu_device pointer 295 * @pos: offset of the buffer in vram 296 * @buf: virtual address of the buffer in system memory 297 * @size: read/write size, sizeof(@buf) must > @size 298 * @write: true - write to vram, otherwise - read from vram 299 */ 300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 301 void *buf, size_t size, bool write) 302 { 303 unsigned long flags; 304 uint32_t hi = ~0, tmp = 0; 305 uint32_t *data = buf; 306 uint64_t last; 307 int idx; 308 309 if (!drm_dev_enter(&adev->ddev, &idx)) 310 return; 311 312 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 313 314 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 315 for (last = pos + size; pos < last; pos += 4) { 316 tmp = pos >> 31; 317 318 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 319 if (tmp != hi) { 320 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 321 hi = tmp; 322 } 323 if (write) 324 WREG32_NO_KIQ(mmMM_DATA, *data++); 325 else 326 *data++ = RREG32_NO_KIQ(mmMM_DATA); 327 } 328 329 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 330 drm_dev_exit(idx); 331 } 332 333 /** 334 * amdgpu_device_vram_access - access vram by vram aperature 335 * 336 * @adev: amdgpu_device pointer 337 * @pos: offset of the buffer in vram 338 * @buf: virtual address of the buffer in system memory 339 * @size: read/write size, sizeof(@buf) must > @size 340 * @write: true - write to vram, otherwise - read from vram 341 * 342 * The return value means how many bytes have been transferred. 343 */ 344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 345 void *buf, size_t size, bool write) 346 { 347 #ifdef CONFIG_64BIT 348 void __iomem *addr; 349 size_t count = 0; 350 uint64_t last; 351 352 if (!adev->mman.aper_base_kaddr) 353 return 0; 354 355 last = min(pos + size, adev->gmc.visible_vram_size); 356 if (last > pos) { 357 addr = adev->mman.aper_base_kaddr + pos; 358 count = last - pos; 359 360 if (write) { 361 memcpy_toio(addr, buf, count); 362 mb(); 363 amdgpu_device_flush_hdp(adev, NULL); 364 } else { 365 amdgpu_device_invalidate_hdp(adev, NULL); 366 mb(); 367 memcpy_fromio(buf, addr, count); 368 } 369 370 } 371 372 return count; 373 #else 374 return 0; 375 #endif 376 } 377 378 /** 379 * amdgpu_device_vram_access - read/write a buffer in vram 380 * 381 * @adev: amdgpu_device pointer 382 * @pos: offset of the buffer in vram 383 * @buf: virtual address of the buffer in system memory 384 * @size: read/write size, sizeof(@buf) must > @size 385 * @write: true - write to vram, otherwise - read from vram 386 */ 387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 388 void *buf, size_t size, bool write) 389 { 390 size_t count; 391 392 /* try to using vram apreature to access vram first */ 393 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 394 size -= count; 395 if (size) { 396 /* using MM to access rest vram */ 397 pos += count; 398 buf += count; 399 amdgpu_device_mm_access(adev, pos, buf, size, write); 400 } 401 } 402 403 /* 404 * register access helper functions. 405 */ 406 407 /* Check if hw access should be skipped because of hotplug or device error */ 408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 409 { 410 if (adev->no_hw_access) 411 return true; 412 413 #ifdef CONFIG_LOCKDEP 414 /* 415 * This is a bit complicated to understand, so worth a comment. What we assert 416 * here is that the GPU reset is not running on another thread in parallel. 417 * 418 * For this we trylock the read side of the reset semaphore, if that succeeds 419 * we know that the reset is not running in paralell. 420 * 421 * If the trylock fails we assert that we are either already holding the read 422 * side of the lock or are the reset thread itself and hold the write side of 423 * the lock. 424 */ 425 if (in_task()) { 426 if (down_read_trylock(&adev->reset_sem)) 427 up_read(&adev->reset_sem); 428 else 429 lockdep_assert_held(&adev->reset_sem); 430 } 431 #endif 432 return false; 433 } 434 435 /** 436 * amdgpu_device_rreg - read a memory mapped IO or indirect register 437 * 438 * @adev: amdgpu_device pointer 439 * @reg: dword aligned register offset 440 * @acc_flags: access flags which require special behavior 441 * 442 * Returns the 32 bit value from the offset specified. 443 */ 444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 445 uint32_t reg, uint32_t acc_flags) 446 { 447 uint32_t ret; 448 449 if (amdgpu_device_skip_hw_access(adev)) 450 return 0; 451 452 if ((reg * 4) < adev->rmmio_size) { 453 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 454 amdgpu_sriov_runtime(adev) && 455 down_read_trylock(&adev->reset_sem)) { 456 ret = amdgpu_kiq_rreg(adev, reg); 457 up_read(&adev->reset_sem); 458 } else { 459 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 460 } 461 } else { 462 ret = adev->pcie_rreg(adev, reg * 4); 463 } 464 465 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 466 467 return ret; 468 } 469 470 /* 471 * MMIO register read with bytes helper functions 472 * @offset:bytes offset from MMIO start 473 * 474 */ 475 476 /** 477 * amdgpu_mm_rreg8 - read a memory mapped IO register 478 * 479 * @adev: amdgpu_device pointer 480 * @offset: byte aligned register offset 481 * 482 * Returns the 8 bit value from the offset specified. 483 */ 484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return 0; 488 489 if (offset < adev->rmmio_size) 490 return (readb(adev->rmmio + offset)); 491 BUG(); 492 } 493 494 /* 495 * MMIO register write with bytes helper functions 496 * @offset:bytes offset from MMIO start 497 * @value: the value want to be written to the register 498 * 499 */ 500 /** 501 * amdgpu_mm_wreg8 - read a memory mapped IO register 502 * 503 * @adev: amdgpu_device pointer 504 * @offset: byte aligned register offset 505 * @value: 8 bit value to write 506 * 507 * Writes the value specified to the offset specified. 508 */ 509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 510 { 511 if (amdgpu_device_skip_hw_access(adev)) 512 return; 513 514 if (offset < adev->rmmio_size) 515 writeb(value, adev->rmmio + offset); 516 else 517 BUG(); 518 } 519 520 /** 521 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @v: 32 bit value to write to the register 526 * @acc_flags: access flags which require special behavior 527 * 528 * Writes the value specified to the offset specified. 529 */ 530 void amdgpu_device_wreg(struct amdgpu_device *adev, 531 uint32_t reg, uint32_t v, 532 uint32_t acc_flags) 533 { 534 if (amdgpu_device_skip_hw_access(adev)) 535 return; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_sem)) { 541 amdgpu_kiq_wreg(adev, reg, v); 542 up_read(&adev->reset_sem); 543 } else { 544 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 adev->pcie_wreg(adev, reg * 4, v); 548 } 549 550 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 551 } 552 553 /* 554 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 555 * 556 * this function is invoked only the debugfs register access 557 * */ 558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 559 uint32_t reg, uint32_t v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (amdgpu_sriov_fullaccess(adev) && 565 adev->gfx.rlc.funcs && 566 adev->gfx.rlc.funcs->is_rlcg_access_range) { 567 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 568 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); 569 } else { 570 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 571 } 572 } 573 574 /** 575 * amdgpu_mm_rdoorbell - read a doorbell dword 576 * 577 * @adev: amdgpu_device pointer 578 * @index: doorbell index 579 * 580 * Returns the value in the doorbell aperture at the 581 * requested doorbell index (CIK). 582 */ 583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 584 { 585 if (amdgpu_device_skip_hw_access(adev)) 586 return 0; 587 588 if (index < adev->doorbell.num_doorbells) { 589 return readl(adev->doorbell.ptr + index); 590 } else { 591 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 592 return 0; 593 } 594 } 595 596 /** 597 * amdgpu_mm_wdoorbell - write a doorbell dword 598 * 599 * @adev: amdgpu_device pointer 600 * @index: doorbell index 601 * @v: value to write 602 * 603 * Writes @v to the doorbell aperture at the 604 * requested doorbell index (CIK). 605 */ 606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 607 { 608 if (amdgpu_device_skip_hw_access(adev)) 609 return; 610 611 if (index < adev->doorbell.num_doorbells) { 612 writel(v, adev->doorbell.ptr + index); 613 } else { 614 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 615 } 616 } 617 618 /** 619 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 620 * 621 * @adev: amdgpu_device pointer 622 * @index: doorbell index 623 * 624 * Returns the value in the doorbell aperture at the 625 * requested doorbell index (VEGA10+). 626 */ 627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 628 { 629 if (amdgpu_device_skip_hw_access(adev)) 630 return 0; 631 632 if (index < adev->doorbell.num_doorbells) { 633 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 634 } else { 635 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 636 return 0; 637 } 638 } 639 640 /** 641 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 642 * 643 * @adev: amdgpu_device pointer 644 * @index: doorbell index 645 * @v: value to write 646 * 647 * Writes @v to the doorbell aperture at the 648 * requested doorbell index (VEGA10+). 649 */ 650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 651 { 652 if (amdgpu_device_skip_hw_access(adev)) 653 return; 654 655 if (index < adev->doorbell.num_doorbells) { 656 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 657 } else { 658 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 659 } 660 } 661 662 /** 663 * amdgpu_device_indirect_rreg - read an indirect register 664 * 665 * @adev: amdgpu_device pointer 666 * @pcie_index: mmio register offset 667 * @pcie_data: mmio register offset 668 * @reg_addr: indirect register address to read from 669 * 670 * Returns the value of indirect register @reg_addr 671 */ 672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 673 u32 pcie_index, u32 pcie_data, 674 u32 reg_addr) 675 { 676 unsigned long flags; 677 u32 r; 678 void __iomem *pcie_index_offset; 679 void __iomem *pcie_data_offset; 680 681 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 682 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 683 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 684 685 writel(reg_addr, pcie_index_offset); 686 readl(pcie_index_offset); 687 r = readl(pcie_data_offset); 688 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 689 690 return r; 691 } 692 693 /** 694 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 695 * 696 * @adev: amdgpu_device pointer 697 * @pcie_index: mmio register offset 698 * @pcie_data: mmio register offset 699 * @reg_addr: indirect register address to read from 700 * 701 * Returns the value of indirect register @reg_addr 702 */ 703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 704 u32 pcie_index, u32 pcie_data, 705 u32 reg_addr) 706 { 707 unsigned long flags; 708 u64 r; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* read low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 r = readl(pcie_data_offset); 720 /* read high 32 bits */ 721 writel(reg_addr + 4, pcie_index_offset); 722 readl(pcie_index_offset); 723 r |= ((u64)readl(pcie_data_offset) << 32); 724 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 725 726 return r; 727 } 728 729 /** 730 * amdgpu_device_indirect_wreg - write an indirect register address 731 * 732 * @adev: amdgpu_device pointer 733 * @pcie_index: mmio register offset 734 * @pcie_data: mmio register offset 735 * @reg_addr: indirect register offset 736 * @reg_data: indirect register data 737 * 738 */ 739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 740 u32 pcie_index, u32 pcie_data, 741 u32 reg_addr, u32 reg_data) 742 { 743 unsigned long flags; 744 void __iomem *pcie_index_offset; 745 void __iomem *pcie_data_offset; 746 747 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 748 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 749 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 750 751 writel(reg_addr, pcie_index_offset); 752 readl(pcie_index_offset); 753 writel(reg_data, pcie_data_offset); 754 readl(pcie_data_offset); 755 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 756 } 757 758 /** 759 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 760 * 761 * @adev: amdgpu_device pointer 762 * @pcie_index: mmio register offset 763 * @pcie_data: mmio register offset 764 * @reg_addr: indirect register offset 765 * @reg_data: indirect register data 766 * 767 */ 768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 769 u32 pcie_index, u32 pcie_data, 770 u32 reg_addr, u64 reg_data) 771 { 772 unsigned long flags; 773 void __iomem *pcie_index_offset; 774 void __iomem *pcie_data_offset; 775 776 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 777 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 778 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 779 780 /* write low 32 bits */ 781 writel(reg_addr, pcie_index_offset); 782 readl(pcie_index_offset); 783 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 784 readl(pcie_data_offset); 785 /* write high 32 bits */ 786 writel(reg_addr + 4, pcie_index_offset); 787 readl(pcie_index_offset); 788 writel((u32)(reg_data >> 32), pcie_data_offset); 789 readl(pcie_data_offset); 790 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 791 } 792 793 /** 794 * amdgpu_invalid_rreg - dummy reg read function 795 * 796 * @adev: amdgpu_device pointer 797 * @reg: offset of register 798 * 799 * Dummy register read function. Used for register blocks 800 * that certain asics don't have (all asics). 801 * Returns the value in the register. 802 */ 803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 804 { 805 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 806 BUG(); 807 return 0; 808 } 809 810 /** 811 * amdgpu_invalid_wreg - dummy reg write function 812 * 813 * @adev: amdgpu_device pointer 814 * @reg: offset of register 815 * @v: value to write to the register 816 * 817 * Dummy register read function. Used for register blocks 818 * that certain asics don't have (all asics). 819 */ 820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 821 { 822 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 823 reg, v); 824 BUG(); 825 } 826 827 /** 828 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 829 * 830 * @adev: amdgpu_device pointer 831 * @reg: offset of register 832 * 833 * Dummy register read function. Used for register blocks 834 * that certain asics don't have (all asics). 835 * Returns the value in the register. 836 */ 837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 838 { 839 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 840 BUG(); 841 return 0; 842 } 843 844 /** 845 * amdgpu_invalid_wreg64 - dummy reg write function 846 * 847 * @adev: amdgpu_device pointer 848 * @reg: offset of register 849 * @v: value to write to the register 850 * 851 * Dummy register read function. Used for register blocks 852 * that certain asics don't have (all asics). 853 */ 854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 855 { 856 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 857 reg, v); 858 BUG(); 859 } 860 861 /** 862 * amdgpu_block_invalid_rreg - dummy reg read function 863 * 864 * @adev: amdgpu_device pointer 865 * @block: offset of instance 866 * @reg: offset of register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 * Returns the value in the register. 871 */ 872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 873 uint32_t block, uint32_t reg) 874 { 875 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 876 reg, block); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_block_invalid_wreg - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @block: offset of instance 886 * @reg: offset of register 887 * @v: value to write to the register 888 * 889 * Dummy register read function. Used for register blocks 890 * that certain asics don't have (all asics). 891 */ 892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 893 uint32_t block, 894 uint32_t reg, uint32_t v) 895 { 896 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 897 reg, block, v); 898 BUG(); 899 } 900 901 /** 902 * amdgpu_device_asic_init - Wrapper for atom asic_init 903 * 904 * @adev: amdgpu_device pointer 905 * 906 * Does any asic specific work and then calls atom asic init. 907 */ 908 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 909 { 910 amdgpu_asic_pre_asic_init(adev); 911 912 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 913 } 914 915 /** 916 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 917 * 918 * @adev: amdgpu_device pointer 919 * 920 * Allocates a scratch page of VRAM for use by various things in the 921 * driver. 922 */ 923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 924 { 925 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 926 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 927 &adev->vram_scratch.robj, 928 &adev->vram_scratch.gpu_addr, 929 (void **)&adev->vram_scratch.ptr); 930 } 931 932 /** 933 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Frees the VRAM scratch page. 938 */ 939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 940 { 941 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 942 } 943 944 /** 945 * amdgpu_device_program_register_sequence - program an array of registers. 946 * 947 * @adev: amdgpu_device pointer 948 * @registers: pointer to the register array 949 * @array_size: size of the register array 950 * 951 * Programs an array or registers with and and or masks. 952 * This is a helper for setting golden registers. 953 */ 954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 955 const u32 *registers, 956 const u32 array_size) 957 { 958 u32 tmp, reg, and_mask, or_mask; 959 int i; 960 961 if (array_size % 3) 962 return; 963 964 for (i = 0; i < array_size; i +=3) { 965 reg = registers[i + 0]; 966 and_mask = registers[i + 1]; 967 or_mask = registers[i + 2]; 968 969 if (and_mask == 0xffffffff) { 970 tmp = or_mask; 971 } else { 972 tmp = RREG32(reg); 973 tmp &= ~and_mask; 974 if (adev->family >= AMDGPU_FAMILY_AI) 975 tmp |= (or_mask & and_mask); 976 else 977 tmp |= or_mask; 978 } 979 WREG32(reg, tmp); 980 } 981 } 982 983 /** 984 * amdgpu_device_pci_config_reset - reset the GPU 985 * 986 * @adev: amdgpu_device pointer 987 * 988 * Resets the GPU using the pci config reset sequence. 989 * Only applicable to asics prior to vega10. 990 */ 991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 992 { 993 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 994 } 995 996 /** 997 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 998 * 999 * @adev: amdgpu_device pointer 1000 * 1001 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1002 */ 1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1004 { 1005 STUB(); 1006 return -ENOSYS; 1007 #ifdef notyet 1008 return pci_reset_function(adev->pdev); 1009 #endif 1010 } 1011 1012 /* 1013 * GPU doorbell aperture helpers function. 1014 */ 1015 /** 1016 * amdgpu_device_doorbell_init - Init doorbell driver information. 1017 * 1018 * @adev: amdgpu_device pointer 1019 * 1020 * Init doorbell driver information (CIK) 1021 * Returns 0 on success, error on failure. 1022 */ 1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1024 { 1025 1026 /* No doorbell on SI hardware generation */ 1027 if (adev->asic_type < CHIP_BONAIRE) { 1028 adev->doorbell.base = 0; 1029 adev->doorbell.size = 0; 1030 adev->doorbell.num_doorbells = 0; 1031 adev->doorbell.ptr = NULL; 1032 return 0; 1033 } 1034 1035 #ifdef __linux__ 1036 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1037 return -EINVAL; 1038 #endif 1039 1040 amdgpu_asic_init_doorbell_index(adev); 1041 1042 /* doorbell bar mapping */ 1043 #ifdef __linux__ 1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1046 #endif 1047 1048 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1049 adev->doorbell_index.max_assignment+1); 1050 if (adev->doorbell.num_doorbells == 0) 1051 return -EINVAL; 1052 1053 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1054 * paging queue doorbell use the second page. The 1055 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1056 * doorbells are in the first page. So with paging queue enabled, 1057 * the max num_doorbells should + 1 page (0x400 in dword) 1058 */ 1059 if (adev->asic_type >= CHIP_VEGA10) 1060 adev->doorbell.num_doorbells += 0x400; 1061 1062 #ifdef __linux__ 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 #endif 1069 1070 return 0; 1071 } 1072 1073 /** 1074 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1075 * 1076 * @adev: amdgpu_device pointer 1077 * 1078 * Tear down doorbell driver information (CIK) 1079 */ 1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1081 { 1082 #ifdef __linux__ 1083 iounmap(adev->doorbell.ptr); 1084 #else 1085 if (adev->doorbell.size > 0) 1086 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1087 adev->doorbell.size); 1088 #endif 1089 adev->doorbell.ptr = NULL; 1090 } 1091 1092 1093 1094 /* 1095 * amdgpu_device_wb_*() 1096 * Writeback is the method by which the GPU updates special pages in memory 1097 * with the status of certain GPU events (fences, ring pointers,etc.). 1098 */ 1099 1100 /** 1101 * amdgpu_device_wb_fini - Disable Writeback and free memory 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Disables Writeback and frees the Writeback memory (all asics). 1106 * Used at driver shutdown. 1107 */ 1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1109 { 1110 if (adev->wb.wb_obj) { 1111 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1112 &adev->wb.gpu_addr, 1113 (void **)&adev->wb.wb); 1114 adev->wb.wb_obj = NULL; 1115 } 1116 } 1117 1118 /** 1119 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Initializes writeback and allocates writeback memory (all asics). 1124 * Used at driver startup. 1125 * Returns 0 on success or an -error on failure. 1126 */ 1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1128 { 1129 int r; 1130 1131 if (adev->wb.wb_obj == NULL) { 1132 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1133 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1134 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1135 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 if (r) { 1138 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1139 return r; 1140 } 1141 1142 adev->wb.num_wb = AMDGPU_MAX_WB; 1143 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1144 1145 /* clear wb memory */ 1146 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1147 } 1148 1149 return 0; 1150 } 1151 1152 /** 1153 * amdgpu_device_wb_get - Allocate a wb entry 1154 * 1155 * @adev: amdgpu_device pointer 1156 * @wb: wb index 1157 * 1158 * Allocate a wb slot for use by the driver (all asics). 1159 * Returns 0 on success or -EINVAL on failure. 1160 */ 1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1162 { 1163 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1164 1165 if (offset < adev->wb.num_wb) { 1166 __set_bit(offset, adev->wb.used); 1167 *wb = offset << 3; /* convert to dw offset */ 1168 return 0; 1169 } else { 1170 return -EINVAL; 1171 } 1172 } 1173 1174 /** 1175 * amdgpu_device_wb_free - Free a wb entry 1176 * 1177 * @adev: amdgpu_device pointer 1178 * @wb: wb index 1179 * 1180 * Free a wb slot allocated for use by the driver (all asics) 1181 */ 1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1183 { 1184 wb >>= 3; 1185 if (wb < adev->wb.num_wb) 1186 __clear_bit(wb, adev->wb.used); 1187 } 1188 1189 /** 1190 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1191 * 1192 * @adev: amdgpu_device pointer 1193 * 1194 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1195 * to fail, but if any of the BARs is not accessible after the size we abort 1196 * driver loading by returning -ENODEV. 1197 */ 1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1199 { 1200 #ifdef __linux__ 1201 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1202 struct pci_bus *root; 1203 struct resource *res; 1204 unsigned i; 1205 u16 cmd; 1206 int r; 1207 1208 /* Bypass for VF */ 1209 if (amdgpu_sriov_vf(adev)) 1210 return 0; 1211 1212 /* skip if the bios has already enabled large BAR */ 1213 if (adev->gmc.real_vram_size && 1214 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1215 return 0; 1216 1217 /* Check if the root BUS has 64bit memory resources */ 1218 root = adev->pdev->bus; 1219 while (root->parent) 1220 root = root->parent; 1221 1222 pci_bus_for_each_resource(root, res, i) { 1223 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1224 res->start > 0x100000000ull) 1225 break; 1226 } 1227 1228 /* Trying to resize is pointless without a root hub window above 4GB */ 1229 if (!res) 1230 return 0; 1231 1232 /* Limit the BAR size to what is available */ 1233 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1234 rbar_size); 1235 1236 /* Disable memory decoding while we change the BAR addresses and size */ 1237 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1238 pci_write_config_word(adev->pdev, PCI_COMMAND, 1239 cmd & ~PCI_COMMAND_MEMORY); 1240 1241 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1242 amdgpu_device_doorbell_fini(adev); 1243 if (adev->asic_type >= CHIP_BONAIRE) 1244 pci_release_resource(adev->pdev, 2); 1245 1246 pci_release_resource(adev->pdev, 0); 1247 1248 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1249 if (r == -ENOSPC) 1250 DRM_INFO("Not enough PCI address space for a large BAR."); 1251 else if (r && r != -ENOTSUPP) 1252 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1253 1254 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1255 1256 /* When the doorbell or fb BAR isn't available we have no chance of 1257 * using the device. 1258 */ 1259 r = amdgpu_device_doorbell_init(adev); 1260 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1261 return -ENODEV; 1262 1263 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1264 #endif /* __linux__ */ 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /* if we get transitioned to only one device, take VGA back */ 1331 /** 1332 * amdgpu_device_vga_set_decode - enable/disable vga decode 1333 * 1334 * @pdev: PCI device pointer 1335 * @state: enable/disable vga decode 1336 * 1337 * Enable/disable vga decode (all asics). 1338 * Returns VGA resource flags. 1339 */ 1340 #ifdef notyet 1341 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1342 bool state) 1343 { 1344 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1345 amdgpu_asic_set_vga_state(adev, state); 1346 if (state) 1347 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1348 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1349 else 1350 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1351 } 1352 #endif 1353 1354 /** 1355 * amdgpu_device_check_block_size - validate the vm block size 1356 * 1357 * @adev: amdgpu_device pointer 1358 * 1359 * Validates the vm block size specified via module parameter. 1360 * The vm block size defines number of bits in page table versus page directory, 1361 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1362 * page table and the remaining bits are in the page directory. 1363 */ 1364 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1365 { 1366 /* defines number of bits in page table versus page directory, 1367 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1368 * page table and the remaining bits are in the page directory */ 1369 if (amdgpu_vm_block_size == -1) 1370 return; 1371 1372 if (amdgpu_vm_block_size < 9) { 1373 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1374 amdgpu_vm_block_size); 1375 amdgpu_vm_block_size = -1; 1376 } 1377 } 1378 1379 /** 1380 * amdgpu_device_check_vm_size - validate the vm size 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Validates the vm size in GB specified via module parameter. 1385 * The VM size is the size of the GPU virtual memory space in GB. 1386 */ 1387 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1388 { 1389 /* no need to check the default value */ 1390 if (amdgpu_vm_size == -1) 1391 return; 1392 1393 if (amdgpu_vm_size < 1) { 1394 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1395 amdgpu_vm_size); 1396 amdgpu_vm_size = -1; 1397 } 1398 } 1399 1400 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1401 { 1402 #ifdef __linux__ 1403 struct sysinfo si; 1404 #endif 1405 bool is_os_64 = (sizeof(void *) == 8); 1406 uint64_t total_memory; 1407 uint64_t dram_size_seven_GB = 0x1B8000000; 1408 uint64_t dram_size_three_GB = 0xB8000000; 1409 1410 if (amdgpu_smu_memory_pool_size == 0) 1411 return; 1412 1413 if (!is_os_64) { 1414 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1415 goto def_value; 1416 } 1417 #ifdef __linux__ 1418 si_meminfo(&si); 1419 total_memory = (uint64_t)si.totalram * si.mem_unit; 1420 #else 1421 total_memory = ptoa(physmem); 1422 #endif 1423 1424 if ((amdgpu_smu_memory_pool_size == 1) || 1425 (amdgpu_smu_memory_pool_size == 2)) { 1426 if (total_memory < dram_size_three_GB) 1427 goto def_value1; 1428 } else if ((amdgpu_smu_memory_pool_size == 4) || 1429 (amdgpu_smu_memory_pool_size == 8)) { 1430 if (total_memory < dram_size_seven_GB) 1431 goto def_value1; 1432 } else { 1433 DRM_WARN("Smu memory pool size not supported\n"); 1434 goto def_value; 1435 } 1436 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1437 1438 return; 1439 1440 def_value1: 1441 DRM_WARN("No enough system memory\n"); 1442 def_value: 1443 adev->pm.smu_prv_buffer_size = 0; 1444 } 1445 1446 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1447 { 1448 if (!(adev->flags & AMD_IS_APU) || 1449 adev->asic_type < CHIP_RAVEN) 1450 return 0; 1451 1452 switch (adev->asic_type) { 1453 case CHIP_RAVEN: 1454 if (adev->pdev->device == 0x15dd) 1455 adev->apu_flags |= AMD_APU_IS_RAVEN; 1456 if (adev->pdev->device == 0x15d8) 1457 adev->apu_flags |= AMD_APU_IS_PICASSO; 1458 break; 1459 case CHIP_RENOIR: 1460 if ((adev->pdev->device == 0x1636) || 1461 (adev->pdev->device == 0x164c)) 1462 adev->apu_flags |= AMD_APU_IS_RENOIR; 1463 else 1464 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1465 break; 1466 case CHIP_VANGOGH: 1467 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1468 break; 1469 case CHIP_YELLOW_CARP: 1470 break; 1471 case CHIP_CYAN_SKILLFISH: 1472 if (adev->pdev->device == 0x13FE) 1473 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1474 break; 1475 default: 1476 return -EINVAL; 1477 } 1478 1479 return 0; 1480 } 1481 1482 /** 1483 * amdgpu_device_check_arguments - validate module params 1484 * 1485 * @adev: amdgpu_device pointer 1486 * 1487 * Validates certain module parameters and updates 1488 * the associated values used by the driver (all asics). 1489 */ 1490 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1491 { 1492 if (amdgpu_sched_jobs < 4) { 1493 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1494 amdgpu_sched_jobs); 1495 amdgpu_sched_jobs = 4; 1496 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1497 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1498 amdgpu_sched_jobs); 1499 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1500 } 1501 1502 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1503 /* gart size must be greater or equal to 32M */ 1504 dev_warn(adev->dev, "gart size (%d) too small\n", 1505 amdgpu_gart_size); 1506 amdgpu_gart_size = -1; 1507 } 1508 1509 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1510 /* gtt size must be greater or equal to 32M */ 1511 dev_warn(adev->dev, "gtt size (%d) too small\n", 1512 amdgpu_gtt_size); 1513 amdgpu_gtt_size = -1; 1514 } 1515 1516 /* valid range is between 4 and 9 inclusive */ 1517 if (amdgpu_vm_fragment_size != -1 && 1518 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1519 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1520 amdgpu_vm_fragment_size = -1; 1521 } 1522 1523 if (amdgpu_sched_hw_submission < 2) { 1524 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1525 amdgpu_sched_hw_submission); 1526 amdgpu_sched_hw_submission = 2; 1527 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1528 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1529 amdgpu_sched_hw_submission); 1530 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1531 } 1532 1533 amdgpu_device_check_smu_prv_buffer_size(adev); 1534 1535 amdgpu_device_check_vm_size(adev); 1536 1537 amdgpu_device_check_block_size(adev); 1538 1539 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1540 1541 amdgpu_gmc_tmz_set(adev); 1542 1543 amdgpu_gmc_noretry_set(adev); 1544 1545 return 0; 1546 } 1547 1548 #ifdef __linux__ 1549 /** 1550 * amdgpu_switcheroo_set_state - set switcheroo state 1551 * 1552 * @pdev: pci dev pointer 1553 * @state: vga_switcheroo state 1554 * 1555 * Callback for the switcheroo driver. Suspends or resumes the 1556 * the asics before or after it is powered up using ACPI methods. 1557 */ 1558 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1559 enum vga_switcheroo_state state) 1560 { 1561 struct drm_device *dev = pci_get_drvdata(pdev); 1562 int r; 1563 1564 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1565 return; 1566 1567 if (state == VGA_SWITCHEROO_ON) { 1568 pr_info("switched on\n"); 1569 /* don't suspend or resume card normally */ 1570 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1571 1572 pci_set_power_state(pdev, PCI_D0); 1573 amdgpu_device_load_pci_state(pdev); 1574 r = pci_enable_device(pdev); 1575 if (r) 1576 DRM_WARN("pci_enable_device failed (%d)\n", r); 1577 amdgpu_device_resume(dev, true); 1578 1579 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1580 } else { 1581 pr_info("switched off\n"); 1582 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1583 amdgpu_device_suspend(dev, true); 1584 amdgpu_device_cache_pci_state(pdev); 1585 /* Shut down the device */ 1586 pci_disable_device(pdev); 1587 pci_set_power_state(pdev, PCI_D3cold); 1588 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1589 } 1590 } 1591 1592 /** 1593 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1594 * 1595 * @pdev: pci dev pointer 1596 * 1597 * Callback for the switcheroo driver. Check of the switcheroo 1598 * state can be changed. 1599 * Returns true if the state can be changed, false if not. 1600 */ 1601 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1602 { 1603 struct drm_device *dev = pci_get_drvdata(pdev); 1604 1605 /* 1606 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1607 * locking inversion with the driver load path. And the access here is 1608 * completely racy anyway. So don't bother with locking for now. 1609 */ 1610 return atomic_read(&dev->open_count) == 0; 1611 } 1612 #endif /* __linux__ */ 1613 1614 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1615 #ifdef notyet 1616 .set_gpu_state = amdgpu_switcheroo_set_state, 1617 .reprobe = NULL, 1618 .can_switch = amdgpu_switcheroo_can_switch, 1619 #endif 1620 }; 1621 1622 /** 1623 * amdgpu_device_ip_set_clockgating_state - set the CG state 1624 * 1625 * @dev: amdgpu_device pointer 1626 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1627 * @state: clockgating state (gate or ungate) 1628 * 1629 * Sets the requested clockgating state for all instances of 1630 * the hardware IP specified. 1631 * Returns the error code from the last instance. 1632 */ 1633 int amdgpu_device_ip_set_clockgating_state(void *dev, 1634 enum amd_ip_block_type block_type, 1635 enum amd_clockgating_state state) 1636 { 1637 struct amdgpu_device *adev = dev; 1638 int i, r = 0; 1639 1640 for (i = 0; i < adev->num_ip_blocks; i++) { 1641 if (!adev->ip_blocks[i].status.valid) 1642 continue; 1643 if (adev->ip_blocks[i].version->type != block_type) 1644 continue; 1645 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1646 continue; 1647 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1648 (void *)adev, state); 1649 if (r) 1650 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1651 adev->ip_blocks[i].version->funcs->name, r); 1652 } 1653 return r; 1654 } 1655 1656 /** 1657 * amdgpu_device_ip_set_powergating_state - set the PG state 1658 * 1659 * @dev: amdgpu_device pointer 1660 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1661 * @state: powergating state (gate or ungate) 1662 * 1663 * Sets the requested powergating state for all instances of 1664 * the hardware IP specified. 1665 * Returns the error code from the last instance. 1666 */ 1667 int amdgpu_device_ip_set_powergating_state(void *dev, 1668 enum amd_ip_block_type block_type, 1669 enum amd_powergating_state state) 1670 { 1671 struct amdgpu_device *adev = dev; 1672 int i, r = 0; 1673 1674 for (i = 0; i < adev->num_ip_blocks; i++) { 1675 if (!adev->ip_blocks[i].status.valid) 1676 continue; 1677 if (adev->ip_blocks[i].version->type != block_type) 1678 continue; 1679 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1680 continue; 1681 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1682 (void *)adev, state); 1683 if (r) 1684 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1685 adev->ip_blocks[i].version->funcs->name, r); 1686 } 1687 return r; 1688 } 1689 1690 /** 1691 * amdgpu_device_ip_get_clockgating_state - get the CG state 1692 * 1693 * @adev: amdgpu_device pointer 1694 * @flags: clockgating feature flags 1695 * 1696 * Walks the list of IPs on the device and updates the clockgating 1697 * flags for each IP. 1698 * Updates @flags with the feature flags for each hardware IP where 1699 * clockgating is enabled. 1700 */ 1701 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1702 u32 *flags) 1703 { 1704 int i; 1705 1706 for (i = 0; i < adev->num_ip_blocks; i++) { 1707 if (!adev->ip_blocks[i].status.valid) 1708 continue; 1709 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1710 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1711 } 1712 } 1713 1714 /** 1715 * amdgpu_device_ip_wait_for_idle - wait for idle 1716 * 1717 * @adev: amdgpu_device pointer 1718 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1719 * 1720 * Waits for the request hardware IP to be idle. 1721 * Returns 0 for success or a negative error code on failure. 1722 */ 1723 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1724 enum amd_ip_block_type block_type) 1725 { 1726 int i, r; 1727 1728 for (i = 0; i < adev->num_ip_blocks; i++) { 1729 if (!adev->ip_blocks[i].status.valid) 1730 continue; 1731 if (adev->ip_blocks[i].version->type == block_type) { 1732 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1733 if (r) 1734 return r; 1735 break; 1736 } 1737 } 1738 return 0; 1739 1740 } 1741 1742 /** 1743 * amdgpu_device_ip_is_idle - is the hardware IP idle 1744 * 1745 * @adev: amdgpu_device pointer 1746 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1747 * 1748 * Check if the hardware IP is idle or not. 1749 * Returns true if it the IP is idle, false if not. 1750 */ 1751 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1752 enum amd_ip_block_type block_type) 1753 { 1754 int i; 1755 1756 for (i = 0; i < adev->num_ip_blocks; i++) { 1757 if (!adev->ip_blocks[i].status.valid) 1758 continue; 1759 if (adev->ip_blocks[i].version->type == block_type) 1760 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1761 } 1762 return true; 1763 1764 } 1765 1766 /** 1767 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1768 * 1769 * @adev: amdgpu_device pointer 1770 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1771 * 1772 * Returns a pointer to the hardware IP block structure 1773 * if it exists for the asic, otherwise NULL. 1774 */ 1775 struct amdgpu_ip_block * 1776 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1777 enum amd_ip_block_type type) 1778 { 1779 int i; 1780 1781 for (i = 0; i < adev->num_ip_blocks; i++) 1782 if (adev->ip_blocks[i].version->type == type) 1783 return &adev->ip_blocks[i]; 1784 1785 return NULL; 1786 } 1787 1788 /** 1789 * amdgpu_device_ip_block_version_cmp 1790 * 1791 * @adev: amdgpu_device pointer 1792 * @type: enum amd_ip_block_type 1793 * @major: major version 1794 * @minor: minor version 1795 * 1796 * return 0 if equal or greater 1797 * return 1 if smaller or the ip_block doesn't exist 1798 */ 1799 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1800 enum amd_ip_block_type type, 1801 u32 major, u32 minor) 1802 { 1803 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1804 1805 if (ip_block && ((ip_block->version->major > major) || 1806 ((ip_block->version->major == major) && 1807 (ip_block->version->minor >= minor)))) 1808 return 0; 1809 1810 return 1; 1811 } 1812 1813 /** 1814 * amdgpu_device_ip_block_add 1815 * 1816 * @adev: amdgpu_device pointer 1817 * @ip_block_version: pointer to the IP to add 1818 * 1819 * Adds the IP block driver information to the collection of IPs 1820 * on the asic. 1821 */ 1822 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1823 const struct amdgpu_ip_block_version *ip_block_version) 1824 { 1825 if (!ip_block_version) 1826 return -EINVAL; 1827 1828 switch (ip_block_version->type) { 1829 case AMD_IP_BLOCK_TYPE_VCN: 1830 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1831 return 0; 1832 break; 1833 case AMD_IP_BLOCK_TYPE_JPEG: 1834 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1835 return 0; 1836 break; 1837 default: 1838 break; 1839 } 1840 1841 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1842 ip_block_version->funcs->name); 1843 1844 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1845 1846 return 0; 1847 } 1848 1849 /** 1850 * amdgpu_device_enable_virtual_display - enable virtual display feature 1851 * 1852 * @adev: amdgpu_device pointer 1853 * 1854 * Enabled the virtual display feature if the user has enabled it via 1855 * the module parameter virtual_display. This feature provides a virtual 1856 * display hardware on headless boards or in virtualized environments. 1857 * This function parses and validates the configuration string specified by 1858 * the user and configues the virtual display configuration (number of 1859 * virtual connectors, crtcs, etc.) specified. 1860 */ 1861 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1862 { 1863 adev->enable_virtual_display = false; 1864 1865 #ifdef notyet 1866 if (amdgpu_virtual_display) { 1867 const char *pci_address_name = pci_name(adev->pdev); 1868 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1869 1870 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1871 pciaddstr_tmp = pciaddstr; 1872 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1873 pciaddname = strsep(&pciaddname_tmp, ","); 1874 if (!strcmp("all", pciaddname) 1875 || !strcmp(pci_address_name, pciaddname)) { 1876 long num_crtc; 1877 int res = -1; 1878 1879 adev->enable_virtual_display = true; 1880 1881 if (pciaddname_tmp) 1882 res = kstrtol(pciaddname_tmp, 10, 1883 &num_crtc); 1884 1885 if (!res) { 1886 if (num_crtc < 1) 1887 num_crtc = 1; 1888 if (num_crtc > 6) 1889 num_crtc = 6; 1890 adev->mode_info.num_crtc = num_crtc; 1891 } else { 1892 adev->mode_info.num_crtc = 1; 1893 } 1894 break; 1895 } 1896 } 1897 1898 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1899 amdgpu_virtual_display, pci_address_name, 1900 adev->enable_virtual_display, adev->mode_info.num_crtc); 1901 1902 kfree(pciaddstr); 1903 } 1904 #endif 1905 } 1906 1907 /** 1908 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Parses the asic configuration parameters specified in the gpu info 1913 * firmware and makes them availale to the driver for use in configuring 1914 * the asic. 1915 * Returns 0 on success, -EINVAL on failure. 1916 */ 1917 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1918 { 1919 const char *chip_name; 1920 char fw_name[40]; 1921 int err; 1922 const struct gpu_info_firmware_header_v1_0 *hdr; 1923 1924 adev->firmware.gpu_info_fw = NULL; 1925 1926 if (adev->mman.discovery_bin) { 1927 amdgpu_discovery_get_gfx_info(adev); 1928 1929 /* 1930 * FIXME: The bounding box is still needed by Navi12, so 1931 * temporarily read it from gpu_info firmware. Should be droped 1932 * when DAL no longer needs it. 1933 */ 1934 if (adev->asic_type != CHIP_NAVI12) 1935 return 0; 1936 } 1937 1938 switch (adev->asic_type) { 1939 #ifdef CONFIG_DRM_AMDGPU_SI 1940 case CHIP_VERDE: 1941 case CHIP_TAHITI: 1942 case CHIP_PITCAIRN: 1943 case CHIP_OLAND: 1944 case CHIP_HAINAN: 1945 #endif 1946 #ifdef CONFIG_DRM_AMDGPU_CIK 1947 case CHIP_BONAIRE: 1948 case CHIP_HAWAII: 1949 case CHIP_KAVERI: 1950 case CHIP_KABINI: 1951 case CHIP_MULLINS: 1952 #endif 1953 case CHIP_TOPAZ: 1954 case CHIP_TONGA: 1955 case CHIP_FIJI: 1956 case CHIP_POLARIS10: 1957 case CHIP_POLARIS11: 1958 case CHIP_POLARIS12: 1959 case CHIP_VEGAM: 1960 case CHIP_CARRIZO: 1961 case CHIP_STONEY: 1962 case CHIP_VEGA20: 1963 case CHIP_ALDEBARAN: 1964 case CHIP_SIENNA_CICHLID: 1965 case CHIP_NAVY_FLOUNDER: 1966 case CHIP_DIMGREY_CAVEFISH: 1967 case CHIP_BEIGE_GOBY: 1968 default: 1969 return 0; 1970 case CHIP_VEGA10: 1971 chip_name = "vega10"; 1972 break; 1973 case CHIP_VEGA12: 1974 chip_name = "vega12"; 1975 break; 1976 case CHIP_RAVEN: 1977 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1978 chip_name = "raven2"; 1979 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1980 chip_name = "picasso"; 1981 else 1982 chip_name = "raven"; 1983 break; 1984 case CHIP_ARCTURUS: 1985 chip_name = "arcturus"; 1986 break; 1987 case CHIP_RENOIR: 1988 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1989 chip_name = "renoir"; 1990 else 1991 chip_name = "green_sardine"; 1992 break; 1993 case CHIP_NAVI10: 1994 chip_name = "navi10"; 1995 break; 1996 case CHIP_NAVI14: 1997 chip_name = "navi14"; 1998 break; 1999 case CHIP_NAVI12: 2000 chip_name = "navi12"; 2001 break; 2002 case CHIP_VANGOGH: 2003 chip_name = "vangogh"; 2004 break; 2005 case CHIP_YELLOW_CARP: 2006 chip_name = "yellow_carp"; 2007 break; 2008 } 2009 2010 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2011 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2012 if (err) { 2013 dev_err(adev->dev, 2014 "Failed to load gpu_info firmware \"%s\"\n", 2015 fw_name); 2016 goto out; 2017 } 2018 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2019 if (err) { 2020 dev_err(adev->dev, 2021 "Failed to validate gpu_info firmware \"%s\"\n", 2022 fw_name); 2023 goto out; 2024 } 2025 2026 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2027 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2028 2029 switch (hdr->version_major) { 2030 case 1: 2031 { 2032 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2033 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2034 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2035 2036 /* 2037 * Should be droped when DAL no longer needs it. 2038 */ 2039 if (adev->asic_type == CHIP_NAVI12) 2040 goto parse_soc_bounding_box; 2041 2042 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2043 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2044 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2045 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2046 adev->gfx.config.max_texture_channel_caches = 2047 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2048 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2049 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2050 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2051 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2052 adev->gfx.config.double_offchip_lds_buf = 2053 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2054 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2055 adev->gfx.cu_info.max_waves_per_simd = 2056 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2057 adev->gfx.cu_info.max_scratch_slots_per_cu = 2058 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2059 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2060 if (hdr->version_minor >= 1) { 2061 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2062 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2063 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2064 adev->gfx.config.num_sc_per_sh = 2065 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2066 adev->gfx.config.num_packer_per_sc = 2067 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2068 } 2069 2070 parse_soc_bounding_box: 2071 /* 2072 * soc bounding box info is not integrated in disocovery table, 2073 * we always need to parse it from gpu info firmware if needed. 2074 */ 2075 if (hdr->version_minor == 2) { 2076 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2077 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2078 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2079 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2080 } 2081 break; 2082 } 2083 default: 2084 dev_err(adev->dev, 2085 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2086 err = -EINVAL; 2087 goto out; 2088 } 2089 out: 2090 return err; 2091 } 2092 2093 /** 2094 * amdgpu_device_ip_early_init - run early init for hardware IPs 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Early initialization pass for hardware IPs. The hardware IPs that make 2099 * up each asic are discovered each IP's early_init callback is run. This 2100 * is the first stage in initializing the asic. 2101 * Returns 0 on success, negative error code on failure. 2102 */ 2103 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2104 { 2105 struct drm_device *dev = adev_to_drm(adev); 2106 struct pci_dev *parent; 2107 int i, r; 2108 2109 amdgpu_device_enable_virtual_display(adev); 2110 2111 if (amdgpu_sriov_vf(adev)) { 2112 r = amdgpu_virt_request_full_gpu(adev, true); 2113 if (r) 2114 return r; 2115 } 2116 2117 switch (adev->asic_type) { 2118 #ifdef CONFIG_DRM_AMDGPU_SI 2119 case CHIP_VERDE: 2120 case CHIP_TAHITI: 2121 case CHIP_PITCAIRN: 2122 case CHIP_OLAND: 2123 case CHIP_HAINAN: 2124 adev->family = AMDGPU_FAMILY_SI; 2125 r = si_set_ip_blocks(adev); 2126 if (r) 2127 return r; 2128 break; 2129 #endif 2130 #ifdef CONFIG_DRM_AMDGPU_CIK 2131 case CHIP_BONAIRE: 2132 case CHIP_HAWAII: 2133 case CHIP_KAVERI: 2134 case CHIP_KABINI: 2135 case CHIP_MULLINS: 2136 if (adev->flags & AMD_IS_APU) 2137 adev->family = AMDGPU_FAMILY_KV; 2138 else 2139 adev->family = AMDGPU_FAMILY_CI; 2140 2141 r = cik_set_ip_blocks(adev); 2142 if (r) 2143 return r; 2144 break; 2145 #endif 2146 case CHIP_TOPAZ: 2147 case CHIP_TONGA: 2148 case CHIP_FIJI: 2149 case CHIP_POLARIS10: 2150 case CHIP_POLARIS11: 2151 case CHIP_POLARIS12: 2152 case CHIP_VEGAM: 2153 case CHIP_CARRIZO: 2154 case CHIP_STONEY: 2155 if (adev->flags & AMD_IS_APU) 2156 adev->family = AMDGPU_FAMILY_CZ; 2157 else 2158 adev->family = AMDGPU_FAMILY_VI; 2159 2160 r = vi_set_ip_blocks(adev); 2161 if (r) 2162 return r; 2163 break; 2164 case CHIP_VEGA10: 2165 case CHIP_VEGA12: 2166 case CHIP_VEGA20: 2167 case CHIP_RAVEN: 2168 case CHIP_ARCTURUS: 2169 case CHIP_RENOIR: 2170 case CHIP_ALDEBARAN: 2171 if (adev->flags & AMD_IS_APU) 2172 adev->family = AMDGPU_FAMILY_RV; 2173 else 2174 adev->family = AMDGPU_FAMILY_AI; 2175 2176 r = soc15_set_ip_blocks(adev); 2177 if (r) 2178 return r; 2179 break; 2180 case CHIP_NAVI10: 2181 case CHIP_NAVI14: 2182 case CHIP_NAVI12: 2183 case CHIP_SIENNA_CICHLID: 2184 case CHIP_NAVY_FLOUNDER: 2185 case CHIP_DIMGREY_CAVEFISH: 2186 case CHIP_BEIGE_GOBY: 2187 case CHIP_VANGOGH: 2188 case CHIP_YELLOW_CARP: 2189 case CHIP_CYAN_SKILLFISH: 2190 if (adev->asic_type == CHIP_VANGOGH) 2191 adev->family = AMDGPU_FAMILY_VGH; 2192 else if (adev->asic_type == CHIP_YELLOW_CARP) 2193 adev->family = AMDGPU_FAMILY_YC; 2194 else 2195 adev->family = AMDGPU_FAMILY_NV; 2196 2197 r = nv_set_ip_blocks(adev); 2198 if (r) 2199 return r; 2200 break; 2201 default: 2202 /* FIXME: not supported yet */ 2203 return -EINVAL; 2204 } 2205 2206 if (amdgpu_has_atpx() && 2207 (amdgpu_is_atpx_hybrid() || 2208 amdgpu_has_atpx_dgpu_power_cntl()) && 2209 ((adev->flags & AMD_IS_APU) == 0) && 2210 !pci_is_thunderbolt_attached(dev->pdev)) 2211 adev->flags |= AMD_IS_PX; 2212 2213 if (!(adev->flags & AMD_IS_APU)) { 2214 parent = pci_upstream_bridge(adev->pdev); 2215 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2216 } 2217 2218 amdgpu_amdkfd_device_probe(adev); 2219 2220 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2221 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2222 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2223 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2224 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2228 DRM_ERROR("disabled ip block: %d <%s>\n", 2229 i, adev->ip_blocks[i].version->funcs->name); 2230 adev->ip_blocks[i].status.valid = false; 2231 } else { 2232 if (adev->ip_blocks[i].version->funcs->early_init) { 2233 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2234 if (r == -ENOENT) { 2235 adev->ip_blocks[i].status.valid = false; 2236 } else if (r) { 2237 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2238 adev->ip_blocks[i].version->funcs->name, r); 2239 return r; 2240 } else { 2241 adev->ip_blocks[i].status.valid = true; 2242 } 2243 } else { 2244 adev->ip_blocks[i].status.valid = true; 2245 } 2246 } 2247 /* get the vbios after the asic_funcs are set up */ 2248 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2249 r = amdgpu_device_parse_gpu_info_fw(adev); 2250 if (r) 2251 return r; 2252 2253 /* Read BIOS */ 2254 if (!amdgpu_get_bios(adev)) 2255 return -EINVAL; 2256 2257 r = amdgpu_atombios_init(adev); 2258 if (r) { 2259 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2260 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2261 return r; 2262 } 2263 2264 /*get pf2vf msg info at it's earliest time*/ 2265 if (amdgpu_sriov_vf(adev)) 2266 amdgpu_virt_init_data_exchange(adev); 2267 2268 } 2269 } 2270 2271 adev->cg_flags &= amdgpu_cg_mask; 2272 adev->pg_flags &= amdgpu_pg_mask; 2273 2274 return 0; 2275 } 2276 2277 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.sw) 2283 continue; 2284 if (adev->ip_blocks[i].status.hw) 2285 continue; 2286 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2287 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2288 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2289 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2290 if (r) { 2291 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2292 adev->ip_blocks[i].version->funcs->name, r); 2293 return r; 2294 } 2295 adev->ip_blocks[i].status.hw = true; 2296 } 2297 } 2298 2299 return 0; 2300 } 2301 2302 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2303 { 2304 int i, r; 2305 2306 for (i = 0; i < adev->num_ip_blocks; i++) { 2307 if (!adev->ip_blocks[i].status.sw) 2308 continue; 2309 if (adev->ip_blocks[i].status.hw) 2310 continue; 2311 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2312 if (r) { 2313 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2314 adev->ip_blocks[i].version->funcs->name, r); 2315 return r; 2316 } 2317 adev->ip_blocks[i].status.hw = true; 2318 } 2319 2320 return 0; 2321 } 2322 2323 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2324 { 2325 int r = 0; 2326 int i; 2327 uint32_t smu_version; 2328 2329 if (adev->asic_type >= CHIP_VEGA10) { 2330 for (i = 0; i < adev->num_ip_blocks; i++) { 2331 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2332 continue; 2333 2334 if (!adev->ip_blocks[i].status.sw) 2335 continue; 2336 2337 /* no need to do the fw loading again if already done*/ 2338 if (adev->ip_blocks[i].status.hw == true) 2339 break; 2340 2341 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2342 r = adev->ip_blocks[i].version->funcs->resume(adev); 2343 if (r) { 2344 DRM_ERROR("resume of IP block <%s> failed %d\n", 2345 adev->ip_blocks[i].version->funcs->name, r); 2346 return r; 2347 } 2348 } else { 2349 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2350 if (r) { 2351 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2352 adev->ip_blocks[i].version->funcs->name, r); 2353 return r; 2354 } 2355 } 2356 2357 adev->ip_blocks[i].status.hw = true; 2358 break; 2359 } 2360 } 2361 2362 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2363 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2364 2365 return r; 2366 } 2367 2368 /** 2369 * amdgpu_device_ip_init - run init for hardware IPs 2370 * 2371 * @adev: amdgpu_device pointer 2372 * 2373 * Main initialization pass for hardware IPs. The list of all the hardware 2374 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2375 * are run. sw_init initializes the software state associated with each IP 2376 * and hw_init initializes the hardware associated with each IP. 2377 * Returns 0 on success, negative error code on failure. 2378 */ 2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2380 { 2381 int i, r; 2382 2383 r = amdgpu_ras_init(adev); 2384 if (r) 2385 return r; 2386 2387 for (i = 0; i < adev->num_ip_blocks; i++) { 2388 if (!adev->ip_blocks[i].status.valid) 2389 continue; 2390 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2391 if (r) { 2392 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2393 adev->ip_blocks[i].version->funcs->name, r); 2394 goto init_failed; 2395 } 2396 adev->ip_blocks[i].status.sw = true; 2397 2398 /* need to do gmc hw init early so we can allocate gpu mem */ 2399 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2400 r = amdgpu_device_vram_scratch_init(adev); 2401 if (r) { 2402 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2403 goto init_failed; 2404 } 2405 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2406 if (r) { 2407 DRM_ERROR("hw_init %d failed %d\n", i, r); 2408 goto init_failed; 2409 } 2410 r = amdgpu_device_wb_init(adev); 2411 if (r) { 2412 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2413 goto init_failed; 2414 } 2415 adev->ip_blocks[i].status.hw = true; 2416 2417 /* right after GMC hw init, we create CSA */ 2418 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2419 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2420 AMDGPU_GEM_DOMAIN_VRAM, 2421 AMDGPU_CSA_SIZE); 2422 if (r) { 2423 DRM_ERROR("allocate CSA failed %d\n", r); 2424 goto init_failed; 2425 } 2426 } 2427 } 2428 } 2429 2430 if (amdgpu_sriov_vf(adev)) 2431 amdgpu_virt_init_data_exchange(adev); 2432 2433 r = amdgpu_ib_pool_init(adev); 2434 if (r) { 2435 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2437 goto init_failed; 2438 } 2439 2440 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2441 if (r) 2442 goto init_failed; 2443 2444 r = amdgpu_amdkfd_resume_iommu(adev); 2445 if (r) 2446 goto init_failed; 2447 2448 r = amdgpu_device_ip_hw_init_phase1(adev); 2449 if (r) 2450 goto init_failed; 2451 2452 r = amdgpu_device_fw_loading(adev); 2453 if (r) 2454 goto init_failed; 2455 2456 r = amdgpu_device_ip_hw_init_phase2(adev); 2457 if (r) 2458 goto init_failed; 2459 2460 /* 2461 * retired pages will be loaded from eeprom and reserved here, 2462 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2463 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2464 * for I2C communication which only true at this point. 2465 * 2466 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2467 * failure from bad gpu situation and stop amdgpu init process 2468 * accordingly. For other failed cases, it will still release all 2469 * the resource and print error message, rather than returning one 2470 * negative value to upper level. 2471 * 2472 * Note: theoretically, this should be called before all vram allocations 2473 * to protect retired page from abusing 2474 */ 2475 r = amdgpu_ras_recovery_init(adev); 2476 if (r) 2477 goto init_failed; 2478 2479 if (adev->gmc.xgmi.num_physical_nodes > 1) 2480 amdgpu_xgmi_add_device(adev); 2481 2482 /* Don't init kfd if whole hive need to be reset during init */ 2483 if (!adev->gmc.xgmi.pending_reset) 2484 amdgpu_amdkfd_device_init(adev); 2485 2486 amdgpu_fru_get_product_info(adev); 2487 2488 init_failed: 2489 if (amdgpu_sriov_vf(adev)) 2490 amdgpu_virt_release_full_gpu(adev, true); 2491 2492 return r; 2493 } 2494 2495 /** 2496 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2497 * 2498 * @adev: amdgpu_device pointer 2499 * 2500 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2501 * this function before a GPU reset. If the value is retained after a 2502 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2503 */ 2504 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2505 { 2506 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2507 } 2508 2509 /** 2510 * amdgpu_device_check_vram_lost - check if vram is valid 2511 * 2512 * @adev: amdgpu_device pointer 2513 * 2514 * Checks the reset magic value written to the gart pointer in VRAM. 2515 * The driver calls this after a GPU reset to see if the contents of 2516 * VRAM is lost or now. 2517 * returns true if vram is lost, false if not. 2518 */ 2519 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2520 { 2521 if (memcmp(adev->gart.ptr, adev->reset_magic, 2522 AMDGPU_RESET_MAGIC_NUM)) 2523 return true; 2524 2525 if (!amdgpu_in_reset(adev)) 2526 return false; 2527 2528 /* 2529 * For all ASICs with baco/mode1 reset, the VRAM is 2530 * always assumed to be lost. 2531 */ 2532 switch (amdgpu_asic_reset_method(adev)) { 2533 case AMD_RESET_METHOD_BACO: 2534 case AMD_RESET_METHOD_MODE1: 2535 return true; 2536 default: 2537 return false; 2538 } 2539 } 2540 2541 /** 2542 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2543 * 2544 * @adev: amdgpu_device pointer 2545 * @state: clockgating state (gate or ungate) 2546 * 2547 * The list of all the hardware IPs that make up the asic is walked and the 2548 * set_clockgating_state callbacks are run. 2549 * Late initialization pass enabling clockgating for hardware IPs. 2550 * Fini or suspend, pass disabling clockgating for hardware IPs. 2551 * Returns 0 on success, negative error code on failure. 2552 */ 2553 2554 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2555 enum amd_clockgating_state state) 2556 { 2557 int i, j, r; 2558 2559 if (amdgpu_emu_mode == 1) 2560 return 0; 2561 2562 for (j = 0; j < adev->num_ip_blocks; j++) { 2563 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2564 if (!adev->ip_blocks[i].status.late_initialized) 2565 continue; 2566 /* skip CG for GFX on S0ix */ 2567 if (adev->in_s0ix && 2568 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2569 continue; 2570 /* skip CG for VCE/UVD, it's handled specially */ 2571 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2572 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2573 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2574 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2575 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2576 /* enable clockgating to save power */ 2577 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2578 state); 2579 if (r) { 2580 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2581 adev->ip_blocks[i].version->funcs->name, r); 2582 return r; 2583 } 2584 } 2585 } 2586 2587 return 0; 2588 } 2589 2590 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2591 enum amd_powergating_state state) 2592 { 2593 int i, j, r; 2594 2595 if (amdgpu_emu_mode == 1) 2596 return 0; 2597 2598 for (j = 0; j < adev->num_ip_blocks; j++) { 2599 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2600 if (!adev->ip_blocks[i].status.late_initialized) 2601 continue; 2602 /* skip PG for GFX on S0ix */ 2603 if (adev->in_s0ix && 2604 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2605 continue; 2606 /* skip CG for VCE/UVD, it's handled specially */ 2607 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2608 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2609 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2610 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2611 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2612 /* enable powergating to save power */ 2613 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2614 state); 2615 if (r) { 2616 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2617 adev->ip_blocks[i].version->funcs->name, r); 2618 return r; 2619 } 2620 } 2621 } 2622 return 0; 2623 } 2624 2625 static int amdgpu_device_enable_mgpu_fan_boost(void) 2626 { 2627 struct amdgpu_gpu_instance *gpu_ins; 2628 struct amdgpu_device *adev; 2629 int i, ret = 0; 2630 2631 mutex_lock(&mgpu_info.mutex); 2632 2633 /* 2634 * MGPU fan boost feature should be enabled 2635 * only when there are two or more dGPUs in 2636 * the system 2637 */ 2638 if (mgpu_info.num_dgpu < 2) 2639 goto out; 2640 2641 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2642 gpu_ins = &(mgpu_info.gpu_ins[i]); 2643 adev = gpu_ins->adev; 2644 if (!(adev->flags & AMD_IS_APU) && 2645 !gpu_ins->mgpu_fan_enabled) { 2646 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2647 if (ret) 2648 break; 2649 2650 gpu_ins->mgpu_fan_enabled = 1; 2651 } 2652 } 2653 2654 out: 2655 mutex_unlock(&mgpu_info.mutex); 2656 2657 return ret; 2658 } 2659 2660 /** 2661 * amdgpu_device_ip_late_init - run late init for hardware IPs 2662 * 2663 * @adev: amdgpu_device pointer 2664 * 2665 * Late initialization pass for hardware IPs. The list of all the hardware 2666 * IPs that make up the asic is walked and the late_init callbacks are run. 2667 * late_init covers any special initialization that an IP requires 2668 * after all of the have been initialized or something that needs to happen 2669 * late in the init process. 2670 * Returns 0 on success, negative error code on failure. 2671 */ 2672 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2673 { 2674 struct amdgpu_gpu_instance *gpu_instance; 2675 int i = 0, r; 2676 2677 for (i = 0; i < adev->num_ip_blocks; i++) { 2678 if (!adev->ip_blocks[i].status.hw) 2679 continue; 2680 if (adev->ip_blocks[i].version->funcs->late_init) { 2681 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2682 if (r) { 2683 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2684 adev->ip_blocks[i].version->funcs->name, r); 2685 return r; 2686 } 2687 } 2688 adev->ip_blocks[i].status.late_initialized = true; 2689 } 2690 2691 amdgpu_ras_set_error_query_ready(adev, true); 2692 2693 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2694 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2695 2696 amdgpu_device_fill_reset_magic(adev); 2697 2698 r = amdgpu_device_enable_mgpu_fan_boost(); 2699 if (r) 2700 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2701 2702 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2703 if (adev->asic_type == CHIP_ARCTURUS && 2704 amdgpu_passthrough(adev) && 2705 adev->gmc.xgmi.num_physical_nodes > 1) 2706 smu_set_light_sbr(&adev->smu, true); 2707 2708 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2709 mutex_lock(&mgpu_info.mutex); 2710 2711 /* 2712 * Reset device p-state to low as this was booted with high. 2713 * 2714 * This should be performed only after all devices from the same 2715 * hive get initialized. 2716 * 2717 * However, it's unknown how many device in the hive in advance. 2718 * As this is counted one by one during devices initializations. 2719 * 2720 * So, we wait for all XGMI interlinked devices initialized. 2721 * This may bring some delays as those devices may come from 2722 * different hives. But that should be OK. 2723 */ 2724 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2725 for (i = 0; i < mgpu_info.num_gpu; i++) { 2726 gpu_instance = &(mgpu_info.gpu_ins[i]); 2727 if (gpu_instance->adev->flags & AMD_IS_APU) 2728 continue; 2729 2730 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2731 AMDGPU_XGMI_PSTATE_MIN); 2732 if (r) { 2733 DRM_ERROR("pstate setting failed (%d).\n", r); 2734 break; 2735 } 2736 } 2737 } 2738 2739 mutex_unlock(&mgpu_info.mutex); 2740 } 2741 2742 return 0; 2743 } 2744 2745 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2746 { 2747 int i, r; 2748 2749 for (i = 0; i < adev->num_ip_blocks; i++) { 2750 if (!adev->ip_blocks[i].version->funcs->early_fini) 2751 continue; 2752 2753 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2754 if (r) { 2755 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2756 adev->ip_blocks[i].version->funcs->name, r); 2757 } 2758 } 2759 2760 amdgpu_amdkfd_suspend(adev, false); 2761 2762 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2763 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2764 2765 /* need to disable SMC first */ 2766 for (i = 0; i < adev->num_ip_blocks; i++) { 2767 if (!adev->ip_blocks[i].status.hw) 2768 continue; 2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2770 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2771 /* XXX handle errors */ 2772 if (r) { 2773 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2774 adev->ip_blocks[i].version->funcs->name, r); 2775 } 2776 adev->ip_blocks[i].status.hw = false; 2777 break; 2778 } 2779 } 2780 2781 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2782 if (!adev->ip_blocks[i].status.hw) 2783 continue; 2784 2785 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2786 /* XXX handle errors */ 2787 if (r) { 2788 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2789 adev->ip_blocks[i].version->funcs->name, r); 2790 } 2791 2792 adev->ip_blocks[i].status.hw = false; 2793 } 2794 2795 if (amdgpu_sriov_vf(adev)) { 2796 if (amdgpu_virt_release_full_gpu(adev, false)) 2797 DRM_ERROR("failed to release exclusive mode on fini\n"); 2798 } 2799 2800 return 0; 2801 } 2802 2803 /** 2804 * amdgpu_device_ip_fini - run fini for hardware IPs 2805 * 2806 * @adev: amdgpu_device pointer 2807 * 2808 * Main teardown pass for hardware IPs. The list of all the hardware 2809 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2810 * are run. hw_fini tears down the hardware associated with each IP 2811 * and sw_fini tears down any software state associated with each IP. 2812 * Returns 0 on success, negative error code on failure. 2813 */ 2814 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2815 { 2816 int i, r; 2817 2818 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2819 amdgpu_virt_release_ras_err_handler_data(adev); 2820 2821 amdgpu_ras_pre_fini(adev); 2822 2823 if (adev->gmc.xgmi.num_physical_nodes > 1) 2824 amdgpu_xgmi_remove_device(adev); 2825 2826 amdgpu_amdkfd_device_fini_sw(adev); 2827 2828 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2829 if (!adev->ip_blocks[i].status.sw) 2830 continue; 2831 2832 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2833 amdgpu_ucode_free_bo(adev); 2834 amdgpu_free_static_csa(&adev->virt.csa_obj); 2835 amdgpu_device_wb_fini(adev); 2836 amdgpu_device_vram_scratch_fini(adev); 2837 amdgpu_ib_pool_fini(adev); 2838 } 2839 2840 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2841 /* XXX handle errors */ 2842 if (r) { 2843 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2844 adev->ip_blocks[i].version->funcs->name, r); 2845 } 2846 adev->ip_blocks[i].status.sw = false; 2847 adev->ip_blocks[i].status.valid = false; 2848 } 2849 2850 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2851 if (!adev->ip_blocks[i].status.late_initialized) 2852 continue; 2853 if (adev->ip_blocks[i].version->funcs->late_fini) 2854 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2855 adev->ip_blocks[i].status.late_initialized = false; 2856 } 2857 2858 amdgpu_ras_fini(adev); 2859 2860 return 0; 2861 } 2862 2863 /** 2864 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2865 * 2866 * @work: work_struct. 2867 */ 2868 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2869 { 2870 struct amdgpu_device *adev = 2871 container_of(work, struct amdgpu_device, delayed_init_work.work); 2872 int r; 2873 2874 r = amdgpu_ib_ring_tests(adev); 2875 if (r) 2876 DRM_ERROR("ib ring test failed (%d).\n", r); 2877 } 2878 2879 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2880 { 2881 struct amdgpu_device *adev = 2882 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2883 2884 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2885 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2886 2887 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2888 adev->gfx.gfx_off_state = true; 2889 } 2890 2891 /** 2892 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2893 * 2894 * @adev: amdgpu_device pointer 2895 * 2896 * Main suspend function for hardware IPs. The list of all the hardware 2897 * IPs that make up the asic is walked, clockgating is disabled and the 2898 * suspend callbacks are run. suspend puts the hardware and software state 2899 * in each IP into a state suitable for suspend. 2900 * Returns 0 on success, negative error code on failure. 2901 */ 2902 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2903 { 2904 int i, r; 2905 2906 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2907 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2908 2909 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2910 if (!adev->ip_blocks[i].status.valid) 2911 continue; 2912 2913 /* displays are handled separately */ 2914 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2915 continue; 2916 2917 /* XXX handle errors */ 2918 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2919 /* XXX handle errors */ 2920 if (r) { 2921 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2922 adev->ip_blocks[i].version->funcs->name, r); 2923 return r; 2924 } 2925 2926 adev->ip_blocks[i].status.hw = false; 2927 } 2928 2929 return 0; 2930 } 2931 2932 /** 2933 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2934 * 2935 * @adev: amdgpu_device pointer 2936 * 2937 * Main suspend function for hardware IPs. The list of all the hardware 2938 * IPs that make up the asic is walked, clockgating is disabled and the 2939 * suspend callbacks are run. suspend puts the hardware and software state 2940 * in each IP into a state suitable for suspend. 2941 * Returns 0 on success, negative error code on failure. 2942 */ 2943 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2944 { 2945 int i, r; 2946 2947 if (adev->in_s0ix) 2948 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2949 2950 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2951 if (!adev->ip_blocks[i].status.valid) 2952 continue; 2953 /* displays are handled in phase1 */ 2954 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2955 continue; 2956 /* PSP lost connection when err_event_athub occurs */ 2957 if (amdgpu_ras_intr_triggered() && 2958 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2959 adev->ip_blocks[i].status.hw = false; 2960 continue; 2961 } 2962 2963 /* skip unnecessary suspend if we do not initialize them yet */ 2964 if (adev->gmc.xgmi.pending_reset && 2965 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2968 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2969 adev->ip_blocks[i].status.hw = false; 2970 continue; 2971 } 2972 2973 /* skip suspend of gfx and psp for S0ix 2974 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2975 * like at runtime. PSP is also part of the always on hardware 2976 * so no need to suspend it. 2977 */ 2978 if (adev->in_s0ix && 2979 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2981 continue; 2982 2983 /* XXX handle errors */ 2984 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2985 /* XXX handle errors */ 2986 if (r) { 2987 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2988 adev->ip_blocks[i].version->funcs->name, r); 2989 } 2990 adev->ip_blocks[i].status.hw = false; 2991 /* handle putting the SMC in the appropriate state */ 2992 if(!amdgpu_sriov_vf(adev)){ 2993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2994 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2995 if (r) { 2996 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2997 adev->mp1_state, r); 2998 return r; 2999 } 3000 } 3001 } 3002 } 3003 3004 return 0; 3005 } 3006 3007 /** 3008 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3009 * 3010 * @adev: amdgpu_device pointer 3011 * 3012 * Main suspend function for hardware IPs. The list of all the hardware 3013 * IPs that make up the asic is walked, clockgating is disabled and the 3014 * suspend callbacks are run. suspend puts the hardware and software state 3015 * in each IP into a state suitable for suspend. 3016 * Returns 0 on success, negative error code on failure. 3017 */ 3018 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3019 { 3020 int r; 3021 3022 if (amdgpu_sriov_vf(adev)) { 3023 amdgpu_virt_fini_data_exchange(adev); 3024 amdgpu_virt_request_full_gpu(adev, false); 3025 } 3026 3027 r = amdgpu_device_ip_suspend_phase1(adev); 3028 if (r) 3029 return r; 3030 r = amdgpu_device_ip_suspend_phase2(adev); 3031 3032 if (amdgpu_sriov_vf(adev)) 3033 amdgpu_virt_release_full_gpu(adev, false); 3034 3035 return r; 3036 } 3037 3038 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3039 { 3040 int i, r; 3041 3042 static enum amd_ip_block_type ip_order[] = { 3043 AMD_IP_BLOCK_TYPE_GMC, 3044 AMD_IP_BLOCK_TYPE_COMMON, 3045 AMD_IP_BLOCK_TYPE_PSP, 3046 AMD_IP_BLOCK_TYPE_IH, 3047 }; 3048 3049 for (i = 0; i < adev->num_ip_blocks; i++) { 3050 int j; 3051 struct amdgpu_ip_block *block; 3052 3053 block = &adev->ip_blocks[i]; 3054 block->status.hw = false; 3055 3056 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3057 3058 if (block->version->type != ip_order[j] || 3059 !block->status.valid) 3060 continue; 3061 3062 r = block->version->funcs->hw_init(adev); 3063 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3064 if (r) 3065 return r; 3066 block->status.hw = true; 3067 } 3068 } 3069 3070 return 0; 3071 } 3072 3073 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3074 { 3075 int i, r; 3076 3077 static enum amd_ip_block_type ip_order[] = { 3078 AMD_IP_BLOCK_TYPE_SMC, 3079 AMD_IP_BLOCK_TYPE_DCE, 3080 AMD_IP_BLOCK_TYPE_GFX, 3081 AMD_IP_BLOCK_TYPE_SDMA, 3082 AMD_IP_BLOCK_TYPE_UVD, 3083 AMD_IP_BLOCK_TYPE_VCE, 3084 AMD_IP_BLOCK_TYPE_VCN 3085 }; 3086 3087 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3088 int j; 3089 struct amdgpu_ip_block *block; 3090 3091 for (j = 0; j < adev->num_ip_blocks; j++) { 3092 block = &adev->ip_blocks[j]; 3093 3094 if (block->version->type != ip_order[i] || 3095 !block->status.valid || 3096 block->status.hw) 3097 continue; 3098 3099 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3100 r = block->version->funcs->resume(adev); 3101 else 3102 r = block->version->funcs->hw_init(adev); 3103 3104 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3105 if (r) 3106 return r; 3107 block->status.hw = true; 3108 } 3109 } 3110 3111 return 0; 3112 } 3113 3114 /** 3115 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3116 * 3117 * @adev: amdgpu_device pointer 3118 * 3119 * First resume function for hardware IPs. The list of all the hardware 3120 * IPs that make up the asic is walked and the resume callbacks are run for 3121 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3122 * after a suspend and updates the software state as necessary. This 3123 * function is also used for restoring the GPU after a GPU reset. 3124 * Returns 0 on success, negative error code on failure. 3125 */ 3126 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 for (i = 0; i < adev->num_ip_blocks; i++) { 3131 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3132 continue; 3133 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3135 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3136 3137 r = adev->ip_blocks[i].version->funcs->resume(adev); 3138 if (r) { 3139 DRM_ERROR("resume of IP block <%s> failed %d\n", 3140 adev->ip_blocks[i].version->funcs->name, r); 3141 return r; 3142 } 3143 adev->ip_blocks[i].status.hw = true; 3144 } 3145 } 3146 3147 return 0; 3148 } 3149 3150 /** 3151 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3152 * 3153 * @adev: amdgpu_device pointer 3154 * 3155 * First resume function for hardware IPs. The list of all the hardware 3156 * IPs that make up the asic is walked and the resume callbacks are run for 3157 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3158 * functional state after a suspend and updates the software state as 3159 * necessary. This function is also used for restoring the GPU after a GPU 3160 * reset. 3161 * Returns 0 on success, negative error code on failure. 3162 */ 3163 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3164 { 3165 int i, r; 3166 3167 for (i = 0; i < adev->num_ip_blocks; i++) { 3168 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3169 continue; 3170 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3173 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3174 continue; 3175 r = adev->ip_blocks[i].version->funcs->resume(adev); 3176 if (r) { 3177 DRM_ERROR("resume of IP block <%s> failed %d\n", 3178 adev->ip_blocks[i].version->funcs->name, r); 3179 return r; 3180 } 3181 adev->ip_blocks[i].status.hw = true; 3182 } 3183 3184 return 0; 3185 } 3186 3187 /** 3188 * amdgpu_device_ip_resume - run resume for hardware IPs 3189 * 3190 * @adev: amdgpu_device pointer 3191 * 3192 * Main resume function for hardware IPs. The hardware IPs 3193 * are split into two resume functions because they are 3194 * are also used in in recovering from a GPU reset and some additional 3195 * steps need to be take between them. In this case (S3/S4) they are 3196 * run sequentially. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3200 { 3201 int r; 3202 3203 r = amdgpu_amdkfd_resume_iommu(adev); 3204 if (r) 3205 return r; 3206 3207 r = amdgpu_device_ip_resume_phase1(adev); 3208 if (r) 3209 return r; 3210 3211 r = amdgpu_device_fw_loading(adev); 3212 if (r) 3213 return r; 3214 3215 r = amdgpu_device_ip_resume_phase2(adev); 3216 3217 return r; 3218 } 3219 3220 /** 3221 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3222 * 3223 * @adev: amdgpu_device pointer 3224 * 3225 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3226 */ 3227 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3228 { 3229 if (amdgpu_sriov_vf(adev)) { 3230 if (adev->is_atom_fw) { 3231 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3232 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3233 } else { 3234 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3235 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3236 } 3237 3238 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3240 } 3241 } 3242 3243 /** 3244 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3245 * 3246 * @asic_type: AMD asic type 3247 * 3248 * Check if there is DC (new modesetting infrastructre) support for an asic. 3249 * returns true if DC has support, false if not. 3250 */ 3251 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3252 { 3253 switch (asic_type) { 3254 #if defined(CONFIG_DRM_AMD_DC) 3255 #if defined(CONFIG_DRM_AMD_DC_SI) 3256 case CHIP_TAHITI: 3257 case CHIP_PITCAIRN: 3258 case CHIP_VERDE: 3259 case CHIP_OLAND: 3260 #endif 3261 case CHIP_BONAIRE: 3262 case CHIP_KAVERI: 3263 case CHIP_KABINI: 3264 case CHIP_MULLINS: 3265 /* 3266 * We have systems in the wild with these ASICs that require 3267 * LVDS and VGA support which is not supported with DC. 3268 * 3269 * Fallback to the non-DC driver here by default so as not to 3270 * cause regressions. 3271 */ 3272 return amdgpu_dc > 0; 3273 case CHIP_HAWAII: 3274 case CHIP_CARRIZO: 3275 case CHIP_STONEY: 3276 case CHIP_POLARIS10: 3277 case CHIP_POLARIS11: 3278 case CHIP_POLARIS12: 3279 case CHIP_VEGAM: 3280 case CHIP_TONGA: 3281 case CHIP_FIJI: 3282 case CHIP_VEGA10: 3283 case CHIP_VEGA12: 3284 case CHIP_VEGA20: 3285 #if defined(CONFIG_DRM_AMD_DC_DCN) 3286 case CHIP_RAVEN: 3287 case CHIP_NAVI10: 3288 case CHIP_NAVI14: 3289 case CHIP_NAVI12: 3290 case CHIP_RENOIR: 3291 case CHIP_SIENNA_CICHLID: 3292 case CHIP_NAVY_FLOUNDER: 3293 case CHIP_DIMGREY_CAVEFISH: 3294 case CHIP_BEIGE_GOBY: 3295 case CHIP_VANGOGH: 3296 case CHIP_YELLOW_CARP: 3297 #endif 3298 return amdgpu_dc != 0; 3299 #endif 3300 default: 3301 if (amdgpu_dc > 0) 3302 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3303 "but isn't supported by ASIC, ignoring\n"); 3304 return false; 3305 } 3306 } 3307 3308 /** 3309 * amdgpu_device_has_dc_support - check if dc is supported 3310 * 3311 * @adev: amdgpu_device pointer 3312 * 3313 * Returns true for supported, false for not supported 3314 */ 3315 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3316 { 3317 if (amdgpu_sriov_vf(adev) || 3318 adev->enable_virtual_display || 3319 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3320 return false; 3321 3322 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3323 } 3324 3325 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3326 { 3327 struct amdgpu_device *adev = 3328 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3329 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3330 3331 /* It's a bug to not have a hive within this function */ 3332 if (WARN_ON(!hive)) 3333 return; 3334 3335 /* 3336 * Use task barrier to synchronize all xgmi reset works across the 3337 * hive. task_barrier_enter and task_barrier_exit will block 3338 * until all the threads running the xgmi reset works reach 3339 * those points. task_barrier_full will do both blocks. 3340 */ 3341 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3342 3343 task_barrier_enter(&hive->tb); 3344 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3345 3346 if (adev->asic_reset_res) 3347 goto fail; 3348 3349 task_barrier_exit(&hive->tb); 3350 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3351 3352 if (adev->asic_reset_res) 3353 goto fail; 3354 3355 if (adev->mmhub.ras_funcs && 3356 adev->mmhub.ras_funcs->reset_ras_error_count) 3357 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3358 } else { 3359 3360 task_barrier_full(&hive->tb); 3361 adev->asic_reset_res = amdgpu_asic_reset(adev); 3362 } 3363 3364 fail: 3365 if (adev->asic_reset_res) 3366 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3367 adev->asic_reset_res, adev_to_drm(adev)->unique); 3368 amdgpu_put_xgmi_hive(hive); 3369 } 3370 3371 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3372 { 3373 char *input = amdgpu_lockup_timeout; 3374 char *timeout_setting = NULL; 3375 int index = 0; 3376 long timeout; 3377 int ret = 0; 3378 3379 /* 3380 * By default timeout for non compute jobs is 10000 3381 * and 60000 for compute jobs. 3382 * In SR-IOV or passthrough mode, timeout for compute 3383 * jobs are 60000 by default. 3384 */ 3385 adev->gfx_timeout = msecs_to_jiffies(10000); 3386 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3387 if (amdgpu_sriov_vf(adev)) 3388 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3389 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3390 else 3391 adev->compute_timeout = msecs_to_jiffies(60000); 3392 3393 #ifdef notyet 3394 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3395 while ((timeout_setting = strsep(&input, ",")) && 3396 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3397 ret = kstrtol(timeout_setting, 0, &timeout); 3398 if (ret) 3399 return ret; 3400 3401 if (timeout == 0) { 3402 index++; 3403 continue; 3404 } else if (timeout < 0) { 3405 timeout = MAX_SCHEDULE_TIMEOUT; 3406 } else { 3407 timeout = msecs_to_jiffies(timeout); 3408 } 3409 3410 switch (index++) { 3411 case 0: 3412 adev->gfx_timeout = timeout; 3413 break; 3414 case 1: 3415 adev->compute_timeout = timeout; 3416 break; 3417 case 2: 3418 adev->sdma_timeout = timeout; 3419 break; 3420 case 3: 3421 adev->video_timeout = timeout; 3422 break; 3423 default: 3424 break; 3425 } 3426 } 3427 /* 3428 * There is only one value specified and 3429 * it should apply to all non-compute jobs. 3430 */ 3431 if (index == 1) { 3432 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3433 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3434 adev->compute_timeout = adev->gfx_timeout; 3435 } 3436 } 3437 #endif 3438 3439 return ret; 3440 } 3441 3442 static const struct attribute *amdgpu_dev_attributes[] = { 3443 &dev_attr_product_name.attr, 3444 &dev_attr_product_number.attr, 3445 &dev_attr_serial_number.attr, 3446 &dev_attr_pcie_replay_count.attr, 3447 NULL 3448 }; 3449 3450 /** 3451 * amdgpu_device_init - initialize the driver 3452 * 3453 * @adev: amdgpu_device pointer 3454 * @flags: driver flags 3455 * 3456 * Initializes the driver info and hw (all asics). 3457 * Returns 0 for success or an error on failure. 3458 * Called at driver startup. 3459 */ 3460 int amdgpu_device_init(struct amdgpu_device *adev, 3461 uint32_t flags) 3462 { 3463 struct drm_device *ddev = adev_to_drm(adev); 3464 struct pci_dev *pdev = adev->pdev; 3465 int r, i; 3466 bool px = false; 3467 u32 max_MBps; 3468 3469 adev->shutdown = false; 3470 adev->flags = flags; 3471 3472 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3473 adev->asic_type = amdgpu_force_asic_type; 3474 else 3475 adev->asic_type = flags & AMD_ASIC_MASK; 3476 3477 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3478 if (amdgpu_emu_mode == 1) 3479 adev->usec_timeout *= 10; 3480 adev->gmc.gart_size = 512 * 1024 * 1024; 3481 adev->accel_working = false; 3482 adev->num_rings = 0; 3483 adev->mman.buffer_funcs = NULL; 3484 adev->mman.buffer_funcs_ring = NULL; 3485 adev->vm_manager.vm_pte_funcs = NULL; 3486 adev->vm_manager.vm_pte_num_scheds = 0; 3487 adev->gmc.gmc_funcs = NULL; 3488 adev->harvest_ip_mask = 0x0; 3489 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3490 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3491 3492 adev->smc_rreg = &amdgpu_invalid_rreg; 3493 adev->smc_wreg = &amdgpu_invalid_wreg; 3494 adev->pcie_rreg = &amdgpu_invalid_rreg; 3495 adev->pcie_wreg = &amdgpu_invalid_wreg; 3496 adev->pciep_rreg = &amdgpu_invalid_rreg; 3497 adev->pciep_wreg = &amdgpu_invalid_wreg; 3498 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3499 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3500 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3501 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3502 adev->didt_rreg = &amdgpu_invalid_rreg; 3503 adev->didt_wreg = &amdgpu_invalid_wreg; 3504 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3505 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3506 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3507 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3508 3509 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3510 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3511 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3512 3513 /* mutex initialization are all done here so we 3514 * can recall function without having locking issues */ 3515 rw_init(&adev->firmware.mutex, "agfw"); 3516 rw_init(&adev->pm.mutex, "agpm"); 3517 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3518 rw_init(&adev->srbm_mutex, "srbm"); 3519 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3520 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3521 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3522 rw_init(&adev->mn_lock, "agpumn"); 3523 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3524 hash_init(adev->mn_hash); 3525 atomic_set(&adev->in_gpu_reset, 0); 3526 rw_init(&adev->reset_sem, "amrs"); 3527 rw_init(&adev->psp.mutex, "agpsp"); 3528 rw_init(&adev->notifier_lock, "agnf"); 3529 3530 r = amdgpu_device_init_apu_flags(adev); 3531 if (r) 3532 return r; 3533 3534 r = amdgpu_device_check_arguments(adev); 3535 if (r) 3536 return r; 3537 3538 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3539 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3540 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3541 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3542 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3543 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3544 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3545 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3546 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3547 3548 INIT_LIST_HEAD(&adev->shadow_list); 3549 rw_init(&adev->shadow_list_lock, "sdwlst"); 3550 3551 INIT_LIST_HEAD(&adev->reset_list); 3552 3553 INIT_DELAYED_WORK(&adev->delayed_init_work, 3554 amdgpu_device_delayed_init_work_handler); 3555 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3556 amdgpu_device_delay_enable_gfx_off); 3557 3558 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3559 3560 adev->gfx.gfx_off_req_count = 1; 3561 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3562 3563 atomic_set(&adev->throttling_logging_enabled, 1); 3564 /* 3565 * If throttling continues, logging will be performed every minute 3566 * to avoid log flooding. "-1" is subtracted since the thermal 3567 * throttling interrupt comes every second. Thus, the total logging 3568 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3569 * for throttling interrupt) = 60 seconds. 3570 */ 3571 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3572 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3573 3574 #ifdef __linux__ 3575 /* Registers mapping */ 3576 /* TODO: block userspace mapping of io register */ 3577 if (adev->asic_type >= CHIP_BONAIRE) { 3578 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3579 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3580 } else { 3581 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3582 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3583 } 3584 3585 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3586 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3587 3588 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3589 if (adev->rmmio == NULL) { 3590 return -ENOMEM; 3591 } 3592 #endif 3593 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3594 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3595 3596 /* enable PCIE atomic ops */ 3597 #ifdef notyet 3598 r = pci_enable_atomic_ops_to_root(adev->pdev, 3599 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3600 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3601 if (r) { 3602 adev->have_atomics_support = false; 3603 DRM_INFO("PCIE atomic ops is not supported\n"); 3604 } else { 3605 adev->have_atomics_support = true; 3606 } 3607 #else 3608 adev->have_atomics_support = false; 3609 #endif 3610 3611 amdgpu_device_get_pcie_info(adev); 3612 3613 if (amdgpu_mcbp) 3614 DRM_INFO("MCBP is enabled\n"); 3615 3616 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3617 adev->enable_mes = true; 3618 3619 /* detect hw virtualization here */ 3620 amdgpu_detect_virtualization(adev); 3621 3622 r = amdgpu_device_get_job_timeout_settings(adev); 3623 if (r) { 3624 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3625 return r; 3626 } 3627 3628 /* early init functions */ 3629 r = amdgpu_device_ip_early_init(adev); 3630 if (r) 3631 return r; 3632 3633 /* doorbell bar mapping and doorbell index init*/ 3634 amdgpu_device_doorbell_init(adev); 3635 3636 if (amdgpu_emu_mode == 1) { 3637 /* post the asic on emulation mode */ 3638 emu_soc_asic_init(adev); 3639 goto fence_driver_init; 3640 } 3641 3642 amdgpu_reset_init(adev); 3643 3644 /* detect if we are with an SRIOV vbios */ 3645 amdgpu_device_detect_sriov_bios(adev); 3646 3647 /* check if we need to reset the asic 3648 * E.g., driver was not cleanly unloaded previously, etc. 3649 */ 3650 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3651 if (adev->gmc.xgmi.num_physical_nodes) { 3652 dev_info(adev->dev, "Pending hive reset.\n"); 3653 adev->gmc.xgmi.pending_reset = true; 3654 /* Only need to init necessary block for SMU to handle the reset */ 3655 for (i = 0; i < adev->num_ip_blocks; i++) { 3656 if (!adev->ip_blocks[i].status.valid) 3657 continue; 3658 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3659 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3660 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3662 DRM_DEBUG("IP %s disabled for hw_init.\n", 3663 adev->ip_blocks[i].version->funcs->name); 3664 adev->ip_blocks[i].status.hw = true; 3665 } 3666 } 3667 } else { 3668 r = amdgpu_asic_reset(adev); 3669 if (r) { 3670 dev_err(adev->dev, "asic reset on init failed\n"); 3671 goto failed; 3672 } 3673 } 3674 } 3675 3676 pci_enable_pcie_error_reporting(adev->pdev); 3677 3678 /* Post card if necessary */ 3679 if (amdgpu_device_need_post(adev)) { 3680 if (!adev->bios) { 3681 dev_err(adev->dev, "no vBIOS found\n"); 3682 r = -EINVAL; 3683 goto failed; 3684 } 3685 DRM_INFO("GPU posting now...\n"); 3686 r = amdgpu_device_asic_init(adev); 3687 if (r) { 3688 dev_err(adev->dev, "gpu post error!\n"); 3689 goto failed; 3690 } 3691 } 3692 3693 if (adev->is_atom_fw) { 3694 /* Initialize clocks */ 3695 r = amdgpu_atomfirmware_get_clock_info(adev); 3696 if (r) { 3697 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3698 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3699 goto failed; 3700 } 3701 } else { 3702 /* Initialize clocks */ 3703 r = amdgpu_atombios_get_clock_info(adev); 3704 if (r) { 3705 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3706 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3707 goto failed; 3708 } 3709 /* init i2c buses */ 3710 if (!amdgpu_device_has_dc_support(adev)) 3711 amdgpu_atombios_i2c_init(adev); 3712 } 3713 3714 fence_driver_init: 3715 /* Fence driver */ 3716 r = amdgpu_fence_driver_sw_init(adev); 3717 if (r) { 3718 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3719 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3720 goto failed; 3721 } 3722 3723 /* init the mode config */ 3724 drm_mode_config_init(adev_to_drm(adev)); 3725 3726 r = amdgpu_device_ip_init(adev); 3727 if (r) { 3728 /* failed in exclusive mode due to timeout */ 3729 if (amdgpu_sriov_vf(adev) && 3730 !amdgpu_sriov_runtime(adev) && 3731 amdgpu_virt_mmio_blocked(adev) && 3732 !amdgpu_virt_wait_reset(adev)) { 3733 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3734 /* Don't send request since VF is inactive. */ 3735 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3736 adev->virt.ops = NULL; 3737 r = -EAGAIN; 3738 goto release_ras_con; 3739 } 3740 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3741 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3742 goto release_ras_con; 3743 } 3744 3745 amdgpu_fence_driver_hw_init(adev); 3746 3747 dev_info(adev->dev, 3748 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3749 adev->gfx.config.max_shader_engines, 3750 adev->gfx.config.max_sh_per_se, 3751 adev->gfx.config.max_cu_per_sh, 3752 adev->gfx.cu_info.number); 3753 3754 #ifdef __OpenBSD__ 3755 { 3756 const char *chip_name; 3757 3758 switch (adev->asic_type) { 3759 case CHIP_RAVEN: 3760 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3761 chip_name = "RAVEN2"; 3762 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3763 chip_name = "PICASSO"; 3764 else 3765 chip_name = "RAVEN"; 3766 break; 3767 case CHIP_RENOIR: 3768 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3769 chip_name = "RENOIR"; 3770 else 3771 chip_name = "GREEN_SARDINE"; 3772 break; 3773 default: 3774 chip_name = amdgpu_asic_name[adev->asic_type]; 3775 } 3776 printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname, 3777 chip_name, adev->gfx.cu_info.number, adev->rev_id); 3778 } 3779 #endif 3780 3781 adev->accel_working = true; 3782 3783 amdgpu_vm_check_compute_bug(adev); 3784 3785 /* Initialize the buffer migration limit. */ 3786 if (amdgpu_moverate >= 0) 3787 max_MBps = amdgpu_moverate; 3788 else 3789 max_MBps = 8; /* Allow 8 MB/s. */ 3790 /* Get a log2 for easy divisions. */ 3791 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3792 3793 amdgpu_fbdev_init(adev); 3794 3795 r = amdgpu_pm_sysfs_init(adev); 3796 if (r) { 3797 adev->pm_sysfs_en = false; 3798 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3799 } else 3800 adev->pm_sysfs_en = true; 3801 3802 r = amdgpu_ucode_sysfs_init(adev); 3803 if (r) { 3804 adev->ucode_sysfs_en = false; 3805 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3806 } else 3807 adev->ucode_sysfs_en = true; 3808 3809 if ((amdgpu_testing & 1)) { 3810 if (adev->accel_working) 3811 amdgpu_test_moves(adev); 3812 else 3813 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3814 } 3815 if (amdgpu_benchmarking) { 3816 if (adev->accel_working) 3817 amdgpu_benchmark(adev, amdgpu_benchmarking); 3818 else 3819 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3820 } 3821 3822 /* 3823 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3824 * Otherwise the mgpu fan boost feature will be skipped due to the 3825 * gpu instance is counted less. 3826 */ 3827 amdgpu_register_gpu_instance(adev); 3828 3829 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3830 * explicit gating rather than handling it automatically. 3831 */ 3832 if (!adev->gmc.xgmi.pending_reset) { 3833 r = amdgpu_device_ip_late_init(adev); 3834 if (r) { 3835 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3836 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3837 goto release_ras_con; 3838 } 3839 /* must succeed. */ 3840 amdgpu_ras_resume(adev); 3841 queue_delayed_work(system_wq, &adev->delayed_init_work, 3842 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3843 } 3844 3845 if (amdgpu_sriov_vf(adev)) 3846 flush_delayed_work(&adev->delayed_init_work); 3847 3848 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3849 if (r) 3850 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3851 3852 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3853 r = amdgpu_pmu_init(adev); 3854 if (r) 3855 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3856 3857 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3858 if (amdgpu_device_cache_pci_state(adev->pdev)) 3859 pci_restore_state(pdev); 3860 3861 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3862 /* this will fail for cards that aren't VGA class devices, just 3863 * ignore it */ 3864 #ifdef notyet 3865 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3866 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3867 #endif 3868 3869 if (amdgpu_device_supports_px(ddev)) { 3870 px = true; 3871 vga_switcheroo_register_client(adev->pdev, 3872 &amdgpu_switcheroo_ops, px); 3873 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3874 } 3875 3876 if (adev->gmc.xgmi.pending_reset) 3877 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3878 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3879 3880 return 0; 3881 3882 release_ras_con: 3883 amdgpu_release_ras_context(adev); 3884 3885 failed: 3886 amdgpu_vf_error_trans_all(adev); 3887 3888 return r; 3889 } 3890 3891 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3892 { 3893 STUB(); 3894 #ifdef notyet 3895 /* Clear all CPU mappings pointing to this device */ 3896 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3897 #endif 3898 3899 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3900 amdgpu_device_doorbell_fini(adev); 3901 3902 #ifdef __linux__ 3903 iounmap(adev->rmmio); 3904 adev->rmmio = NULL; 3905 if (adev->mman.aper_base_kaddr) 3906 iounmap(adev->mman.aper_base_kaddr); 3907 adev->mman.aper_base_kaddr = NULL; 3908 #else 3909 if (adev->rmmio_size > 0) 3910 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 3911 adev->rmmio_size); 3912 adev->rmmio_size = 0; 3913 adev->rmmio = NULL; 3914 if (adev->mman.aper_base_kaddr) 3915 bus_space_unmap(adev->memt, adev->mman.aper_bsh, 3916 adev->gmc.visible_vram_size); 3917 adev->mman.aper_base_kaddr = NULL; 3918 #endif 3919 3920 /* Memory manager related */ 3921 if (!adev->gmc.xgmi.connected_to_cpu) { 3922 #ifdef __linux__ 3923 arch_phys_wc_del(adev->gmc.vram_mtrr); 3924 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3925 #else 3926 drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC); 3927 #endif 3928 } 3929 } 3930 3931 /** 3932 * amdgpu_device_fini - tear down the driver 3933 * 3934 * @adev: amdgpu_device pointer 3935 * 3936 * Tear down the driver info (all asics). 3937 * Called at driver shutdown. 3938 */ 3939 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3940 { 3941 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3942 flush_delayed_work(&adev->delayed_init_work); 3943 if (adev->mman.initialized) { 3944 flush_delayed_work(&adev->mman.bdev.wq); 3945 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3946 } 3947 adev->shutdown = true; 3948 3949 /* make sure IB test finished before entering exclusive mode 3950 * to avoid preemption on IB test 3951 * */ 3952 if (amdgpu_sriov_vf(adev)) { 3953 amdgpu_virt_request_full_gpu(adev, false); 3954 amdgpu_virt_fini_data_exchange(adev); 3955 } 3956 3957 /* disable all interrupts */ 3958 amdgpu_irq_disable_all(adev); 3959 if (adev->mode_info.mode_config_initialized){ 3960 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3961 drm_helper_force_disable_all(adev_to_drm(adev)); 3962 else 3963 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3964 } 3965 amdgpu_fence_driver_hw_fini(adev); 3966 3967 if (adev->pm_sysfs_en) 3968 amdgpu_pm_sysfs_fini(adev); 3969 if (adev->ucode_sysfs_en) 3970 amdgpu_ucode_sysfs_fini(adev); 3971 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3972 3973 amdgpu_fbdev_fini(adev); 3974 3975 amdgpu_irq_fini_hw(adev); 3976 3977 amdgpu_device_ip_fini_early(adev); 3978 3979 amdgpu_gart_dummy_page_fini(adev); 3980 3981 amdgpu_device_unmap_mmio(adev); 3982 } 3983 3984 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3985 { 3986 amdgpu_fence_driver_sw_fini(adev); 3987 amdgpu_device_ip_fini(adev); 3988 release_firmware(adev->firmware.gpu_info_fw); 3989 adev->firmware.gpu_info_fw = NULL; 3990 adev->accel_working = false; 3991 3992 amdgpu_reset_fini(adev); 3993 3994 /* free i2c buses */ 3995 if (!amdgpu_device_has_dc_support(adev)) 3996 amdgpu_i2c_fini(adev); 3997 3998 if (amdgpu_emu_mode != 1) 3999 amdgpu_atombios_fini(adev); 4000 4001 kfree(adev->bios); 4002 adev->bios = NULL; 4003 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4004 vga_switcheroo_unregister_client(adev->pdev); 4005 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4006 } 4007 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4008 vga_client_unregister(adev->pdev); 4009 4010 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4011 amdgpu_pmu_fini(adev); 4012 if (adev->mman.discovery_bin) 4013 amdgpu_discovery_fini(adev); 4014 4015 kfree(adev->pci_state); 4016 4017 } 4018 4019 /** 4020 * amdgpu_device_evict_resources - evict device resources 4021 * @adev: amdgpu device object 4022 * 4023 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4024 * of the vram memory type. Mainly used for evicting device resources 4025 * at suspend time. 4026 * 4027 */ 4028 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4029 { 4030 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4031 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4032 return; 4033 4034 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4035 DRM_WARN("evicting device resources failed\n"); 4036 4037 } 4038 4039 /* 4040 * Suspend & resume. 4041 */ 4042 /** 4043 * amdgpu_device_suspend - initiate device suspend 4044 * 4045 * @dev: drm dev pointer 4046 * @fbcon : notify the fbdev of suspend 4047 * 4048 * Puts the hw in the suspend state (all asics). 4049 * Returns 0 for success or an error on failure. 4050 * Called at driver suspend. 4051 */ 4052 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4053 { 4054 struct amdgpu_device *adev = drm_to_adev(dev); 4055 4056 if (adev->shutdown) 4057 return 0; 4058 4059 #ifdef notyet 4060 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4061 return 0; 4062 #endif 4063 4064 adev->in_suspend = true; 4065 4066 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4067 DRM_WARN("smart shift update failed\n"); 4068 4069 drm_kms_helper_poll_disable(dev); 4070 4071 if (fbcon) 4072 amdgpu_fbdev_set_suspend(adev, 1); 4073 4074 cancel_delayed_work_sync(&adev->delayed_init_work); 4075 4076 amdgpu_ras_suspend(adev); 4077 4078 amdgpu_device_ip_suspend_phase1(adev); 4079 4080 if (!adev->in_s0ix) 4081 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4082 4083 /* First evict vram memory */ 4084 amdgpu_device_evict_resources(adev); 4085 4086 amdgpu_fence_driver_hw_fini(adev); 4087 4088 amdgpu_device_ip_suspend_phase2(adev); 4089 /* This second call to evict device resources is to evict 4090 * the gart page table using the CPU. 4091 */ 4092 amdgpu_device_evict_resources(adev); 4093 4094 return 0; 4095 } 4096 4097 /** 4098 * amdgpu_device_resume - initiate device resume 4099 * 4100 * @dev: drm dev pointer 4101 * @fbcon : notify the fbdev of resume 4102 * 4103 * Bring the hw back to operating state (all asics). 4104 * Returns 0 for success or an error on failure. 4105 * Called at driver resume. 4106 */ 4107 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4108 { 4109 struct amdgpu_device *adev = drm_to_adev(dev); 4110 int r = 0; 4111 4112 #ifdef notyet 4113 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4114 return 0; 4115 #endif 4116 4117 if (adev->in_s0ix) 4118 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 4119 4120 /* post card */ 4121 if (amdgpu_device_need_post(adev)) { 4122 r = amdgpu_device_asic_init(adev); 4123 if (r) 4124 dev_err(adev->dev, "amdgpu asic init failed\n"); 4125 } 4126 4127 r = amdgpu_device_ip_resume(adev); 4128 if (r) { 4129 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4130 return r; 4131 } 4132 amdgpu_fence_driver_hw_init(adev); 4133 4134 r = amdgpu_device_ip_late_init(adev); 4135 if (r) 4136 return r; 4137 4138 queue_delayed_work(system_wq, &adev->delayed_init_work, 4139 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4140 4141 if (!adev->in_s0ix) { 4142 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4143 if (r) 4144 return r; 4145 } 4146 4147 /* Make sure IB tests flushed */ 4148 flush_delayed_work(&adev->delayed_init_work); 4149 4150 if (fbcon) 4151 amdgpu_fbdev_set_suspend(adev, 0); 4152 4153 drm_kms_helper_poll_enable(dev); 4154 4155 amdgpu_ras_resume(adev); 4156 4157 /* 4158 * Most of the connector probing functions try to acquire runtime pm 4159 * refs to ensure that the GPU is powered on when connector polling is 4160 * performed. Since we're calling this from a runtime PM callback, 4161 * trying to acquire rpm refs will cause us to deadlock. 4162 * 4163 * Since we're guaranteed to be holding the rpm lock, it's safe to 4164 * temporarily disable the rpm helpers so this doesn't deadlock us. 4165 */ 4166 #if defined(CONFIG_PM) && defined(__linux__) 4167 dev->dev->power.disable_depth++; 4168 #endif 4169 if (!amdgpu_device_has_dc_support(adev)) 4170 drm_helper_hpd_irq_event(dev); 4171 else 4172 drm_kms_helper_hotplug_event(dev); 4173 #if defined(CONFIG_PM) && defined(__linux__) 4174 dev->dev->power.disable_depth--; 4175 #endif 4176 adev->in_suspend = false; 4177 4178 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4179 DRM_WARN("smart shift update failed\n"); 4180 4181 return 0; 4182 } 4183 4184 /** 4185 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4186 * 4187 * @adev: amdgpu_device pointer 4188 * 4189 * The list of all the hardware IPs that make up the asic is walked and 4190 * the check_soft_reset callbacks are run. check_soft_reset determines 4191 * if the asic is still hung or not. 4192 * Returns true if any of the IPs are still in a hung state, false if not. 4193 */ 4194 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4195 { 4196 int i; 4197 bool asic_hang = false; 4198 4199 if (amdgpu_sriov_vf(adev)) 4200 return true; 4201 4202 if (amdgpu_asic_need_full_reset(adev)) 4203 return true; 4204 4205 for (i = 0; i < adev->num_ip_blocks; i++) { 4206 if (!adev->ip_blocks[i].status.valid) 4207 continue; 4208 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4209 adev->ip_blocks[i].status.hang = 4210 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4211 if (adev->ip_blocks[i].status.hang) { 4212 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4213 asic_hang = true; 4214 } 4215 } 4216 return asic_hang; 4217 } 4218 4219 /** 4220 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4221 * 4222 * @adev: amdgpu_device pointer 4223 * 4224 * The list of all the hardware IPs that make up the asic is walked and the 4225 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4226 * handles any IP specific hardware or software state changes that are 4227 * necessary for a soft reset to succeed. 4228 * Returns 0 on success, negative error code on failure. 4229 */ 4230 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4231 { 4232 int i, r = 0; 4233 4234 for (i = 0; i < adev->num_ip_blocks; i++) { 4235 if (!adev->ip_blocks[i].status.valid) 4236 continue; 4237 if (adev->ip_blocks[i].status.hang && 4238 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4239 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4240 if (r) 4241 return r; 4242 } 4243 } 4244 4245 return 0; 4246 } 4247 4248 /** 4249 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4250 * 4251 * @adev: amdgpu_device pointer 4252 * 4253 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4254 * reset is necessary to recover. 4255 * Returns true if a full asic reset is required, false if not. 4256 */ 4257 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4258 { 4259 int i; 4260 4261 if (amdgpu_asic_need_full_reset(adev)) 4262 return true; 4263 4264 for (i = 0; i < adev->num_ip_blocks; i++) { 4265 if (!adev->ip_blocks[i].status.valid) 4266 continue; 4267 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4268 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4269 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4270 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4271 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4272 if (adev->ip_blocks[i].status.hang) { 4273 dev_info(adev->dev, "Some block need full reset!\n"); 4274 return true; 4275 } 4276 } 4277 } 4278 return false; 4279 } 4280 4281 /** 4282 * amdgpu_device_ip_soft_reset - do a soft reset 4283 * 4284 * @adev: amdgpu_device pointer 4285 * 4286 * The list of all the hardware IPs that make up the asic is walked and the 4287 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4288 * IP specific hardware or software state changes that are necessary to soft 4289 * reset the IP. 4290 * Returns 0 on success, negative error code on failure. 4291 */ 4292 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4293 { 4294 int i, r = 0; 4295 4296 for (i = 0; i < adev->num_ip_blocks; i++) { 4297 if (!adev->ip_blocks[i].status.valid) 4298 continue; 4299 if (adev->ip_blocks[i].status.hang && 4300 adev->ip_blocks[i].version->funcs->soft_reset) { 4301 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4302 if (r) 4303 return r; 4304 } 4305 } 4306 4307 return 0; 4308 } 4309 4310 /** 4311 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4312 * 4313 * @adev: amdgpu_device pointer 4314 * 4315 * The list of all the hardware IPs that make up the asic is walked and the 4316 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4317 * handles any IP specific hardware or software state changes that are 4318 * necessary after the IP has been soft reset. 4319 * Returns 0 on success, negative error code on failure. 4320 */ 4321 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4322 { 4323 int i, r = 0; 4324 4325 for (i = 0; i < adev->num_ip_blocks; i++) { 4326 if (!adev->ip_blocks[i].status.valid) 4327 continue; 4328 if (adev->ip_blocks[i].status.hang && 4329 adev->ip_blocks[i].version->funcs->post_soft_reset) 4330 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4331 if (r) 4332 return r; 4333 } 4334 4335 return 0; 4336 } 4337 4338 /** 4339 * amdgpu_device_recover_vram - Recover some VRAM contents 4340 * 4341 * @adev: amdgpu_device pointer 4342 * 4343 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4344 * restore things like GPUVM page tables after a GPU reset where 4345 * the contents of VRAM might be lost. 4346 * 4347 * Returns: 4348 * 0 on success, negative error code on failure. 4349 */ 4350 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4351 { 4352 struct dma_fence *fence = NULL, *next = NULL; 4353 struct amdgpu_bo *shadow; 4354 struct amdgpu_bo_vm *vmbo; 4355 long r = 1, tmo; 4356 4357 if (amdgpu_sriov_runtime(adev)) 4358 tmo = msecs_to_jiffies(8000); 4359 else 4360 tmo = msecs_to_jiffies(100); 4361 4362 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4363 mutex_lock(&adev->shadow_list_lock); 4364 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4365 shadow = &vmbo->bo; 4366 /* No need to recover an evicted BO */ 4367 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4368 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4369 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4370 continue; 4371 4372 r = amdgpu_bo_restore_shadow(shadow, &next); 4373 if (r) 4374 break; 4375 4376 if (fence) { 4377 tmo = dma_fence_wait_timeout(fence, false, tmo); 4378 dma_fence_put(fence); 4379 fence = next; 4380 if (tmo == 0) { 4381 r = -ETIMEDOUT; 4382 break; 4383 } else if (tmo < 0) { 4384 r = tmo; 4385 break; 4386 } 4387 } else { 4388 fence = next; 4389 } 4390 } 4391 mutex_unlock(&adev->shadow_list_lock); 4392 4393 if (fence) 4394 tmo = dma_fence_wait_timeout(fence, false, tmo); 4395 dma_fence_put(fence); 4396 4397 if (r < 0 || tmo <= 0) { 4398 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4399 return -EIO; 4400 } 4401 4402 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4403 return 0; 4404 } 4405 4406 4407 /** 4408 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4409 * 4410 * @adev: amdgpu_device pointer 4411 * @from_hypervisor: request from hypervisor 4412 * 4413 * do VF FLR and reinitialize Asic 4414 * return 0 means succeeded otherwise failed 4415 */ 4416 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4417 bool from_hypervisor) 4418 { 4419 int r; 4420 4421 if (from_hypervisor) 4422 r = amdgpu_virt_request_full_gpu(adev, true); 4423 else 4424 r = amdgpu_virt_reset_gpu(adev); 4425 if (r) 4426 return r; 4427 4428 amdgpu_amdkfd_pre_reset(adev); 4429 4430 /* Resume IP prior to SMC */ 4431 r = amdgpu_device_ip_reinit_early_sriov(adev); 4432 if (r) 4433 goto error; 4434 4435 amdgpu_virt_init_data_exchange(adev); 4436 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4437 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4438 4439 r = amdgpu_device_fw_loading(adev); 4440 if (r) 4441 return r; 4442 4443 /* now we are okay to resume SMC/CP/SDMA */ 4444 r = amdgpu_device_ip_reinit_late_sriov(adev); 4445 if (r) 4446 goto error; 4447 4448 amdgpu_irq_gpu_reset_resume_helper(adev); 4449 r = amdgpu_ib_ring_tests(adev); 4450 amdgpu_amdkfd_post_reset(adev); 4451 4452 error: 4453 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4454 amdgpu_inc_vram_lost(adev); 4455 r = amdgpu_device_recover_vram(adev); 4456 } 4457 amdgpu_virt_release_full_gpu(adev, true); 4458 4459 return r; 4460 } 4461 4462 /** 4463 * amdgpu_device_has_job_running - check if there is any job in mirror list 4464 * 4465 * @adev: amdgpu_device pointer 4466 * 4467 * check if there is any job in mirror list 4468 */ 4469 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4470 { 4471 int i; 4472 struct drm_sched_job *job; 4473 4474 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4475 struct amdgpu_ring *ring = adev->rings[i]; 4476 4477 if (!ring || !ring->sched.thread) 4478 continue; 4479 4480 spin_lock(&ring->sched.job_list_lock); 4481 job = list_first_entry_or_null(&ring->sched.pending_list, 4482 struct drm_sched_job, list); 4483 spin_unlock(&ring->sched.job_list_lock); 4484 if (job) 4485 return true; 4486 } 4487 return false; 4488 } 4489 4490 /** 4491 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4492 * 4493 * @adev: amdgpu_device pointer 4494 * 4495 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4496 * a hung GPU. 4497 */ 4498 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4499 { 4500 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4501 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4502 return false; 4503 } 4504 4505 if (amdgpu_gpu_recovery == 0) 4506 goto disabled; 4507 4508 if (amdgpu_sriov_vf(adev)) 4509 return true; 4510 4511 if (amdgpu_gpu_recovery == -1) { 4512 switch (adev->asic_type) { 4513 case CHIP_BONAIRE: 4514 case CHIP_HAWAII: 4515 case CHIP_TOPAZ: 4516 case CHIP_TONGA: 4517 case CHIP_FIJI: 4518 case CHIP_POLARIS10: 4519 case CHIP_POLARIS11: 4520 case CHIP_POLARIS12: 4521 case CHIP_VEGAM: 4522 case CHIP_VEGA20: 4523 case CHIP_VEGA10: 4524 case CHIP_VEGA12: 4525 case CHIP_RAVEN: 4526 case CHIP_ARCTURUS: 4527 case CHIP_RENOIR: 4528 case CHIP_NAVI10: 4529 case CHIP_NAVI14: 4530 case CHIP_NAVI12: 4531 case CHIP_SIENNA_CICHLID: 4532 case CHIP_NAVY_FLOUNDER: 4533 case CHIP_DIMGREY_CAVEFISH: 4534 case CHIP_BEIGE_GOBY: 4535 case CHIP_VANGOGH: 4536 case CHIP_ALDEBARAN: 4537 break; 4538 default: 4539 goto disabled; 4540 } 4541 } 4542 4543 return true; 4544 4545 disabled: 4546 dev_info(adev->dev, "GPU recovery disabled.\n"); 4547 return false; 4548 } 4549 4550 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4551 { 4552 u32 i; 4553 int ret = 0; 4554 4555 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4556 4557 dev_info(adev->dev, "GPU mode1 reset\n"); 4558 4559 /* disable BM */ 4560 pci_clear_master(adev->pdev); 4561 4562 amdgpu_device_cache_pci_state(adev->pdev); 4563 4564 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4565 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4566 ret = amdgpu_dpm_mode1_reset(adev); 4567 } else { 4568 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4569 ret = psp_gpu_reset(adev); 4570 } 4571 4572 if (ret) 4573 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4574 4575 amdgpu_device_load_pci_state(adev->pdev); 4576 4577 /* wait for asic to come out of reset */ 4578 for (i = 0; i < adev->usec_timeout; i++) { 4579 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4580 4581 if (memsize != 0xffffffff) 4582 break; 4583 udelay(1); 4584 } 4585 4586 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4587 return ret; 4588 } 4589 4590 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4591 struct amdgpu_reset_context *reset_context) 4592 { 4593 int i, j, r = 0; 4594 struct amdgpu_job *job = NULL; 4595 bool need_full_reset = 4596 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4597 4598 if (reset_context->reset_req_dev == adev) 4599 job = reset_context->job; 4600 4601 if (amdgpu_sriov_vf(adev)) { 4602 /* stop the data exchange thread */ 4603 amdgpu_virt_fini_data_exchange(adev); 4604 } 4605 4606 /* block all schedulers and reset given job's ring */ 4607 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4608 struct amdgpu_ring *ring = adev->rings[i]; 4609 4610 if (!ring || !ring->sched.thread) 4611 continue; 4612 4613 /*clear job fence from fence drv to avoid force_completion 4614 *leave NULL and vm flush fence in fence drv */ 4615 for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { 4616 struct dma_fence *old, **ptr; 4617 4618 ptr = &ring->fence_drv.fences[j]; 4619 old = rcu_dereference_protected(*ptr, 1); 4620 if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { 4621 RCU_INIT_POINTER(*ptr, NULL); 4622 } 4623 } 4624 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4625 amdgpu_fence_driver_force_completion(ring); 4626 } 4627 4628 if (job && job->vm) 4629 drm_sched_increase_karma(&job->base); 4630 4631 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4632 /* If reset handler not implemented, continue; otherwise return */ 4633 if (r == -ENOSYS) 4634 r = 0; 4635 else 4636 return r; 4637 4638 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4639 if (!amdgpu_sriov_vf(adev)) { 4640 4641 if (!need_full_reset) 4642 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4643 4644 if (!need_full_reset) { 4645 amdgpu_device_ip_pre_soft_reset(adev); 4646 r = amdgpu_device_ip_soft_reset(adev); 4647 amdgpu_device_ip_post_soft_reset(adev); 4648 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4649 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4650 need_full_reset = true; 4651 } 4652 } 4653 4654 if (need_full_reset) 4655 r = amdgpu_device_ip_suspend(adev); 4656 if (need_full_reset) 4657 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4658 else 4659 clear_bit(AMDGPU_NEED_FULL_RESET, 4660 &reset_context->flags); 4661 } 4662 4663 return r; 4664 } 4665 4666 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4667 struct amdgpu_reset_context *reset_context) 4668 { 4669 struct amdgpu_device *tmp_adev = NULL; 4670 bool need_full_reset, skip_hw_reset, vram_lost = false; 4671 int r = 0; 4672 4673 /* Try reset handler method first */ 4674 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4675 reset_list); 4676 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4677 /* If reset handler not implemented, continue; otherwise return */ 4678 if (r == -ENOSYS) 4679 r = 0; 4680 else 4681 return r; 4682 4683 /* Reset handler not implemented, use the default method */ 4684 need_full_reset = 4685 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4686 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4687 4688 /* 4689 * ASIC reset has to be done on all XGMI hive nodes ASAP 4690 * to allow proper links negotiation in FW (within 1 sec) 4691 */ 4692 if (!skip_hw_reset && need_full_reset) { 4693 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4694 /* For XGMI run all resets in parallel to speed up the process */ 4695 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4696 tmp_adev->gmc.xgmi.pending_reset = false; 4697 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4698 r = -EALREADY; 4699 } else 4700 r = amdgpu_asic_reset(tmp_adev); 4701 4702 if (r) { 4703 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4704 r, adev_to_drm(tmp_adev)->unique); 4705 break; 4706 } 4707 } 4708 4709 /* For XGMI wait for all resets to complete before proceed */ 4710 if (!r) { 4711 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4712 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4713 flush_work(&tmp_adev->xgmi_reset_work); 4714 r = tmp_adev->asic_reset_res; 4715 if (r) 4716 break; 4717 } 4718 } 4719 } 4720 } 4721 4722 if (!r && amdgpu_ras_intr_triggered()) { 4723 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4724 if (tmp_adev->mmhub.ras_funcs && 4725 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4726 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4727 } 4728 4729 amdgpu_ras_intr_cleared(); 4730 } 4731 4732 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4733 if (need_full_reset) { 4734 /* post card */ 4735 r = amdgpu_device_asic_init(tmp_adev); 4736 if (r) { 4737 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4738 } else { 4739 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4740 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4741 if (r) 4742 goto out; 4743 4744 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4745 if (r) 4746 goto out; 4747 4748 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4749 if (vram_lost) { 4750 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4751 amdgpu_inc_vram_lost(tmp_adev); 4752 } 4753 4754 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4755 if (r) 4756 goto out; 4757 4758 r = amdgpu_device_fw_loading(tmp_adev); 4759 if (r) 4760 return r; 4761 4762 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4763 if (r) 4764 goto out; 4765 4766 if (vram_lost) 4767 amdgpu_device_fill_reset_magic(tmp_adev); 4768 4769 /* 4770 * Add this ASIC as tracked as reset was already 4771 * complete successfully. 4772 */ 4773 amdgpu_register_gpu_instance(tmp_adev); 4774 4775 if (!reset_context->hive && 4776 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4777 amdgpu_xgmi_add_device(tmp_adev); 4778 4779 r = amdgpu_device_ip_late_init(tmp_adev); 4780 if (r) 4781 goto out; 4782 4783 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4784 4785 /* 4786 * The GPU enters bad state once faulty pages 4787 * by ECC has reached the threshold, and ras 4788 * recovery is scheduled next. So add one check 4789 * here to break recovery if it indeed exceeds 4790 * bad page threshold, and remind user to 4791 * retire this GPU or setting one bigger 4792 * bad_page_threshold value to fix this once 4793 * probing driver again. 4794 */ 4795 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4796 /* must succeed. */ 4797 amdgpu_ras_resume(tmp_adev); 4798 } else { 4799 r = -EINVAL; 4800 goto out; 4801 } 4802 4803 /* Update PSP FW topology after reset */ 4804 if (reset_context->hive && 4805 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4806 r = amdgpu_xgmi_update_topology( 4807 reset_context->hive, tmp_adev); 4808 } 4809 } 4810 4811 out: 4812 if (!r) { 4813 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4814 r = amdgpu_ib_ring_tests(tmp_adev); 4815 if (r) { 4816 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4817 need_full_reset = true; 4818 r = -EAGAIN; 4819 goto end; 4820 } 4821 } 4822 4823 if (!r) 4824 r = amdgpu_device_recover_vram(tmp_adev); 4825 else 4826 tmp_adev->asic_reset_res = r; 4827 } 4828 4829 end: 4830 if (need_full_reset) 4831 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4832 else 4833 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4834 return r; 4835 } 4836 4837 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4838 struct amdgpu_hive_info *hive) 4839 { 4840 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4841 return false; 4842 4843 if (hive) { 4844 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4845 } else { 4846 down_write(&adev->reset_sem); 4847 } 4848 4849 switch (amdgpu_asic_reset_method(adev)) { 4850 case AMD_RESET_METHOD_MODE1: 4851 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4852 break; 4853 case AMD_RESET_METHOD_MODE2: 4854 adev->mp1_state = PP_MP1_STATE_RESET; 4855 break; 4856 default: 4857 adev->mp1_state = PP_MP1_STATE_NONE; 4858 break; 4859 } 4860 4861 return true; 4862 } 4863 4864 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4865 { 4866 amdgpu_vf_error_trans_all(adev); 4867 adev->mp1_state = PP_MP1_STATE_NONE; 4868 atomic_set(&adev->in_gpu_reset, 0); 4869 up_write(&adev->reset_sem); 4870 } 4871 4872 /* 4873 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4874 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4875 * 4876 * unlock won't require roll back. 4877 */ 4878 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4879 { 4880 struct amdgpu_device *tmp_adev = NULL; 4881 4882 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4883 if (!hive) { 4884 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4885 return -ENODEV; 4886 } 4887 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4888 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4889 goto roll_back; 4890 } 4891 } else if (!amdgpu_device_lock_adev(adev, hive)) 4892 return -EAGAIN; 4893 4894 return 0; 4895 roll_back: 4896 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4897 /* 4898 * if the lockup iteration break in the middle of a hive, 4899 * it may means there may has a race issue, 4900 * or a hive device locked up independently. 4901 * we may be in trouble and may not, so will try to roll back 4902 * the lock and give out a warnning. 4903 */ 4904 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4905 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4906 amdgpu_device_unlock_adev(tmp_adev); 4907 } 4908 } 4909 return -EAGAIN; 4910 } 4911 4912 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4913 { 4914 STUB(); 4915 #ifdef notyet 4916 struct pci_dev *p = NULL; 4917 4918 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4919 adev->pdev->bus->number, 1); 4920 if (p) { 4921 pm_runtime_enable(&(p->dev)); 4922 pm_runtime_resume(&(p->dev)); 4923 } 4924 #endif 4925 } 4926 4927 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4928 { 4929 enum amd_reset_method reset_method; 4930 struct pci_dev *p = NULL; 4931 u64 expires; 4932 4933 /* 4934 * For now, only BACO and mode1 reset are confirmed 4935 * to suffer the audio issue without proper suspended. 4936 */ 4937 reset_method = amdgpu_asic_reset_method(adev); 4938 if ((reset_method != AMD_RESET_METHOD_BACO) && 4939 (reset_method != AMD_RESET_METHOD_MODE1)) 4940 return -EINVAL; 4941 4942 STUB(); 4943 return -ENOSYS; 4944 #ifdef notyet 4945 4946 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4947 adev->pdev->bus->number, 1); 4948 if (!p) 4949 return -ENODEV; 4950 4951 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4952 if (!expires) 4953 /* 4954 * If we cannot get the audio device autosuspend delay, 4955 * a fixed 4S interval will be used. Considering 3S is 4956 * the audio controller default autosuspend delay setting. 4957 * 4S used here is guaranteed to cover that. 4958 */ 4959 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4960 4961 while (!pm_runtime_status_suspended(&(p->dev))) { 4962 if (!pm_runtime_suspend(&(p->dev))) 4963 break; 4964 4965 if (expires < ktime_get_mono_fast_ns()) { 4966 dev_warn(adev->dev, "failed to suspend display audio\n"); 4967 /* TODO: abort the succeeding gpu reset? */ 4968 return -ETIMEDOUT; 4969 } 4970 } 4971 4972 pm_runtime_disable(&(p->dev)); 4973 4974 return 0; 4975 #endif 4976 } 4977 4978 static void amdgpu_device_recheck_guilty_jobs( 4979 struct amdgpu_device *adev, struct list_head *device_list_handle, 4980 struct amdgpu_reset_context *reset_context) 4981 { 4982 int i, r = 0; 4983 4984 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4985 struct amdgpu_ring *ring = adev->rings[i]; 4986 int ret = 0; 4987 struct drm_sched_job *s_job; 4988 4989 if (!ring || !ring->sched.thread) 4990 continue; 4991 4992 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4993 struct drm_sched_job, list); 4994 if (s_job == NULL) 4995 continue; 4996 4997 /* clear job's guilty and depend the folowing step to decide the real one */ 4998 drm_sched_reset_karma(s_job); 4999 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 5000 5001 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 5002 if (ret == 0) { /* timeout */ 5003 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 5004 ring->sched.name, s_job->id); 5005 5006 /* set guilty */ 5007 drm_sched_increase_karma(s_job); 5008 retry: 5009 /* do hw reset */ 5010 if (amdgpu_sriov_vf(adev)) { 5011 amdgpu_virt_fini_data_exchange(adev); 5012 r = amdgpu_device_reset_sriov(adev, false); 5013 if (r) 5014 adev->asic_reset_res = r; 5015 } else { 5016 clear_bit(AMDGPU_SKIP_HW_RESET, 5017 &reset_context->flags); 5018 r = amdgpu_do_asic_reset(device_list_handle, 5019 reset_context); 5020 if (r && r == -EAGAIN) 5021 goto retry; 5022 } 5023 5024 /* 5025 * add reset counter so that the following 5026 * resubmitted job could flush vmid 5027 */ 5028 atomic_inc(&adev->gpu_reset_counter); 5029 continue; 5030 } 5031 5032 /* got the hw fence, signal finished fence */ 5033 atomic_dec(ring->sched.score); 5034 dma_fence_get(&s_job->s_fence->finished); 5035 dma_fence_signal(&s_job->s_fence->finished); 5036 dma_fence_put(&s_job->s_fence->finished); 5037 5038 /* remove node from list and free the job */ 5039 spin_lock(&ring->sched.job_list_lock); 5040 list_del_init(&s_job->list); 5041 spin_unlock(&ring->sched.job_list_lock); 5042 ring->sched.ops->free_job(s_job); 5043 } 5044 } 5045 5046 /** 5047 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5048 * 5049 * @adev: amdgpu_device pointer 5050 * @job: which job trigger hang 5051 * 5052 * Attempt to reset the GPU if it has hung (all asics). 5053 * Attempt to do soft-reset or full-reset and reinitialize Asic 5054 * Returns 0 for success or an error on failure. 5055 */ 5056 5057 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5058 struct amdgpu_job *job) 5059 { 5060 struct list_head device_list, *device_list_handle = NULL; 5061 bool job_signaled = false; 5062 struct amdgpu_hive_info *hive = NULL; 5063 struct amdgpu_device *tmp_adev = NULL; 5064 int i, r = 0; 5065 bool need_emergency_restart = false; 5066 bool audio_suspended = false; 5067 int tmp_vram_lost_counter; 5068 struct amdgpu_reset_context reset_context; 5069 5070 memset(&reset_context, 0, sizeof(reset_context)); 5071 5072 /* 5073 * Special case: RAS triggered and full reset isn't supported 5074 */ 5075 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5076 5077 /* 5078 * Flush RAM to disk so that after reboot 5079 * the user can read log and see why the system rebooted. 5080 */ 5081 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5082 DRM_WARN("Emergency reboot."); 5083 5084 #ifdef notyet 5085 ksys_sync_helper(); 5086 emergency_restart(); 5087 #else 5088 panic("emergency_restart"); 5089 #endif 5090 } 5091 5092 dev_info(adev->dev, "GPU %s begin!\n", 5093 need_emergency_restart ? "jobs stop":"reset"); 5094 5095 /* 5096 * Here we trylock to avoid chain of resets executing from 5097 * either trigger by jobs on different adevs in XGMI hive or jobs on 5098 * different schedulers for same device while this TO handler is running. 5099 * We always reset all schedulers for device and all devices for XGMI 5100 * hive so that should take care of them too. 5101 */ 5102 hive = amdgpu_get_xgmi_hive(adev); 5103 if (hive) { 5104 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5105 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5106 job ? job->base.id : -1, hive->hive_id); 5107 amdgpu_put_xgmi_hive(hive); 5108 if (job && job->vm) 5109 drm_sched_increase_karma(&job->base); 5110 return 0; 5111 } 5112 mutex_lock(&hive->hive_lock); 5113 } 5114 5115 reset_context.method = AMD_RESET_METHOD_NONE; 5116 reset_context.reset_req_dev = adev; 5117 reset_context.job = job; 5118 reset_context.hive = hive; 5119 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5120 5121 /* 5122 * lock the device before we try to operate the linked list 5123 * if didn't get the device lock, don't touch the linked list since 5124 * others may iterating it. 5125 */ 5126 r = amdgpu_device_lock_hive_adev(adev, hive); 5127 if (r) { 5128 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5129 job ? job->base.id : -1); 5130 5131 /* even we skipped this reset, still need to set the job to guilty */ 5132 if (job && job->vm) 5133 drm_sched_increase_karma(&job->base); 5134 goto skip_recovery; 5135 } 5136 5137 /* 5138 * Build list of devices to reset. 5139 * In case we are in XGMI hive mode, resort the device list 5140 * to put adev in the 1st position. 5141 */ 5142 INIT_LIST_HEAD(&device_list); 5143 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5144 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5145 list_add_tail(&tmp_adev->reset_list, &device_list); 5146 if (!list_is_first(&adev->reset_list, &device_list)) 5147 list_rotate_to_front(&adev->reset_list, &device_list); 5148 device_list_handle = &device_list; 5149 } else { 5150 list_add_tail(&adev->reset_list, &device_list); 5151 device_list_handle = &device_list; 5152 } 5153 5154 /* block all schedulers and reset given job's ring */ 5155 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5156 /* 5157 * Try to put the audio codec into suspend state 5158 * before gpu reset started. 5159 * 5160 * Due to the power domain of the graphics device 5161 * is shared with AZ power domain. Without this, 5162 * we may change the audio hardware from behind 5163 * the audio driver's back. That will trigger 5164 * some audio codec errors. 5165 */ 5166 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5167 audio_suspended = true; 5168 5169 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5170 5171 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5172 5173 if (!amdgpu_sriov_vf(tmp_adev)) 5174 amdgpu_amdkfd_pre_reset(tmp_adev); 5175 5176 /* 5177 * Mark these ASICs to be reseted as untracked first 5178 * And add them back after reset completed 5179 */ 5180 amdgpu_unregister_gpu_instance(tmp_adev); 5181 5182 amdgpu_fbdev_set_suspend(tmp_adev, 1); 5183 5184 /* disable ras on ALL IPs */ 5185 if (!need_emergency_restart && 5186 amdgpu_device_ip_need_full_reset(tmp_adev)) 5187 amdgpu_ras_suspend(tmp_adev); 5188 5189 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5190 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5191 5192 if (!ring || !ring->sched.thread) 5193 continue; 5194 5195 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5196 5197 if (need_emergency_restart) 5198 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5199 } 5200 atomic_inc(&tmp_adev->gpu_reset_counter); 5201 } 5202 5203 if (need_emergency_restart) 5204 goto skip_sched_resume; 5205 5206 /* 5207 * Must check guilty signal here since after this point all old 5208 * HW fences are force signaled. 5209 * 5210 * job->base holds a reference to parent fence 5211 */ 5212 if (job && job->base.s_fence->parent && 5213 dma_fence_is_signaled(job->base.s_fence->parent)) { 5214 job_signaled = true; 5215 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5216 goto skip_hw_reset; 5217 } 5218 5219 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5220 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5221 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5222 /*TODO Should we stop ?*/ 5223 if (r) { 5224 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5225 r, adev_to_drm(tmp_adev)->unique); 5226 tmp_adev->asic_reset_res = r; 5227 } 5228 } 5229 5230 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5231 /* Actual ASIC resets if needed.*/ 5232 /* TODO Implement XGMI hive reset logic for SRIOV */ 5233 if (amdgpu_sriov_vf(adev)) { 5234 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5235 if (r) 5236 adev->asic_reset_res = r; 5237 } else { 5238 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5239 if (r && r == -EAGAIN) 5240 goto retry; 5241 } 5242 5243 skip_hw_reset: 5244 5245 /* Post ASIC reset for all devs .*/ 5246 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5247 5248 /* 5249 * Sometimes a later bad compute job can block a good gfx job as gfx 5250 * and compute ring share internal GC HW mutually. We add an additional 5251 * guilty jobs recheck step to find the real guilty job, it synchronously 5252 * submits and pends for the first job being signaled. If it gets timeout, 5253 * we identify it as a real guilty job. 5254 */ 5255 if (amdgpu_gpu_recovery == 2 && 5256 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5257 amdgpu_device_recheck_guilty_jobs( 5258 tmp_adev, device_list_handle, &reset_context); 5259 5260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5261 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5262 5263 if (!ring || !ring->sched.thread) 5264 continue; 5265 5266 /* No point to resubmit jobs if we didn't HW reset*/ 5267 if (!tmp_adev->asic_reset_res && !job_signaled) 5268 drm_sched_resubmit_jobs(&ring->sched); 5269 5270 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5271 } 5272 5273 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5274 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5275 } 5276 5277 tmp_adev->asic_reset_res = 0; 5278 5279 if (r) { 5280 /* bad news, how to tell it to userspace ? */ 5281 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5282 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5283 } else { 5284 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5285 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5286 DRM_WARN("smart shift update failed\n"); 5287 } 5288 } 5289 5290 skip_sched_resume: 5291 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5292 /* unlock kfd: SRIOV would do it separately */ 5293 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5294 amdgpu_amdkfd_post_reset(tmp_adev); 5295 5296 /* kfd_post_reset will do nothing if kfd device is not initialized, 5297 * need to bring up kfd here if it's not be initialized before 5298 */ 5299 if (!adev->kfd.init_complete) 5300 amdgpu_amdkfd_device_init(adev); 5301 5302 if (audio_suspended) 5303 amdgpu_device_resume_display_audio(tmp_adev); 5304 amdgpu_device_unlock_adev(tmp_adev); 5305 } 5306 5307 skip_recovery: 5308 if (hive) { 5309 atomic_set(&hive->in_reset, 0); 5310 mutex_unlock(&hive->hive_lock); 5311 amdgpu_put_xgmi_hive(hive); 5312 } 5313 5314 if (r && r != -EAGAIN) 5315 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5316 return r; 5317 } 5318 5319 /** 5320 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5321 * 5322 * @adev: amdgpu_device pointer 5323 * 5324 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5325 * and lanes) of the slot the device is in. Handles APUs and 5326 * virtualized environments where PCIE config space may not be available. 5327 */ 5328 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5329 { 5330 struct pci_dev *pdev; 5331 enum pci_bus_speed speed_cap, platform_speed_cap; 5332 enum pcie_link_width platform_link_width; 5333 5334 if (amdgpu_pcie_gen_cap) 5335 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5336 5337 if (amdgpu_pcie_lane_cap) 5338 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5339 5340 /* covers APUs as well */ 5341 if (pci_is_root_bus(adev->pdev->bus)) { 5342 if (adev->pm.pcie_gen_mask == 0) 5343 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5344 if (adev->pm.pcie_mlw_mask == 0) 5345 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5346 return; 5347 } 5348 5349 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5350 return; 5351 5352 pcie_bandwidth_available(adev->pdev, NULL, 5353 &platform_speed_cap, &platform_link_width); 5354 5355 if (adev->pm.pcie_gen_mask == 0) { 5356 /* asic caps */ 5357 pdev = adev->pdev; 5358 speed_cap = pcie_get_speed_cap(pdev); 5359 if (speed_cap == PCI_SPEED_UNKNOWN) { 5360 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5361 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5362 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5363 } else { 5364 if (speed_cap == PCIE_SPEED_32_0GT) 5365 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5366 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5367 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5368 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5369 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5370 else if (speed_cap == PCIE_SPEED_16_0GT) 5371 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5372 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5373 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5374 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5375 else if (speed_cap == PCIE_SPEED_8_0GT) 5376 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5377 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5378 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5379 else if (speed_cap == PCIE_SPEED_5_0GT) 5380 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5381 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5382 else 5383 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5384 } 5385 /* platform caps */ 5386 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5387 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5388 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5389 } else { 5390 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5391 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5392 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5393 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5394 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5395 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5396 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5397 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5398 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5399 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5400 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5401 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5402 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5403 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5404 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5405 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5406 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5407 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5408 else 5409 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5410 5411 } 5412 } 5413 if (adev->pm.pcie_mlw_mask == 0) { 5414 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5415 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5416 } else { 5417 switch (platform_link_width) { 5418 case PCIE_LNK_X32: 5419 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5420 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5426 break; 5427 case PCIE_LNK_X16: 5428 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5432 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5433 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5434 break; 5435 case PCIE_LNK_X12: 5436 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5439 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5441 break; 5442 case PCIE_LNK_X8: 5443 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5447 break; 5448 case PCIE_LNK_X4: 5449 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5452 break; 5453 case PCIE_LNK_X2: 5454 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5456 break; 5457 case PCIE_LNK_X1: 5458 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5459 break; 5460 default: 5461 break; 5462 } 5463 } 5464 } 5465 } 5466 5467 int amdgpu_device_baco_enter(struct drm_device *dev) 5468 { 5469 struct amdgpu_device *adev = drm_to_adev(dev); 5470 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5471 5472 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5473 return -ENOTSUPP; 5474 5475 if (ras && adev->ras_enabled && 5476 adev->nbio.funcs->enable_doorbell_interrupt) 5477 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5478 5479 return amdgpu_dpm_baco_enter(adev); 5480 } 5481 5482 int amdgpu_device_baco_exit(struct drm_device *dev) 5483 { 5484 struct amdgpu_device *adev = drm_to_adev(dev); 5485 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5486 int ret = 0; 5487 5488 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5489 return -ENOTSUPP; 5490 5491 ret = amdgpu_dpm_baco_exit(adev); 5492 if (ret) 5493 return ret; 5494 5495 if (ras && adev->ras_enabled && 5496 adev->nbio.funcs->enable_doorbell_interrupt) 5497 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5498 5499 if (amdgpu_passthrough(adev) && 5500 adev->nbio.funcs->clear_doorbell_interrupt) 5501 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5502 5503 return 0; 5504 } 5505 5506 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5507 { 5508 int i; 5509 5510 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5511 struct amdgpu_ring *ring = adev->rings[i]; 5512 5513 if (!ring || !ring->sched.thread) 5514 continue; 5515 5516 cancel_delayed_work_sync(&ring->sched.work_tdr); 5517 } 5518 } 5519 5520 /** 5521 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5522 * @pdev: PCI device struct 5523 * @state: PCI channel state 5524 * 5525 * Description: Called when a PCI error is detected. 5526 * 5527 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5528 */ 5529 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5530 { 5531 STUB(); 5532 return 0; 5533 #ifdef notyet 5534 struct drm_device *dev = pci_get_drvdata(pdev); 5535 struct amdgpu_device *adev = drm_to_adev(dev); 5536 int i; 5537 5538 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5539 5540 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5541 DRM_WARN("No support for XGMI hive yet..."); 5542 return PCI_ERS_RESULT_DISCONNECT; 5543 } 5544 5545 adev->pci_channel_state = state; 5546 5547 switch (state) { 5548 case pci_channel_io_normal: 5549 return PCI_ERS_RESULT_CAN_RECOVER; 5550 /* Fatal error, prepare for slot reset */ 5551 case pci_channel_io_frozen: 5552 /* 5553 * Cancel and wait for all TDRs in progress if failing to 5554 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5555 * 5556 * Locking adev->reset_sem will prevent any external access 5557 * to GPU during PCI error recovery 5558 */ 5559 while (!amdgpu_device_lock_adev(adev, NULL)) 5560 amdgpu_cancel_all_tdr(adev); 5561 5562 /* 5563 * Block any work scheduling as we do for regular GPU reset 5564 * for the duration of the recovery 5565 */ 5566 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5567 struct amdgpu_ring *ring = adev->rings[i]; 5568 5569 if (!ring || !ring->sched.thread) 5570 continue; 5571 5572 drm_sched_stop(&ring->sched, NULL); 5573 } 5574 atomic_inc(&adev->gpu_reset_counter); 5575 return PCI_ERS_RESULT_NEED_RESET; 5576 case pci_channel_io_perm_failure: 5577 /* Permanent error, prepare for device removal */ 5578 return PCI_ERS_RESULT_DISCONNECT; 5579 } 5580 5581 return PCI_ERS_RESULT_NEED_RESET; 5582 #endif 5583 } 5584 5585 /** 5586 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5587 * @pdev: pointer to PCI device 5588 */ 5589 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5590 { 5591 5592 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5593 5594 /* TODO - dump whatever for debugging purposes */ 5595 5596 /* This called only if amdgpu_pci_error_detected returns 5597 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5598 * works, no need to reset slot. 5599 */ 5600 5601 return PCI_ERS_RESULT_RECOVERED; 5602 } 5603 5604 /** 5605 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5606 * @pdev: PCI device struct 5607 * 5608 * Description: This routine is called by the pci error recovery 5609 * code after the PCI slot has been reset, just before we 5610 * should resume normal operations. 5611 */ 5612 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5613 { 5614 STUB(); 5615 return PCI_ERS_RESULT_RECOVERED; 5616 #ifdef notyet 5617 struct drm_device *dev = pci_get_drvdata(pdev); 5618 struct amdgpu_device *adev = drm_to_adev(dev); 5619 int r, i; 5620 struct amdgpu_reset_context reset_context; 5621 u32 memsize; 5622 struct list_head device_list; 5623 5624 DRM_INFO("PCI error: slot reset callback!!\n"); 5625 5626 memset(&reset_context, 0, sizeof(reset_context)); 5627 5628 INIT_LIST_HEAD(&device_list); 5629 list_add_tail(&adev->reset_list, &device_list); 5630 5631 /* wait for asic to come out of reset */ 5632 drm_msleep(500); 5633 5634 /* Restore PCI confspace */ 5635 amdgpu_device_load_pci_state(pdev); 5636 5637 /* confirm ASIC came out of reset */ 5638 for (i = 0; i < adev->usec_timeout; i++) { 5639 memsize = amdgpu_asic_get_config_memsize(adev); 5640 5641 if (memsize != 0xffffffff) 5642 break; 5643 udelay(1); 5644 } 5645 if (memsize == 0xffffffff) { 5646 r = -ETIME; 5647 goto out; 5648 } 5649 5650 reset_context.method = AMD_RESET_METHOD_NONE; 5651 reset_context.reset_req_dev = adev; 5652 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5653 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5654 5655 adev->no_hw_access = true; 5656 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5657 adev->no_hw_access = false; 5658 if (r) 5659 goto out; 5660 5661 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5662 5663 out: 5664 if (!r) { 5665 if (amdgpu_device_cache_pci_state(adev->pdev)) 5666 pci_restore_state(adev->pdev); 5667 5668 DRM_INFO("PCIe error recovery succeeded\n"); 5669 } else { 5670 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5671 amdgpu_device_unlock_adev(adev); 5672 } 5673 5674 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5675 #endif 5676 } 5677 5678 /** 5679 * amdgpu_pci_resume() - resume normal ops after PCI reset 5680 * @pdev: pointer to PCI device 5681 * 5682 * Called when the error recovery driver tells us that its 5683 * OK to resume normal operation. 5684 */ 5685 void amdgpu_pci_resume(struct pci_dev *pdev) 5686 { 5687 STUB(); 5688 #ifdef notyet 5689 struct drm_device *dev = pci_get_drvdata(pdev); 5690 struct amdgpu_device *adev = drm_to_adev(dev); 5691 int i; 5692 5693 5694 DRM_INFO("PCI error: resume callback!!\n"); 5695 5696 /* Only continue execution for the case of pci_channel_io_frozen */ 5697 if (adev->pci_channel_state != pci_channel_io_frozen) 5698 return; 5699 5700 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5701 struct amdgpu_ring *ring = adev->rings[i]; 5702 5703 if (!ring || !ring->sched.thread) 5704 continue; 5705 5706 5707 drm_sched_resubmit_jobs(&ring->sched); 5708 drm_sched_start(&ring->sched, true); 5709 } 5710 5711 amdgpu_device_unlock_adev(adev); 5712 #endif 5713 } 5714 5715 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5716 { 5717 return false; 5718 #ifdef notyet 5719 struct drm_device *dev = pci_get_drvdata(pdev); 5720 struct amdgpu_device *adev = drm_to_adev(dev); 5721 int r; 5722 5723 r = pci_save_state(pdev); 5724 if (!r) { 5725 kfree(adev->pci_state); 5726 5727 adev->pci_state = pci_store_saved_state(pdev); 5728 5729 if (!adev->pci_state) { 5730 DRM_ERROR("Failed to store PCI saved state"); 5731 return false; 5732 } 5733 } else { 5734 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5735 return false; 5736 } 5737 5738 return true; 5739 #endif 5740 } 5741 5742 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5743 { 5744 STUB(); 5745 return false; 5746 #ifdef notyet 5747 struct drm_device *dev = pci_get_drvdata(pdev); 5748 struct amdgpu_device *adev = drm_to_adev(dev); 5749 int r; 5750 5751 if (!adev->pci_state) 5752 return false; 5753 5754 r = pci_load_saved_state(pdev, adev->pci_state); 5755 5756 if (!r) { 5757 pci_restore_state(pdev); 5758 } else { 5759 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5760 return false; 5761 } 5762 5763 return true; 5764 #endif 5765 } 5766 5767 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5768 struct amdgpu_ring *ring) 5769 { 5770 #ifdef CONFIG_X86_64 5771 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5772 return; 5773 #endif 5774 if (adev->gmc.xgmi.connected_to_cpu) 5775 return; 5776 5777 if (ring && ring->funcs->emit_hdp_flush) 5778 amdgpu_ring_emit_hdp_flush(ring); 5779 else 5780 amdgpu_asic_flush_hdp(adev, ring); 5781 } 5782 5783 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5784 struct amdgpu_ring *ring) 5785 { 5786 #ifdef CONFIG_X86_64 5787 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5788 return; 5789 #endif 5790 if (adev->gmc.xgmi.connected_to_cpu) 5791 return; 5792 5793 amdgpu_asic_invalidate_hdp(adev, ring); 5794 } 5795