1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 69 #include <linux/suspend.h> 70 #include <drm/task_barrier.h> 71 #include <linux/pm_runtime.h> 72 73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 83 84 #define AMDGPU_RESUME_MS 2000 85 86 const char *amdgpu_asic_name[] = { 87 "TAHITI", 88 "PITCAIRN", 89 "VERDE", 90 "OLAND", 91 "HAINAN", 92 "BONAIRE", 93 "KAVERI", 94 "KABINI", 95 "HAWAII", 96 "MULLINS", 97 "TOPAZ", 98 "TONGA", 99 "FIJI", 100 "CARRIZO", 101 "STONEY", 102 "POLARIS10", 103 "POLARIS11", 104 "POLARIS12", 105 "VEGAM", 106 "VEGA10", 107 "VEGA12", 108 "VEGA20", 109 "RAVEN", 110 "ARCTURUS", 111 "RENOIR", 112 "NAVI10", 113 "NAVI14", 114 "NAVI12", 115 "SIENNA_CICHLID", 116 "NAVY_FLOUNDER", 117 "LAST", 118 }; 119 120 /** 121 * DOC: pcie_replay_count 122 * 123 * The amdgpu driver provides a sysfs API for reporting the total number 124 * of PCIe replays (NAKs) 125 * The file pcie_replay_count is used for this and returns the total 126 * number of replays as a sum of the NAKs generated and NAKs received 127 */ 128 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 130 struct device_attribute *attr, char *buf) 131 { 132 struct drm_device *ddev = dev_get_drvdata(dev); 133 struct amdgpu_device *adev = drm_to_adev(ddev); 134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 135 136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 137 } 138 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 140 amdgpu_device_get_pcie_replay_count, NULL); 141 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 143 144 /** 145 * DOC: product_name 146 * 147 * The amdgpu driver provides a sysfs API for reporting the product name 148 * for the device 149 * The file serial_number is used for this and returns the product name 150 * as returned from the FRU. 151 * NOTE: This is only available for certain server cards 152 */ 153 154 static ssize_t amdgpu_device_get_product_name(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct drm_device *ddev = dev_get_drvdata(dev); 158 struct amdgpu_device *adev = drm_to_adev(ddev); 159 160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); 161 } 162 163 static DEVICE_ATTR(product_name, S_IRUGO, 164 amdgpu_device_get_product_name, NULL); 165 166 /** 167 * DOC: product_number 168 * 169 * The amdgpu driver provides a sysfs API for reporting the part number 170 * for the device 171 * The file serial_number is used for this and returns the part number 172 * as returned from the FRU. 173 * NOTE: This is only available for certain server cards 174 */ 175 176 static ssize_t amdgpu_device_get_product_number(struct device *dev, 177 struct device_attribute *attr, char *buf) 178 { 179 struct drm_device *ddev = dev_get_drvdata(dev); 180 struct amdgpu_device *adev = drm_to_adev(ddev); 181 182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); 183 } 184 185 static DEVICE_ATTR(product_number, S_IRUGO, 186 amdgpu_device_get_product_number, NULL); 187 188 /** 189 * DOC: serial_number 190 * 191 * The amdgpu driver provides a sysfs API for reporting the serial number 192 * for the device 193 * The file serial_number is used for this and returns the serial number 194 * as returned from the FRU. 195 * NOTE: This is only available for certain server cards 196 */ 197 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 199 struct device_attribute *attr, char *buf) 200 { 201 struct drm_device *ddev = dev_get_drvdata(dev); 202 struct amdgpu_device *adev = drm_to_adev(ddev); 203 204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); 205 } 206 207 static DEVICE_ATTR(serial_number, S_IRUGO, 208 amdgpu_device_get_serial_number, NULL); 209 210 /** 211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 212 * 213 * @dev: drm_device pointer 214 * 215 * Returns true if the device is a dGPU with HG/PX power control, 216 * otherwise return false. 217 */ 218 bool amdgpu_device_supports_boco(struct drm_device *dev) 219 { 220 struct amdgpu_device *adev = drm_to_adev(dev); 221 222 if (adev->flags & AMD_IS_PX) 223 return true; 224 return false; 225 } 226 227 /** 228 * amdgpu_device_supports_baco - Does the device support BACO 229 * 230 * @dev: drm_device pointer 231 * 232 * Returns true if the device supporte BACO, 233 * otherwise return false. 234 */ 235 bool amdgpu_device_supports_baco(struct drm_device *dev) 236 { 237 struct amdgpu_device *adev = drm_to_adev(dev); 238 239 return amdgpu_asic_supports_baco(adev); 240 } 241 242 /* 243 * VRAM access helper functions 244 */ 245 246 /** 247 * amdgpu_device_vram_access - read/write a buffer in vram 248 * 249 * @adev: amdgpu_device pointer 250 * @pos: offset of the buffer in vram 251 * @buf: virtual address of the buffer in system memory 252 * @size: read/write size, sizeof(@buf) must > @size 253 * @write: true - write to vram, otherwise - read from vram 254 */ 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 256 uint32_t *buf, size_t size, bool write) 257 { 258 unsigned long flags; 259 uint32_t hi = ~0; 260 uint64_t last; 261 262 263 #ifdef CONFIG_64BIT 264 last = min(pos + size, adev->gmc.visible_vram_size); 265 if (last > pos) { 266 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 267 size_t count = last - pos; 268 269 if (write) { 270 memcpy_toio(addr, buf, count); 271 mb(); 272 amdgpu_asic_flush_hdp(adev, NULL); 273 } else { 274 amdgpu_asic_invalidate_hdp(adev, NULL); 275 mb(); 276 memcpy_fromio(buf, addr, count); 277 } 278 279 if (count == size) 280 return; 281 282 pos += count; 283 buf += count / 4; 284 size -= count; 285 } 286 #endif 287 288 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 289 for (last = pos + size; pos < last; pos += 4) { 290 uint32_t tmp = pos >> 31; 291 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 293 if (tmp != hi) { 294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 295 hi = tmp; 296 } 297 if (write) 298 WREG32_NO_KIQ(mmMM_DATA, *buf++); 299 else 300 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 301 } 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 303 } 304 305 /* 306 * register access helper functions. 307 */ 308 /** 309 * amdgpu_device_rreg - read a memory mapped IO or indirect register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @acc_flags: access flags which require special behavior 314 * 315 * Returns the 32 bit value from the offset specified. 316 */ 317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 318 uint32_t reg, uint32_t acc_flags) 319 { 320 uint32_t ret; 321 322 if (adev->in_pci_err_recovery) 323 return 0; 324 325 if ((reg * 4) < adev->rmmio_size) { 326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 327 amdgpu_sriov_runtime(adev) && 328 down_read_trylock(&adev->reset_sem)) { 329 ret = amdgpu_kiq_rreg(adev, reg); 330 up_read(&adev->reset_sem); 331 } else { 332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 333 } 334 } else { 335 ret = adev->pcie_rreg(adev, reg * 4); 336 } 337 338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 339 340 return ret; 341 } 342 343 /* 344 * MMIO register read with bytes helper functions 345 * @offset:bytes offset from MMIO start 346 * 347 */ 348 349 /** 350 * amdgpu_mm_rreg8 - read a memory mapped IO register 351 * 352 * @adev: amdgpu_device pointer 353 * @offset: byte aligned register offset 354 * 355 * Returns the 8 bit value from the offset specified. 356 */ 357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 358 { 359 if (adev->in_pci_err_recovery) 360 return 0; 361 362 if (offset < adev->rmmio_size) 363 return (readb(adev->rmmio + offset)); 364 BUG(); 365 } 366 367 /* 368 * MMIO register write with bytes helper functions 369 * @offset:bytes offset from MMIO start 370 * @value: the value want to be written to the register 371 * 372 */ 373 /** 374 * amdgpu_mm_wreg8 - read a memory mapped IO register 375 * 376 * @adev: amdgpu_device pointer 377 * @offset: byte aligned register offset 378 * @value: 8 bit value to write 379 * 380 * Writes the value specified to the offset specified. 381 */ 382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 383 { 384 if (adev->in_pci_err_recovery) 385 return; 386 387 if (offset < adev->rmmio_size) 388 writeb(value, adev->rmmio + offset); 389 else 390 BUG(); 391 } 392 393 /** 394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 395 * 396 * @adev: amdgpu_device pointer 397 * @reg: dword aligned register offset 398 * @v: 32 bit value to write to the register 399 * @acc_flags: access flags which require special behavior 400 * 401 * Writes the value specified to the offset specified. 402 */ 403 void amdgpu_device_wreg(struct amdgpu_device *adev, 404 uint32_t reg, uint32_t v, 405 uint32_t acc_flags) 406 { 407 if (adev->in_pci_err_recovery) 408 return; 409 410 if ((reg * 4) < adev->rmmio_size) { 411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 412 amdgpu_sriov_runtime(adev) && 413 down_read_trylock(&adev->reset_sem)) { 414 amdgpu_kiq_wreg(adev, reg, v); 415 up_read(&adev->reset_sem); 416 } else { 417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 418 } 419 } else { 420 adev->pcie_wreg(adev, reg * 4, v); 421 } 422 423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 424 } 425 426 /* 427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 428 * 429 * this function is invoked only the debugfs register access 430 * */ 431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 432 uint32_t reg, uint32_t v) 433 { 434 if (adev->in_pci_err_recovery) 435 return; 436 437 if (amdgpu_sriov_fullaccess(adev) && 438 adev->gfx.rlc.funcs && 439 adev->gfx.rlc.funcs->is_rlcg_access_range) { 440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 442 } else { 443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 444 } 445 } 446 447 /** 448 * amdgpu_io_rreg - read an IO register 449 * 450 * @adev: amdgpu_device pointer 451 * @reg: dword aligned register offset 452 * 453 * Returns the 32 bit value from the offset specified. 454 */ 455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 456 { 457 u32 val; 458 459 if (adev->in_pci_err_recovery) 460 return 0; 461 462 if ((reg * 4) < adev->rio_mem_size) { 463 val = bus_space_read_4(adev->rio_mem_bst, adev->rio_mem_bsh, reg); 464 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 465 adev->rio_mem_size, BUS_SPACE_BARRIER_READ); 466 } else { 467 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 468 adev->rio_mem_size, BUS_SPACE_BARRIER_WRITE); 469 bus_space_write_4(adev->rio_mem_bst, adev->rio_mem_bsh, 470 mmMM_INDEX * 4, reg * 4); 471 val = bus_space_read_4(adev->rio_mem_bst, adev->rio_mem_bsh, 472 mmMM_INDEX * 4); 473 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 474 adev->rio_mem_size, BUS_SPACE_BARRIER_READ); 475 } 476 477 return val; 478 } 479 480 /** 481 * amdgpu_io_wreg - write to an IO register 482 * 483 * @adev: amdgpu_device pointer 484 * @reg: dword aligned register offset 485 * @v: 32 bit value to write to the register 486 * 487 * Writes the value specified to the offset specified. 488 */ 489 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 490 { 491 if (adev->in_pci_err_recovery) 492 return; 493 494 if ((reg * 4) < adev->rio_mem_size) { 495 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 496 adev->rio_mem_size, BUS_SPACE_BARRIER_WRITE); 497 bus_space_write_4(adev->rio_mem_bst, adev->rio_mem_bsh, 498 reg * 4, v); 499 } else { 500 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 501 adev->rio_mem_size, BUS_SPACE_BARRIER_WRITE); 502 bus_space_write_4(adev->rio_mem_bst, adev->rio_mem_bsh, 503 mmMM_INDEX * 4, reg * 4); 504 bus_space_barrier(adev->rio_mem_bst, adev->rio_mem_bsh, 0, 505 adev->rio_mem_size, BUS_SPACE_BARRIER_WRITE); 506 bus_space_write_4(adev->rio_mem_bst, adev->rio_mem_bsh, 507 mmMM_DATA * 4, v); 508 509 } 510 } 511 512 /** 513 * amdgpu_mm_rdoorbell - read a doorbell dword 514 * 515 * @adev: amdgpu_device pointer 516 * @index: doorbell index 517 * 518 * Returns the value in the doorbell aperture at the 519 * requested doorbell index (CIK). 520 */ 521 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 522 { 523 if (adev->in_pci_err_recovery) 524 return 0; 525 526 if (index < adev->doorbell.num_doorbells) { 527 return readl(adev->doorbell.ptr + index); 528 } else { 529 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 530 return 0; 531 } 532 } 533 534 /** 535 * amdgpu_mm_wdoorbell - write a doorbell dword 536 * 537 * @adev: amdgpu_device pointer 538 * @index: doorbell index 539 * @v: value to write 540 * 541 * Writes @v to the doorbell aperture at the 542 * requested doorbell index (CIK). 543 */ 544 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 545 { 546 if (adev->in_pci_err_recovery) 547 return; 548 549 if (index < adev->doorbell.num_doorbells) { 550 writel(v, adev->doorbell.ptr + index); 551 } else { 552 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 553 } 554 } 555 556 /** 557 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 558 * 559 * @adev: amdgpu_device pointer 560 * @index: doorbell index 561 * 562 * Returns the value in the doorbell aperture at the 563 * requested doorbell index (VEGA10+). 564 */ 565 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 566 { 567 if (adev->in_pci_err_recovery) 568 return 0; 569 570 if (index < adev->doorbell.num_doorbells) { 571 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 572 } else { 573 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 574 return 0; 575 } 576 } 577 578 /** 579 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 580 * 581 * @adev: amdgpu_device pointer 582 * @index: doorbell index 583 * @v: value to write 584 * 585 * Writes @v to the doorbell aperture at the 586 * requested doorbell index (VEGA10+). 587 */ 588 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 589 { 590 if (adev->in_pci_err_recovery) 591 return; 592 593 if (index < adev->doorbell.num_doorbells) { 594 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 595 } else { 596 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 597 } 598 } 599 600 /** 601 * amdgpu_device_indirect_rreg - read an indirect register 602 * 603 * @adev: amdgpu_device pointer 604 * @pcie_index: mmio register offset 605 * @pcie_data: mmio register offset 606 * 607 * Returns the value of indirect register @reg_addr 608 */ 609 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 610 u32 pcie_index, u32 pcie_data, 611 u32 reg_addr) 612 { 613 unsigned long flags; 614 u32 r; 615 void __iomem *pcie_index_offset; 616 void __iomem *pcie_data_offset; 617 618 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 619 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 620 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 621 622 writel(reg_addr, pcie_index_offset); 623 readl(pcie_index_offset); 624 r = readl(pcie_data_offset); 625 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 626 627 return r; 628 } 629 630 /** 631 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 632 * 633 * @adev: amdgpu_device pointer 634 * @pcie_index: mmio register offset 635 * @pcie_data: mmio register offset 636 * 637 * Returns the value of indirect register @reg_addr 638 */ 639 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 640 u32 pcie_index, u32 pcie_data, 641 u32 reg_addr) 642 { 643 unsigned long flags; 644 u64 r; 645 void __iomem *pcie_index_offset; 646 void __iomem *pcie_data_offset; 647 648 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 649 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 650 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 651 652 /* read low 32 bits */ 653 writel(reg_addr, pcie_index_offset); 654 readl(pcie_index_offset); 655 r = readl(pcie_data_offset); 656 /* read high 32 bits */ 657 writel(reg_addr + 4, pcie_index_offset); 658 readl(pcie_index_offset); 659 r |= ((u64)readl(pcie_data_offset) << 32); 660 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 661 662 return r; 663 } 664 665 /** 666 * amdgpu_device_indirect_wreg - write an indirect register address 667 * 668 * @adev: amdgpu_device pointer 669 * @pcie_index: mmio register offset 670 * @pcie_data: mmio register offset 671 * @reg_addr: indirect register offset 672 * @reg_data: indirect register data 673 * 674 */ 675 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 676 u32 pcie_index, u32 pcie_data, 677 u32 reg_addr, u32 reg_data) 678 { 679 unsigned long flags; 680 void __iomem *pcie_index_offset; 681 void __iomem *pcie_data_offset; 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 writel(reg_addr, pcie_index_offset); 688 readl(pcie_index_offset); 689 writel(reg_data, pcie_data_offset); 690 readl(pcie_data_offset); 691 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 692 } 693 694 /** 695 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 696 * 697 * @adev: amdgpu_device pointer 698 * @pcie_index: mmio register offset 699 * @pcie_data: mmio register offset 700 * @reg_addr: indirect register offset 701 * @reg_data: indirect register data 702 * 703 */ 704 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 705 u32 pcie_index, u32 pcie_data, 706 u32 reg_addr, u64 reg_data) 707 { 708 unsigned long flags; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* write low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 720 readl(pcie_data_offset); 721 /* write high 32 bits */ 722 writel(reg_addr + 4, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel((u32)(reg_data >> 32), pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 /** 730 * amdgpu_invalid_rreg - dummy reg read function 731 * 732 * @adev: amdgpu_device pointer 733 * @reg: offset of register 734 * 735 * Dummy register read function. Used for register blocks 736 * that certain asics don't have (all asics). 737 * Returns the value in the register. 738 */ 739 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 740 { 741 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 742 BUG(); 743 return 0; 744 } 745 746 /** 747 * amdgpu_invalid_wreg - dummy reg write function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * @v: value to write to the register 752 * 753 * Dummy register read function. Used for register blocks 754 * that certain asics don't have (all asics). 755 */ 756 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 757 { 758 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 759 reg, v); 760 BUG(); 761 } 762 763 /** 764 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * 769 * Dummy register read function. Used for register blocks 770 * that certain asics don't have (all asics). 771 * Returns the value in the register. 772 */ 773 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 774 { 775 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 776 BUG(); 777 return 0; 778 } 779 780 /** 781 * amdgpu_invalid_wreg64 - dummy reg write function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * @v: value to write to the register 786 * 787 * Dummy register read function. Used for register blocks 788 * that certain asics don't have (all asics). 789 */ 790 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 791 { 792 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 793 reg, v); 794 BUG(); 795 } 796 797 /** 798 * amdgpu_block_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @block: offset of instance 802 * @reg: offset of register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 * Returns the value in the register. 807 */ 808 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 809 uint32_t block, uint32_t reg) 810 { 811 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 812 reg, block); 813 BUG(); 814 return 0; 815 } 816 817 /** 818 * amdgpu_block_invalid_wreg - dummy reg write function 819 * 820 * @adev: amdgpu_device pointer 821 * @block: offset of instance 822 * @reg: offset of register 823 * @v: value to write to the register 824 * 825 * Dummy register read function. Used for register blocks 826 * that certain asics don't have (all asics). 827 */ 828 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 829 uint32_t block, 830 uint32_t reg, uint32_t v) 831 { 832 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 833 reg, block, v); 834 BUG(); 835 } 836 837 /** 838 * amdgpu_device_asic_init - Wrapper for atom asic_init 839 * 840 * @adev: amdgpu_device pointer 841 * 842 * Does any asic specific work and then calls atom asic init. 843 */ 844 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 845 { 846 amdgpu_asic_pre_asic_init(adev); 847 848 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 849 } 850 851 /** 852 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 853 * 854 * @adev: amdgpu_device pointer 855 * 856 * Allocates a scratch page of VRAM for use by various things in the 857 * driver. 858 */ 859 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 860 { 861 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 862 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 863 &adev->vram_scratch.robj, 864 &adev->vram_scratch.gpu_addr, 865 (void **)&adev->vram_scratch.ptr); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Frees the VRAM scratch page. 874 */ 875 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 876 { 877 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 878 } 879 880 /** 881 * amdgpu_device_program_register_sequence - program an array of registers. 882 * 883 * @adev: amdgpu_device pointer 884 * @registers: pointer to the register array 885 * @array_size: size of the register array 886 * 887 * Programs an array or registers with and and or masks. 888 * This is a helper for setting golden registers. 889 */ 890 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 891 const u32 *registers, 892 const u32 array_size) 893 { 894 u32 tmp, reg, and_mask, or_mask; 895 int i; 896 897 if (array_size % 3) 898 return; 899 900 for (i = 0; i < array_size; i +=3) { 901 reg = registers[i + 0]; 902 and_mask = registers[i + 1]; 903 or_mask = registers[i + 2]; 904 905 if (and_mask == 0xffffffff) { 906 tmp = or_mask; 907 } else { 908 tmp = RREG32(reg); 909 tmp &= ~and_mask; 910 if (adev->family >= AMDGPU_FAMILY_AI) 911 tmp |= (or_mask & and_mask); 912 else 913 tmp |= or_mask; 914 } 915 WREG32(reg, tmp); 916 } 917 } 918 919 /** 920 * amdgpu_device_pci_config_reset - reset the GPU 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Resets the GPU using the pci config reset sequence. 925 * Only applicable to asics prior to vega10. 926 */ 927 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 928 { 929 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 930 } 931 932 /* 933 * GPU doorbell aperture helpers function. 934 */ 935 /** 936 * amdgpu_device_doorbell_init - Init doorbell driver information. 937 * 938 * @adev: amdgpu_device pointer 939 * 940 * Init doorbell driver information (CIK) 941 * Returns 0 on success, error on failure. 942 */ 943 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 944 { 945 946 /* No doorbell on SI hardware generation */ 947 if (adev->asic_type < CHIP_BONAIRE) { 948 adev->doorbell.base = 0; 949 adev->doorbell.size = 0; 950 adev->doorbell.num_doorbells = 0; 951 adev->doorbell.ptr = NULL; 952 return 0; 953 } 954 955 #ifdef __linux 956 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 957 return -EINVAL; 958 #endif 959 960 amdgpu_asic_init_doorbell_index(adev); 961 962 /* doorbell bar mapping */ 963 #ifdef __linux__ 964 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 965 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 966 #endif 967 968 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 969 adev->doorbell_index.max_assignment+1); 970 if (adev->doorbell.num_doorbells == 0) 971 return -EINVAL; 972 973 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 974 * paging queue doorbell use the second page. The 975 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 976 * doorbells are in the first page. So with paging queue enabled, 977 * the max num_doorbells should + 1 page (0x400 in dword) 978 */ 979 if (adev->asic_type >= CHIP_VEGA10) 980 adev->doorbell.num_doorbells += 0x400; 981 982 #ifdef __linux__ 983 adev->doorbell.ptr = ioremap(adev->doorbell.base, 984 adev->doorbell.num_doorbells * 985 sizeof(u32)); 986 if (adev->doorbell.ptr == NULL) 987 return -ENOMEM; 988 #endif 989 990 return 0; 991 } 992 993 /** 994 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 995 * 996 * @adev: amdgpu_device pointer 997 * 998 * Tear down doorbell driver information (CIK) 999 */ 1000 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1001 { 1002 #ifdef __linux__ 1003 iounmap(adev->doorbell.ptr); 1004 #else 1005 if (adev->doorbell.size > 0) 1006 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 1007 adev->doorbell.size); 1008 #endif 1009 adev->doorbell.ptr = NULL; 1010 } 1011 1012 1013 1014 /* 1015 * amdgpu_device_wb_*() 1016 * Writeback is the method by which the GPU updates special pages in memory 1017 * with the status of certain GPU events (fences, ring pointers,etc.). 1018 */ 1019 1020 /** 1021 * amdgpu_device_wb_fini - Disable Writeback and free memory 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Disables Writeback and frees the Writeback memory (all asics). 1026 * Used at driver shutdown. 1027 */ 1028 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1029 { 1030 if (adev->wb.wb_obj) { 1031 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1032 &adev->wb.gpu_addr, 1033 (void **)&adev->wb.wb); 1034 adev->wb.wb_obj = NULL; 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1040 * 1041 * @adev: amdgpu_device pointer 1042 * 1043 * Initializes writeback and allocates writeback memory (all asics). 1044 * Used at driver startup. 1045 * Returns 0 on success or an -error on failure. 1046 */ 1047 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1048 { 1049 int r; 1050 1051 if (adev->wb.wb_obj == NULL) { 1052 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1053 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1054 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1055 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1056 (void **)&adev->wb.wb); 1057 if (r) { 1058 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1059 return r; 1060 } 1061 1062 adev->wb.num_wb = AMDGPU_MAX_WB; 1063 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1064 1065 /* clear wb memory */ 1066 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1067 } 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_wb_get - Allocate a wb entry 1074 * 1075 * @adev: amdgpu_device pointer 1076 * @wb: wb index 1077 * 1078 * Allocate a wb slot for use by the driver (all asics). 1079 * Returns 0 on success or -EINVAL on failure. 1080 */ 1081 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1082 { 1083 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1084 1085 if (offset < adev->wb.num_wb) { 1086 __set_bit(offset, adev->wb.used); 1087 *wb = offset << 3; /* convert to dw offset */ 1088 return 0; 1089 } else { 1090 return -EINVAL; 1091 } 1092 } 1093 1094 /** 1095 * amdgpu_device_wb_free - Free a wb entry 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @wb: wb index 1099 * 1100 * Free a wb slot allocated for use by the driver (all asics) 1101 */ 1102 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1103 { 1104 wb >>= 3; 1105 if (wb < adev->wb.num_wb) 1106 __clear_bit(wb, adev->wb.used); 1107 } 1108 1109 /** 1110 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1111 * 1112 * @adev: amdgpu_device pointer 1113 * 1114 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1115 * to fail, but if any of the BARs is not accessible after the size we abort 1116 * driver loading by returning -ENODEV. 1117 */ 1118 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1119 { 1120 #ifdef __linux__ 1121 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 1122 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 1123 struct pci_bus *root; 1124 struct resource *res; 1125 unsigned i; 1126 u16 cmd; 1127 int r; 1128 1129 /* Bypass for VF */ 1130 if (amdgpu_sriov_vf(adev)) 1131 return 0; 1132 1133 /* skip if the bios has already enabled large BAR */ 1134 if (adev->gmc.real_vram_size && 1135 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1136 return 0; 1137 1138 /* Check if the root BUS has 64bit memory resources */ 1139 root = adev->pdev->bus; 1140 while (root->parent) 1141 root = root->parent; 1142 1143 pci_bus_for_each_resource(root, res, i) { 1144 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1145 res->start > 0x100000000ull) 1146 break; 1147 } 1148 1149 /* Trying to resize is pointless without a root hub window above 4GB */ 1150 if (!res) 1151 return 0; 1152 1153 /* Disable memory decoding while we change the BAR addresses and size */ 1154 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1155 pci_write_config_word(adev->pdev, PCI_COMMAND, 1156 cmd & ~PCI_COMMAND_MEMORY); 1157 1158 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1159 amdgpu_device_doorbell_fini(adev); 1160 if (adev->asic_type >= CHIP_BONAIRE) 1161 pci_release_resource(adev->pdev, 2); 1162 1163 pci_release_resource(adev->pdev, 0); 1164 1165 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1166 if (r == -ENOSPC) 1167 DRM_INFO("Not enough PCI address space for a large BAR."); 1168 else if (r && r != -ENOTSUPP) 1169 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1170 1171 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1172 1173 /* When the doorbell or fb BAR isn't available we have no chance of 1174 * using the device. 1175 */ 1176 r = amdgpu_device_doorbell_init(adev); 1177 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1178 return -ENODEV; 1179 1180 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1181 #endif /* __linux__ */ 1182 1183 return 0; 1184 } 1185 1186 /* 1187 * GPU helpers function. 1188 */ 1189 /** 1190 * amdgpu_device_need_post - check if the hw need post or not 1191 * 1192 * @adev: amdgpu_device pointer 1193 * 1194 * Check if the asic has been initialized (all asics) at driver startup 1195 * or post is needed if hw reset is performed. 1196 * Returns true if need or false if not. 1197 */ 1198 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1199 { 1200 uint32_t reg; 1201 1202 if (amdgpu_sriov_vf(adev)) 1203 return false; 1204 1205 if (amdgpu_passthrough(adev)) { 1206 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1207 * some old smc fw still need driver do vPost otherwise gpu hang, while 1208 * those smc fw version above 22.15 doesn't have this flaw, so we force 1209 * vpost executed for smc version below 22.15 1210 */ 1211 if (adev->asic_type == CHIP_FIJI) { 1212 int err; 1213 uint32_t fw_ver; 1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1215 /* force vPost if error occured */ 1216 if (err) 1217 return true; 1218 1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1220 if (fw_ver < 0x00160e00) 1221 return true; 1222 } 1223 } 1224 1225 if (adev->has_hw_reset) { 1226 adev->has_hw_reset = false; 1227 return true; 1228 } 1229 1230 /* bios scratch used on CIK+ */ 1231 if (adev->asic_type >= CHIP_BONAIRE) 1232 return amdgpu_atombios_scratch_need_asic_init(adev); 1233 1234 /* check MEM_SIZE for older asics */ 1235 reg = amdgpu_asic_get_config_memsize(adev); 1236 1237 if ((reg != 0) && (reg != 0xffffffff)) 1238 return false; 1239 1240 return true; 1241 } 1242 1243 /* if we get transitioned to only one device, take VGA back */ 1244 /** 1245 * amdgpu_device_vga_set_decode - enable/disable vga decode 1246 * 1247 * @cookie: amdgpu_device pointer 1248 * @state: enable/disable vga decode 1249 * 1250 * Enable/disable vga decode (all asics). 1251 * Returns VGA resource flags. 1252 */ 1253 #ifdef notyet 1254 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1255 { 1256 struct amdgpu_device *adev = cookie; 1257 amdgpu_asic_set_vga_state(adev, state); 1258 if (state) 1259 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1260 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1261 else 1262 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1263 } 1264 #endif 1265 1266 /** 1267 * amdgpu_device_check_block_size - validate the vm block size 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Validates the vm block size specified via module parameter. 1272 * The vm block size defines number of bits in page table versus page directory, 1273 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1274 * page table and the remaining bits are in the page directory. 1275 */ 1276 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1277 { 1278 /* defines number of bits in page table versus page directory, 1279 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1280 * page table and the remaining bits are in the page directory */ 1281 if (amdgpu_vm_block_size == -1) 1282 return; 1283 1284 if (amdgpu_vm_block_size < 9) { 1285 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1286 amdgpu_vm_block_size); 1287 amdgpu_vm_block_size = -1; 1288 } 1289 } 1290 1291 /** 1292 * amdgpu_device_check_vm_size - validate the vm size 1293 * 1294 * @adev: amdgpu_device pointer 1295 * 1296 * Validates the vm size in GB specified via module parameter. 1297 * The VM size is the size of the GPU virtual memory space in GB. 1298 */ 1299 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1300 { 1301 /* no need to check the default value */ 1302 if (amdgpu_vm_size == -1) 1303 return; 1304 1305 if (amdgpu_vm_size < 1) { 1306 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1307 amdgpu_vm_size); 1308 amdgpu_vm_size = -1; 1309 } 1310 } 1311 1312 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1313 { 1314 #ifdef __linux__ 1315 struct sysinfo si; 1316 #endif 1317 bool is_os_64 = (sizeof(void *) == 8); 1318 uint64_t total_memory; 1319 uint64_t dram_size_seven_GB = 0x1B8000000; 1320 uint64_t dram_size_three_GB = 0xB8000000; 1321 1322 if (amdgpu_smu_memory_pool_size == 0) 1323 return; 1324 1325 if (!is_os_64) { 1326 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1327 goto def_value; 1328 } 1329 #ifdef __linux__ 1330 si_meminfo(&si); 1331 total_memory = (uint64_t)si.totalram * si.mem_unit; 1332 #else 1333 total_memory = ptoa(physmem); 1334 #endif 1335 1336 if ((amdgpu_smu_memory_pool_size == 1) || 1337 (amdgpu_smu_memory_pool_size == 2)) { 1338 if (total_memory < dram_size_three_GB) 1339 goto def_value1; 1340 } else if ((amdgpu_smu_memory_pool_size == 4) || 1341 (amdgpu_smu_memory_pool_size == 8)) { 1342 if (total_memory < dram_size_seven_GB) 1343 goto def_value1; 1344 } else { 1345 DRM_WARN("Smu memory pool size not supported\n"); 1346 goto def_value; 1347 } 1348 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1349 1350 return; 1351 1352 def_value1: 1353 DRM_WARN("No enough system memory\n"); 1354 def_value: 1355 adev->pm.smu_prv_buffer_size = 0; 1356 } 1357 1358 /** 1359 * amdgpu_device_check_arguments - validate module params 1360 * 1361 * @adev: amdgpu_device pointer 1362 * 1363 * Validates certain module parameters and updates 1364 * the associated values used by the driver (all asics). 1365 */ 1366 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1367 { 1368 if (amdgpu_sched_jobs < 4) { 1369 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1370 amdgpu_sched_jobs); 1371 amdgpu_sched_jobs = 4; 1372 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1373 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1374 amdgpu_sched_jobs); 1375 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1376 } 1377 1378 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1379 /* gart size must be greater or equal to 32M */ 1380 dev_warn(adev->dev, "gart size (%d) too small\n", 1381 amdgpu_gart_size); 1382 amdgpu_gart_size = -1; 1383 } 1384 1385 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1386 /* gtt size must be greater or equal to 32M */ 1387 dev_warn(adev->dev, "gtt size (%d) too small\n", 1388 amdgpu_gtt_size); 1389 amdgpu_gtt_size = -1; 1390 } 1391 1392 /* valid range is between 4 and 9 inclusive */ 1393 if (amdgpu_vm_fragment_size != -1 && 1394 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1395 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1396 amdgpu_vm_fragment_size = -1; 1397 } 1398 1399 if (amdgpu_sched_hw_submission < 2) { 1400 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1401 amdgpu_sched_hw_submission); 1402 amdgpu_sched_hw_submission = 2; 1403 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1404 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1405 amdgpu_sched_hw_submission); 1406 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1407 } 1408 1409 amdgpu_device_check_smu_prv_buffer_size(adev); 1410 1411 amdgpu_device_check_vm_size(adev); 1412 1413 amdgpu_device_check_block_size(adev); 1414 1415 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1416 1417 amdgpu_gmc_tmz_set(adev); 1418 1419 if (amdgpu_num_kcq == -1) { 1420 amdgpu_num_kcq = 8; 1421 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { 1422 amdgpu_num_kcq = 8; 1423 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); 1424 } 1425 1426 amdgpu_gmc_noretry_set(adev); 1427 1428 return 0; 1429 } 1430 1431 #ifdef __linux__ 1432 /** 1433 * amdgpu_switcheroo_set_state - set switcheroo state 1434 * 1435 * @pdev: pci dev pointer 1436 * @state: vga_switcheroo state 1437 * 1438 * Callback for the switcheroo driver. Suspends or resumes the 1439 * the asics before or after it is powered up using ACPI methods. 1440 */ 1441 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1442 enum vga_switcheroo_state state) 1443 { 1444 struct drm_device *dev = pci_get_drvdata(pdev); 1445 int r; 1446 1447 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1448 return; 1449 1450 if (state == VGA_SWITCHEROO_ON) { 1451 pr_info("switched on\n"); 1452 /* don't suspend or resume card normally */ 1453 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1454 1455 pci_set_power_state(dev->pdev, PCI_D0); 1456 amdgpu_device_load_pci_state(dev->pdev); 1457 r = pci_enable_device(dev->pdev); 1458 if (r) 1459 DRM_WARN("pci_enable_device failed (%d)\n", r); 1460 amdgpu_device_resume(dev, true); 1461 1462 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1463 drm_kms_helper_poll_enable(dev); 1464 } else { 1465 pr_info("switched off\n"); 1466 drm_kms_helper_poll_disable(dev); 1467 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1468 amdgpu_device_suspend(dev, true); 1469 amdgpu_device_cache_pci_state(dev->pdev); 1470 /* Shut down the device */ 1471 pci_disable_device(dev->pdev); 1472 pci_set_power_state(dev->pdev, PCI_D3cold); 1473 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1474 } 1475 } 1476 1477 /** 1478 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1479 * 1480 * @pdev: pci dev pointer 1481 * 1482 * Callback for the switcheroo driver. Check of the switcheroo 1483 * state can be changed. 1484 * Returns true if the state can be changed, false if not. 1485 */ 1486 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1487 { 1488 struct drm_device *dev = pci_get_drvdata(pdev); 1489 1490 /* 1491 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1492 * locking inversion with the driver load path. And the access here is 1493 * completely racy anyway. So don't bother with locking for now. 1494 */ 1495 return atomic_read(&dev->open_count) == 0; 1496 } 1497 1498 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1499 .set_gpu_state = amdgpu_switcheroo_set_state, 1500 .reprobe = NULL, 1501 .can_switch = amdgpu_switcheroo_can_switch, 1502 }; 1503 #endif /* __linux__ */ 1504 1505 /** 1506 * amdgpu_device_ip_set_clockgating_state - set the CG state 1507 * 1508 * @dev: amdgpu_device pointer 1509 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1510 * @state: clockgating state (gate or ungate) 1511 * 1512 * Sets the requested clockgating state for all instances of 1513 * the hardware IP specified. 1514 * Returns the error code from the last instance. 1515 */ 1516 int amdgpu_device_ip_set_clockgating_state(void *dev, 1517 enum amd_ip_block_type block_type, 1518 enum amd_clockgating_state state) 1519 { 1520 struct amdgpu_device *adev = dev; 1521 int i, r = 0; 1522 1523 for (i = 0; i < adev->num_ip_blocks; i++) { 1524 if (!adev->ip_blocks[i].status.valid) 1525 continue; 1526 if (adev->ip_blocks[i].version->type != block_type) 1527 continue; 1528 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1529 continue; 1530 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1531 (void *)adev, state); 1532 if (r) 1533 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1534 adev->ip_blocks[i].version->funcs->name, r); 1535 } 1536 return r; 1537 } 1538 1539 /** 1540 * amdgpu_device_ip_set_powergating_state - set the PG state 1541 * 1542 * @dev: amdgpu_device pointer 1543 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1544 * @state: powergating state (gate or ungate) 1545 * 1546 * Sets the requested powergating state for all instances of 1547 * the hardware IP specified. 1548 * Returns the error code from the last instance. 1549 */ 1550 int amdgpu_device_ip_set_powergating_state(void *dev, 1551 enum amd_ip_block_type block_type, 1552 enum amd_powergating_state state) 1553 { 1554 struct amdgpu_device *adev = dev; 1555 int i, r = 0; 1556 1557 for (i = 0; i < adev->num_ip_blocks; i++) { 1558 if (!adev->ip_blocks[i].status.valid) 1559 continue; 1560 if (adev->ip_blocks[i].version->type != block_type) 1561 continue; 1562 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1563 continue; 1564 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1565 (void *)adev, state); 1566 if (r) 1567 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1568 adev->ip_blocks[i].version->funcs->name, r); 1569 } 1570 return r; 1571 } 1572 1573 /** 1574 * amdgpu_device_ip_get_clockgating_state - get the CG state 1575 * 1576 * @adev: amdgpu_device pointer 1577 * @flags: clockgating feature flags 1578 * 1579 * Walks the list of IPs on the device and updates the clockgating 1580 * flags for each IP. 1581 * Updates @flags with the feature flags for each hardware IP where 1582 * clockgating is enabled. 1583 */ 1584 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1585 u32 *flags) 1586 { 1587 int i; 1588 1589 for (i = 0; i < adev->num_ip_blocks; i++) { 1590 if (!adev->ip_blocks[i].status.valid) 1591 continue; 1592 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1593 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1594 } 1595 } 1596 1597 /** 1598 * amdgpu_device_ip_wait_for_idle - wait for idle 1599 * 1600 * @adev: amdgpu_device pointer 1601 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1602 * 1603 * Waits for the request hardware IP to be idle. 1604 * Returns 0 for success or a negative error code on failure. 1605 */ 1606 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1607 enum amd_ip_block_type block_type) 1608 { 1609 int i, r; 1610 1611 for (i = 0; i < adev->num_ip_blocks; i++) { 1612 if (!adev->ip_blocks[i].status.valid) 1613 continue; 1614 if (adev->ip_blocks[i].version->type == block_type) { 1615 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1616 if (r) 1617 return r; 1618 break; 1619 } 1620 } 1621 return 0; 1622 1623 } 1624 1625 /** 1626 * amdgpu_device_ip_is_idle - is the hardware IP idle 1627 * 1628 * @adev: amdgpu_device pointer 1629 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1630 * 1631 * Check if the hardware IP is idle or not. 1632 * Returns true if it the IP is idle, false if not. 1633 */ 1634 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1635 enum amd_ip_block_type block_type) 1636 { 1637 int i; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) { 1640 if (!adev->ip_blocks[i].status.valid) 1641 continue; 1642 if (adev->ip_blocks[i].version->type == block_type) 1643 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1644 } 1645 return true; 1646 1647 } 1648 1649 /** 1650 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1651 * 1652 * @adev: amdgpu_device pointer 1653 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1654 * 1655 * Returns a pointer to the hardware IP block structure 1656 * if it exists for the asic, otherwise NULL. 1657 */ 1658 struct amdgpu_ip_block * 1659 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1660 enum amd_ip_block_type type) 1661 { 1662 int i; 1663 1664 for (i = 0; i < adev->num_ip_blocks; i++) 1665 if (adev->ip_blocks[i].version->type == type) 1666 return &adev->ip_blocks[i]; 1667 1668 return NULL; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_block_version_cmp 1673 * 1674 * @adev: amdgpu_device pointer 1675 * @type: enum amd_ip_block_type 1676 * @major: major version 1677 * @minor: minor version 1678 * 1679 * return 0 if equal or greater 1680 * return 1 if smaller or the ip_block doesn't exist 1681 */ 1682 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1683 enum amd_ip_block_type type, 1684 u32 major, u32 minor) 1685 { 1686 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1687 1688 if (ip_block && ((ip_block->version->major > major) || 1689 ((ip_block->version->major == major) && 1690 (ip_block->version->minor >= minor)))) 1691 return 0; 1692 1693 return 1; 1694 } 1695 1696 /** 1697 * amdgpu_device_ip_block_add 1698 * 1699 * @adev: amdgpu_device pointer 1700 * @ip_block_version: pointer to the IP to add 1701 * 1702 * Adds the IP block driver information to the collection of IPs 1703 * on the asic. 1704 */ 1705 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1706 const struct amdgpu_ip_block_version *ip_block_version) 1707 { 1708 if (!ip_block_version) 1709 return -EINVAL; 1710 1711 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1712 ip_block_version->funcs->name); 1713 1714 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1715 1716 return 0; 1717 } 1718 1719 /** 1720 * amdgpu_device_enable_virtual_display - enable virtual display feature 1721 * 1722 * @adev: amdgpu_device pointer 1723 * 1724 * Enabled the virtual display feature if the user has enabled it via 1725 * the module parameter virtual_display. This feature provides a virtual 1726 * display hardware on headless boards or in virtualized environments. 1727 * This function parses and validates the configuration string specified by 1728 * the user and configues the virtual display configuration (number of 1729 * virtual connectors, crtcs, etc.) specified. 1730 */ 1731 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1732 { 1733 adev->enable_virtual_display = false; 1734 1735 #ifdef notyet 1736 if (amdgpu_virtual_display) { 1737 struct drm_device *ddev = adev_to_drm(adev); 1738 const char *pci_address_name = pci_name(ddev->pdev); 1739 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1740 1741 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1742 pciaddstr_tmp = pciaddstr; 1743 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1744 pciaddname = strsep(&pciaddname_tmp, ","); 1745 if (!strcmp("all", pciaddname) 1746 || !strcmp(pci_address_name, pciaddname)) { 1747 long num_crtc; 1748 int res = -1; 1749 1750 adev->enable_virtual_display = true; 1751 1752 if (pciaddname_tmp) 1753 res = kstrtol(pciaddname_tmp, 10, 1754 &num_crtc); 1755 1756 if (!res) { 1757 if (num_crtc < 1) 1758 num_crtc = 1; 1759 if (num_crtc > 6) 1760 num_crtc = 6; 1761 adev->mode_info.num_crtc = num_crtc; 1762 } else { 1763 adev->mode_info.num_crtc = 1; 1764 } 1765 break; 1766 } 1767 } 1768 1769 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1770 amdgpu_virtual_display, pci_address_name, 1771 adev->enable_virtual_display, adev->mode_info.num_crtc); 1772 1773 kfree(pciaddstr); 1774 } 1775 #endif 1776 } 1777 1778 /** 1779 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1780 * 1781 * @adev: amdgpu_device pointer 1782 * 1783 * Parses the asic configuration parameters specified in the gpu info 1784 * firmware and makes them availale to the driver for use in configuring 1785 * the asic. 1786 * Returns 0 on success, -EINVAL on failure. 1787 */ 1788 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1789 { 1790 const char *chip_name; 1791 char fw_name[40]; 1792 int err; 1793 const struct gpu_info_firmware_header_v1_0 *hdr; 1794 1795 adev->firmware.gpu_info_fw = NULL; 1796 1797 if (adev->mman.discovery_bin) { 1798 amdgpu_discovery_get_gfx_info(adev); 1799 1800 /* 1801 * FIXME: The bounding box is still needed by Navi12, so 1802 * temporarily read it from gpu_info firmware. Should be droped 1803 * when DAL no longer needs it. 1804 */ 1805 if (adev->asic_type != CHIP_NAVI12) 1806 return 0; 1807 } 1808 1809 switch (adev->asic_type) { 1810 #ifdef CONFIG_DRM_AMDGPU_SI 1811 case CHIP_VERDE: 1812 case CHIP_TAHITI: 1813 case CHIP_PITCAIRN: 1814 case CHIP_OLAND: 1815 case CHIP_HAINAN: 1816 #endif 1817 #ifdef CONFIG_DRM_AMDGPU_CIK 1818 case CHIP_BONAIRE: 1819 case CHIP_HAWAII: 1820 case CHIP_KAVERI: 1821 case CHIP_KABINI: 1822 case CHIP_MULLINS: 1823 #endif 1824 case CHIP_TOPAZ: 1825 case CHIP_TONGA: 1826 case CHIP_FIJI: 1827 case CHIP_POLARIS10: 1828 case CHIP_POLARIS11: 1829 case CHIP_POLARIS12: 1830 case CHIP_VEGAM: 1831 case CHIP_CARRIZO: 1832 case CHIP_STONEY: 1833 case CHIP_VEGA20: 1834 case CHIP_SIENNA_CICHLID: 1835 case CHIP_NAVY_FLOUNDER: 1836 default: 1837 return 0; 1838 case CHIP_VEGA10: 1839 chip_name = "vega10"; 1840 break; 1841 case CHIP_VEGA12: 1842 chip_name = "vega12"; 1843 break; 1844 case CHIP_RAVEN: 1845 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1846 chip_name = "raven2"; 1847 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1848 chip_name = "picasso"; 1849 else 1850 chip_name = "raven"; 1851 break; 1852 case CHIP_ARCTURUS: 1853 chip_name = "arcturus"; 1854 break; 1855 case CHIP_RENOIR: 1856 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1857 chip_name = "renoir"; 1858 else 1859 chip_name = "green_sardine"; 1860 break; 1861 case CHIP_NAVI10: 1862 chip_name = "navi10"; 1863 break; 1864 case CHIP_NAVI14: 1865 chip_name = "navi14"; 1866 break; 1867 case CHIP_NAVI12: 1868 chip_name = "navi12"; 1869 break; 1870 } 1871 1872 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1873 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1874 if (err) { 1875 dev_err(adev->dev, 1876 "Failed to load gpu_info firmware \"%s\"\n", 1877 fw_name); 1878 goto out; 1879 } 1880 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1881 if (err) { 1882 dev_err(adev->dev, 1883 "Failed to validate gpu_info firmware \"%s\"\n", 1884 fw_name); 1885 goto out; 1886 } 1887 1888 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1889 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1890 1891 switch (hdr->version_major) { 1892 case 1: 1893 { 1894 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1895 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1896 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1897 1898 /* 1899 * Should be droped when DAL no longer needs it. 1900 */ 1901 if (adev->asic_type == CHIP_NAVI12) 1902 goto parse_soc_bounding_box; 1903 1904 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1905 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1906 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1907 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1908 adev->gfx.config.max_texture_channel_caches = 1909 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1910 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1911 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1912 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1913 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1914 adev->gfx.config.double_offchip_lds_buf = 1915 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1916 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1917 adev->gfx.cu_info.max_waves_per_simd = 1918 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1919 adev->gfx.cu_info.max_scratch_slots_per_cu = 1920 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1921 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1922 if (hdr->version_minor >= 1) { 1923 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1924 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1925 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1926 adev->gfx.config.num_sc_per_sh = 1927 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1928 adev->gfx.config.num_packer_per_sc = 1929 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1930 } 1931 1932 parse_soc_bounding_box: 1933 /* 1934 * soc bounding box info is not integrated in disocovery table, 1935 * we always need to parse it from gpu info firmware if needed. 1936 */ 1937 if (hdr->version_minor == 2) { 1938 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1939 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1940 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1941 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1942 } 1943 break; 1944 } 1945 default: 1946 dev_err(adev->dev, 1947 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1948 err = -EINVAL; 1949 goto out; 1950 } 1951 out: 1952 return err; 1953 } 1954 1955 /** 1956 * amdgpu_device_ip_early_init - run early init for hardware IPs 1957 * 1958 * @adev: amdgpu_device pointer 1959 * 1960 * Early initialization pass for hardware IPs. The hardware IPs that make 1961 * up each asic are discovered each IP's early_init callback is run. This 1962 * is the first stage in initializing the asic. 1963 * Returns 0 on success, negative error code on failure. 1964 */ 1965 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1966 { 1967 int i, r; 1968 1969 amdgpu_device_enable_virtual_display(adev); 1970 1971 if (amdgpu_sriov_vf(adev)) { 1972 r = amdgpu_virt_request_full_gpu(adev, true); 1973 if (r) 1974 return r; 1975 } 1976 1977 switch (adev->asic_type) { 1978 #ifdef CONFIG_DRM_AMDGPU_SI 1979 case CHIP_VERDE: 1980 case CHIP_TAHITI: 1981 case CHIP_PITCAIRN: 1982 case CHIP_OLAND: 1983 case CHIP_HAINAN: 1984 adev->family = AMDGPU_FAMILY_SI; 1985 r = si_set_ip_blocks(adev); 1986 if (r) 1987 return r; 1988 break; 1989 #endif 1990 #ifdef CONFIG_DRM_AMDGPU_CIK 1991 case CHIP_BONAIRE: 1992 case CHIP_HAWAII: 1993 case CHIP_KAVERI: 1994 case CHIP_KABINI: 1995 case CHIP_MULLINS: 1996 if (adev->flags & AMD_IS_APU) 1997 adev->family = AMDGPU_FAMILY_KV; 1998 else 1999 adev->family = AMDGPU_FAMILY_CI; 2000 2001 r = cik_set_ip_blocks(adev); 2002 if (r) 2003 return r; 2004 break; 2005 #endif 2006 case CHIP_TOPAZ: 2007 case CHIP_TONGA: 2008 case CHIP_FIJI: 2009 case CHIP_POLARIS10: 2010 case CHIP_POLARIS11: 2011 case CHIP_POLARIS12: 2012 case CHIP_VEGAM: 2013 case CHIP_CARRIZO: 2014 case CHIP_STONEY: 2015 if (adev->flags & AMD_IS_APU) 2016 adev->family = AMDGPU_FAMILY_CZ; 2017 else 2018 adev->family = AMDGPU_FAMILY_VI; 2019 2020 r = vi_set_ip_blocks(adev); 2021 if (r) 2022 return r; 2023 break; 2024 case CHIP_VEGA10: 2025 case CHIP_VEGA12: 2026 case CHIP_VEGA20: 2027 case CHIP_RAVEN: 2028 case CHIP_ARCTURUS: 2029 case CHIP_RENOIR: 2030 if (adev->flags & AMD_IS_APU) 2031 adev->family = AMDGPU_FAMILY_RV; 2032 else 2033 adev->family = AMDGPU_FAMILY_AI; 2034 2035 r = soc15_set_ip_blocks(adev); 2036 if (r) 2037 return r; 2038 break; 2039 case CHIP_NAVI10: 2040 case CHIP_NAVI14: 2041 case CHIP_NAVI12: 2042 case CHIP_SIENNA_CICHLID: 2043 case CHIP_NAVY_FLOUNDER: 2044 adev->family = AMDGPU_FAMILY_NV; 2045 2046 r = nv_set_ip_blocks(adev); 2047 if (r) 2048 return r; 2049 break; 2050 default: 2051 /* FIXME: not supported yet */ 2052 return -EINVAL; 2053 } 2054 2055 amdgpu_amdkfd_device_probe(adev); 2056 2057 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2058 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2059 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2060 2061 for (i = 0; i < adev->num_ip_blocks; i++) { 2062 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2063 DRM_ERROR("disabled ip block: %d <%s>\n", 2064 i, adev->ip_blocks[i].version->funcs->name); 2065 adev->ip_blocks[i].status.valid = false; 2066 } else { 2067 if (adev->ip_blocks[i].version->funcs->early_init) { 2068 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2069 if (r == -ENOENT) { 2070 adev->ip_blocks[i].status.valid = false; 2071 } else if (r) { 2072 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2073 adev->ip_blocks[i].version->funcs->name, r); 2074 return r; 2075 } else { 2076 adev->ip_blocks[i].status.valid = true; 2077 } 2078 } else { 2079 adev->ip_blocks[i].status.valid = true; 2080 } 2081 } 2082 /* get the vbios after the asic_funcs are set up */ 2083 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2084 r = amdgpu_device_parse_gpu_info_fw(adev); 2085 if (r) 2086 return r; 2087 2088 /* Read BIOS */ 2089 if (!amdgpu_get_bios(adev)) 2090 return -EINVAL; 2091 2092 r = amdgpu_atombios_init(adev); 2093 if (r) { 2094 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2095 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2096 return r; 2097 } 2098 } 2099 } 2100 2101 adev->cg_flags &= amdgpu_cg_mask; 2102 adev->pg_flags &= amdgpu_pg_mask; 2103 2104 return 0; 2105 } 2106 2107 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2108 { 2109 int i, r; 2110 2111 for (i = 0; i < adev->num_ip_blocks; i++) { 2112 if (!adev->ip_blocks[i].status.sw) 2113 continue; 2114 if (adev->ip_blocks[i].status.hw) 2115 continue; 2116 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2117 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2118 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2119 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2120 if (r) { 2121 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2122 adev->ip_blocks[i].version->funcs->name, r); 2123 return r; 2124 } 2125 adev->ip_blocks[i].status.hw = true; 2126 } 2127 } 2128 2129 return 0; 2130 } 2131 2132 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2133 { 2134 int i, r; 2135 2136 for (i = 0; i < adev->num_ip_blocks; i++) { 2137 if (!adev->ip_blocks[i].status.sw) 2138 continue; 2139 if (adev->ip_blocks[i].status.hw) 2140 continue; 2141 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2142 if (r) { 2143 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2144 adev->ip_blocks[i].version->funcs->name, r); 2145 return r; 2146 } 2147 adev->ip_blocks[i].status.hw = true; 2148 } 2149 2150 return 0; 2151 } 2152 2153 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2154 { 2155 int r = 0; 2156 int i; 2157 uint32_t smu_version; 2158 2159 if (adev->asic_type >= CHIP_VEGA10) { 2160 for (i = 0; i < adev->num_ip_blocks; i++) { 2161 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2162 continue; 2163 2164 /* no need to do the fw loading again if already done*/ 2165 if (adev->ip_blocks[i].status.hw == true) 2166 break; 2167 2168 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2169 r = adev->ip_blocks[i].version->funcs->resume(adev); 2170 if (r) { 2171 DRM_ERROR("resume of IP block <%s> failed %d\n", 2172 adev->ip_blocks[i].version->funcs->name, r); 2173 return r; 2174 } 2175 } else { 2176 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2177 if (r) { 2178 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2179 adev->ip_blocks[i].version->funcs->name, r); 2180 return r; 2181 } 2182 } 2183 2184 adev->ip_blocks[i].status.hw = true; 2185 break; 2186 } 2187 } 2188 2189 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2190 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2191 2192 return r; 2193 } 2194 2195 /** 2196 * amdgpu_device_ip_init - run init for hardware IPs 2197 * 2198 * @adev: amdgpu_device pointer 2199 * 2200 * Main initialization pass for hardware IPs. The list of all the hardware 2201 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2202 * are run. sw_init initializes the software state associated with each IP 2203 * and hw_init initializes the hardware associated with each IP. 2204 * Returns 0 on success, negative error code on failure. 2205 */ 2206 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2207 { 2208 int i, r; 2209 2210 r = amdgpu_ras_init(adev); 2211 if (r) 2212 return r; 2213 2214 for (i = 0; i < adev->num_ip_blocks; i++) { 2215 if (!adev->ip_blocks[i].status.valid) 2216 continue; 2217 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2218 if (r) { 2219 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2220 adev->ip_blocks[i].version->funcs->name, r); 2221 goto init_failed; 2222 } 2223 adev->ip_blocks[i].status.sw = true; 2224 2225 /* need to do gmc hw init early so we can allocate gpu mem */ 2226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2227 r = amdgpu_device_vram_scratch_init(adev); 2228 if (r) { 2229 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2230 goto init_failed; 2231 } 2232 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2233 if (r) { 2234 DRM_ERROR("hw_init %d failed %d\n", i, r); 2235 goto init_failed; 2236 } 2237 r = amdgpu_device_wb_init(adev); 2238 if (r) { 2239 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2240 goto init_failed; 2241 } 2242 adev->ip_blocks[i].status.hw = true; 2243 2244 /* right after GMC hw init, we create CSA */ 2245 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2246 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2247 AMDGPU_GEM_DOMAIN_VRAM, 2248 AMDGPU_CSA_SIZE); 2249 if (r) { 2250 DRM_ERROR("allocate CSA failed %d\n", r); 2251 goto init_failed; 2252 } 2253 } 2254 } 2255 } 2256 2257 if (amdgpu_sriov_vf(adev)) 2258 amdgpu_virt_init_data_exchange(adev); 2259 2260 r = amdgpu_ib_pool_init(adev); 2261 if (r) { 2262 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2263 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2264 goto init_failed; 2265 } 2266 2267 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2268 if (r) 2269 goto init_failed; 2270 2271 r = amdgpu_device_ip_hw_init_phase1(adev); 2272 if (r) 2273 goto init_failed; 2274 2275 r = amdgpu_device_fw_loading(adev); 2276 if (r) 2277 goto init_failed; 2278 2279 r = amdgpu_device_ip_hw_init_phase2(adev); 2280 if (r) 2281 goto init_failed; 2282 2283 /* 2284 * retired pages will be loaded from eeprom and reserved here, 2285 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2286 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2287 * for I2C communication which only true at this point. 2288 * 2289 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2290 * failure from bad gpu situation and stop amdgpu init process 2291 * accordingly. For other failed cases, it will still release all 2292 * the resource and print error message, rather than returning one 2293 * negative value to upper level. 2294 * 2295 * Note: theoretically, this should be called before all vram allocations 2296 * to protect retired page from abusing 2297 */ 2298 r = amdgpu_ras_recovery_init(adev); 2299 if (r) 2300 goto init_failed; 2301 2302 if (adev->gmc.xgmi.num_physical_nodes > 1) 2303 amdgpu_xgmi_add_device(adev); 2304 amdgpu_amdkfd_device_init(adev); 2305 2306 amdgpu_fru_get_product_info(adev); 2307 2308 init_failed: 2309 if (amdgpu_sriov_vf(adev)) 2310 amdgpu_virt_release_full_gpu(adev, true); 2311 2312 return r; 2313 } 2314 2315 /** 2316 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2317 * 2318 * @adev: amdgpu_device pointer 2319 * 2320 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2321 * this function before a GPU reset. If the value is retained after a 2322 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2323 */ 2324 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2325 { 2326 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2327 } 2328 2329 /** 2330 * amdgpu_device_check_vram_lost - check if vram is valid 2331 * 2332 * @adev: amdgpu_device pointer 2333 * 2334 * Checks the reset magic value written to the gart pointer in VRAM. 2335 * The driver calls this after a GPU reset to see if the contents of 2336 * VRAM is lost or now. 2337 * returns true if vram is lost, false if not. 2338 */ 2339 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2340 { 2341 if (memcmp(adev->gart.ptr, adev->reset_magic, 2342 AMDGPU_RESET_MAGIC_NUM)) 2343 return true; 2344 2345 if (!amdgpu_in_reset(adev)) 2346 return false; 2347 2348 /* 2349 * For all ASICs with baco/mode1 reset, the VRAM is 2350 * always assumed to be lost. 2351 */ 2352 switch (amdgpu_asic_reset_method(adev)) { 2353 case AMD_RESET_METHOD_BACO: 2354 case AMD_RESET_METHOD_MODE1: 2355 return true; 2356 default: 2357 return false; 2358 } 2359 } 2360 2361 /** 2362 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2363 * 2364 * @adev: amdgpu_device pointer 2365 * @state: clockgating state (gate or ungate) 2366 * 2367 * The list of all the hardware IPs that make up the asic is walked and the 2368 * set_clockgating_state callbacks are run. 2369 * Late initialization pass enabling clockgating for hardware IPs. 2370 * Fini or suspend, pass disabling clockgating for hardware IPs. 2371 * Returns 0 on success, negative error code on failure. 2372 */ 2373 2374 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2375 enum amd_clockgating_state state) 2376 { 2377 int i, j, r; 2378 2379 if (amdgpu_emu_mode == 1) 2380 return 0; 2381 2382 for (j = 0; j < adev->num_ip_blocks; j++) { 2383 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2384 if (!adev->ip_blocks[i].status.late_initialized) 2385 continue; 2386 /* skip CG for VCE/UVD, it's handled specially */ 2387 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2388 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2389 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2390 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2391 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2392 /* enable clockgating to save power */ 2393 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2394 state); 2395 if (r) { 2396 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2397 adev->ip_blocks[i].version->funcs->name, r); 2398 return r; 2399 } 2400 } 2401 } 2402 2403 return 0; 2404 } 2405 2406 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2407 { 2408 int i, j, r; 2409 2410 if (amdgpu_emu_mode == 1) 2411 return 0; 2412 2413 for (j = 0; j < adev->num_ip_blocks; j++) { 2414 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2415 if (!adev->ip_blocks[i].status.late_initialized) 2416 continue; 2417 /* skip CG for VCE/UVD, it's handled specially */ 2418 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2419 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2420 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2421 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2422 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2423 /* enable powergating to save power */ 2424 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2425 state); 2426 if (r) { 2427 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2428 adev->ip_blocks[i].version->funcs->name, r); 2429 return r; 2430 } 2431 } 2432 } 2433 return 0; 2434 } 2435 2436 static int amdgpu_device_enable_mgpu_fan_boost(void) 2437 { 2438 struct amdgpu_gpu_instance *gpu_ins; 2439 struct amdgpu_device *adev; 2440 int i, ret = 0; 2441 2442 mutex_lock(&mgpu_info.mutex); 2443 2444 /* 2445 * MGPU fan boost feature should be enabled 2446 * only when there are two or more dGPUs in 2447 * the system 2448 */ 2449 if (mgpu_info.num_dgpu < 2) 2450 goto out; 2451 2452 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2453 gpu_ins = &(mgpu_info.gpu_ins[i]); 2454 adev = gpu_ins->adev; 2455 if (!(adev->flags & AMD_IS_APU) && 2456 !gpu_ins->mgpu_fan_enabled) { 2457 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2458 if (ret) 2459 break; 2460 2461 gpu_ins->mgpu_fan_enabled = 1; 2462 } 2463 } 2464 2465 out: 2466 mutex_unlock(&mgpu_info.mutex); 2467 2468 return ret; 2469 } 2470 2471 /** 2472 * amdgpu_device_ip_late_init - run late init for hardware IPs 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Late initialization pass for hardware IPs. The list of all the hardware 2477 * IPs that make up the asic is walked and the late_init callbacks are run. 2478 * late_init covers any special initialization that an IP requires 2479 * after all of the have been initialized or something that needs to happen 2480 * late in the init process. 2481 * Returns 0 on success, negative error code on failure. 2482 */ 2483 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2484 { 2485 struct amdgpu_gpu_instance *gpu_instance; 2486 int i = 0, r; 2487 2488 for (i = 0; i < adev->num_ip_blocks; i++) { 2489 if (!adev->ip_blocks[i].status.hw) 2490 continue; 2491 if (adev->ip_blocks[i].version->funcs->late_init) { 2492 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2493 if (r) { 2494 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2495 adev->ip_blocks[i].version->funcs->name, r); 2496 return r; 2497 } 2498 } 2499 adev->ip_blocks[i].status.late_initialized = true; 2500 } 2501 2502 amdgpu_ras_set_error_query_ready(adev, true); 2503 2504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2505 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2506 2507 amdgpu_device_fill_reset_magic(adev); 2508 2509 r = amdgpu_device_enable_mgpu_fan_boost(); 2510 if (r) 2511 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2512 2513 2514 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2515 mutex_lock(&mgpu_info.mutex); 2516 2517 /* 2518 * Reset device p-state to low as this was booted with high. 2519 * 2520 * This should be performed only after all devices from the same 2521 * hive get initialized. 2522 * 2523 * However, it's unknown how many device in the hive in advance. 2524 * As this is counted one by one during devices initializations. 2525 * 2526 * So, we wait for all XGMI interlinked devices initialized. 2527 * This may bring some delays as those devices may come from 2528 * different hives. But that should be OK. 2529 */ 2530 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2531 for (i = 0; i < mgpu_info.num_gpu; i++) { 2532 gpu_instance = &(mgpu_info.gpu_ins[i]); 2533 if (gpu_instance->adev->flags & AMD_IS_APU) 2534 continue; 2535 2536 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2537 AMDGPU_XGMI_PSTATE_MIN); 2538 if (r) { 2539 DRM_ERROR("pstate setting failed (%d).\n", r); 2540 break; 2541 } 2542 } 2543 } 2544 2545 mutex_unlock(&mgpu_info.mutex); 2546 } 2547 2548 return 0; 2549 } 2550 2551 /** 2552 * amdgpu_device_ip_fini - run fini for hardware IPs 2553 * 2554 * @adev: amdgpu_device pointer 2555 * 2556 * Main teardown pass for hardware IPs. The list of all the hardware 2557 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2558 * are run. hw_fini tears down the hardware associated with each IP 2559 * and sw_fini tears down any software state associated with each IP. 2560 * Returns 0 on success, negative error code on failure. 2561 */ 2562 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2563 { 2564 int i, r; 2565 2566 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2567 amdgpu_virt_release_ras_err_handler_data(adev); 2568 2569 amdgpu_ras_pre_fini(adev); 2570 2571 if (adev->gmc.xgmi.num_physical_nodes > 1) 2572 amdgpu_xgmi_remove_device(adev); 2573 2574 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2575 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2576 2577 amdgpu_amdkfd_device_fini(adev); 2578 2579 /* need to disable SMC first */ 2580 for (i = 0; i < adev->num_ip_blocks; i++) { 2581 if (!adev->ip_blocks[i].status.hw) 2582 continue; 2583 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2584 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2585 /* XXX handle errors */ 2586 if (r) { 2587 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2588 adev->ip_blocks[i].version->funcs->name, r); 2589 } 2590 adev->ip_blocks[i].status.hw = false; 2591 break; 2592 } 2593 } 2594 2595 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2596 if (!adev->ip_blocks[i].status.hw) 2597 continue; 2598 2599 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2600 /* XXX handle errors */ 2601 if (r) { 2602 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2603 adev->ip_blocks[i].version->funcs->name, r); 2604 } 2605 2606 adev->ip_blocks[i].status.hw = false; 2607 } 2608 2609 2610 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2611 if (!adev->ip_blocks[i].status.sw) 2612 continue; 2613 2614 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2615 amdgpu_ucode_free_bo(adev); 2616 amdgpu_free_static_csa(&adev->virt.csa_obj); 2617 amdgpu_device_wb_fini(adev); 2618 amdgpu_device_vram_scratch_fini(adev); 2619 amdgpu_ib_pool_fini(adev); 2620 } 2621 2622 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2623 /* XXX handle errors */ 2624 if (r) { 2625 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2626 adev->ip_blocks[i].version->funcs->name, r); 2627 } 2628 adev->ip_blocks[i].status.sw = false; 2629 adev->ip_blocks[i].status.valid = false; 2630 } 2631 2632 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2633 if (!adev->ip_blocks[i].status.late_initialized) 2634 continue; 2635 if (adev->ip_blocks[i].version->funcs->late_fini) 2636 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2637 adev->ip_blocks[i].status.late_initialized = false; 2638 } 2639 2640 amdgpu_ras_fini(adev); 2641 2642 if (amdgpu_sriov_vf(adev)) 2643 if (amdgpu_virt_release_full_gpu(adev, false)) 2644 DRM_ERROR("failed to release exclusive mode on fini\n"); 2645 2646 return 0; 2647 } 2648 2649 /** 2650 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2651 * 2652 * @work: work_struct. 2653 */ 2654 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2655 { 2656 struct amdgpu_device *adev = 2657 container_of(work, struct amdgpu_device, delayed_init_work.work); 2658 int r; 2659 2660 r = amdgpu_ib_ring_tests(adev); 2661 if (r) 2662 DRM_ERROR("ib ring test failed (%d).\n", r); 2663 } 2664 2665 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2666 { 2667 struct amdgpu_device *adev = 2668 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2669 2670 mutex_lock(&adev->gfx.gfx_off_mutex); 2671 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2672 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2673 adev->gfx.gfx_off_state = true; 2674 } 2675 mutex_unlock(&adev->gfx.gfx_off_mutex); 2676 } 2677 2678 /** 2679 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2680 * 2681 * @adev: amdgpu_device pointer 2682 * 2683 * Main suspend function for hardware IPs. The list of all the hardware 2684 * IPs that make up the asic is walked, clockgating is disabled and the 2685 * suspend callbacks are run. suspend puts the hardware and software state 2686 * in each IP into a state suitable for suspend. 2687 * Returns 0 on success, negative error code on failure. 2688 */ 2689 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2690 { 2691 int i, r; 2692 2693 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2694 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2695 2696 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2697 if (!adev->ip_blocks[i].status.valid) 2698 continue; 2699 2700 /* displays are handled separately */ 2701 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2702 continue; 2703 2704 /* XXX handle errors */ 2705 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2706 /* XXX handle errors */ 2707 if (r) { 2708 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 return r; 2711 } 2712 2713 adev->ip_blocks[i].status.hw = false; 2714 } 2715 2716 return 0; 2717 } 2718 2719 /** 2720 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2721 * 2722 * @adev: amdgpu_device pointer 2723 * 2724 * Main suspend function for hardware IPs. The list of all the hardware 2725 * IPs that make up the asic is walked, clockgating is disabled and the 2726 * suspend callbacks are run. suspend puts the hardware and software state 2727 * in each IP into a state suitable for suspend. 2728 * Returns 0 on success, negative error code on failure. 2729 */ 2730 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2731 { 2732 int i, r; 2733 2734 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2735 if (!adev->ip_blocks[i].status.valid) 2736 continue; 2737 /* displays are handled in phase1 */ 2738 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2739 continue; 2740 /* PSP lost connection when err_event_athub occurs */ 2741 if (amdgpu_ras_intr_triggered() && 2742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2743 adev->ip_blocks[i].status.hw = false; 2744 continue; 2745 } 2746 /* XXX handle errors */ 2747 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2748 /* XXX handle errors */ 2749 if (r) { 2750 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2751 adev->ip_blocks[i].version->funcs->name, r); 2752 } 2753 adev->ip_blocks[i].status.hw = false; 2754 /* handle putting the SMC in the appropriate state */ 2755 if(!amdgpu_sriov_vf(adev)){ 2756 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2757 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2758 if (r) { 2759 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2760 adev->mp1_state, r); 2761 return r; 2762 } 2763 } 2764 } 2765 adev->ip_blocks[i].status.hw = false; 2766 } 2767 2768 return 0; 2769 } 2770 2771 /** 2772 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2773 * 2774 * @adev: amdgpu_device pointer 2775 * 2776 * Main suspend function for hardware IPs. The list of all the hardware 2777 * IPs that make up the asic is walked, clockgating is disabled and the 2778 * suspend callbacks are run. suspend puts the hardware and software state 2779 * in each IP into a state suitable for suspend. 2780 * Returns 0 on success, negative error code on failure. 2781 */ 2782 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2783 { 2784 int r; 2785 2786 if (amdgpu_sriov_vf(adev)) 2787 amdgpu_virt_request_full_gpu(adev, false); 2788 2789 r = amdgpu_device_ip_suspend_phase1(adev); 2790 if (r) 2791 return r; 2792 r = amdgpu_device_ip_suspend_phase2(adev); 2793 2794 if (amdgpu_sriov_vf(adev)) 2795 amdgpu_virt_release_full_gpu(adev, false); 2796 2797 return r; 2798 } 2799 2800 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2801 { 2802 int i, r; 2803 2804 static enum amd_ip_block_type ip_order[] = { 2805 AMD_IP_BLOCK_TYPE_GMC, 2806 AMD_IP_BLOCK_TYPE_COMMON, 2807 AMD_IP_BLOCK_TYPE_PSP, 2808 AMD_IP_BLOCK_TYPE_IH, 2809 }; 2810 2811 for (i = 0; i < adev->num_ip_blocks; i++) { 2812 int j; 2813 struct amdgpu_ip_block *block; 2814 2815 block = &adev->ip_blocks[i]; 2816 block->status.hw = false; 2817 2818 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2819 2820 if (block->version->type != ip_order[j] || 2821 !block->status.valid) 2822 continue; 2823 2824 r = block->version->funcs->hw_init(adev); 2825 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2826 if (r) 2827 return r; 2828 block->status.hw = true; 2829 } 2830 } 2831 2832 return 0; 2833 } 2834 2835 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2836 { 2837 int i, r; 2838 2839 static enum amd_ip_block_type ip_order[] = { 2840 AMD_IP_BLOCK_TYPE_SMC, 2841 AMD_IP_BLOCK_TYPE_DCE, 2842 AMD_IP_BLOCK_TYPE_GFX, 2843 AMD_IP_BLOCK_TYPE_SDMA, 2844 AMD_IP_BLOCK_TYPE_UVD, 2845 AMD_IP_BLOCK_TYPE_VCE, 2846 AMD_IP_BLOCK_TYPE_VCN 2847 }; 2848 2849 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2850 int j; 2851 struct amdgpu_ip_block *block; 2852 2853 for (j = 0; j < adev->num_ip_blocks; j++) { 2854 block = &adev->ip_blocks[j]; 2855 2856 if (block->version->type != ip_order[i] || 2857 !block->status.valid || 2858 block->status.hw) 2859 continue; 2860 2861 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2862 r = block->version->funcs->resume(adev); 2863 else 2864 r = block->version->funcs->hw_init(adev); 2865 2866 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2867 if (r) 2868 return r; 2869 block->status.hw = true; 2870 } 2871 } 2872 2873 return 0; 2874 } 2875 2876 /** 2877 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2878 * 2879 * @adev: amdgpu_device pointer 2880 * 2881 * First resume function for hardware IPs. The list of all the hardware 2882 * IPs that make up the asic is walked and the resume callbacks are run for 2883 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2884 * after a suspend and updates the software state as necessary. This 2885 * function is also used for restoring the GPU after a GPU reset. 2886 * Returns 0 on success, negative error code on failure. 2887 */ 2888 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2889 { 2890 int i, r; 2891 2892 for (i = 0; i < adev->num_ip_blocks; i++) { 2893 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2894 continue; 2895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2896 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2898 2899 r = adev->ip_blocks[i].version->funcs->resume(adev); 2900 if (r) { 2901 DRM_ERROR("resume of IP block <%s> failed %d\n", 2902 adev->ip_blocks[i].version->funcs->name, r); 2903 return r; 2904 } 2905 adev->ip_blocks[i].status.hw = true; 2906 } 2907 } 2908 2909 return 0; 2910 } 2911 2912 /** 2913 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2914 * 2915 * @adev: amdgpu_device pointer 2916 * 2917 * First resume function for hardware IPs. The list of all the hardware 2918 * IPs that make up the asic is walked and the resume callbacks are run for 2919 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2920 * functional state after a suspend and updates the software state as 2921 * necessary. This function is also used for restoring the GPU after a GPU 2922 * reset. 2923 * Returns 0 on success, negative error code on failure. 2924 */ 2925 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2926 { 2927 int i, r; 2928 2929 for (i = 0; i < adev->num_ip_blocks; i++) { 2930 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2931 continue; 2932 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2933 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2934 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2935 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2936 continue; 2937 r = adev->ip_blocks[i].version->funcs->resume(adev); 2938 if (r) { 2939 DRM_ERROR("resume of IP block <%s> failed %d\n", 2940 adev->ip_blocks[i].version->funcs->name, r); 2941 return r; 2942 } 2943 adev->ip_blocks[i].status.hw = true; 2944 } 2945 2946 return 0; 2947 } 2948 2949 /** 2950 * amdgpu_device_ip_resume - run resume for hardware IPs 2951 * 2952 * @adev: amdgpu_device pointer 2953 * 2954 * Main resume function for hardware IPs. The hardware IPs 2955 * are split into two resume functions because they are 2956 * are also used in in recovering from a GPU reset and some additional 2957 * steps need to be take between them. In this case (S3/S4) they are 2958 * run sequentially. 2959 * Returns 0 on success, negative error code on failure. 2960 */ 2961 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2962 { 2963 int r; 2964 2965 r = amdgpu_device_ip_resume_phase1(adev); 2966 if (r) 2967 return r; 2968 2969 r = amdgpu_device_fw_loading(adev); 2970 if (r) 2971 return r; 2972 2973 r = amdgpu_device_ip_resume_phase2(adev); 2974 2975 return r; 2976 } 2977 2978 /** 2979 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2980 * 2981 * @adev: amdgpu_device pointer 2982 * 2983 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2984 */ 2985 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2986 { 2987 if (amdgpu_sriov_vf(adev)) { 2988 if (adev->is_atom_fw) { 2989 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2990 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2991 } else { 2992 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2993 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2994 } 2995 2996 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2997 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2998 } 2999 } 3000 3001 /** 3002 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3003 * 3004 * @asic_type: AMD asic type 3005 * 3006 * Check if there is DC (new modesetting infrastructre) support for an asic. 3007 * returns true if DC has support, false if not. 3008 */ 3009 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3010 { 3011 switch (asic_type) { 3012 #if defined(CONFIG_DRM_AMD_DC) 3013 #if defined(CONFIG_DRM_AMD_DC_SI) 3014 case CHIP_TAHITI: 3015 case CHIP_PITCAIRN: 3016 case CHIP_VERDE: 3017 case CHIP_OLAND: 3018 #endif 3019 case CHIP_BONAIRE: 3020 case CHIP_KAVERI: 3021 case CHIP_KABINI: 3022 case CHIP_MULLINS: 3023 /* 3024 * We have systems in the wild with these ASICs that require 3025 * LVDS and VGA support which is not supported with DC. 3026 * 3027 * Fallback to the non-DC driver here by default so as not to 3028 * cause regressions. 3029 */ 3030 return amdgpu_dc > 0; 3031 case CHIP_HAWAII: 3032 case CHIP_CARRIZO: 3033 case CHIP_STONEY: 3034 case CHIP_POLARIS10: 3035 case CHIP_POLARIS11: 3036 case CHIP_POLARIS12: 3037 case CHIP_VEGAM: 3038 case CHIP_TONGA: 3039 case CHIP_FIJI: 3040 case CHIP_VEGA10: 3041 case CHIP_VEGA12: 3042 case CHIP_VEGA20: 3043 #if defined(CONFIG_DRM_AMD_DC_DCN) 3044 case CHIP_RAVEN: 3045 case CHIP_NAVI10: 3046 case CHIP_NAVI14: 3047 case CHIP_NAVI12: 3048 case CHIP_RENOIR: 3049 #endif 3050 #if defined(CONFIG_DRM_AMD_DC_DCN3_0) 3051 case CHIP_SIENNA_CICHLID: 3052 case CHIP_NAVY_FLOUNDER: 3053 #endif 3054 return amdgpu_dc != 0; 3055 #endif 3056 default: 3057 if (amdgpu_dc > 0) 3058 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3059 "but isn't supported by ASIC, ignoring\n"); 3060 return false; 3061 } 3062 } 3063 3064 /** 3065 * amdgpu_device_has_dc_support - check if dc is supported 3066 * 3067 * @adev: amdgpu_device pointer 3068 * 3069 * Returns true for supported, false for not supported 3070 */ 3071 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3072 { 3073 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3074 return false; 3075 3076 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3077 } 3078 3079 3080 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3081 { 3082 struct amdgpu_device *adev = 3083 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3084 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3085 3086 /* It's a bug to not have a hive within this function */ 3087 if (WARN_ON(!hive)) 3088 return; 3089 3090 /* 3091 * Use task barrier to synchronize all xgmi reset works across the 3092 * hive. task_barrier_enter and task_barrier_exit will block 3093 * until all the threads running the xgmi reset works reach 3094 * those points. task_barrier_full will do both blocks. 3095 */ 3096 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3097 3098 task_barrier_enter(&hive->tb); 3099 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3100 3101 if (adev->asic_reset_res) 3102 goto fail; 3103 3104 task_barrier_exit(&hive->tb); 3105 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3106 3107 if (adev->asic_reset_res) 3108 goto fail; 3109 3110 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3111 adev->mmhub.funcs->reset_ras_error_count(adev); 3112 } else { 3113 3114 task_barrier_full(&hive->tb); 3115 adev->asic_reset_res = amdgpu_asic_reset(adev); 3116 } 3117 3118 fail: 3119 if (adev->asic_reset_res) 3120 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3121 adev->asic_reset_res, adev_to_drm(adev)->unique); 3122 amdgpu_put_xgmi_hive(hive); 3123 } 3124 3125 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3126 { 3127 char *input = amdgpu_lockup_timeout; 3128 char *timeout_setting = NULL; 3129 int index = 0; 3130 long timeout; 3131 int ret = 0; 3132 3133 /* 3134 * By default timeout for non compute jobs is 10000. 3135 * And there is no timeout enforced on compute jobs. 3136 * In SR-IOV or passthrough mode, timeout for compute 3137 * jobs are 60000 by default. 3138 */ 3139 adev->gfx_timeout = msecs_to_jiffies(10000); 3140 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3141 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3142 adev->compute_timeout = msecs_to_jiffies(60000); 3143 else 3144 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3145 3146 #ifdef notyet 3147 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3148 while ((timeout_setting = strsep(&input, ",")) && 3149 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3150 ret = kstrtol(timeout_setting, 0, &timeout); 3151 if (ret) 3152 return ret; 3153 3154 if (timeout == 0) { 3155 index++; 3156 continue; 3157 } else if (timeout < 0) { 3158 timeout = MAX_SCHEDULE_TIMEOUT; 3159 } else { 3160 timeout = msecs_to_jiffies(timeout); 3161 } 3162 3163 switch (index++) { 3164 case 0: 3165 adev->gfx_timeout = timeout; 3166 break; 3167 case 1: 3168 adev->compute_timeout = timeout; 3169 break; 3170 case 2: 3171 adev->sdma_timeout = timeout; 3172 break; 3173 case 3: 3174 adev->video_timeout = timeout; 3175 break; 3176 default: 3177 break; 3178 } 3179 } 3180 /* 3181 * There is only one value specified and 3182 * it should apply to all non-compute jobs. 3183 */ 3184 if (index == 1) { 3185 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3186 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3187 adev->compute_timeout = adev->gfx_timeout; 3188 } 3189 } 3190 #endif 3191 3192 return ret; 3193 } 3194 3195 static const struct attribute *amdgpu_dev_attributes[] = { 3196 &dev_attr_product_name.attr, 3197 &dev_attr_product_number.attr, 3198 &dev_attr_serial_number.attr, 3199 &dev_attr_pcie_replay_count.attr, 3200 NULL 3201 }; 3202 3203 3204 /** 3205 * amdgpu_device_init - initialize the driver 3206 * 3207 * @adev: amdgpu_device pointer 3208 * @flags: driver flags 3209 * 3210 * Initializes the driver info and hw (all asics). 3211 * Returns 0 for success or an error on failure. 3212 * Called at driver startup. 3213 */ 3214 int amdgpu_device_init(struct amdgpu_device *adev, 3215 uint32_t flags) 3216 { 3217 struct drm_device *ddev = adev_to_drm(adev); 3218 struct pci_dev *pdev = adev->pdev; 3219 int r, i; 3220 bool boco = false; 3221 u32 max_MBps; 3222 3223 adev->shutdown = false; 3224 adev->flags = flags; 3225 3226 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3227 adev->asic_type = amdgpu_force_asic_type; 3228 else 3229 adev->asic_type = flags & AMD_ASIC_MASK; 3230 3231 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3232 if (amdgpu_emu_mode == 1) 3233 adev->usec_timeout *= 10; 3234 adev->gmc.gart_size = 512 * 1024 * 1024; 3235 adev->accel_working = false; 3236 adev->num_rings = 0; 3237 adev->mman.buffer_funcs = NULL; 3238 adev->mman.buffer_funcs_ring = NULL; 3239 adev->vm_manager.vm_pte_funcs = NULL; 3240 adev->vm_manager.vm_pte_num_scheds = 0; 3241 adev->gmc.gmc_funcs = NULL; 3242 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3243 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3244 3245 adev->smc_rreg = &amdgpu_invalid_rreg; 3246 adev->smc_wreg = &amdgpu_invalid_wreg; 3247 adev->pcie_rreg = &amdgpu_invalid_rreg; 3248 adev->pcie_wreg = &amdgpu_invalid_wreg; 3249 adev->pciep_rreg = &amdgpu_invalid_rreg; 3250 adev->pciep_wreg = &amdgpu_invalid_wreg; 3251 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3252 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3253 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3254 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3255 adev->didt_rreg = &amdgpu_invalid_rreg; 3256 adev->didt_wreg = &amdgpu_invalid_wreg; 3257 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3258 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3259 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3260 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3261 3262 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3263 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3264 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3265 3266 /* mutex initialization are all done here so we 3267 * can recall function without having locking issues */ 3268 atomic_set(&adev->irq.ih.lock, 0); 3269 rw_init(&adev->firmware.mutex, "agfw"); 3270 rw_init(&adev->pm.mutex, "agpm"); 3271 rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk"); 3272 rw_init(&adev->srbm_mutex, "srbm"); 3273 rw_init(&adev->gfx.pipe_reserve_mutex, "pipers"); 3274 rw_init(&adev->gfx.gfx_off_mutex, "gfxoff"); 3275 rw_init(&adev->grbm_idx_mutex, "grbmidx"); 3276 rw_init(&adev->mn_lock, "agpumn"); 3277 rw_init(&adev->virt.vf_errors.lock, "vferr"); 3278 hash_init(adev->mn_hash); 3279 atomic_set(&adev->in_gpu_reset, 0); 3280 rw_init(&adev->reset_sem, "amrs"); 3281 rw_init(&adev->psp.mutex, "agpsp"); 3282 rw_init(&adev->notifier_lock, "agnf"); 3283 3284 r = amdgpu_device_check_arguments(adev); 3285 if (r) 3286 return r; 3287 3288 mtx_init(&adev->mmio_idx_lock, IPL_TTY); 3289 mtx_init(&adev->smc_idx_lock, IPL_TTY); 3290 mtx_init(&adev->pcie_idx_lock, IPL_TTY); 3291 mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY); 3292 mtx_init(&adev->didt_idx_lock, IPL_TTY); 3293 mtx_init(&adev->gc_cac_idx_lock, IPL_TTY); 3294 mtx_init(&adev->se_cac_idx_lock, IPL_TTY); 3295 mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY); 3296 mtx_init(&adev->mm_stats.lock, IPL_NONE); 3297 3298 INIT_LIST_HEAD(&adev->shadow_list); 3299 rw_init(&adev->shadow_list_lock, "sdwlst"); 3300 3301 INIT_DELAYED_WORK(&adev->delayed_init_work, 3302 amdgpu_device_delayed_init_work_handler); 3303 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3304 amdgpu_device_delay_enable_gfx_off); 3305 3306 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3307 3308 adev->gfx.gfx_off_req_count = 1; 3309 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3310 3311 atomic_set(&adev->throttling_logging_enabled, 1); 3312 /* 3313 * If throttling continues, logging will be performed every minute 3314 * to avoid log flooding. "-1" is subtracted since the thermal 3315 * throttling interrupt comes every second. Thus, the total logging 3316 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3317 * for throttling interrupt) = 60 seconds. 3318 */ 3319 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3320 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3321 3322 #ifdef __linux__ 3323 /* Registers mapping */ 3324 /* TODO: block userspace mapping of io register */ 3325 if (adev->asic_type >= CHIP_BONAIRE) { 3326 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3327 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3328 } else { 3329 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3330 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3331 } 3332 3333 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3334 if (adev->rmmio == NULL) { 3335 return -ENOMEM; 3336 } 3337 #endif 3338 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3339 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3340 3341 /* io port mapping */ 3342 #ifdef __linux__ 3343 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3344 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3345 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3346 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3347 break; 3348 } 3349 } 3350 if (adev->rio_mem == NULL) 3351 DRM_INFO("PCI I/O BAR is not found.\n"); 3352 #endif 3353 3354 /* enable PCIE atomic ops */ 3355 #ifdef notyet 3356 r = pci_enable_atomic_ops_to_root(adev->pdev, 3357 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3358 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3359 if (r) { 3360 adev->have_atomics_support = false; 3361 DRM_INFO("PCIE atomic ops is not supported\n"); 3362 } else { 3363 adev->have_atomics_support = true; 3364 } 3365 #else 3366 adev->have_atomics_support = false; 3367 #endif 3368 3369 amdgpu_device_get_pcie_info(adev); 3370 3371 if (amdgpu_mcbp) 3372 DRM_INFO("MCBP is enabled\n"); 3373 3374 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3375 adev->enable_mes = true; 3376 3377 /* detect hw virtualization here */ 3378 amdgpu_detect_virtualization(adev); 3379 3380 r = amdgpu_device_get_job_timeout_settings(adev); 3381 if (r) { 3382 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3383 goto failed_unmap; 3384 } 3385 3386 /* early init functions */ 3387 r = amdgpu_device_ip_early_init(adev); 3388 if (r) 3389 goto failed_unmap; 3390 3391 /* doorbell bar mapping and doorbell index init*/ 3392 amdgpu_device_doorbell_init(adev); 3393 3394 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3395 /* this will fail for cards that aren't VGA class devices, just 3396 * ignore it */ 3397 #ifdef notyet 3398 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3399 #endif 3400 3401 if (amdgpu_device_supports_boco(ddev)) 3402 boco = true; 3403 if (amdgpu_has_atpx() && 3404 (amdgpu_is_atpx_hybrid() || 3405 amdgpu_has_atpx_dgpu_power_cntl()) && 3406 !pci_is_thunderbolt_attached(adev->pdev)) 3407 vga_switcheroo_register_client(adev->pdev, 3408 &amdgpu_switcheroo_ops, boco); 3409 if (boco) 3410 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3411 3412 if (amdgpu_emu_mode == 1) { 3413 /* post the asic on emulation mode */ 3414 emu_soc_asic_init(adev); 3415 goto fence_driver_init; 3416 } 3417 3418 /* detect if we are with an SRIOV vbios */ 3419 amdgpu_device_detect_sriov_bios(adev); 3420 3421 /* check if we need to reset the asic 3422 * E.g., driver was not cleanly unloaded previously, etc. 3423 */ 3424 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3425 r = amdgpu_asic_reset(adev); 3426 if (r) { 3427 dev_err(adev->dev, "asic reset on init failed\n"); 3428 goto failed; 3429 } 3430 } 3431 3432 pci_enable_pcie_error_reporting(adev->ddev.pdev); 3433 3434 /* Post card if necessary */ 3435 if (amdgpu_device_need_post(adev)) { 3436 if (!adev->bios) { 3437 dev_err(adev->dev, "no vBIOS found\n"); 3438 r = -EINVAL; 3439 goto failed; 3440 } 3441 DRM_INFO("GPU posting now...\n"); 3442 r = amdgpu_device_asic_init(adev); 3443 if (r) { 3444 dev_err(adev->dev, "gpu post error!\n"); 3445 goto failed; 3446 } 3447 } 3448 3449 if (adev->is_atom_fw) { 3450 /* Initialize clocks */ 3451 r = amdgpu_atomfirmware_get_clock_info(adev); 3452 if (r) { 3453 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3454 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3455 goto failed; 3456 } 3457 } else { 3458 /* Initialize clocks */ 3459 r = amdgpu_atombios_get_clock_info(adev); 3460 if (r) { 3461 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3462 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3463 goto failed; 3464 } 3465 /* init i2c buses */ 3466 if (!amdgpu_device_has_dc_support(adev)) 3467 amdgpu_atombios_i2c_init(adev); 3468 } 3469 3470 fence_driver_init: 3471 /* Fence driver */ 3472 r = amdgpu_fence_driver_init(adev); 3473 if (r) { 3474 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3475 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3476 goto failed; 3477 } 3478 3479 /* init the mode config */ 3480 drm_mode_config_init(adev_to_drm(adev)); 3481 3482 r = amdgpu_device_ip_init(adev); 3483 if (r) { 3484 /* failed in exclusive mode due to timeout */ 3485 if (amdgpu_sriov_vf(adev) && 3486 !amdgpu_sriov_runtime(adev) && 3487 amdgpu_virt_mmio_blocked(adev) && 3488 !amdgpu_virt_wait_reset(adev)) { 3489 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3490 /* Don't send request since VF is inactive. */ 3491 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3492 adev->virt.ops = NULL; 3493 r = -EAGAIN; 3494 goto failed; 3495 } 3496 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3497 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3498 goto failed; 3499 } 3500 3501 dev_info(adev->dev, 3502 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3503 adev->gfx.config.max_shader_engines, 3504 adev->gfx.config.max_sh_per_se, 3505 adev->gfx.config.max_cu_per_sh, 3506 adev->gfx.cu_info.number); 3507 3508 #ifdef __OpenBSD__ 3509 { 3510 const char *chip_name; 3511 3512 switch (adev->asic_type) { 3513 case CHIP_RAVEN: 3514 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 3515 chip_name = "RAVEN2"; 3516 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 3517 chip_name = "PICASSO"; 3518 else 3519 chip_name = "RAVEN"; 3520 break; 3521 case CHIP_RENOIR: 3522 if (adev->apu_flags & AMD_APU_IS_RENOIR) 3523 chip_name = "RENOIR"; 3524 else 3525 chip_name = "GREEN_SARDINE"; 3526 break; 3527 default: 3528 chip_name = amdgpu_asic_name[adev->asic_type]; 3529 } 3530 printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname, 3531 chip_name, adev->gfx.cu_info.number, adev->rev_id); 3532 } 3533 #endif 3534 3535 adev->accel_working = true; 3536 3537 amdgpu_vm_check_compute_bug(adev); 3538 3539 /* Initialize the buffer migration limit. */ 3540 if (amdgpu_moverate >= 0) 3541 max_MBps = amdgpu_moverate; 3542 else 3543 max_MBps = 8; /* Allow 8 MB/s. */ 3544 /* Get a log2 for easy divisions. */ 3545 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3546 3547 amdgpu_fbdev_init(adev); 3548 3549 r = amdgpu_pm_sysfs_init(adev); 3550 if (r) { 3551 adev->pm_sysfs_en = false; 3552 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3553 } else 3554 adev->pm_sysfs_en = true; 3555 3556 r = amdgpu_ucode_sysfs_init(adev); 3557 if (r) { 3558 adev->ucode_sysfs_en = false; 3559 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3560 } else 3561 adev->ucode_sysfs_en = true; 3562 3563 if ((amdgpu_testing & 1)) { 3564 if (adev->accel_working) 3565 amdgpu_test_moves(adev); 3566 else 3567 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3568 } 3569 if (amdgpu_benchmarking) { 3570 if (adev->accel_working) 3571 amdgpu_benchmark(adev, amdgpu_benchmarking); 3572 else 3573 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3574 } 3575 3576 /* 3577 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3578 * Otherwise the mgpu fan boost feature will be skipped due to the 3579 * gpu instance is counted less. 3580 */ 3581 amdgpu_register_gpu_instance(adev); 3582 3583 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3584 * explicit gating rather than handling it automatically. 3585 */ 3586 r = amdgpu_device_ip_late_init(adev); 3587 if (r) { 3588 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3589 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3590 goto failed; 3591 } 3592 3593 /* must succeed. */ 3594 amdgpu_ras_resume(adev); 3595 3596 queue_delayed_work(system_wq, &adev->delayed_init_work, 3597 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3598 3599 if (amdgpu_sriov_vf(adev)) 3600 flush_delayed_work(&adev->delayed_init_work); 3601 3602 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3603 if (r) 3604 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3605 3606 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3607 r = amdgpu_pmu_init(adev); 3608 if (r) 3609 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3610 3611 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3612 if (amdgpu_device_cache_pci_state(adev->pdev)) 3613 pci_restore_state(pdev); 3614 3615 return 0; 3616 3617 failed: 3618 amdgpu_vf_error_trans_all(adev); 3619 if (boco) 3620 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3621 3622 failed_unmap: 3623 #ifdef __linux__ 3624 iounmap(adev->rmmio); 3625 adev->rmmio = NULL; 3626 #endif 3627 3628 return r; 3629 } 3630 3631 /** 3632 * amdgpu_device_fini - tear down the driver 3633 * 3634 * @adev: amdgpu_device pointer 3635 * 3636 * Tear down the driver info (all asics). 3637 * Called at driver shutdown. 3638 */ 3639 void amdgpu_device_fini(struct amdgpu_device *adev) 3640 { 3641 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3642 flush_delayed_work(&adev->delayed_init_work); 3643 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3644 adev->shutdown = true; 3645 3646 kfree(adev->pci_state); 3647 3648 /* make sure IB test finished before entering exclusive mode 3649 * to avoid preemption on IB test 3650 * */ 3651 if (amdgpu_sriov_vf(adev)) { 3652 amdgpu_virt_request_full_gpu(adev, false); 3653 amdgpu_virt_fini_data_exchange(adev); 3654 } 3655 3656 /* disable all interrupts */ 3657 amdgpu_irq_disable_all(adev); 3658 if (adev->mode_info.mode_config_initialized){ 3659 if (!amdgpu_device_has_dc_support(adev)) 3660 drm_helper_force_disable_all(adev_to_drm(adev)); 3661 else 3662 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3663 } 3664 amdgpu_fence_driver_fini(adev); 3665 if (adev->pm_sysfs_en) 3666 amdgpu_pm_sysfs_fini(adev); 3667 amdgpu_fbdev_fini(adev); 3668 amdgpu_device_ip_fini(adev); 3669 release_firmware(adev->firmware.gpu_info_fw); 3670 adev->firmware.gpu_info_fw = NULL; 3671 adev->accel_working = false; 3672 /* free i2c buses */ 3673 if (!amdgpu_device_has_dc_support(adev)) 3674 amdgpu_i2c_fini(adev); 3675 3676 if (amdgpu_emu_mode != 1) 3677 amdgpu_atombios_fini(adev); 3678 3679 kfree(adev->bios); 3680 adev->bios = NULL; 3681 if (amdgpu_has_atpx() && 3682 (amdgpu_is_atpx_hybrid() || 3683 amdgpu_has_atpx_dgpu_power_cntl()) && 3684 !pci_is_thunderbolt_attached(adev->pdev)) 3685 vga_switcheroo_unregister_client(adev->pdev); 3686 if (amdgpu_device_supports_boco(adev_to_drm(adev))) 3687 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3688 vga_client_register(adev->pdev, NULL, NULL, NULL); 3689 #ifdef __linux__ 3690 if (adev->rio_mem) 3691 pci_iounmap(adev->pdev, adev->rio_mem); 3692 adev->rio_mem = NULL; 3693 iounmap(adev->rmmio); 3694 #else 3695 if (adev->rio_mem_size > 0) 3696 bus_space_unmap(adev->rio_mem_bst, adev->rio_mem_bsh, 3697 adev->rio_mem_size); 3698 adev->rio_mem_size = 0; 3699 3700 if (adev->rmmio_size > 0) 3701 bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh, 3702 adev->rmmio_size); 3703 adev->rmmio_size = 0; 3704 #endif 3705 adev->rmmio = NULL; 3706 amdgpu_device_doorbell_fini(adev); 3707 3708 if (adev->ucode_sysfs_en) 3709 amdgpu_ucode_sysfs_fini(adev); 3710 3711 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3712 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3713 amdgpu_pmu_fini(adev); 3714 if (adev->mman.discovery_bin) 3715 amdgpu_discovery_fini(adev); 3716 } 3717 3718 3719 /* 3720 * Suspend & resume. 3721 */ 3722 /** 3723 * amdgpu_device_suspend - initiate device suspend 3724 * 3725 * @dev: drm dev pointer 3726 * @fbcon : notify the fbdev of suspend 3727 * 3728 * Puts the hw in the suspend state (all asics). 3729 * Returns 0 for success or an error on failure. 3730 * Called at driver suspend. 3731 */ 3732 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3733 { 3734 struct amdgpu_device *adev; 3735 struct drm_crtc *crtc; 3736 struct drm_connector *connector; 3737 struct drm_connector_list_iter iter; 3738 int r; 3739 3740 adev = drm_to_adev(dev); 3741 if (adev->shutdown) 3742 return 0; 3743 3744 #ifdef notyet 3745 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3746 return 0; 3747 #endif 3748 3749 adev->in_suspend = true; 3750 drm_kms_helper_poll_disable(dev); 3751 3752 if (fbcon) 3753 amdgpu_fbdev_set_suspend(adev, 1); 3754 3755 cancel_delayed_work_sync(&adev->delayed_init_work); 3756 3757 if (!amdgpu_device_has_dc_support(adev)) { 3758 /* turn off display hw */ 3759 drm_modeset_lock_all(dev); 3760 drm_connector_list_iter_begin(dev, &iter); 3761 drm_for_each_connector_iter(connector, &iter) 3762 drm_helper_connector_dpms(connector, 3763 DRM_MODE_DPMS_OFF); 3764 drm_connector_list_iter_end(&iter); 3765 drm_modeset_unlock_all(dev); 3766 /* unpin the front buffers and cursors */ 3767 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3768 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3769 struct drm_framebuffer *fb = crtc->primary->fb; 3770 struct amdgpu_bo *robj; 3771 3772 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3773 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3774 r = amdgpu_bo_reserve(aobj, true); 3775 if (r == 0) { 3776 amdgpu_bo_unpin(aobj); 3777 amdgpu_bo_unreserve(aobj); 3778 } 3779 } 3780 3781 if (fb == NULL || fb->obj[0] == NULL) { 3782 continue; 3783 } 3784 robj = gem_to_amdgpu_bo(fb->obj[0]); 3785 /* don't unpin kernel fb objects */ 3786 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3787 r = amdgpu_bo_reserve(robj, true); 3788 if (r == 0) { 3789 amdgpu_bo_unpin(robj); 3790 amdgpu_bo_unreserve(robj); 3791 } 3792 } 3793 } 3794 } 3795 3796 amdgpu_ras_suspend(adev); 3797 3798 r = amdgpu_device_ip_suspend_phase1(adev); 3799 3800 amdgpu_amdkfd_suspend(adev, !fbcon); 3801 3802 /* evict vram memory */ 3803 amdgpu_bo_evict_vram(adev); 3804 3805 amdgpu_fence_driver_suspend(adev); 3806 3807 r = amdgpu_device_ip_suspend_phase2(adev); 3808 3809 /* evict remaining vram memory 3810 * This second call to evict vram is to evict the gart page table 3811 * using the CPU. 3812 */ 3813 amdgpu_bo_evict_vram(adev); 3814 3815 return 0; 3816 } 3817 3818 /** 3819 * amdgpu_device_resume - initiate device resume 3820 * 3821 * @dev: drm dev pointer 3822 * @fbcon : notify the fbdev of resume 3823 * 3824 * Bring the hw back to operating state (all asics). 3825 * Returns 0 for success or an error on failure. 3826 * Called at driver resume. 3827 */ 3828 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3829 { 3830 struct drm_connector *connector; 3831 struct drm_connector_list_iter iter; 3832 struct amdgpu_device *adev = drm_to_adev(dev); 3833 struct drm_crtc *crtc; 3834 int r = 0; 3835 3836 #ifdef notyet 3837 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3838 return 0; 3839 #endif 3840 3841 /* post card */ 3842 if (amdgpu_device_need_post(adev)) { 3843 r = amdgpu_device_asic_init(adev); 3844 if (r) 3845 dev_err(adev->dev, "amdgpu asic init failed\n"); 3846 } 3847 3848 r = amdgpu_device_ip_resume(adev); 3849 if (r) { 3850 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3851 return r; 3852 } 3853 amdgpu_fence_driver_resume(adev); 3854 3855 3856 r = amdgpu_device_ip_late_init(adev); 3857 if (r) 3858 return r; 3859 3860 queue_delayed_work(system_wq, &adev->delayed_init_work, 3861 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3862 3863 if (!amdgpu_device_has_dc_support(adev)) { 3864 /* pin cursors */ 3865 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3866 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3867 3868 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3869 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3870 r = amdgpu_bo_reserve(aobj, true); 3871 if (r == 0) { 3872 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3873 if (r != 0) 3874 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); 3875 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3876 amdgpu_bo_unreserve(aobj); 3877 } 3878 } 3879 } 3880 } 3881 r = amdgpu_amdkfd_resume(adev, !fbcon); 3882 if (r) 3883 return r; 3884 3885 /* Make sure IB tests flushed */ 3886 flush_delayed_work(&adev->delayed_init_work); 3887 3888 /* blat the mode back in */ 3889 if (fbcon) { 3890 if (!amdgpu_device_has_dc_support(adev)) { 3891 /* pre DCE11 */ 3892 drm_helper_resume_force_mode(dev); 3893 3894 /* turn on display hw */ 3895 drm_modeset_lock_all(dev); 3896 3897 drm_connector_list_iter_begin(dev, &iter); 3898 drm_for_each_connector_iter(connector, &iter) 3899 drm_helper_connector_dpms(connector, 3900 DRM_MODE_DPMS_ON); 3901 drm_connector_list_iter_end(&iter); 3902 3903 drm_modeset_unlock_all(dev); 3904 } 3905 amdgpu_fbdev_set_suspend(adev, 0); 3906 } 3907 3908 drm_kms_helper_poll_enable(dev); 3909 3910 amdgpu_ras_resume(adev); 3911 3912 /* 3913 * Most of the connector probing functions try to acquire runtime pm 3914 * refs to ensure that the GPU is powered on when connector polling is 3915 * performed. Since we're calling this from a runtime PM callback, 3916 * trying to acquire rpm refs will cause us to deadlock. 3917 * 3918 * Since we're guaranteed to be holding the rpm lock, it's safe to 3919 * temporarily disable the rpm helpers so this doesn't deadlock us. 3920 */ 3921 #if defined(CONFIG_PM) && defined(__linux__) 3922 dev->dev->power.disable_depth++; 3923 #endif 3924 if (!amdgpu_device_has_dc_support(adev)) 3925 drm_helper_hpd_irq_event(dev); 3926 else 3927 drm_kms_helper_hotplug_event(dev); 3928 #if defined(CONFIG_PM) && defined(__linux__) 3929 dev->dev->power.disable_depth--; 3930 #endif 3931 adev->in_suspend = false; 3932 3933 return 0; 3934 } 3935 3936 /** 3937 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3938 * 3939 * @adev: amdgpu_device pointer 3940 * 3941 * The list of all the hardware IPs that make up the asic is walked and 3942 * the check_soft_reset callbacks are run. check_soft_reset determines 3943 * if the asic is still hung or not. 3944 * Returns true if any of the IPs are still in a hung state, false if not. 3945 */ 3946 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3947 { 3948 int i; 3949 bool asic_hang = false; 3950 3951 if (amdgpu_sriov_vf(adev)) 3952 return true; 3953 3954 if (amdgpu_asic_need_full_reset(adev)) 3955 return true; 3956 3957 for (i = 0; i < adev->num_ip_blocks; i++) { 3958 if (!adev->ip_blocks[i].status.valid) 3959 continue; 3960 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3961 adev->ip_blocks[i].status.hang = 3962 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3963 if (adev->ip_blocks[i].status.hang) { 3964 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3965 asic_hang = true; 3966 } 3967 } 3968 return asic_hang; 3969 } 3970 3971 /** 3972 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3973 * 3974 * @adev: amdgpu_device pointer 3975 * 3976 * The list of all the hardware IPs that make up the asic is walked and the 3977 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3978 * handles any IP specific hardware or software state changes that are 3979 * necessary for a soft reset to succeed. 3980 * Returns 0 on success, negative error code on failure. 3981 */ 3982 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3983 { 3984 int i, r = 0; 3985 3986 for (i = 0; i < adev->num_ip_blocks; i++) { 3987 if (!adev->ip_blocks[i].status.valid) 3988 continue; 3989 if (adev->ip_blocks[i].status.hang && 3990 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3991 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3992 if (r) 3993 return r; 3994 } 3995 } 3996 3997 return 0; 3998 } 3999 4000 /** 4001 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4002 * 4003 * @adev: amdgpu_device pointer 4004 * 4005 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4006 * reset is necessary to recover. 4007 * Returns true if a full asic reset is required, false if not. 4008 */ 4009 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4010 { 4011 int i; 4012 4013 if (amdgpu_asic_need_full_reset(adev)) 4014 return true; 4015 4016 for (i = 0; i < adev->num_ip_blocks; i++) { 4017 if (!adev->ip_blocks[i].status.valid) 4018 continue; 4019 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4020 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4021 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4022 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4023 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4024 if (adev->ip_blocks[i].status.hang) { 4025 dev_info(adev->dev, "Some block need full reset!\n"); 4026 return true; 4027 } 4028 } 4029 } 4030 return false; 4031 } 4032 4033 /** 4034 * amdgpu_device_ip_soft_reset - do a soft reset 4035 * 4036 * @adev: amdgpu_device pointer 4037 * 4038 * The list of all the hardware IPs that make up the asic is walked and the 4039 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4040 * IP specific hardware or software state changes that are necessary to soft 4041 * reset the IP. 4042 * Returns 0 on success, negative error code on failure. 4043 */ 4044 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4045 { 4046 int i, r = 0; 4047 4048 for (i = 0; i < adev->num_ip_blocks; i++) { 4049 if (!adev->ip_blocks[i].status.valid) 4050 continue; 4051 if (adev->ip_blocks[i].status.hang && 4052 adev->ip_blocks[i].version->funcs->soft_reset) { 4053 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4054 if (r) 4055 return r; 4056 } 4057 } 4058 4059 return 0; 4060 } 4061 4062 /** 4063 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4064 * 4065 * @adev: amdgpu_device pointer 4066 * 4067 * The list of all the hardware IPs that make up the asic is walked and the 4068 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4069 * handles any IP specific hardware or software state changes that are 4070 * necessary after the IP has been soft reset. 4071 * Returns 0 on success, negative error code on failure. 4072 */ 4073 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4074 { 4075 int i, r = 0; 4076 4077 for (i = 0; i < adev->num_ip_blocks; i++) { 4078 if (!adev->ip_blocks[i].status.valid) 4079 continue; 4080 if (adev->ip_blocks[i].status.hang && 4081 adev->ip_blocks[i].version->funcs->post_soft_reset) 4082 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4083 if (r) 4084 return r; 4085 } 4086 4087 return 0; 4088 } 4089 4090 /** 4091 * amdgpu_device_recover_vram - Recover some VRAM contents 4092 * 4093 * @adev: amdgpu_device pointer 4094 * 4095 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4096 * restore things like GPUVM page tables after a GPU reset where 4097 * the contents of VRAM might be lost. 4098 * 4099 * Returns: 4100 * 0 on success, negative error code on failure. 4101 */ 4102 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4103 { 4104 struct dma_fence *fence = NULL, *next = NULL; 4105 struct amdgpu_bo *shadow; 4106 long r = 1, tmo; 4107 4108 if (amdgpu_sriov_runtime(adev)) 4109 tmo = msecs_to_jiffies(8000); 4110 else 4111 tmo = msecs_to_jiffies(100); 4112 4113 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4114 mutex_lock(&adev->shadow_list_lock); 4115 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4116 4117 /* No need to recover an evicted BO */ 4118 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4119 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4120 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4121 continue; 4122 4123 r = amdgpu_bo_restore_shadow(shadow, &next); 4124 if (r) 4125 break; 4126 4127 if (fence) { 4128 tmo = dma_fence_wait_timeout(fence, false, tmo); 4129 dma_fence_put(fence); 4130 fence = next; 4131 if (tmo == 0) { 4132 r = -ETIMEDOUT; 4133 break; 4134 } else if (tmo < 0) { 4135 r = tmo; 4136 break; 4137 } 4138 } else { 4139 fence = next; 4140 } 4141 } 4142 mutex_unlock(&adev->shadow_list_lock); 4143 4144 if (fence) 4145 tmo = dma_fence_wait_timeout(fence, false, tmo); 4146 dma_fence_put(fence); 4147 4148 if (r < 0 || tmo <= 0) { 4149 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4150 return -EIO; 4151 } 4152 4153 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4154 return 0; 4155 } 4156 4157 4158 /** 4159 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4160 * 4161 * @adev: amdgpu_device pointer 4162 * @from_hypervisor: request from hypervisor 4163 * 4164 * do VF FLR and reinitialize Asic 4165 * return 0 means succeeded otherwise failed 4166 */ 4167 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4168 bool from_hypervisor) 4169 { 4170 int r; 4171 4172 if (from_hypervisor) 4173 r = amdgpu_virt_request_full_gpu(adev, true); 4174 else 4175 r = amdgpu_virt_reset_gpu(adev); 4176 if (r) 4177 return r; 4178 4179 amdgpu_amdkfd_pre_reset(adev); 4180 4181 /* Resume IP prior to SMC */ 4182 r = amdgpu_device_ip_reinit_early_sriov(adev); 4183 if (r) 4184 goto error; 4185 4186 amdgpu_virt_init_data_exchange(adev); 4187 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4188 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4189 4190 r = amdgpu_device_fw_loading(adev); 4191 if (r) 4192 return r; 4193 4194 /* now we are okay to resume SMC/CP/SDMA */ 4195 r = amdgpu_device_ip_reinit_late_sriov(adev); 4196 if (r) 4197 goto error; 4198 4199 amdgpu_irq_gpu_reset_resume_helper(adev); 4200 r = amdgpu_ib_ring_tests(adev); 4201 amdgpu_amdkfd_post_reset(adev); 4202 4203 error: 4204 amdgpu_virt_release_full_gpu(adev, true); 4205 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4206 amdgpu_inc_vram_lost(adev); 4207 r = amdgpu_device_recover_vram(adev); 4208 } 4209 4210 return r; 4211 } 4212 4213 /** 4214 * amdgpu_device_has_job_running - check if there is any job in mirror list 4215 * 4216 * @adev: amdgpu_device pointer 4217 * 4218 * check if there is any job in mirror list 4219 */ 4220 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4221 { 4222 int i; 4223 struct drm_sched_job *job; 4224 4225 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4226 struct amdgpu_ring *ring = adev->rings[i]; 4227 4228 if (!ring || !ring->sched.thread) 4229 continue; 4230 4231 spin_lock(&ring->sched.job_list_lock); 4232 job = list_first_entry_or_null(&ring->sched.ring_mirror_list, 4233 struct drm_sched_job, node); 4234 spin_unlock(&ring->sched.job_list_lock); 4235 if (job) 4236 return true; 4237 } 4238 return false; 4239 } 4240 4241 /** 4242 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4243 * 4244 * @adev: amdgpu_device pointer 4245 * 4246 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4247 * a hung GPU. 4248 */ 4249 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4250 { 4251 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4252 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4253 return false; 4254 } 4255 4256 if (amdgpu_gpu_recovery == 0) 4257 goto disabled; 4258 4259 if (amdgpu_sriov_vf(adev)) 4260 return true; 4261 4262 if (amdgpu_gpu_recovery == -1) { 4263 switch (adev->asic_type) { 4264 case CHIP_BONAIRE: 4265 case CHIP_HAWAII: 4266 case CHIP_TOPAZ: 4267 case CHIP_TONGA: 4268 case CHIP_FIJI: 4269 case CHIP_POLARIS10: 4270 case CHIP_POLARIS11: 4271 case CHIP_POLARIS12: 4272 case CHIP_VEGAM: 4273 case CHIP_VEGA20: 4274 case CHIP_VEGA10: 4275 case CHIP_VEGA12: 4276 case CHIP_RAVEN: 4277 case CHIP_ARCTURUS: 4278 case CHIP_RENOIR: 4279 case CHIP_NAVI10: 4280 case CHIP_NAVI14: 4281 case CHIP_NAVI12: 4282 case CHIP_SIENNA_CICHLID: 4283 break; 4284 default: 4285 goto disabled; 4286 } 4287 } 4288 4289 return true; 4290 4291 disabled: 4292 dev_info(adev->dev, "GPU recovery disabled.\n"); 4293 return false; 4294 } 4295 4296 4297 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4298 struct amdgpu_job *job, 4299 bool *need_full_reset_arg) 4300 { 4301 int i, r = 0; 4302 bool need_full_reset = *need_full_reset_arg; 4303 4304 amdgpu_debugfs_wait_dump(adev); 4305 4306 if (amdgpu_sriov_vf(adev)) { 4307 /* stop the data exchange thread */ 4308 amdgpu_virt_fini_data_exchange(adev); 4309 } 4310 4311 /* block all schedulers and reset given job's ring */ 4312 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4313 struct amdgpu_ring *ring = adev->rings[i]; 4314 4315 if (!ring || !ring->sched.thread) 4316 continue; 4317 4318 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4319 amdgpu_fence_driver_force_completion(ring); 4320 } 4321 4322 if(job) 4323 drm_sched_increase_karma(&job->base); 4324 4325 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4326 if (!amdgpu_sriov_vf(adev)) { 4327 4328 if (!need_full_reset) 4329 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4330 4331 if (!need_full_reset) { 4332 amdgpu_device_ip_pre_soft_reset(adev); 4333 r = amdgpu_device_ip_soft_reset(adev); 4334 amdgpu_device_ip_post_soft_reset(adev); 4335 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4336 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4337 need_full_reset = true; 4338 } 4339 } 4340 4341 if (need_full_reset) 4342 r = amdgpu_device_ip_suspend(adev); 4343 4344 *need_full_reset_arg = need_full_reset; 4345 } 4346 4347 return r; 4348 } 4349 4350 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4351 struct list_head *device_list_handle, 4352 bool *need_full_reset_arg, 4353 bool skip_hw_reset) 4354 { 4355 struct amdgpu_device *tmp_adev = NULL; 4356 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4357 int r = 0; 4358 4359 /* 4360 * ASIC reset has to be done on all HGMI hive nodes ASAP 4361 * to allow proper links negotiation in FW (within 1 sec) 4362 */ 4363 if (!skip_hw_reset && need_full_reset) { 4364 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4365 /* For XGMI run all resets in parallel to speed up the process */ 4366 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4367 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4368 r = -EALREADY; 4369 } else 4370 r = amdgpu_asic_reset(tmp_adev); 4371 4372 if (r) { 4373 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4374 r, adev_to_drm(tmp_adev)->unique); 4375 break; 4376 } 4377 } 4378 4379 /* For XGMI wait for all resets to complete before proceed */ 4380 if (!r) { 4381 list_for_each_entry(tmp_adev, device_list_handle, 4382 gmc.xgmi.head) { 4383 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4384 flush_work(&tmp_adev->xgmi_reset_work); 4385 r = tmp_adev->asic_reset_res; 4386 if (r) 4387 break; 4388 } 4389 } 4390 } 4391 } 4392 4393 if (!r && amdgpu_ras_intr_triggered()) { 4394 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4395 if (tmp_adev->mmhub.funcs && 4396 tmp_adev->mmhub.funcs->reset_ras_error_count) 4397 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4398 } 4399 4400 amdgpu_ras_intr_cleared(); 4401 } 4402 4403 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4404 if (need_full_reset) { 4405 /* post card */ 4406 if (amdgpu_device_asic_init(tmp_adev)) 4407 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4408 4409 if (!r) { 4410 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4411 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4412 if (r) 4413 goto out; 4414 4415 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4416 if (vram_lost) { 4417 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4418 amdgpu_inc_vram_lost(tmp_adev); 4419 } 4420 4421 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4422 if (r) 4423 goto out; 4424 4425 r = amdgpu_device_fw_loading(tmp_adev); 4426 if (r) 4427 return r; 4428 4429 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4430 if (r) 4431 goto out; 4432 4433 if (vram_lost) 4434 amdgpu_device_fill_reset_magic(tmp_adev); 4435 4436 /* 4437 * Add this ASIC as tracked as reset was already 4438 * complete successfully. 4439 */ 4440 amdgpu_register_gpu_instance(tmp_adev); 4441 4442 r = amdgpu_device_ip_late_init(tmp_adev); 4443 if (r) 4444 goto out; 4445 4446 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4447 4448 /* 4449 * The GPU enters bad state once faulty pages 4450 * by ECC has reached the threshold, and ras 4451 * recovery is scheduled next. So add one check 4452 * here to break recovery if it indeed exceeds 4453 * bad page threshold, and remind user to 4454 * retire this GPU or setting one bigger 4455 * bad_page_threshold value to fix this once 4456 * probing driver again. 4457 */ 4458 if (!amdgpu_ras_check_err_threshold(tmp_adev)) { 4459 /* must succeed. */ 4460 amdgpu_ras_resume(tmp_adev); 4461 } else { 4462 r = -EINVAL; 4463 goto out; 4464 } 4465 4466 /* Update PSP FW topology after reset */ 4467 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4468 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4469 } 4470 } 4471 4472 out: 4473 if (!r) { 4474 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4475 r = amdgpu_ib_ring_tests(tmp_adev); 4476 if (r) { 4477 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4478 need_full_reset = true; 4479 r = -EAGAIN; 4480 goto end; 4481 } 4482 } 4483 4484 if (!r) 4485 r = amdgpu_device_recover_vram(tmp_adev); 4486 else 4487 tmp_adev->asic_reset_res = r; 4488 } 4489 4490 end: 4491 *need_full_reset_arg = need_full_reset; 4492 return r; 4493 } 4494 4495 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4496 struct amdgpu_hive_info *hive) 4497 { 4498 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4499 return false; 4500 4501 if (hive) { 4502 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4503 } else { 4504 down_write(&adev->reset_sem); 4505 } 4506 4507 atomic_inc(&adev->gpu_reset_counter); 4508 switch (amdgpu_asic_reset_method(adev)) { 4509 case AMD_RESET_METHOD_MODE1: 4510 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4511 break; 4512 case AMD_RESET_METHOD_MODE2: 4513 adev->mp1_state = PP_MP1_STATE_RESET; 4514 break; 4515 default: 4516 adev->mp1_state = PP_MP1_STATE_NONE; 4517 break; 4518 } 4519 4520 return true; 4521 } 4522 4523 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4524 { 4525 amdgpu_vf_error_trans_all(adev); 4526 adev->mp1_state = PP_MP1_STATE_NONE; 4527 atomic_set(&adev->in_gpu_reset, 0); 4528 up_write(&adev->reset_sem); 4529 } 4530 4531 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4532 { 4533 STUB(); 4534 #ifdef notyet 4535 struct pci_dev *p = NULL; 4536 4537 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4538 adev->pdev->bus->number, 1); 4539 if (p) { 4540 pm_runtime_enable(&(p->dev)); 4541 pm_runtime_resume(&(p->dev)); 4542 } 4543 #endif 4544 } 4545 4546 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4547 { 4548 enum amd_reset_method reset_method; 4549 struct pci_dev *p = NULL; 4550 u64 expires; 4551 4552 /* 4553 * For now, only BACO and mode1 reset are confirmed 4554 * to suffer the audio issue without proper suspended. 4555 */ 4556 reset_method = amdgpu_asic_reset_method(adev); 4557 if ((reset_method != AMD_RESET_METHOD_BACO) && 4558 (reset_method != AMD_RESET_METHOD_MODE1)) 4559 return -EINVAL; 4560 4561 STUB(); 4562 return -ENOSYS; 4563 #ifdef notyet 4564 4565 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4566 adev->pdev->bus->number, 1); 4567 if (!p) 4568 return -ENODEV; 4569 4570 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4571 if (!expires) 4572 /* 4573 * If we cannot get the audio device autosuspend delay, 4574 * a fixed 4S interval will be used. Considering 3S is 4575 * the audio controller default autosuspend delay setting. 4576 * 4S used here is guaranteed to cover that. 4577 */ 4578 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4579 4580 while (!pm_runtime_status_suspended(&(p->dev))) { 4581 if (!pm_runtime_suspend(&(p->dev))) 4582 break; 4583 4584 if (expires < ktime_get_mono_fast_ns()) { 4585 dev_warn(adev->dev, "failed to suspend display audio\n"); 4586 /* TODO: abort the succeeding gpu reset? */ 4587 return -ETIMEDOUT; 4588 } 4589 } 4590 4591 pm_runtime_disable(&(p->dev)); 4592 4593 return 0; 4594 #endif 4595 } 4596 4597 /** 4598 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4599 * 4600 * @adev: amdgpu_device pointer 4601 * @job: which job trigger hang 4602 * 4603 * Attempt to reset the GPU if it has hung (all asics). 4604 * Attempt to do soft-reset or full-reset and reinitialize Asic 4605 * Returns 0 for success or an error on failure. 4606 */ 4607 4608 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4609 struct amdgpu_job *job) 4610 { 4611 struct list_head device_list, *device_list_handle = NULL; 4612 bool need_full_reset = false; 4613 bool job_signaled = false; 4614 struct amdgpu_hive_info *hive = NULL; 4615 struct amdgpu_device *tmp_adev = NULL; 4616 int i, r = 0; 4617 bool need_emergency_restart = false; 4618 bool audio_suspended = false; 4619 4620 /* 4621 * Special case: RAS triggered and full reset isn't supported 4622 */ 4623 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4624 4625 /* 4626 * Flush RAM to disk so that after reboot 4627 * the user can read log and see why the system rebooted. 4628 */ 4629 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4630 DRM_WARN("Emergency reboot."); 4631 4632 #ifdef notyet 4633 ksys_sync_helper(); 4634 emergency_restart(); 4635 #else 4636 panic("emergency_restart"); 4637 #endif 4638 } 4639 4640 dev_info(adev->dev, "GPU %s begin!\n", 4641 need_emergency_restart ? "jobs stop":"reset"); 4642 4643 /* 4644 * Here we trylock to avoid chain of resets executing from 4645 * either trigger by jobs on different adevs in XGMI hive or jobs on 4646 * different schedulers for same device while this TO handler is running. 4647 * We always reset all schedulers for device and all devices for XGMI 4648 * hive so that should take care of them too. 4649 */ 4650 hive = amdgpu_get_xgmi_hive(adev); 4651 if (hive) { 4652 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4653 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4654 job ? job->base.id : -1, hive->hive_id); 4655 amdgpu_put_xgmi_hive(hive); 4656 return 0; 4657 } 4658 mutex_lock(&hive->hive_lock); 4659 } 4660 4661 /* 4662 * Build list of devices to reset. 4663 * In case we are in XGMI hive mode, resort the device list 4664 * to put adev in the 1st position. 4665 */ 4666 INIT_LIST_HEAD(&device_list); 4667 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4668 if (!hive) 4669 return -ENODEV; 4670 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4671 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4672 device_list_handle = &hive->device_list; 4673 } else { 4674 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4675 device_list_handle = &device_list; 4676 } 4677 4678 /* block all schedulers and reset given job's ring */ 4679 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4680 if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4681 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4682 job ? job->base.id : -1); 4683 r = 0; 4684 goto skip_recovery; 4685 } 4686 4687 /* 4688 * Try to put the audio codec into suspend state 4689 * before gpu reset started. 4690 * 4691 * Due to the power domain of the graphics device 4692 * is shared with AZ power domain. Without this, 4693 * we may change the audio hardware from behind 4694 * the audio driver's back. That will trigger 4695 * some audio codec errors. 4696 */ 4697 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4698 audio_suspended = true; 4699 4700 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4701 4702 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4703 4704 if (!amdgpu_sriov_vf(tmp_adev)) 4705 amdgpu_amdkfd_pre_reset(tmp_adev); 4706 4707 /* 4708 * Mark these ASICs to be reseted as untracked first 4709 * And add them back after reset completed 4710 */ 4711 amdgpu_unregister_gpu_instance(tmp_adev); 4712 4713 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4714 4715 /* disable ras on ALL IPs */ 4716 if (!need_emergency_restart && 4717 amdgpu_device_ip_need_full_reset(tmp_adev)) 4718 amdgpu_ras_suspend(tmp_adev); 4719 4720 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4721 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4722 4723 if (!ring || !ring->sched.thread) 4724 continue; 4725 4726 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4727 4728 if (need_emergency_restart) 4729 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4730 } 4731 } 4732 4733 if (need_emergency_restart) 4734 goto skip_sched_resume; 4735 4736 /* 4737 * Must check guilty signal here since after this point all old 4738 * HW fences are force signaled. 4739 * 4740 * job->base holds a reference to parent fence 4741 */ 4742 if (job && job->base.s_fence->parent && 4743 dma_fence_is_signaled(job->base.s_fence->parent)) { 4744 job_signaled = true; 4745 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4746 goto skip_hw_reset; 4747 } 4748 4749 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4750 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4751 r = amdgpu_device_pre_asic_reset(tmp_adev, 4752 (tmp_adev == adev) ? job : NULL, 4753 &need_full_reset); 4754 /*TODO Should we stop ?*/ 4755 if (r) { 4756 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4757 r, adev_to_drm(tmp_adev)->unique); 4758 tmp_adev->asic_reset_res = r; 4759 } 4760 } 4761 4762 /* Actual ASIC resets if needed.*/ 4763 /* TODO Implement XGMI hive reset logic for SRIOV */ 4764 if (amdgpu_sriov_vf(adev)) { 4765 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4766 if (r) 4767 adev->asic_reset_res = r; 4768 } else { 4769 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4770 if (r && r == -EAGAIN) 4771 goto retry; 4772 } 4773 4774 skip_hw_reset: 4775 4776 /* Post ASIC reset for all devs .*/ 4777 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4778 4779 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4780 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4781 4782 if (!ring || !ring->sched.thread) 4783 continue; 4784 4785 /* No point to resubmit jobs if we didn't HW reset*/ 4786 if (!tmp_adev->asic_reset_res && !job_signaled) 4787 drm_sched_resubmit_jobs(&ring->sched); 4788 4789 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4790 } 4791 4792 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4793 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4794 } 4795 4796 tmp_adev->asic_reset_res = 0; 4797 4798 if (r) { 4799 /* bad news, how to tell it to userspace ? */ 4800 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4801 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4802 } else { 4803 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4804 } 4805 } 4806 4807 skip_sched_resume: 4808 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4809 /*unlock kfd: SRIOV would do it separately */ 4810 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4811 amdgpu_amdkfd_post_reset(tmp_adev); 4812 if (audio_suspended) 4813 amdgpu_device_resume_display_audio(tmp_adev); 4814 amdgpu_device_unlock_adev(tmp_adev); 4815 } 4816 4817 skip_recovery: 4818 if (hive) { 4819 atomic_set(&hive->in_reset, 0); 4820 mutex_unlock(&hive->hive_lock); 4821 amdgpu_put_xgmi_hive(hive); 4822 } 4823 4824 if (r) 4825 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4826 return r; 4827 } 4828 4829 /** 4830 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4831 * 4832 * @adev: amdgpu_device pointer 4833 * 4834 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4835 * and lanes) of the slot the device is in. Handles APUs and 4836 * virtualized environments where PCIE config space may not be available. 4837 */ 4838 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4839 { 4840 struct pci_dev *pdev; 4841 enum pci_bus_speed speed_cap, platform_speed_cap; 4842 enum pcie_link_width platform_link_width; 4843 4844 if (amdgpu_pcie_gen_cap) 4845 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4846 4847 if (amdgpu_pcie_lane_cap) 4848 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4849 4850 /* covers APUs as well */ 4851 if (pci_is_root_bus(adev->pdev->bus)) { 4852 if (adev->pm.pcie_gen_mask == 0) 4853 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4854 if (adev->pm.pcie_mlw_mask == 0) 4855 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4856 return; 4857 } 4858 4859 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4860 return; 4861 4862 pcie_bandwidth_available(adev->pdev, NULL, 4863 &platform_speed_cap, &platform_link_width); 4864 4865 if (adev->pm.pcie_gen_mask == 0) { 4866 /* asic caps */ 4867 pdev = adev->pdev; 4868 speed_cap = pcie_get_speed_cap(pdev); 4869 if (speed_cap == PCI_SPEED_UNKNOWN) { 4870 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4871 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4872 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4873 } else { 4874 if (speed_cap == PCIE_SPEED_16_0GT) 4875 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4876 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4877 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4878 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4879 else if (speed_cap == PCIE_SPEED_8_0GT) 4880 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4881 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4882 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4883 else if (speed_cap == PCIE_SPEED_5_0GT) 4884 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4885 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4886 else 4887 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4888 } 4889 /* platform caps */ 4890 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4891 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4892 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4893 } else { 4894 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4895 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4896 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4897 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4898 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4899 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4900 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4901 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4902 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4903 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4904 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4905 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4906 else 4907 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4908 4909 } 4910 } 4911 if (adev->pm.pcie_mlw_mask == 0) { 4912 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4913 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4914 } else { 4915 switch (platform_link_width) { 4916 case PCIE_LNK_X32: 4917 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4918 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4919 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4920 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4921 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4922 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4923 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4924 break; 4925 case PCIE_LNK_X16: 4926 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4927 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4928 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4929 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4930 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4931 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4932 break; 4933 case PCIE_LNK_X12: 4934 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4935 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4936 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4937 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4938 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4939 break; 4940 case PCIE_LNK_X8: 4941 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4945 break; 4946 case PCIE_LNK_X4: 4947 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4950 break; 4951 case PCIE_LNK_X2: 4952 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4954 break; 4955 case PCIE_LNK_X1: 4956 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4957 break; 4958 default: 4959 break; 4960 } 4961 } 4962 } 4963 } 4964 4965 int amdgpu_device_baco_enter(struct drm_device *dev) 4966 { 4967 struct amdgpu_device *adev = drm_to_adev(dev); 4968 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4969 4970 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4971 return -ENOTSUPP; 4972 4973 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4974 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4975 4976 return amdgpu_dpm_baco_enter(adev); 4977 } 4978 4979 int amdgpu_device_baco_exit(struct drm_device *dev) 4980 { 4981 struct amdgpu_device *adev = drm_to_adev(dev); 4982 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4983 int ret = 0; 4984 4985 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 4986 return -ENOTSUPP; 4987 4988 ret = amdgpu_dpm_baco_exit(adev); 4989 if (ret) 4990 return ret; 4991 4992 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 4993 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4994 4995 return 0; 4996 } 4997 4998 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 4999 { 5000 int i; 5001 5002 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5003 struct amdgpu_ring *ring = adev->rings[i]; 5004 5005 if (!ring || !ring->sched.thread) 5006 continue; 5007 5008 cancel_delayed_work_sync(&ring->sched.work_tdr); 5009 } 5010 } 5011 5012 /** 5013 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5014 * @pdev: PCI device struct 5015 * @state: PCI channel state 5016 * 5017 * Description: Called when a PCI error is detected. 5018 * 5019 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5020 */ 5021 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5022 { 5023 STUB(); 5024 return 0; 5025 #ifdef notyet 5026 struct drm_device *dev = pci_get_drvdata(pdev); 5027 struct amdgpu_device *adev = drm_to_adev(dev); 5028 int i; 5029 5030 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5031 5032 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5033 DRM_WARN("No support for XGMI hive yet..."); 5034 return PCI_ERS_RESULT_DISCONNECT; 5035 } 5036 5037 switch (state) { 5038 case pci_channel_io_normal: 5039 return PCI_ERS_RESULT_CAN_RECOVER; 5040 /* Fatal error, prepare for slot reset */ 5041 case pci_channel_io_frozen: 5042 /* 5043 * Cancel and wait for all TDRs in progress if failing to 5044 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5045 * 5046 * Locking adev->reset_sem will prevent any external access 5047 * to GPU during PCI error recovery 5048 */ 5049 while (!amdgpu_device_lock_adev(adev, NULL)) 5050 amdgpu_cancel_all_tdr(adev); 5051 5052 /* 5053 * Block any work scheduling as we do for regular GPU reset 5054 * for the duration of the recovery 5055 */ 5056 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5057 struct amdgpu_ring *ring = adev->rings[i]; 5058 5059 if (!ring || !ring->sched.thread) 5060 continue; 5061 5062 drm_sched_stop(&ring->sched, NULL); 5063 } 5064 return PCI_ERS_RESULT_NEED_RESET; 5065 case pci_channel_io_perm_failure: 5066 /* Permanent error, prepare for device removal */ 5067 return PCI_ERS_RESULT_DISCONNECT; 5068 } 5069 5070 return PCI_ERS_RESULT_NEED_RESET; 5071 #endif 5072 } 5073 5074 /** 5075 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5076 * @pdev: pointer to PCI device 5077 */ 5078 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5079 { 5080 5081 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5082 5083 /* TODO - dump whatever for debugging purposes */ 5084 5085 /* This called only if amdgpu_pci_error_detected returns 5086 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5087 * works, no need to reset slot. 5088 */ 5089 5090 return PCI_ERS_RESULT_RECOVERED; 5091 } 5092 5093 /** 5094 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5095 * @pdev: PCI device struct 5096 * 5097 * Description: This routine is called by the pci error recovery 5098 * code after the PCI slot has been reset, just before we 5099 * should resume normal operations. 5100 */ 5101 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5102 { 5103 STUB(); 5104 return PCI_ERS_RESULT_RECOVERED; 5105 #ifdef notyet 5106 struct drm_device *dev = pci_get_drvdata(pdev); 5107 struct amdgpu_device *adev = drm_to_adev(dev); 5108 int r, i; 5109 bool need_full_reset = true; 5110 u32 memsize; 5111 struct list_head device_list; 5112 5113 DRM_INFO("PCI error: slot reset callback!!\n"); 5114 5115 INIT_LIST_HEAD(&device_list); 5116 list_add_tail(&adev->gmc.xgmi.head, &device_list); 5117 5118 /* wait for asic to come out of reset */ 5119 drm_msleep(500); 5120 5121 /* Restore PCI confspace */ 5122 amdgpu_device_load_pci_state(pdev); 5123 5124 /* confirm ASIC came out of reset */ 5125 for (i = 0; i < adev->usec_timeout; i++) { 5126 memsize = amdgpu_asic_get_config_memsize(adev); 5127 5128 if (memsize != 0xffffffff) 5129 break; 5130 udelay(1); 5131 } 5132 if (memsize == 0xffffffff) { 5133 r = -ETIME; 5134 goto out; 5135 } 5136 5137 adev->in_pci_err_recovery = true; 5138 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5139 adev->in_pci_err_recovery = false; 5140 if (r) 5141 goto out; 5142 5143 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5144 5145 out: 5146 if (!r) { 5147 if (amdgpu_device_cache_pci_state(adev->pdev)) 5148 pci_restore_state(adev->pdev); 5149 5150 DRM_INFO("PCIe error recovery succeeded\n"); 5151 } else { 5152 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5153 amdgpu_device_unlock_adev(adev); 5154 } 5155 5156 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5157 #endif 5158 } 5159 5160 /** 5161 * amdgpu_pci_resume() - resume normal ops after PCI reset 5162 * @pdev: pointer to PCI device 5163 * 5164 * Called when the error recovery driver tells us that its 5165 * OK to resume normal operation. Use completion to allow 5166 * halted scsi ops to resume. 5167 */ 5168 void amdgpu_pci_resume(struct pci_dev *pdev) 5169 { 5170 STUB(); 5171 #ifdef notyet 5172 struct drm_device *dev = pci_get_drvdata(pdev); 5173 struct amdgpu_device *adev = drm_to_adev(dev); 5174 int i; 5175 5176 5177 DRM_INFO("PCI error: resume callback!!\n"); 5178 5179 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5180 struct amdgpu_ring *ring = adev->rings[i]; 5181 5182 if (!ring || !ring->sched.thread) 5183 continue; 5184 5185 5186 drm_sched_resubmit_jobs(&ring->sched); 5187 drm_sched_start(&ring->sched, true); 5188 } 5189 5190 amdgpu_device_unlock_adev(adev); 5191 #endif 5192 } 5193 5194 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5195 { 5196 return false; 5197 #ifdef notyet 5198 struct drm_device *dev = pci_get_drvdata(pdev); 5199 struct amdgpu_device *adev = drm_to_adev(dev); 5200 int r; 5201 5202 r = pci_save_state(pdev); 5203 if (!r) { 5204 kfree(adev->pci_state); 5205 5206 adev->pci_state = pci_store_saved_state(pdev); 5207 5208 if (!adev->pci_state) { 5209 DRM_ERROR("Failed to store PCI saved state"); 5210 return false; 5211 } 5212 } else { 5213 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5214 return false; 5215 } 5216 5217 return true; 5218 #endif 5219 } 5220 5221 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5222 { 5223 STUB(); 5224 return false; 5225 #ifdef notyet 5226 struct drm_device *dev = pci_get_drvdata(pdev); 5227 struct amdgpu_device *adev = drm_to_adev(dev); 5228 int r; 5229 5230 if (!adev->pci_state) 5231 return false; 5232 5233 r = pci_load_saved_state(pdev, adev->pci_state); 5234 5235 if (!r) { 5236 pci_restore_state(pdev); 5237 } else { 5238 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5239 return false; 5240 } 5241 5242 return true; 5243 #endif 5244 } 5245 5246 5247