1 /* $NetBSD: amdgpu_device.c,v 1.21 2024/07/01 12:09:52 riastradh Exp $ */ 2 3 /* 4 * Copyright 2008 Advanced Micro Devices, Inc. 5 * Copyright 2008 Red Hat Inc. 6 * Copyright 2009 Jerome Glisse. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 * 26 * Authors: Dave Airlie 27 * Alex Deucher 28 * Jerome Glisse 29 */ 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: amdgpu_device.c,v 1.21 2024/07/01 12:09:52 riastradh Exp $"); 32 33 #include <linux/power_supply.h> 34 #include <linux/kthread.h> 35 #include <linux/module.h> 36 #include <linux/console.h> 37 #include <linux/slab.h> 38 #include <linux/reboot.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/pci.h> 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 74 #include <linux/suspend.h> 75 #include <drm/task_barrier.h> 76 #include <linux/nbsd-namespace.h> 77 78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "NAVI10", 118 "NAVI14", 119 "NAVI12", 120 "LAST", 121 }; 122 123 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = ddev->dev_private; 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 #endif /* __NetBSD__ */ 148 149 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 150 151 /** 152 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 153 * 154 * @dev: drm_device pointer 155 * 156 * Returns true if the device is a dGPU with HG/PX power control, 157 * otherwise return false. 158 */ 159 bool amdgpu_device_supports_boco(struct drm_device *dev) 160 { 161 struct amdgpu_device *adev = dev->dev_private; 162 163 if (adev->flags & AMD_IS_PX) 164 return true; 165 return false; 166 } 167 168 /** 169 * amdgpu_device_supports_baco - Does the device support BACO 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device supporte BACO, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_baco(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = dev->dev_private; 179 180 return amdgpu_asic_supports_baco(adev); 181 } 182 183 /** 184 * VRAM access helper functions. 185 * 186 * amdgpu_device_vram_access - read/write a buffer in vram 187 * 188 * @adev: amdgpu_device pointer 189 * @pos: offset of the buffer in vram 190 * @buf: virtual address of the buffer in system memory 191 * @size: read/write size, sizeof(@buf) must > @size 192 * @write: true - write to vram, otherwise - read from vram 193 */ 194 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 195 uint32_t *buf, size_t size, bool write) 196 { 197 uint64_t last; 198 unsigned long flags; 199 200 last = size - 4; 201 for (last += pos; pos <= last; pos += 4) { 202 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 203 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 204 WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); 205 if (write) 206 WREG32_NO_KIQ(mmMM_DATA, *buf++); 207 else 208 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 209 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 210 } 211 } 212 213 /* 214 * MMIO register access helper functions. 215 */ 216 /** 217 * amdgpu_mm_rreg - read a memory mapped IO register 218 * 219 * @adev: amdgpu_device pointer 220 * @reg: dword aligned register offset 221 * @acc_flags: access flags which require special behavior 222 * 223 * Returns the 32 bit value from the offset specified. 224 */ 225 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 226 uint32_t acc_flags) 227 { 228 uint32_t ret; 229 230 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 231 return amdgpu_kiq_rreg(adev, reg); 232 233 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 234 #ifdef __NetBSD__ 235 return bus_space_read_4(adev->rmmiot, adev->rmmioh, 4*reg); 236 #else 237 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 238 #endif 239 else { 240 unsigned long flags; 241 242 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 243 #ifdef __NetBSD__ 244 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 245 4*reg); 246 ret = bus_space_read_4(adev->rmmiot, adev->rmmioh, 247 4*mmMM_DATA); 248 #else 249 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 250 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 251 #endif 252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 253 } 254 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 255 return ret; 256 } 257 258 /* 259 * MMIO register read with bytes helper functions 260 * @offset:bytes offset from MMIO start 261 * 262 */ 263 264 /** 265 * amdgpu_mm_rreg8 - read a memory mapped IO register 266 * 267 * @adev: amdgpu_device pointer 268 * @offset: byte aligned register offset 269 * 270 * Returns the 8 bit value from the offset specified. 271 */ 272 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 273 if (offset < adev->rmmio_size) 274 #ifdef __NetBSD__ 275 return bus_space_read_1(adev->rmmiot, adev->rmmioh, offset); 276 #else 277 return (readb(adev->rmmio + offset)); 278 #endif 279 BUG(); 280 } 281 282 /* 283 * MMIO register write with bytes helper functions 284 * @offset:bytes offset from MMIO start 285 * @value: the value want to be written to the register 286 * 287 */ 288 /** 289 * amdgpu_mm_wreg8 - read a memory mapped IO register 290 * 291 * @adev: amdgpu_device pointer 292 * @offset: byte aligned register offset 293 * @value: 8 bit value to write 294 * 295 * Writes the value specified to the offset specified. 296 */ 297 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 298 if (offset < adev->rmmio_size) 299 #ifdef __NetBSD__ 300 bus_space_write_1(adev->rmmiot, adev->rmmioh, offset, value); 301 #else 302 writeb(value, adev->rmmio + offset); 303 #endif 304 else 305 BUG(); 306 } 307 308 /** 309 * amdgpu_mm_wreg - write to a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @v: 32 bit value to write to the register 314 * @acc_flags: access flags which require special behavior 315 * 316 * Writes the value specified to the offset specified. 317 */ 318 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 319 uint32_t acc_flags) 320 { 321 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 322 323 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 324 adev->last_mm_index = v; 325 } 326 327 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 328 return amdgpu_kiq_wreg(adev, reg, v); 329 330 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 331 #ifdef __NetBSD__ 332 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*reg, v); 333 #else 334 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 335 #endif 336 else { 337 unsigned long flags; 338 339 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 340 #ifdef __NetBSD__ 341 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 342 reg*4); 343 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_DATA, v); 344 #else 345 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 346 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 347 #endif 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 } 350 351 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 352 udelay(500); 353 } 354 } 355 356 /** 357 * amdgpu_io_rreg - read an IO register 358 * 359 * @adev: amdgpu_device pointer 360 * @reg: dword aligned register offset 361 * 362 * Returns the 32 bit value from the offset specified. 363 */ 364 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 365 { 366 if ((reg * 4) < adev->rio_mem_size) 367 #ifdef __NetBSD__ 368 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 4*reg); 369 #else 370 return ioread32(adev->rio_mem + (reg * 4)); 371 #endif 372 else { 373 #ifdef __NetBSD__ 374 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 375 4*reg); 376 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 377 4*mmMM_DATA); 378 #else 379 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 380 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 381 #endif 382 } 383 } 384 385 /** 386 * amdgpu_io_wreg - write to an IO register 387 * 388 * @adev: amdgpu_device pointer 389 * @reg: dword aligned register offset 390 * @v: 32 bit value to write to the register 391 * 392 * Writes the value specified to the offset specified. 393 */ 394 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 395 { 396 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 397 adev->last_mm_index = v; 398 } 399 400 if ((reg * 4) < adev->rio_mem_size) 401 #ifdef __NetBSD__ 402 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*reg, v); 403 #else 404 iowrite32(v, adev->rio_mem + (reg * 4)); 405 #endif 406 else { 407 #ifdef __NetBSD__ 408 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 409 4*reg); 410 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_DATA, 411 v); 412 #else 413 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 414 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 415 #endif 416 } 417 418 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 419 udelay(500); 420 } 421 } 422 423 /** 424 * amdgpu_mm_rdoorbell - read a doorbell dword 425 * 426 * @adev: amdgpu_device pointer 427 * @index: doorbell index 428 * 429 * Returns the value in the doorbell aperture at the 430 * requested doorbell index (CIK). 431 */ 432 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 433 { 434 if (index < adev->doorbell.num_doorbells) { 435 #ifdef __NetBSD__ 436 return bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 437 4*index); 438 #else 439 return readl(adev->doorbell.ptr + index); 440 #endif 441 } else { 442 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 443 return 0; 444 } 445 } 446 447 /** 448 * amdgpu_mm_wdoorbell - write a doorbell dword 449 * 450 * @adev: amdgpu_device pointer 451 * @index: doorbell index 452 * @v: value to write 453 * 454 * Writes @v to the doorbell aperture at the 455 * requested doorbell index (CIK). 456 */ 457 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 458 { 459 if (index < adev->doorbell.num_doorbells) { 460 #ifdef __NetBSD__ 461 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 462 4*index, v); 463 #else 464 writel(v, adev->doorbell.ptr + index); 465 #endif 466 } else { 467 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 468 } 469 } 470 471 /** 472 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 473 * 474 * @adev: amdgpu_device pointer 475 * @index: doorbell index 476 * 477 * Returns the value in the doorbell aperture at the 478 * requested doorbell index (VEGA10+). 479 */ 480 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 481 { 482 if (index < adev->doorbell.num_doorbells) { 483 #ifdef __NetBSD__ 484 #ifdef _LP64 485 return bus_space_read_8(adev->doorbell.bst, adev->doorbell.bsh, 486 4*index); 487 #else 488 uint64_t lo, hi; 489 #if _BYTE_ORDER == _LITTLE_ENDIAN 490 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 491 4*index); 492 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 493 4*index + 4); 494 #else 495 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 496 4*index); 497 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 498 4*index + 4); 499 #endif 500 return lo | (hi << 32); 501 #endif 502 #else 503 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 504 #endif 505 } else { 506 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 507 return 0; 508 } 509 } 510 511 /** 512 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 513 * 514 * @adev: amdgpu_device pointer 515 * @index: doorbell index 516 * @v: value to write 517 * 518 * Writes @v to the doorbell aperture at the 519 * requested doorbell index (VEGA10+). 520 */ 521 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 522 { 523 if (index < adev->doorbell.num_doorbells) { 524 #ifdef __NetBSD__ 525 #ifdef _LP64 526 bus_space_write_8(adev->doorbell.bst, adev->doorbell.bsh, 527 4*index, v); 528 #else 529 /* 530 * XXX This might not be as atomic as one might hope... 531 */ 532 #if _BYTE_ORDER == _LITTLE_ENDIAN 533 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 534 4*index, v & 0xffffffffU); 535 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 536 4*index + 4, v >> 32); 537 #else 538 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 539 4*index, v >> 32); 540 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 541 4*index + 4, v & 0xffffffffU); 542 #endif 543 #endif 544 #else 545 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 546 #endif 547 } else { 548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 549 } 550 } 551 552 /** 553 * amdgpu_invalid_rreg - dummy reg read function 554 * 555 * @adev: amdgpu device pointer 556 * @reg: offset of register 557 * 558 * Dummy register read function. Used for register blocks 559 * that certain asics don't have (all asics). 560 * Returns the value in the register. 561 */ 562 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 563 { 564 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 565 BUG(); 566 return 0; 567 } 568 569 /** 570 * amdgpu_invalid_wreg - dummy reg write function 571 * 572 * @adev: amdgpu device pointer 573 * @reg: offset of register 574 * @v: value to write to the register 575 * 576 * Dummy register read function. Used for register blocks 577 * that certain asics don't have (all asics). 578 */ 579 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 580 { 581 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 582 reg, v); 583 BUG(); 584 } 585 586 /** 587 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 588 * 589 * @adev: amdgpu device pointer 590 * @reg: offset of register 591 * 592 * Dummy register read function. Used for register blocks 593 * that certain asics don't have (all asics). 594 * Returns the value in the register. 595 */ 596 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 597 { 598 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 599 BUG(); 600 return 0; 601 } 602 603 /** 604 * amdgpu_invalid_wreg64 - dummy reg write function 605 * 606 * @adev: amdgpu device pointer 607 * @reg: offset of register 608 * @v: value to write to the register 609 * 610 * Dummy register read function. Used for register blocks 611 * that certain asics don't have (all asics). 612 */ 613 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 614 { 615 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08"PRIX64"\n", 616 reg, v); 617 BUG(); 618 } 619 620 /** 621 * amdgpu_block_invalid_rreg - dummy reg read function 622 * 623 * @adev: amdgpu device pointer 624 * @block: offset of instance 625 * @reg: offset of register 626 * 627 * Dummy register read function. Used for register blocks 628 * that certain asics don't have (all asics). 629 * Returns the value in the register. 630 */ 631 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 632 uint32_t block, uint32_t reg) 633 { 634 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 635 reg, block); 636 BUG(); 637 return 0; 638 } 639 640 /** 641 * amdgpu_block_invalid_wreg - dummy reg write function 642 * 643 * @adev: amdgpu device pointer 644 * @block: offset of instance 645 * @reg: offset of register 646 * @v: value to write to the register 647 * 648 * Dummy register read function. Used for register blocks 649 * that certain asics don't have (all asics). 650 */ 651 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 652 uint32_t block, 653 uint32_t reg, uint32_t v) 654 { 655 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 656 reg, block, v); 657 BUG(); 658 } 659 660 /** 661 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 662 * 663 * @adev: amdgpu device pointer 664 * 665 * Allocates a scratch page of VRAM for use by various things in the 666 * driver. 667 */ 668 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 669 { 670 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 671 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 672 &adev->vram_scratch.robj, 673 &adev->vram_scratch.gpu_addr, 674 (void **)__UNVOLATILE(&adev->vram_scratch.ptr)); 675 } 676 677 /** 678 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 679 * 680 * @adev: amdgpu device pointer 681 * 682 * Frees the VRAM scratch page. 683 */ 684 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 685 { 686 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 687 } 688 689 /** 690 * amdgpu_device_program_register_sequence - program an array of registers. 691 * 692 * @adev: amdgpu_device pointer 693 * @registers: pointer to the register array 694 * @array_size: size of the register array 695 * 696 * Programs an array or registers with and and or masks. 697 * This is a helper for setting golden registers. 698 */ 699 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 700 const u32 *registers, 701 const u32 array_size) 702 { 703 u32 tmp, reg, and_mask, or_mask; 704 int i; 705 706 if (array_size % 3) 707 return; 708 709 for (i = 0; i < array_size; i +=3) { 710 reg = registers[i + 0]; 711 and_mask = registers[i + 1]; 712 or_mask = registers[i + 2]; 713 714 if (and_mask == 0xffffffff) { 715 tmp = or_mask; 716 } else { 717 tmp = RREG32(reg); 718 tmp &= ~and_mask; 719 if (adev->family >= AMDGPU_FAMILY_AI) 720 tmp |= (or_mask & and_mask); 721 else 722 tmp |= or_mask; 723 } 724 WREG32(reg, tmp); 725 } 726 } 727 728 /** 729 * amdgpu_device_pci_config_reset - reset the GPU 730 * 731 * @adev: amdgpu_device pointer 732 * 733 * Resets the GPU using the pci config reset sequence. 734 * Only applicable to asics prior to vega10. 735 */ 736 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 737 { 738 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 739 } 740 741 /* 742 * GPU doorbell aperture helpers function. 743 */ 744 /** 745 * amdgpu_device_doorbell_init - Init doorbell driver information. 746 * 747 * @adev: amdgpu_device pointer 748 * 749 * Init doorbell driver information (CIK) 750 * Returns 0 on success, error on failure. 751 */ 752 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 753 { 754 755 /* No doorbell on SI hardware generation */ 756 if (adev->asic_type < CHIP_BONAIRE) { 757 adev->doorbell.base = 0; 758 adev->doorbell.size = 0; 759 adev->doorbell.num_doorbells = 0; 760 #ifndef __NetBSD__ 761 adev->doorbell.ptr = NULL; 762 #endif 763 return 0; 764 } 765 766 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 767 return -EINVAL; 768 769 amdgpu_asic_init_doorbell_index(adev); 770 771 /* doorbell bar mapping */ 772 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 773 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 774 775 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 776 adev->doorbell_index.max_assignment+1); 777 if (adev->doorbell.num_doorbells == 0) 778 return -EINVAL; 779 780 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 781 * paging queue doorbell use the second page. The 782 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 783 * doorbells are in the first page. So with paging queue enabled, 784 * the max num_doorbells should + 1 page (0x400 in dword) 785 */ 786 if (adev->asic_type >= CHIP_VEGA10) 787 adev->doorbell.num_doorbells += 0x400; 788 789 #ifdef __NetBSD__ 790 int r; 791 adev->doorbell.bst = adev->pdev->pd_pa.pa_memt; 792 /* XXX errno NetBSD->Linux */ 793 r = -bus_space_map(adev->doorbell.bst, adev->doorbell.base, 794 adev->doorbell.num_doorbells * sizeof(u32), 0, 795 &adev->doorbell.bsh); 796 if (r) 797 return r; 798 #else 799 adev->doorbell.ptr = ioremap(adev->doorbell.base, 800 adev->doorbell.num_doorbells * 801 sizeof(u32)); 802 if (adev->doorbell.ptr == NULL) 803 return -ENOMEM; 804 #endif 805 806 return 0; 807 } 808 809 /** 810 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 811 * 812 * @adev: amdgpu_device pointer 813 * 814 * Tear down doorbell driver information (CIK) 815 */ 816 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 817 { 818 #ifdef __NetBSD__ 819 if (adev->doorbell.num_doorbells) { 820 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 821 adev->doorbell.num_doorbells * sizeof(u32)); 822 adev->doorbell.num_doorbells = 0; 823 } 824 #else 825 iounmap(adev->doorbell.ptr); 826 adev->doorbell.ptr = NULL; 827 #endif 828 } 829 830 831 832 /* 833 * amdgpu_device_wb_*() 834 * Writeback is the method by which the GPU updates special pages in memory 835 * with the status of certain GPU events (fences, ring pointers,etc.). 836 */ 837 838 /** 839 * amdgpu_device_wb_fini - Disable Writeback and free memory 840 * 841 * @adev: amdgpu_device pointer 842 * 843 * Disables Writeback and frees the Writeback memory (all asics). 844 * Used at driver shutdown. 845 */ 846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 847 { 848 if (adev->wb.wb_obj) { 849 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 850 &adev->wb.gpu_addr, 851 (void **)__UNVOLATILE(&adev->wb.wb)); 852 adev->wb.wb_obj = NULL; 853 } 854 } 855 856 /** 857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 858 * 859 * @adev: amdgpu_device pointer 860 * 861 * Initializes writeback and allocates writeback memory (all asics). 862 * Used at driver startup. 863 * Returns 0 on success or an -error on failure. 864 */ 865 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 866 { 867 int r; 868 869 if (adev->wb.wb_obj == NULL) { 870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 873 &adev->wb.wb_obj, &adev->wb.gpu_addr, 874 (void **)__UNVOLATILE(&adev->wb.wb)); 875 if (r) { 876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 877 return r; 878 } 879 880 adev->wb.num_wb = AMDGPU_MAX_WB; 881 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 882 883 /* clear wb memory */ 884 memset(__UNVOLATILE(adev->wb.wb), 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 885 } 886 887 return 0; 888 } 889 890 /** 891 * amdgpu_device_wb_get - Allocate a wb entry 892 * 893 * @adev: amdgpu_device pointer 894 * @wb: wb index 895 * 896 * Allocate a wb slot for use by the driver (all asics). 897 * Returns 0 on success or -EINVAL on failure. 898 */ 899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 900 { 901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 902 903 if (offset < adev->wb.num_wb) { 904 __set_bit(offset, adev->wb.used); 905 *wb = offset << 3; /* convert to dw offset */ 906 return 0; 907 } else { 908 return -EINVAL; 909 } 910 } 911 912 /** 913 * amdgpu_device_wb_free - Free a wb entry 914 * 915 * @adev: amdgpu_device pointer 916 * @wb: wb index 917 * 918 * Free a wb slot allocated for use by the driver (all asics) 919 */ 920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 921 { 922 wb >>= 3; 923 if (wb < adev->wb.num_wb) 924 __clear_bit(wb, adev->wb.used); 925 } 926 927 /** 928 * amdgpu_device_resize_fb_bar - try to resize FB BAR 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 933 * to fail, but if any of the BARs is not accessible after the size we abort 934 * driver loading by returning -ENODEV. 935 */ 936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 937 { 938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 940 struct pci_bus *root; 941 struct resource *res; 942 unsigned i; 943 u16 cmd; 944 int r; 945 946 /* Bypass for VF */ 947 if (amdgpu_sriov_vf(adev)) 948 return 0; 949 950 #ifdef __NetBSD__ /* XXX amdgpu fb resize */ 951 __USE(space_needed); 952 __USE(rbar_size); 953 __USE(root); 954 __USE(res); 955 __USE(i); 956 __USE(cmd); 957 __USE(r); 958 #else 959 960 /* Check if the root BUS has 64bit memory resources */ 961 root = adev->pdev->bus; 962 while (root->parent) 963 root = root->parent; 964 965 pci_bus_for_each_resource(root, res, i) { 966 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 967 res->start > 0x100000000ull) 968 break; 969 } 970 971 /* Trying to resize is pointless without a root hub window above 4GB */ 972 if (!res) 973 return 0; 974 975 /* Disable memory decoding while we change the BAR addresses and size */ 976 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 977 pci_write_config_word(adev->pdev, PCI_COMMAND, 978 cmd & ~PCI_COMMAND_MEMORY); 979 980 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 981 amdgpu_device_doorbell_fini(adev); 982 if (adev->asic_type >= CHIP_BONAIRE) 983 pci_release_resource(adev->pdev, 2); 984 985 pci_release_resource(adev->pdev, 0); 986 987 r = pci_resize_resource(adev->pdev, 0, rbar_size); 988 if (r == -ENOSPC) 989 DRM_INFO("Not enough PCI address space for a large BAR."); 990 else if (r && r != -ENOTSUPP) 991 DRM_ERROR("Problem resizing BAR0 (%d).", r); 992 993 pci_assign_unassigned_bus_resources(adev->pdev->bus); 994 995 /* When the doorbell or fb BAR isn't available we have no chance of 996 * using the device. 997 */ 998 r = amdgpu_device_doorbell_init(adev); 999 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1000 return -ENODEV; 1001 1002 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1003 1004 #endif 1005 1006 return 0; 1007 } 1008 1009 /* 1010 * GPU helpers function. 1011 */ 1012 /** 1013 * amdgpu_device_need_post - check if the hw need post or not 1014 * 1015 * @adev: amdgpu_device pointer 1016 * 1017 * Check if the asic has been initialized (all asics) at driver startup 1018 * or post is needed if hw reset is performed. 1019 * Returns true if need or false if not. 1020 */ 1021 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1022 { 1023 uint32_t reg; 1024 1025 if (amdgpu_sriov_vf(adev)) 1026 return false; 1027 1028 if (amdgpu_passthrough(adev)) { 1029 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1030 * some old smc fw still need driver do vPost otherwise gpu hang, while 1031 * those smc fw version above 22.15 doesn't have this flaw, so we force 1032 * vpost executed for smc version below 22.15 1033 */ 1034 if (adev->asic_type == CHIP_FIJI) { 1035 int err; 1036 uint32_t fw_ver; 1037 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1038 /* force vPost if error occured */ 1039 if (err) 1040 return true; 1041 1042 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1043 if (fw_ver < 0x00160e00) 1044 return true; 1045 } 1046 } 1047 1048 if (adev->has_hw_reset) { 1049 adev->has_hw_reset = false; 1050 return true; 1051 } 1052 1053 /* bios scratch used on CIK+ */ 1054 if (adev->asic_type >= CHIP_BONAIRE) 1055 return amdgpu_atombios_scratch_need_asic_init(adev); 1056 1057 /* check MEM_SIZE for older asics */ 1058 reg = amdgpu_asic_get_config_memsize(adev); 1059 1060 if ((reg != 0) && (reg != 0xffffffff)) 1061 return false; 1062 1063 return true; 1064 } 1065 1066 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1067 /* if we get transitioned to only one device, take VGA back */ 1068 /** 1069 * amdgpu_device_vga_set_decode - enable/disable vga decode 1070 * 1071 * @cookie: amdgpu_device pointer 1072 * @state: enable/disable vga decode 1073 * 1074 * Enable/disable vga decode (all asics). 1075 * Returns VGA resource flags. 1076 */ 1077 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1078 { 1079 struct amdgpu_device *adev = cookie; 1080 amdgpu_asic_set_vga_state(adev, state); 1081 if (state) 1082 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1083 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1084 else 1085 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1086 } 1087 #endif /* __NetBSD__ */ 1088 1089 /** 1090 * amdgpu_device_check_block_size - validate the vm block size 1091 * 1092 * @adev: amdgpu_device pointer 1093 * 1094 * Validates the vm block size specified via module parameter. 1095 * The vm block size defines number of bits in page table versus page directory, 1096 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1097 * page table and the remaining bits are in the page directory. 1098 */ 1099 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1100 { 1101 /* defines number of bits in page table versus page directory, 1102 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1103 * page table and the remaining bits are in the page directory */ 1104 if (amdgpu_vm_block_size == -1) 1105 return; 1106 1107 if (amdgpu_vm_block_size < 9) { 1108 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1109 amdgpu_vm_block_size); 1110 amdgpu_vm_block_size = -1; 1111 } 1112 } 1113 1114 /** 1115 * amdgpu_device_check_vm_size - validate the vm size 1116 * 1117 * @adev: amdgpu_device pointer 1118 * 1119 * Validates the vm size in GB specified via module parameter. 1120 * The VM size is the size of the GPU virtual memory space in GB. 1121 */ 1122 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1123 { 1124 /* no need to check the default value */ 1125 if (amdgpu_vm_size == -1) 1126 return; 1127 1128 if (amdgpu_vm_size < 1) { 1129 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1130 amdgpu_vm_size); 1131 amdgpu_vm_size = -1; 1132 } 1133 } 1134 1135 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1136 { 1137 struct sysinfo si; 1138 bool is_os_64 = (sizeof(void *) == 8); 1139 uint64_t total_memory; 1140 uint64_t dram_size_seven_GB = 0x1B8000000; 1141 uint64_t dram_size_three_GB = 0xB8000000; 1142 1143 if (amdgpu_smu_memory_pool_size == 0) 1144 return; 1145 1146 if (!is_os_64) { 1147 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1148 goto def_value; 1149 } 1150 si_meminfo(&si); 1151 total_memory = (uint64_t)si.totalram * si.mem_unit; 1152 1153 if ((amdgpu_smu_memory_pool_size == 1) || 1154 (amdgpu_smu_memory_pool_size == 2)) { 1155 if (total_memory < dram_size_three_GB) 1156 goto def_value1; 1157 } else if ((amdgpu_smu_memory_pool_size == 4) || 1158 (amdgpu_smu_memory_pool_size == 8)) { 1159 if (total_memory < dram_size_seven_GB) 1160 goto def_value1; 1161 } else { 1162 DRM_WARN("Smu memory pool size not supported\n"); 1163 goto def_value; 1164 } 1165 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1166 1167 return; 1168 1169 def_value1: 1170 DRM_WARN("No enough system memory\n"); 1171 def_value: 1172 adev->pm.smu_prv_buffer_size = 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_check_arguments - validate module params 1177 * 1178 * @adev: amdgpu_device pointer 1179 * 1180 * Validates certain module parameters and updates 1181 * the associated values used by the driver (all asics). 1182 */ 1183 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1184 { 1185 if (amdgpu_sched_jobs < 4) { 1186 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1187 amdgpu_sched_jobs); 1188 amdgpu_sched_jobs = 4; 1189 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1190 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1191 amdgpu_sched_jobs); 1192 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1193 } 1194 1195 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1196 /* gart size must be greater or equal to 32M */ 1197 dev_warn(adev->dev, "gart size (%d) too small\n", 1198 amdgpu_gart_size); 1199 amdgpu_gart_size = -1; 1200 } 1201 1202 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1203 /* gtt size must be greater or equal to 32M */ 1204 dev_warn(adev->dev, "gtt size (%d) too small\n", 1205 amdgpu_gtt_size); 1206 amdgpu_gtt_size = -1; 1207 } 1208 1209 /* valid range is between 4 and 9 inclusive */ 1210 if (amdgpu_vm_fragment_size != -1 && 1211 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1212 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1213 amdgpu_vm_fragment_size = -1; 1214 } 1215 1216 amdgpu_device_check_smu_prv_buffer_size(adev); 1217 1218 amdgpu_device_check_vm_size(adev); 1219 1220 amdgpu_device_check_block_size(adev); 1221 1222 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1223 1224 return 0; 1225 } 1226 1227 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1228 /** 1229 * amdgpu_switcheroo_set_state - set switcheroo state 1230 * 1231 * @pdev: pci dev pointer 1232 * @state: vga_switcheroo state 1233 * 1234 * Callback for the switcheroo driver. Suspends or resumes the 1235 * the asics before or after it is powered up using ACPI methods. 1236 */ 1237 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1238 { 1239 struct drm_device *dev = pci_get_drvdata(pdev); 1240 int r; 1241 1242 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1243 return; 1244 1245 if (state == VGA_SWITCHEROO_ON) { 1246 pr_info("amdgpu: switched on\n"); 1247 /* don't suspend or resume card normally */ 1248 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1249 1250 #ifndef __NetBSD__ /* pmf handles this for us. */ 1251 pci_set_power_state(dev->pdev, PCI_D0); 1252 pci_restore_state(dev->pdev); 1253 r = pci_enable_device(dev->pdev); 1254 if (r) 1255 DRM_WARN("pci_enable_device failed (%d)\n", r); 1256 #endif 1257 amdgpu_device_resume(dev, true); 1258 1259 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1260 drm_kms_helper_poll_enable(dev); 1261 } else { 1262 pr_info("amdgpu: switched off\n"); 1263 drm_kms_helper_poll_disable(dev); 1264 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1265 amdgpu_device_suspend(dev, true); 1266 #ifndef __NetBSD__ /* pmf handles this for us. */ 1267 pci_save_state(dev->pdev); 1268 /* Shut down the device */ 1269 pci_disable_device(dev->pdev); 1270 pci_set_power_state(dev->pdev, PCI_D3cold); 1271 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1272 #endif 1273 } 1274 } 1275 1276 /** 1277 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1278 * 1279 * @pdev: pci dev pointer 1280 * 1281 * Callback for the switcheroo driver. Check of the switcheroo 1282 * state can be changed. 1283 * Returns true if the state can be changed, false if not. 1284 */ 1285 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1286 { 1287 struct drm_device *dev = pci_get_drvdata(pdev); 1288 1289 /* 1290 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1291 * locking inversion with the driver load path. And the access here is 1292 * completely racy anyway. So don't bother with locking for now. 1293 */ 1294 return dev->open_count == 0; 1295 } 1296 1297 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1298 .set_gpu_state = amdgpu_switcheroo_set_state, 1299 .reprobe = NULL, 1300 .can_switch = amdgpu_switcheroo_can_switch, 1301 }; 1302 #endif /* __NetBSD__ */ 1303 1304 /** 1305 * amdgpu_device_ip_set_clockgating_state - set the CG state 1306 * 1307 * @dev: amdgpu_device pointer 1308 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1309 * @state: clockgating state (gate or ungate) 1310 * 1311 * Sets the requested clockgating state for all instances of 1312 * the hardware IP specified. 1313 * Returns the error code from the last instance. 1314 */ 1315 int amdgpu_device_ip_set_clockgating_state(void *dev, 1316 enum amd_ip_block_type block_type, 1317 enum amd_clockgating_state state) 1318 { 1319 struct amdgpu_device *adev = dev; 1320 int i, r = 0; 1321 1322 for (i = 0; i < adev->num_ip_blocks; i++) { 1323 if (!adev->ip_blocks[i].status.valid) 1324 continue; 1325 if (adev->ip_blocks[i].version->type != block_type) 1326 continue; 1327 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1328 continue; 1329 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1330 (void *)adev, state); 1331 if (r) 1332 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1333 adev->ip_blocks[i].version->funcs->name, r); 1334 } 1335 return r; 1336 } 1337 1338 /** 1339 * amdgpu_device_ip_set_powergating_state - set the PG state 1340 * 1341 * @dev: amdgpu_device pointer 1342 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1343 * @state: powergating state (gate or ungate) 1344 * 1345 * Sets the requested powergating state for all instances of 1346 * the hardware IP specified. 1347 * Returns the error code from the last instance. 1348 */ 1349 int amdgpu_device_ip_set_powergating_state(void *dev, 1350 enum amd_ip_block_type block_type, 1351 enum amd_powergating_state state) 1352 { 1353 struct amdgpu_device *adev = dev; 1354 int i, r = 0; 1355 1356 for (i = 0; i < adev->num_ip_blocks; i++) { 1357 if (!adev->ip_blocks[i].status.valid) 1358 continue; 1359 if (adev->ip_blocks[i].version->type != block_type) 1360 continue; 1361 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1362 continue; 1363 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1364 (void *)adev, state); 1365 if (r) 1366 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1367 adev->ip_blocks[i].version->funcs->name, r); 1368 } 1369 return r; 1370 } 1371 1372 /** 1373 * amdgpu_device_ip_get_clockgating_state - get the CG state 1374 * 1375 * @adev: amdgpu_device pointer 1376 * @flags: clockgating feature flags 1377 * 1378 * Walks the list of IPs on the device and updates the clockgating 1379 * flags for each IP. 1380 * Updates @flags with the feature flags for each hardware IP where 1381 * clockgating is enabled. 1382 */ 1383 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1384 u32 *flags) 1385 { 1386 int i; 1387 1388 for (i = 0; i < adev->num_ip_blocks; i++) { 1389 if (!adev->ip_blocks[i].status.valid) 1390 continue; 1391 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1392 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1393 } 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_wait_for_idle - wait for idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Waits for the request hardware IP to be idle. 1403 * Returns 0 for success or a negative error code on failure. 1404 */ 1405 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i, r; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) { 1414 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1415 if (r) 1416 return r; 1417 break; 1418 } 1419 } 1420 return 0; 1421 1422 } 1423 1424 /** 1425 * amdgpu_device_ip_is_idle - is the hardware IP idle 1426 * 1427 * @adev: amdgpu_device pointer 1428 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1429 * 1430 * Check if the hardware IP is idle or not. 1431 * Returns true if it the IP is idle, false if not. 1432 */ 1433 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1434 enum amd_ip_block_type block_type) 1435 { 1436 int i; 1437 1438 for (i = 0; i < adev->num_ip_blocks; i++) { 1439 if (!adev->ip_blocks[i].status.valid) 1440 continue; 1441 if (adev->ip_blocks[i].version->type == block_type) 1442 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1443 } 1444 return true; 1445 1446 } 1447 1448 /** 1449 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1450 * 1451 * @adev: amdgpu_device pointer 1452 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1453 * 1454 * Returns a pointer to the hardware IP block structure 1455 * if it exists for the asic, otherwise NULL. 1456 */ 1457 struct amdgpu_ip_block * 1458 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1459 enum amd_ip_block_type type) 1460 { 1461 int i; 1462 1463 for (i = 0; i < adev->num_ip_blocks; i++) 1464 if (adev->ip_blocks[i].version->type == type) 1465 return &adev->ip_blocks[i]; 1466 1467 return NULL; 1468 } 1469 1470 /** 1471 * amdgpu_device_ip_block_version_cmp 1472 * 1473 * @adev: amdgpu_device pointer 1474 * @type: enum amd_ip_block_type 1475 * @major: major version 1476 * @minor: minor version 1477 * 1478 * return 0 if equal or greater 1479 * return 1 if smaller or the ip_block doesn't exist 1480 */ 1481 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1482 enum amd_ip_block_type type, 1483 u32 major, u32 minor) 1484 { 1485 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1486 1487 if (ip_block && ((ip_block->version->major > major) || 1488 ((ip_block->version->major == major) && 1489 (ip_block->version->minor >= minor)))) 1490 return 0; 1491 1492 return 1; 1493 } 1494 1495 /** 1496 * amdgpu_device_ip_block_add 1497 * 1498 * @adev: amdgpu_device pointer 1499 * @ip_block_version: pointer to the IP to add 1500 * 1501 * Adds the IP block driver information to the collection of IPs 1502 * on the asic. 1503 */ 1504 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1505 const struct amdgpu_ip_block_version *ip_block_version) 1506 { 1507 if (!ip_block_version) 1508 return -EINVAL; 1509 1510 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1511 ip_block_version->funcs->name); 1512 1513 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1514 1515 return 0; 1516 } 1517 1518 /** 1519 * amdgpu_device_enable_virtual_display - enable virtual display feature 1520 * 1521 * @adev: amdgpu_device pointer 1522 * 1523 * Enabled the virtual display feature if the user has enabled it via 1524 * the module parameter virtual_display. This feature provides a virtual 1525 * display hardware on headless boards or in virtualized environments. 1526 * This function parses and validates the configuration string specified by 1527 * the user and configues the virtual display configuration (number of 1528 * virtual connectors, crtcs, etc.) specified. 1529 */ 1530 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1531 { 1532 adev->enable_virtual_display = false; 1533 1534 if (amdgpu_virtual_display) { 1535 struct drm_device *ddev = adev->ddev; 1536 const char *pci_address_name = pci_name(ddev->pdev); 1537 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1538 1539 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1540 pciaddstr_tmp = pciaddstr; 1541 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1542 pciaddname = strsep(&pciaddname_tmp, ","); 1543 if (!strcmp("all", pciaddname) 1544 || !strcmp(pci_address_name, pciaddname)) { 1545 long num_crtc; 1546 int res = -1; 1547 1548 adev->enable_virtual_display = true; 1549 1550 if (pciaddname_tmp) 1551 res = kstrtol(pciaddname_tmp, 10, 1552 &num_crtc); 1553 1554 if (!res) { 1555 if (num_crtc < 1) 1556 num_crtc = 1; 1557 if (num_crtc > 6) 1558 num_crtc = 6; 1559 adev->mode_info.num_crtc = num_crtc; 1560 } else { 1561 adev->mode_info.num_crtc = 1; 1562 } 1563 break; 1564 } 1565 } 1566 1567 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1568 amdgpu_virtual_display, pci_address_name, 1569 adev->enable_virtual_display, adev->mode_info.num_crtc); 1570 1571 kfree(pciaddstr); 1572 } 1573 } 1574 1575 /** 1576 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1577 * 1578 * @adev: amdgpu_device pointer 1579 * 1580 * Parses the asic configuration parameters specified in the gpu info 1581 * firmware and makes them availale to the driver for use in configuring 1582 * the asic. 1583 * Returns 0 on success, -EINVAL on failure. 1584 */ 1585 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1586 { 1587 const char *chip_name; 1588 char fw_name[30]; 1589 int err; 1590 const struct gpu_info_firmware_header_v1_0 *hdr; 1591 1592 adev->firmware.gpu_info_fw = NULL; 1593 1594 switch (adev->asic_type) { 1595 case CHIP_TOPAZ: 1596 case CHIP_TONGA: 1597 case CHIP_FIJI: 1598 case CHIP_POLARIS10: 1599 case CHIP_POLARIS11: 1600 case CHIP_POLARIS12: 1601 case CHIP_VEGAM: 1602 case CHIP_CARRIZO: 1603 case CHIP_STONEY: 1604 #ifdef CONFIG_DRM_AMDGPU_SI 1605 case CHIP_VERDE: 1606 case CHIP_TAHITI: 1607 case CHIP_PITCAIRN: 1608 case CHIP_OLAND: 1609 case CHIP_HAINAN: 1610 #endif 1611 #ifdef CONFIG_DRM_AMDGPU_CIK 1612 case CHIP_BONAIRE: 1613 case CHIP_HAWAII: 1614 case CHIP_KAVERI: 1615 case CHIP_KABINI: 1616 case CHIP_MULLINS: 1617 #endif 1618 case CHIP_VEGA20: 1619 default: 1620 return 0; 1621 case CHIP_VEGA10: 1622 chip_name = "vega10"; 1623 break; 1624 case CHIP_VEGA12: 1625 chip_name = "vega12"; 1626 break; 1627 case CHIP_RAVEN: 1628 if (adev->rev_id >= 8) 1629 chip_name = "raven2"; 1630 else if (adev->pdev->device == 0x15d8) 1631 chip_name = "picasso"; 1632 else 1633 chip_name = "raven"; 1634 break; 1635 case CHIP_ARCTURUS: 1636 chip_name = "arcturus"; 1637 break; 1638 case CHIP_RENOIR: 1639 chip_name = "renoir"; 1640 break; 1641 case CHIP_NAVI10: 1642 chip_name = "navi10"; 1643 break; 1644 case CHIP_NAVI14: 1645 chip_name = "navi14"; 1646 break; 1647 case CHIP_NAVI12: 1648 chip_name = "navi12"; 1649 break; 1650 } 1651 1652 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1653 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1654 if (err) { 1655 dev_err(adev->dev, 1656 "Failed to load gpu_info firmware \"%s\"\n", 1657 fw_name); 1658 goto out; 1659 } 1660 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1661 if (err) { 1662 dev_err(adev->dev, 1663 "Failed to validate gpu_info firmware \"%s\"\n", 1664 fw_name); 1665 goto out; 1666 } 1667 1668 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1669 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1670 1671 switch (hdr->version_major) { 1672 case 1: 1673 { 1674 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1675 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1676 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1677 1678 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1679 goto parse_soc_bounding_box; 1680 1681 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1682 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1683 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1684 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1685 adev->gfx.config.max_texture_channel_caches = 1686 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1687 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1688 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1689 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1690 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1691 adev->gfx.config.double_offchip_lds_buf = 1692 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1693 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1694 adev->gfx.cu_info.max_waves_per_simd = 1695 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1696 adev->gfx.cu_info.max_scratch_slots_per_cu = 1697 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1698 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1699 if (hdr->version_minor >= 1) { 1700 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1701 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1702 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1703 adev->gfx.config.num_sc_per_sh = 1704 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1705 adev->gfx.config.num_packer_per_sc = 1706 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1707 } 1708 1709 parse_soc_bounding_box: 1710 /* 1711 * soc bounding box info is not integrated in disocovery table, 1712 * we always need to parse it from gpu info firmware. 1713 */ 1714 if (hdr->version_minor == 2) { 1715 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1716 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1717 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1718 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1719 } 1720 break; 1721 } 1722 default: 1723 dev_err(adev->dev, 1724 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1725 err = -EINVAL; 1726 goto out; 1727 } 1728 out: 1729 return err; 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_early_init - run early init for hardware IPs 1734 * 1735 * @adev: amdgpu_device pointer 1736 * 1737 * Early initialization pass for hardware IPs. The hardware IPs that make 1738 * up each asic are discovered each IP's early_init callback is run. This 1739 * is the first stage in initializing the asic. 1740 * Returns 0 on success, negative error code on failure. 1741 */ 1742 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1743 { 1744 int i, r; 1745 1746 amdgpu_device_enable_virtual_display(adev); 1747 1748 switch (adev->asic_type) { 1749 case CHIP_TOPAZ: 1750 case CHIP_TONGA: 1751 case CHIP_FIJI: 1752 case CHIP_POLARIS10: 1753 case CHIP_POLARIS11: 1754 case CHIP_POLARIS12: 1755 case CHIP_VEGAM: 1756 case CHIP_CARRIZO: 1757 case CHIP_STONEY: 1758 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1759 adev->family = AMDGPU_FAMILY_CZ; 1760 else 1761 adev->family = AMDGPU_FAMILY_VI; 1762 1763 r = vi_set_ip_blocks(adev); 1764 if (r) 1765 return r; 1766 break; 1767 #ifdef CONFIG_DRM_AMDGPU_SI 1768 case CHIP_VERDE: 1769 case CHIP_TAHITI: 1770 case CHIP_PITCAIRN: 1771 case CHIP_OLAND: 1772 case CHIP_HAINAN: 1773 adev->family = AMDGPU_FAMILY_SI; 1774 r = si_set_ip_blocks(adev); 1775 if (r) 1776 return r; 1777 break; 1778 #endif 1779 #ifdef CONFIG_DRM_AMDGPU_CIK 1780 case CHIP_BONAIRE: 1781 case CHIP_HAWAII: 1782 case CHIP_KAVERI: 1783 case CHIP_KABINI: 1784 case CHIP_MULLINS: 1785 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1786 adev->family = AMDGPU_FAMILY_CI; 1787 else 1788 adev->family = AMDGPU_FAMILY_KV; 1789 1790 r = cik_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 #endif 1795 case CHIP_VEGA10: 1796 case CHIP_VEGA12: 1797 case CHIP_VEGA20: 1798 case CHIP_RAVEN: 1799 case CHIP_ARCTURUS: 1800 case CHIP_RENOIR: 1801 if (adev->asic_type == CHIP_RAVEN || 1802 adev->asic_type == CHIP_RENOIR) 1803 adev->family = AMDGPU_FAMILY_RV; 1804 else 1805 adev->family = AMDGPU_FAMILY_AI; 1806 1807 r = soc15_set_ip_blocks(adev); 1808 if (r) 1809 return r; 1810 break; 1811 case CHIP_NAVI10: 1812 case CHIP_NAVI14: 1813 case CHIP_NAVI12: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 r = amdgpu_device_parse_gpu_info_fw(adev); 1826 if (r) 1827 return r; 1828 1829 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1830 amdgpu_discovery_get_gfx_info(adev); 1831 1832 amdgpu_amdkfd_device_probe(adev); 1833 1834 if (amdgpu_sriov_vf(adev)) { 1835 r = amdgpu_virt_request_full_gpu(adev, true); 1836 if (r) 1837 return -EAGAIN; 1838 } 1839 1840 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1841 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1842 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1843 1844 for (i = 0; i < adev->num_ip_blocks; i++) { 1845 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1846 DRM_ERROR("disabled ip block: %d <%s>\n", 1847 i, adev->ip_blocks[i].version->funcs->name); 1848 adev->ip_blocks[i].status.valid = false; 1849 } else { 1850 if (adev->ip_blocks[i].version->funcs->early_init) { 1851 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1852 if (r == -ENOENT) { 1853 adev->ip_blocks[i].status.valid = false; 1854 } else if (r) { 1855 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1856 adev->ip_blocks[i].version->funcs->name, r); 1857 return r; 1858 } else { 1859 adev->ip_blocks[i].status.valid = true; 1860 } 1861 } else { 1862 adev->ip_blocks[i].status.valid = true; 1863 } 1864 } 1865 /* get the vbios after the asic_funcs are set up */ 1866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1867 /* Read BIOS */ 1868 if (!amdgpu_get_bios(adev)) 1869 return -EINVAL; 1870 1871 r = amdgpu_atombios_init(adev); 1872 if (r) { 1873 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1875 return r; 1876 } 1877 } 1878 } 1879 1880 adev->cg_flags &= amdgpu_cg_mask; 1881 adev->pg_flags &= amdgpu_pg_mask; 1882 1883 return 0; 1884 } 1885 1886 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1887 { 1888 int i, r; 1889 1890 for (i = 0; i < adev->num_ip_blocks; i++) { 1891 if (!adev->ip_blocks[i].status.sw) 1892 continue; 1893 if (adev->ip_blocks[i].status.hw) 1894 continue; 1895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1896 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1898 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1899 if (r) { 1900 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1901 adev->ip_blocks[i].version->funcs->name, r); 1902 return r; 1903 } 1904 adev->ip_blocks[i].status.hw = true; 1905 } 1906 } 1907 1908 return 0; 1909 } 1910 1911 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1912 { 1913 int i, r; 1914 1915 for (i = 0; i < adev->num_ip_blocks; i++) { 1916 if (!adev->ip_blocks[i].status.sw) 1917 continue; 1918 if (adev->ip_blocks[i].status.hw) 1919 continue; 1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1921 if (r) { 1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1923 adev->ip_blocks[i].version->funcs->name, r); 1924 return r; 1925 } 1926 adev->ip_blocks[i].status.hw = true; 1927 } 1928 1929 return 0; 1930 } 1931 1932 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1933 { 1934 int r = 0; 1935 int i; 1936 uint32_t smu_version; 1937 1938 if (adev->asic_type >= CHIP_VEGA10) { 1939 for (i = 0; i < adev->num_ip_blocks; i++) { 1940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1941 continue; 1942 1943 /* no need to do the fw loading again if already done*/ 1944 if (adev->ip_blocks[i].status.hw == true) 1945 break; 1946 1947 if (adev->in_gpu_reset || adev->in_suspend) { 1948 r = adev->ip_blocks[i].version->funcs->resume(adev); 1949 if (r) { 1950 DRM_ERROR("resume of IP block <%s> failed %d\n", 1951 adev->ip_blocks[i].version->funcs->name, r); 1952 return r; 1953 } 1954 } else { 1955 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1956 if (r) { 1957 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1958 adev->ip_blocks[i].version->funcs->name, r); 1959 return r; 1960 } 1961 } 1962 1963 adev->ip_blocks[i].status.hw = true; 1964 break; 1965 } 1966 } 1967 1968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1970 1971 return r; 1972 } 1973 1974 /** 1975 * amdgpu_device_ip_init - run init for hardware IPs 1976 * 1977 * @adev: amdgpu_device pointer 1978 * 1979 * Main initialization pass for hardware IPs. The list of all the hardware 1980 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1981 * are run. sw_init initializes the software state associated with each IP 1982 * and hw_init initializes the hardware associated with each IP. 1983 * Returns 0 on success, negative error code on failure. 1984 */ 1985 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1986 { 1987 int i, r; 1988 1989 r = amdgpu_ras_init(adev); 1990 if (r) 1991 return r; 1992 1993 for (i = 0; i < adev->num_ip_blocks; i++) { 1994 if (!adev->ip_blocks[i].status.valid) 1995 continue; 1996 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1997 if (r) { 1998 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1999 adev->ip_blocks[i].version->funcs->name, r); 2000 goto init_failed; 2001 } 2002 adev->ip_blocks[i].status.sw = true; 2003 2004 /* need to do gmc hw init early so we can allocate gpu mem */ 2005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2006 r = amdgpu_device_vram_scratch_init(adev); 2007 if (r) { 2008 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2009 goto init_failed; 2010 } 2011 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2012 if (r) { 2013 DRM_ERROR("hw_init %d failed %d\n", i, r); 2014 goto init_failed; 2015 } 2016 r = amdgpu_device_wb_init(adev); 2017 if (r) { 2018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2019 goto init_failed; 2020 } 2021 adev->ip_blocks[i].status.hw = true; 2022 2023 /* right after GMC hw init, we create CSA */ 2024 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2026 AMDGPU_GEM_DOMAIN_VRAM, 2027 AMDGPU_CSA_SIZE); 2028 if (r) { 2029 DRM_ERROR("allocate CSA failed %d\n", r); 2030 goto init_failed; 2031 } 2032 } 2033 } 2034 } 2035 2036 if (amdgpu_sriov_vf(adev)) 2037 amdgpu_virt_init_data_exchange(adev); 2038 2039 r = amdgpu_ib_pool_init(adev); 2040 if (r) { 2041 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2043 goto init_failed; 2044 } 2045 2046 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2047 if (r) 2048 goto init_failed; 2049 2050 r = amdgpu_device_ip_hw_init_phase1(adev); 2051 if (r) 2052 goto init_failed; 2053 2054 r = amdgpu_device_fw_loading(adev); 2055 if (r) 2056 goto init_failed; 2057 2058 r = amdgpu_device_ip_hw_init_phase2(adev); 2059 if (r) 2060 goto init_failed; 2061 2062 /* 2063 * retired pages will be loaded from eeprom and reserved here, 2064 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2065 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2066 * for I2C communication which only true at this point. 2067 * recovery_init may fail, but it can free all resources allocated by 2068 * itself and its failure should not stop amdgpu init process. 2069 * 2070 * Note: theoretically, this should be called before all vram allocations 2071 * to protect retired page from abusing 2072 */ 2073 amdgpu_ras_recovery_init(adev); 2074 2075 if (adev->gmc.xgmi.num_physical_nodes > 1) 2076 amdgpu_xgmi_add_device(adev); 2077 amdgpu_amdkfd_device_init(adev); 2078 2079 init_failed: 2080 if (amdgpu_sriov_vf(adev)) 2081 amdgpu_virt_release_full_gpu(adev, true); 2082 2083 return r; 2084 } 2085 2086 /** 2087 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * 2091 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2092 * this function before a GPU reset. If the value is retained after a 2093 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2094 */ 2095 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2096 { 2097 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2098 } 2099 2100 /** 2101 * amdgpu_device_check_vram_lost - check if vram is valid 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Checks the reset magic value written to the gart pointer in VRAM. 2106 * The driver calls this after a GPU reset to see if the contents of 2107 * VRAM is lost or now. 2108 * returns true if vram is lost, false if not. 2109 */ 2110 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2111 { 2112 return !!memcmp(adev->gart.ptr, adev->reset_magic, 2113 AMDGPU_RESET_MAGIC_NUM); 2114 } 2115 2116 /** 2117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2118 * 2119 * @adev: amdgpu_device pointer 2120 * @state: clockgating state (gate or ungate) 2121 * 2122 * The list of all the hardware IPs that make up the asic is walked and the 2123 * set_clockgating_state callbacks are run. 2124 * Late initialization pass enabling clockgating for hardware IPs. 2125 * Fini or suspend, pass disabling clockgating for hardware IPs. 2126 * Returns 0 on success, negative error code on failure. 2127 */ 2128 2129 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2130 enum amd_clockgating_state state) 2131 { 2132 int i, j, r; 2133 2134 if (amdgpu_emu_mode == 1) 2135 return 0; 2136 2137 for (j = 0; j < adev->num_ip_blocks; j++) { 2138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2139 if (!adev->ip_blocks[i].status.late_initialized) 2140 continue; 2141 /* skip CG for VCE/UVD, it's handled specially */ 2142 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2145 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2146 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2147 /* enable clockgating to save power */ 2148 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2149 state); 2150 if (r) { 2151 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2152 adev->ip_blocks[i].version->funcs->name, r); 2153 return r; 2154 } 2155 } 2156 } 2157 2158 return 0; 2159 } 2160 2161 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2162 { 2163 int i, j, r; 2164 2165 if (amdgpu_emu_mode == 1) 2166 return 0; 2167 2168 for (j = 0; j < adev->num_ip_blocks; j++) { 2169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2170 if (!adev->ip_blocks[i].status.late_initialized) 2171 continue; 2172 /* skip CG for VCE/UVD, it's handled specially */ 2173 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2177 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2178 /* enable powergating to save power */ 2179 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2180 state); 2181 if (r) { 2182 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2183 adev->ip_blocks[i].version->funcs->name, r); 2184 return r; 2185 } 2186 } 2187 } 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_enable_mgpu_fan_boost(void) 2192 { 2193 struct amdgpu_gpu_instance *gpu_ins; 2194 struct amdgpu_device *adev; 2195 int i, ret = 0; 2196 2197 mutex_lock(&mgpu_info.mutex); 2198 2199 /* 2200 * MGPU fan boost feature should be enabled 2201 * only when there are two or more dGPUs in 2202 * the system 2203 */ 2204 if (mgpu_info.num_dgpu < 2) 2205 goto out; 2206 2207 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2208 gpu_ins = &(mgpu_info.gpu_ins[i]); 2209 adev = gpu_ins->adev; 2210 if (!(adev->flags & AMD_IS_APU) && 2211 !gpu_ins->mgpu_fan_enabled && 2212 adev->powerplay.pp_funcs && 2213 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2215 if (ret) 2216 break; 2217 2218 gpu_ins->mgpu_fan_enabled = 1; 2219 } 2220 } 2221 2222 out: 2223 mutex_unlock(&mgpu_info.mutex); 2224 2225 return ret; 2226 } 2227 2228 /** 2229 * amdgpu_device_ip_late_init - run late init for hardware IPs 2230 * 2231 * @adev: amdgpu_device pointer 2232 * 2233 * Late initialization pass for hardware IPs. The list of all the hardware 2234 * IPs that make up the asic is walked and the late_init callbacks are run. 2235 * late_init covers any special initialization that an IP requires 2236 * after all of the have been initialized or something that needs to happen 2237 * late in the init process. 2238 * Returns 0 on success, negative error code on failure. 2239 */ 2240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2241 { 2242 struct amdgpu_gpu_instance *gpu_instance; 2243 int i = 0, r; 2244 2245 for (i = 0; i < adev->num_ip_blocks; i++) { 2246 if (!adev->ip_blocks[i].status.hw) 2247 continue; 2248 if (adev->ip_blocks[i].version->funcs->late_init) { 2249 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2250 if (r) { 2251 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2252 adev->ip_blocks[i].version->funcs->name, r); 2253 return r; 2254 } 2255 } 2256 adev->ip_blocks[i].status.late_initialized = true; 2257 } 2258 2259 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2260 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2261 2262 amdgpu_device_fill_reset_magic(adev); 2263 2264 r = amdgpu_device_enable_mgpu_fan_boost(); 2265 if (r) 2266 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2267 2268 2269 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2270 mutex_lock(&mgpu_info.mutex); 2271 2272 /* 2273 * Reset device p-state to low as this was booted with high. 2274 * 2275 * This should be performed only after all devices from the same 2276 * hive get initialized. 2277 * 2278 * However, it's unknown how many device in the hive in advance. 2279 * As this is counted one by one during devices initializations. 2280 * 2281 * So, we wait for all XGMI interlinked devices initialized. 2282 * This may bring some delays as those devices may come from 2283 * different hives. But that should be OK. 2284 */ 2285 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2286 for (i = 0; i < mgpu_info.num_gpu; i++) { 2287 gpu_instance = &(mgpu_info.gpu_ins[i]); 2288 if (gpu_instance->adev->flags & AMD_IS_APU) 2289 continue; 2290 2291 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); 2292 if (r) { 2293 DRM_ERROR("pstate setting failed (%d).\n", r); 2294 break; 2295 } 2296 } 2297 } 2298 2299 mutex_unlock(&mgpu_info.mutex); 2300 } 2301 2302 return 0; 2303 } 2304 2305 /** 2306 * amdgpu_device_ip_fini - run fini for hardware IPs 2307 * 2308 * @adev: amdgpu_device pointer 2309 * 2310 * Main teardown pass for hardware IPs. The list of all the hardware 2311 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2312 * are run. hw_fini tears down the hardware associated with each IP 2313 * and sw_fini tears down any software state associated with each IP. 2314 * Returns 0 on success, negative error code on failure. 2315 */ 2316 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2317 { 2318 int i, r; 2319 2320 amdgpu_ras_pre_fini(adev); 2321 2322 if (adev->gmc.xgmi.num_physical_nodes > 1) 2323 amdgpu_xgmi_remove_device(adev); 2324 2325 amdgpu_amdkfd_device_fini(adev); 2326 2327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2328 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2329 2330 /* need to disable SMC first */ 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (!adev->ip_blocks[i].status.hw) 2333 continue; 2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2335 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2336 /* XXX handle errors */ 2337 if (r) { 2338 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 } 2341 adev->ip_blocks[i].status.hw = false; 2342 break; 2343 } 2344 } 2345 2346 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 2357 adev->ip_blocks[i].status.hw = false; 2358 } 2359 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.sw) 2363 continue; 2364 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 amdgpu_ucode_free_bo(adev); 2367 amdgpu_free_static_csa(&adev->virt.csa_obj); 2368 amdgpu_device_wb_fini(adev); 2369 amdgpu_device_vram_scratch_fini(adev); 2370 amdgpu_ib_pool_fini(adev); 2371 } 2372 2373 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2374 /* XXX handle errors */ 2375 if (r) { 2376 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 } 2379 adev->ip_blocks[i].status.sw = false; 2380 adev->ip_blocks[i].status.valid = false; 2381 } 2382 2383 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2384 if (!adev->ip_blocks[i].status.late_initialized) 2385 continue; 2386 if (adev->ip_blocks[i].version->funcs->late_fini) 2387 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2388 adev->ip_blocks[i].status.late_initialized = false; 2389 } 2390 2391 amdgpu_ras_fini(adev); 2392 2393 if (amdgpu_sriov_vf(adev)) 2394 if (amdgpu_virt_release_full_gpu(adev, false)) 2395 DRM_ERROR("failed to release exclusive mode on fini\n"); 2396 2397 return 0; 2398 } 2399 2400 /** 2401 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2402 * 2403 * @work: work_struct. 2404 */ 2405 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2406 { 2407 struct amdgpu_device *adev = 2408 container_of(work, struct amdgpu_device, delayed_init_work.work); 2409 int r; 2410 2411 r = amdgpu_ib_ring_tests(adev); 2412 if (r) 2413 DRM_ERROR("ib ring test failed (%d).\n", r); 2414 } 2415 2416 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2417 { 2418 struct amdgpu_device *adev = 2419 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2420 2421 mutex_lock(&adev->gfx.gfx_off_mutex); 2422 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2423 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2424 adev->gfx.gfx_off_state = true; 2425 } 2426 mutex_unlock(&adev->gfx.gfx_off_mutex); 2427 } 2428 2429 /** 2430 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2431 * 2432 * @adev: amdgpu_device pointer 2433 * 2434 * Main suspend function for hardware IPs. The list of all the hardware 2435 * IPs that make up the asic is walked, clockgating is disabled and the 2436 * suspend callbacks are run. suspend puts the hardware and software state 2437 * in each IP into a state suitable for suspend. 2438 * Returns 0 on success, negative error code on failure. 2439 */ 2440 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2441 { 2442 int i, r; 2443 2444 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2445 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2446 2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2448 if (!adev->ip_blocks[i].status.valid) 2449 continue; 2450 /* displays are handled separately */ 2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2452 /* XXX handle errors */ 2453 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2454 /* XXX handle errors */ 2455 if (r) { 2456 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2457 adev->ip_blocks[i].version->funcs->name, r); 2458 return r; 2459 } 2460 adev->ip_blocks[i].status.hw = false; 2461 } 2462 } 2463 2464 return 0; 2465 } 2466 2467 /** 2468 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Main suspend function for hardware IPs. The list of all the hardware 2473 * IPs that make up the asic is walked, clockgating is disabled and the 2474 * suspend callbacks are run. suspend puts the hardware and software state 2475 * in each IP into a state suitable for suspend. 2476 * Returns 0 on success, negative error code on failure. 2477 */ 2478 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2479 { 2480 int i, r __unused; 2481 2482 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2483 if (!adev->ip_blocks[i].status.valid) 2484 continue; 2485 /* displays are handled in phase1 */ 2486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2487 continue; 2488 /* PSP lost connection when err_event_athub occurs */ 2489 if (amdgpu_ras_intr_triggered() && 2490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2491 adev->ip_blocks[i].status.hw = false; 2492 continue; 2493 } 2494 /* XXX handle errors */ 2495 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2496 /* XXX handle errors */ 2497 if (r) { 2498 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2499 adev->ip_blocks[i].version->funcs->name, r); 2500 } 2501 adev->ip_blocks[i].status.hw = false; 2502 /* handle putting the SMC in the appropriate state */ 2503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2505 if (r) { 2506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2507 adev->mp1_state, r); 2508 return r; 2509 } 2510 } 2511 2512 adev->ip_blocks[i].status.hw = false; 2513 } 2514 2515 return 0; 2516 } 2517 2518 /** 2519 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2520 * 2521 * @adev: amdgpu_device pointer 2522 * 2523 * Main suspend function for hardware IPs. The list of all the hardware 2524 * IPs that make up the asic is walked, clockgating is disabled and the 2525 * suspend callbacks are run. suspend puts the hardware and software state 2526 * in each IP into a state suitable for suspend. 2527 * Returns 0 on success, negative error code on failure. 2528 */ 2529 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2530 { 2531 int r; 2532 2533 if (amdgpu_sriov_vf(adev)) 2534 amdgpu_virt_request_full_gpu(adev, false); 2535 2536 r = amdgpu_device_ip_suspend_phase1(adev); 2537 if (r) 2538 return r; 2539 r = amdgpu_device_ip_suspend_phase2(adev); 2540 2541 if (amdgpu_sriov_vf(adev)) 2542 amdgpu_virt_release_full_gpu(adev, false); 2543 2544 return r; 2545 } 2546 2547 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2548 { 2549 int i, r; 2550 2551 static enum amd_ip_block_type ip_order[] = { 2552 AMD_IP_BLOCK_TYPE_GMC, 2553 AMD_IP_BLOCK_TYPE_COMMON, 2554 AMD_IP_BLOCK_TYPE_PSP, 2555 AMD_IP_BLOCK_TYPE_IH, 2556 }; 2557 2558 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2559 int j; 2560 struct amdgpu_ip_block *block; 2561 2562 for (j = 0; j < adev->num_ip_blocks; j++) { 2563 block = &adev->ip_blocks[j]; 2564 2565 block->status.hw = false; 2566 if (block->version->type != ip_order[i] || 2567 !block->status.valid) 2568 continue; 2569 2570 r = block->version->funcs->hw_init(adev); 2571 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2572 if (r) 2573 return r; 2574 block->status.hw = true; 2575 } 2576 } 2577 2578 return 0; 2579 } 2580 2581 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2582 { 2583 int i, r; 2584 2585 static enum amd_ip_block_type ip_order[] = { 2586 AMD_IP_BLOCK_TYPE_SMC, 2587 AMD_IP_BLOCK_TYPE_DCE, 2588 AMD_IP_BLOCK_TYPE_GFX, 2589 AMD_IP_BLOCK_TYPE_SDMA, 2590 AMD_IP_BLOCK_TYPE_UVD, 2591 AMD_IP_BLOCK_TYPE_VCE, 2592 AMD_IP_BLOCK_TYPE_VCN 2593 }; 2594 2595 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2596 int j; 2597 struct amdgpu_ip_block *block; 2598 2599 for (j = 0; j < adev->num_ip_blocks; j++) { 2600 block = &adev->ip_blocks[j]; 2601 2602 if (block->version->type != ip_order[i] || 2603 !block->status.valid || 2604 block->status.hw) 2605 continue; 2606 2607 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2608 r = block->version->funcs->resume(adev); 2609 else 2610 r = block->version->funcs->hw_init(adev); 2611 2612 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2613 if (r) 2614 return r; 2615 block->status.hw = true; 2616 } 2617 } 2618 2619 return 0; 2620 } 2621 2622 /** 2623 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2624 * 2625 * @adev: amdgpu_device pointer 2626 * 2627 * First resume function for hardware IPs. The list of all the hardware 2628 * IPs that make up the asic is walked and the resume callbacks are run for 2629 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2630 * after a suspend and updates the software state as necessary. This 2631 * function is also used for restoring the GPU after a GPU reset. 2632 * Returns 0 on success, negative error code on failure. 2633 */ 2634 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2635 { 2636 int i, r; 2637 2638 for (i = 0; i < adev->num_ip_blocks; i++) { 2639 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2640 continue; 2641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2642 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2643 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2644 2645 r = adev->ip_blocks[i].version->funcs->resume(adev); 2646 if (r) { 2647 DRM_ERROR("resume of IP block <%s> failed %d\n", 2648 adev->ip_blocks[i].version->funcs->name, r); 2649 return r; 2650 } 2651 adev->ip_blocks[i].status.hw = true; 2652 } 2653 } 2654 2655 return 0; 2656 } 2657 2658 /** 2659 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2660 * 2661 * @adev: amdgpu_device pointer 2662 * 2663 * First resume function for hardware IPs. The list of all the hardware 2664 * IPs that make up the asic is walked and the resume callbacks are run for 2665 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2666 * functional state after a suspend and updates the software state as 2667 * necessary. This function is also used for restoring the GPU after a GPU 2668 * reset. 2669 * Returns 0 on success, negative error code on failure. 2670 */ 2671 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2672 { 2673 int i, r; 2674 2675 for (i = 0; i < adev->num_ip_blocks; i++) { 2676 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2677 continue; 2678 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2679 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2680 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2682 continue; 2683 r = adev->ip_blocks[i].version->funcs->resume(adev); 2684 if (r) { 2685 DRM_ERROR("resume of IP block <%s> failed %d\n", 2686 adev->ip_blocks[i].version->funcs->name, r); 2687 return r; 2688 } 2689 adev->ip_blocks[i].status.hw = true; 2690 } 2691 2692 return 0; 2693 } 2694 2695 /** 2696 * amdgpu_device_ip_resume - run resume for hardware IPs 2697 * 2698 * @adev: amdgpu_device pointer 2699 * 2700 * Main resume function for hardware IPs. The hardware IPs 2701 * are split into two resume functions because they are 2702 * are also used in in recovering from a GPU reset and some additional 2703 * steps need to be take between them. In this case (S3/S4) they are 2704 * run sequentially. 2705 * Returns 0 on success, negative error code on failure. 2706 */ 2707 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2708 { 2709 int r; 2710 2711 r = amdgpu_device_ip_resume_phase1(adev); 2712 if (r) 2713 return r; 2714 2715 r = amdgpu_device_fw_loading(adev); 2716 if (r) 2717 return r; 2718 2719 r = amdgpu_device_ip_resume_phase2(adev); 2720 2721 return r; 2722 } 2723 2724 /** 2725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2726 * 2727 * @adev: amdgpu_device pointer 2728 * 2729 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2730 */ 2731 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2732 { 2733 if (amdgpu_sriov_vf(adev)) { 2734 if (adev->is_atom_fw) { 2735 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2737 } else { 2738 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2740 } 2741 2742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2744 } 2745 } 2746 2747 /** 2748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2749 * 2750 * @asic_type: AMD asic type 2751 * 2752 * Check if there is DC (new modesetting infrastructre) support for an asic. 2753 * returns true if DC has support, false if not. 2754 */ 2755 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2756 { 2757 switch (asic_type) { 2758 #if defined(CONFIG_DRM_AMD_DC) 2759 case CHIP_BONAIRE: 2760 case CHIP_KAVERI: 2761 case CHIP_KABINI: 2762 case CHIP_MULLINS: 2763 /* 2764 * We have systems in the wild with these ASICs that require 2765 * LVDS and VGA support which is not supported with DC. 2766 * 2767 * Fallback to the non-DC driver here by default so as not to 2768 * cause regressions. 2769 */ 2770 return amdgpu_dc > 0; 2771 case CHIP_HAWAII: 2772 case CHIP_CARRIZO: 2773 case CHIP_STONEY: 2774 case CHIP_POLARIS10: 2775 case CHIP_POLARIS11: 2776 case CHIP_POLARIS12: 2777 case CHIP_VEGAM: 2778 case CHIP_TONGA: 2779 case CHIP_FIJI: 2780 case CHIP_VEGA10: 2781 case CHIP_VEGA12: 2782 case CHIP_VEGA20: 2783 #if defined(CONFIG_DRM_AMD_DC_DCN) 2784 case CHIP_RAVEN: 2785 case CHIP_NAVI10: 2786 case CHIP_NAVI14: 2787 case CHIP_NAVI12: 2788 case CHIP_RENOIR: 2789 #endif 2790 return amdgpu_dc != 0; 2791 #endif 2792 default: 2793 if (amdgpu_dc > 0) 2794 DRM_INFO("Display Core has been requested via kernel parameter " 2795 "but isn't supported by ASIC, ignoring\n"); 2796 return false; 2797 } 2798 } 2799 2800 /** 2801 * amdgpu_device_has_dc_support - check if dc is supported 2802 * 2803 * @adev: amdgpu_device_pointer 2804 * 2805 * Returns true for supported, false for not supported 2806 */ 2807 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2808 { 2809 if (amdgpu_sriov_vf(adev)) 2810 return false; 2811 2812 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2813 } 2814 2815 2816 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2817 { 2818 struct amdgpu_device *adev = 2819 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2820 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2821 2822 /* It's a bug to not have a hive within this function */ 2823 if (WARN_ON(!hive)) 2824 return; 2825 2826 /* 2827 * Use task barrier to synchronize all xgmi reset works across the 2828 * hive. task_barrier_enter and task_barrier_exit will block 2829 * until all the threads running the xgmi reset works reach 2830 * those points. task_barrier_full will do both blocks. 2831 */ 2832 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2833 2834 task_barrier_enter(&hive->tb); 2835 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2836 2837 if (adev->asic_reset_res) 2838 goto fail; 2839 2840 task_barrier_exit(&hive->tb); 2841 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2842 2843 if (adev->asic_reset_res) 2844 goto fail; 2845 } else { 2846 2847 task_barrier_full(&hive->tb); 2848 adev->asic_reset_res = amdgpu_asic_reset(adev); 2849 } 2850 2851 fail: 2852 if (adev->asic_reset_res) 2853 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2854 adev->asic_reset_res, adev->ddev->unique); 2855 } 2856 2857 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2858 { 2859 char *input = amdgpu_lockup_timeout; 2860 char *timeout_setting = NULL; 2861 int index = 0; 2862 long timeout; 2863 int ret = 0; 2864 2865 /* 2866 * By default timeout for non compute jobs is 10000. 2867 * And there is no timeout enforced on compute jobs. 2868 * In SR-IOV or passthrough mode, timeout for compute 2869 * jobs are 10000 by default. 2870 */ 2871 adev->gfx_timeout = msecs_to_jiffies(10000); 2872 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2873 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2874 adev->compute_timeout = adev->gfx_timeout; 2875 else 2876 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2877 2878 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2879 while ((timeout_setting = strsep(&input, ",")) && 2880 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2881 ret = kstrtol(timeout_setting, 0, &timeout); 2882 if (ret) 2883 return ret; 2884 2885 if (timeout == 0) { 2886 index++; 2887 continue; 2888 } else if (timeout < 0) { 2889 timeout = MAX_SCHEDULE_TIMEOUT; 2890 } else { 2891 timeout = msecs_to_jiffies(timeout); 2892 } 2893 2894 switch (index++) { 2895 case 0: 2896 adev->gfx_timeout = timeout; 2897 break; 2898 case 1: 2899 adev->compute_timeout = timeout; 2900 break; 2901 case 2: 2902 adev->sdma_timeout = timeout; 2903 break; 2904 case 3: 2905 adev->video_timeout = timeout; 2906 break; 2907 default: 2908 break; 2909 } 2910 } 2911 /* 2912 * There is only one value specified and 2913 * it should apply to all non-compute jobs. 2914 */ 2915 if (index == 1) { 2916 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2917 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2918 adev->compute_timeout = adev->gfx_timeout; 2919 } 2920 } 2921 2922 return ret; 2923 } 2924 2925 /** 2926 * amdgpu_device_init - initialize the driver 2927 * 2928 * @adev: amdgpu_device pointer 2929 * @ddev: drm dev pointer 2930 * @pdev: pci dev pointer 2931 * @flags: driver flags 2932 * 2933 * Initializes the driver info and hw (all asics). 2934 * Returns 0 for success or an error on failure. 2935 * Called at driver startup. 2936 */ 2937 int amdgpu_device_init(struct amdgpu_device *adev, 2938 struct drm_device *ddev, 2939 struct pci_dev *pdev, 2940 uint32_t flags) 2941 { 2942 int r, i; 2943 bool boco = false; 2944 u32 max_MBps; 2945 2946 adev->shutdown = false; 2947 adev->dev = pci_dev_dev(pdev); 2948 adev->ddev = ddev; 2949 adev->pdev = pdev; 2950 adev->flags = flags; 2951 2952 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2953 adev->asic_type = amdgpu_force_asic_type; 2954 else 2955 adev->asic_type = flags & AMD_ASIC_MASK; 2956 2957 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2958 if (amdgpu_emu_mode == 1) 2959 adev->usec_timeout *= 2; 2960 adev->gmc.gart_size = 512 * 1024 * 1024; 2961 adev->accel_working = false; 2962 adev->num_rings = 0; 2963 adev->mman.buffer_funcs = NULL; 2964 adev->mman.buffer_funcs_ring = NULL; 2965 adev->vm_manager.vm_pte_funcs = NULL; 2966 adev->vm_manager.vm_pte_num_scheds = 0; 2967 adev->gmc.gmc_funcs = NULL; 2968 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2969 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2970 2971 adev->smc_rreg = &amdgpu_invalid_rreg; 2972 adev->smc_wreg = &amdgpu_invalid_wreg; 2973 adev->pcie_rreg = &amdgpu_invalid_rreg; 2974 adev->pcie_wreg = &amdgpu_invalid_wreg; 2975 adev->pciep_rreg = &amdgpu_invalid_rreg; 2976 adev->pciep_wreg = &amdgpu_invalid_wreg; 2977 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 2978 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 2979 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2980 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2981 adev->didt_rreg = &amdgpu_invalid_rreg; 2982 adev->didt_wreg = &amdgpu_invalid_wreg; 2983 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2984 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2985 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2986 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2987 2988 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2989 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2990 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2991 2992 /* mutex initialization are all done here so we 2993 * can recall function without having locking issues */ 2994 atomic_set(&adev->irq.ih.lock, 0); 2995 mutex_init(&adev->firmware.mutex); 2996 mutex_init(&adev->pm.mutex); 2997 mutex_init(&adev->gfx.gpu_clock_mutex); 2998 mutex_init(&adev->srbm_mutex); 2999 mutex_init(&adev->gfx.pipe_reserve_mutex); 3000 mutex_init(&adev->gfx.gfx_off_mutex); 3001 mutex_init(&adev->grbm_idx_mutex); 3002 mutex_init(&adev->mn_lock); 3003 mutex_init(&adev->virt.vf_errors.lock); 3004 hash_init(adev->mn_hash); 3005 mutex_init(&adev->lock_reset); 3006 mutex_init(&adev->psp.mutex); 3007 mutex_init(&adev->notifier_lock); 3008 3009 spin_lock_init(&adev->mmio_idx_lock); 3010 spin_lock_init(&adev->smc_idx_lock); 3011 spin_lock_init(&adev->pcie_idx_lock); 3012 spin_lock_init(&adev->uvd_ctx_idx_lock); 3013 spin_lock_init(&adev->didt_idx_lock); 3014 spin_lock_init(&adev->gc_cac_idx_lock); 3015 spin_lock_init(&adev->se_cac_idx_lock); 3016 spin_lock_init(&adev->audio_endpt_idx_lock); 3017 spin_lock_init(&adev->mm_stats.lock); 3018 3019 INIT_LIST_HEAD(&adev->shadow_list); 3020 mutex_init(&adev->shadow_list_lock); 3021 3022 INIT_LIST_HEAD(&adev->ring_lru_list); 3023 spin_lock_init(&adev->ring_lru_list_lock); 3024 3025 INIT_DELAYED_WORK(&adev->delayed_init_work, 3026 amdgpu_device_delayed_init_work_handler); 3027 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3028 amdgpu_device_delay_enable_gfx_off); 3029 3030 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3031 3032 r = amdgpu_device_check_arguments(adev); 3033 if (r) 3034 return r; 3035 3036 adev->gfx.gfx_off_req_count = 1; 3037 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 3038 3039 /* Registers mapping */ 3040 /* TODO: block userspace mapping of io register */ 3041 if (adev->asic_type >= CHIP_BONAIRE) { 3042 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3043 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3044 } else { 3045 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3046 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3047 } 3048 3049 #ifdef __NetBSD__ 3050 const int bar = (adev->asic_type >= CHIP_BONAIRE ? 5 : 2); 3051 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(bar), 3052 pci_mapreg_type(adev->pdev->pd_pa.pa_pc, 3053 adev->pdev->pd_pa.pa_tag, PCI_BAR(bar)), 3054 0, 3055 &adev->rmmiot, &adev->rmmioh, 3056 &adev->rmmio_base, &adev->rmmio_size)) 3057 return -EIO; 3058 DRM_INFO("register mmio base: 0x%8"PRIXMAX"\n", 3059 (uintmax_t)adev->rmmio_base); 3060 DRM_INFO("register mmio size: %"PRIuMAX"\n", 3061 (uintmax_t)adev->rmmio_size); 3062 #else 3063 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3064 if (adev->rmmio == NULL) { 3065 return -ENOMEM; 3066 } 3067 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3068 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3069 #endif 3070 3071 /* io port mapping */ 3072 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3073 #ifdef __NetBSD__ 3074 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(i), 3075 PCI_MAPREG_TYPE_IO, 0, 3076 &adev->rio_memt, &adev->rio_memh, 3077 NULL, &adev->rio_mem_size) == 0) 3078 break; 3079 #else 3080 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3081 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3082 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3083 break; 3084 } 3085 #endif 3086 } 3087 #ifdef __NetBSD__ 3088 if (i == DEVICE_COUNT_RESOURCE) 3089 #else 3090 if (adev->rio_mem == NULL) 3091 #endif 3092 DRM_INFO("PCI I/O BAR is not found.\n"); 3093 3094 /* enable PCIE atomic ops */ 3095 r = pci_enable_atomic_ops_to_root(adev->pdev, 3096 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3097 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3098 if (r) { 3099 adev->have_atomics_support = false; 3100 DRM_INFO("PCIE atomic ops is not supported\n"); 3101 } else { 3102 adev->have_atomics_support = true; 3103 } 3104 3105 amdgpu_device_get_pcie_info(adev); 3106 3107 if (amdgpu_mcbp) 3108 DRM_INFO("MCBP is enabled\n"); 3109 3110 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3111 adev->enable_mes = true; 3112 3113 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 3114 r = amdgpu_discovery_init(adev); 3115 if (r) { 3116 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 3117 return r; 3118 } 3119 } 3120 3121 /* early init functions */ 3122 r = amdgpu_device_ip_early_init(adev); 3123 if (r) 3124 return r; 3125 3126 r = amdgpu_device_get_job_timeout_settings(adev); 3127 if (r) { 3128 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3129 return r; 3130 } 3131 3132 /* doorbell bar mapping and doorbell index init*/ 3133 amdgpu_device_doorbell_init(adev); 3134 3135 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3136 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3137 /* this will fail for cards that aren't VGA class devices, just 3138 * ignore it */ 3139 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3140 3141 if (amdgpu_device_supports_boco(ddev)) 3142 boco = true; 3143 if (amdgpu_has_atpx() && 3144 (amdgpu_is_atpx_hybrid() || 3145 amdgpu_has_atpx_dgpu_power_cntl()) && 3146 !pci_is_thunderbolt_attached(adev->pdev)) 3147 vga_switcheroo_register_client(adev->pdev, 3148 &amdgpu_switcheroo_ops, boco); 3149 if (boco) 3150 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3151 #endif 3152 3153 if (amdgpu_emu_mode == 1) { 3154 /* post the asic on emulation mode */ 3155 emu_soc_asic_init(adev); 3156 goto fence_driver_init; 3157 } 3158 3159 /* detect if we are with an SRIOV vbios */ 3160 amdgpu_device_detect_sriov_bios(adev); 3161 3162 /* check if we need to reset the asic 3163 * E.g., driver was not cleanly unloaded previously, etc. 3164 */ 3165 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3166 r = amdgpu_asic_reset(adev); 3167 if (r) { 3168 dev_err(adev->dev, "asic reset on init failed\n"); 3169 goto failed; 3170 } 3171 } 3172 3173 /* Post card if necessary */ 3174 if (amdgpu_device_need_post(adev)) { 3175 if (!adev->bios) { 3176 dev_err(adev->dev, "no vBIOS found\n"); 3177 r = -EINVAL; 3178 goto failed; 3179 } 3180 DRM_INFO("GPU posting now...\n"); 3181 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3182 if (r) { 3183 dev_err(adev->dev, "gpu post error!\n"); 3184 goto failed; 3185 } 3186 } 3187 3188 if (adev->is_atom_fw) { 3189 /* Initialize clocks */ 3190 r = amdgpu_atomfirmware_get_clock_info(adev); 3191 if (r) { 3192 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3193 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3194 goto failed; 3195 } 3196 } else { 3197 /* Initialize clocks */ 3198 r = amdgpu_atombios_get_clock_info(adev); 3199 if (r) { 3200 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3202 goto failed; 3203 } 3204 /* init i2c buses */ 3205 if (!amdgpu_device_has_dc_support(adev)) 3206 amdgpu_atombios_i2c_init(adev); 3207 } 3208 3209 fence_driver_init: 3210 /* Fence driver */ 3211 r = amdgpu_fence_driver_init(adev); 3212 if (r) { 3213 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3214 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3215 goto failed; 3216 } 3217 3218 /* init the mode config */ 3219 drm_mode_config_init(adev->ddev); 3220 3221 r = amdgpu_device_ip_init(adev); 3222 if (r) { 3223 /* failed in exclusive mode due to timeout */ 3224 if (amdgpu_sriov_vf(adev) && 3225 !amdgpu_sriov_runtime(adev) && 3226 amdgpu_virt_mmio_blocked(adev) && 3227 !amdgpu_virt_wait_reset(adev)) { 3228 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3229 /* Don't send request since VF is inactive. */ 3230 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3231 adev->virt.ops = NULL; 3232 r = -EAGAIN; 3233 goto failed; 3234 } 3235 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3236 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3237 goto failed; 3238 } 3239 3240 DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3241 adev->gfx.config.max_shader_engines, 3242 adev->gfx.config.max_sh_per_se, 3243 adev->gfx.config.max_cu_per_sh, 3244 adev->gfx.cu_info.number); 3245 3246 amdgpu_ctx_init_sched(adev); 3247 3248 adev->accel_working = true; 3249 3250 amdgpu_vm_check_compute_bug(adev); 3251 3252 /* Initialize the buffer migration limit. */ 3253 if (amdgpu_moverate >= 0) 3254 max_MBps = amdgpu_moverate; 3255 else 3256 max_MBps = 8; /* Allow 8 MB/s. */ 3257 /* Get a log2 for easy divisions. */ 3258 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3259 3260 amdgpu_fbdev_init(adev); 3261 3262 r = amdgpu_pm_sysfs_init(adev); 3263 if (r) { 3264 adev->pm_sysfs_en = false; 3265 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3266 } else 3267 adev->pm_sysfs_en = true; 3268 3269 r = amdgpu_ucode_sysfs_init(adev); 3270 if (r) { 3271 adev->ucode_sysfs_en = false; 3272 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3273 } else 3274 adev->ucode_sysfs_en = true; 3275 3276 r = amdgpu_debugfs_gem_init(adev); 3277 if (r) 3278 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 3279 3280 r = amdgpu_debugfs_regs_init(adev); 3281 if (r) 3282 DRM_ERROR("registering register debugfs failed (%d).\n", r); 3283 3284 r = amdgpu_debugfs_firmware_init(adev); 3285 if (r) 3286 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 3287 3288 r = amdgpu_debugfs_init(adev); 3289 if (r) 3290 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 3291 3292 if ((amdgpu_testing & 1)) { 3293 if (adev->accel_working) 3294 amdgpu_test_moves(adev); 3295 else 3296 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3297 } 3298 if (amdgpu_benchmarking) { 3299 if (adev->accel_working) 3300 amdgpu_benchmark(adev, amdgpu_benchmarking); 3301 else 3302 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3303 } 3304 3305 /* 3306 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3307 * Otherwise the mgpu fan boost feature will be skipped due to the 3308 * gpu instance is counted less. 3309 */ 3310 amdgpu_register_gpu_instance(adev); 3311 3312 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3313 * explicit gating rather than handling it automatically. 3314 */ 3315 r = amdgpu_device_ip_late_init(adev); 3316 if (r) { 3317 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3318 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3319 goto failed; 3320 } 3321 3322 /* must succeed. */ 3323 amdgpu_ras_resume(adev); 3324 3325 queue_delayed_work(system_wq, &adev->delayed_init_work, 3326 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3327 3328 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3329 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 3330 if (r) { 3331 dev_err(adev->dev, "Could not create pcie_replay_count"); 3332 return r; 3333 } 3334 #endif 3335 3336 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3337 r = amdgpu_pmu_init(adev); 3338 if (r) 3339 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3340 3341 return 0; 3342 3343 failed: 3344 amdgpu_vf_error_trans_all(adev); 3345 if (boco) 3346 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3347 3348 return r; 3349 } 3350 3351 /** 3352 * amdgpu_device_fini - tear down the driver 3353 * 3354 * @adev: amdgpu_device pointer 3355 * 3356 * Tear down the driver info (all asics). 3357 * Called at driver shutdown. 3358 */ 3359 void amdgpu_device_fini(struct amdgpu_device *adev) 3360 { 3361 int r __unused; 3362 3363 DRM_INFO("amdgpu: finishing device.\n"); 3364 flush_delayed_work(&adev->delayed_init_work); 3365 adev->shutdown = true; 3366 3367 /* disable all interrupts */ 3368 amdgpu_irq_disable_all(adev); 3369 if (adev->mode_info.mode_config_initialized){ 3370 if (!amdgpu_device_has_dc_support(adev)) 3371 drm_helper_force_disable_all(adev->ddev); 3372 else 3373 drm_atomic_helper_shutdown(adev->ddev); 3374 } 3375 amdgpu_fence_driver_fini(adev); 3376 if (adev->pm_sysfs_en) 3377 amdgpu_pm_sysfs_fini(adev); 3378 amdgpu_fbdev_fini(adev); 3379 r = amdgpu_device_ip_fini(adev); 3380 if (adev->firmware.gpu_info_fw) { 3381 release_firmware(adev->firmware.gpu_info_fw); 3382 adev->firmware.gpu_info_fw = NULL; 3383 } 3384 adev->accel_working = false; 3385 /* free i2c buses */ 3386 if (!amdgpu_device_has_dc_support(adev)) 3387 amdgpu_i2c_fini(adev); 3388 3389 if (amdgpu_emu_mode != 1) 3390 amdgpu_atombios_fini(adev); 3391 3392 kfree(adev->bios); 3393 adev->bios = NULL; 3394 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3395 if (amdgpu_has_atpx() && 3396 (amdgpu_is_atpx_hybrid() || 3397 amdgpu_has_atpx_dgpu_power_cntl()) && 3398 !pci_is_thunderbolt_attached(adev->pdev)) 3399 vga_switcheroo_unregister_client(adev->pdev); 3400 if (amdgpu_device_supports_boco(adev->ddev)) 3401 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3402 vga_client_register(adev->pdev, NULL, NULL, NULL); 3403 #endif 3404 #ifdef __NetBSD__ 3405 if (adev->rio_mem_size) 3406 bus_space_unmap(adev->rio_memt, adev->rio_memh, 3407 adev->rio_mem_size); 3408 adev->rio_mem_size = 0; 3409 bus_space_unmap(adev->rmmiot, adev->rmmioh, adev->rmmio_size); 3410 #else 3411 if (adev->rio_mem) 3412 pci_iounmap(adev->pdev, adev->rio_mem); 3413 adev->rio_mem = NULL; 3414 iounmap(adev->rmmio); 3415 adev->rmmio = NULL; 3416 #endif 3417 amdgpu_device_doorbell_fini(adev); 3418 3419 amdgpu_debugfs_regs_cleanup(adev); 3420 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3421 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 3422 #endif 3423 if (adev->ucode_sysfs_en) 3424 amdgpu_ucode_sysfs_fini(adev); 3425 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3426 amdgpu_pmu_fini(adev); 3427 amdgpu_debugfs_preempt_cleanup(adev); 3428 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 3429 amdgpu_discovery_fini(adev); 3430 spin_lock_destroy(&adev->ring_lru_list_lock); 3431 mutex_destroy(&adev->shadow_list_lock); 3432 spin_lock_destroy(&adev->mm_stats.lock); 3433 spin_lock_destroy(&adev->audio_endpt_idx_lock); 3434 spin_lock_destroy(&adev->se_cac_idx_lock); 3435 spin_lock_destroy(&adev->gc_cac_idx_lock); 3436 spin_lock_destroy(&adev->didt_idx_lock); 3437 spin_lock_destroy(&adev->uvd_ctx_idx_lock); 3438 spin_lock_destroy(&adev->pcie_idx_lock); 3439 spin_lock_destroy(&adev->smc_idx_lock); 3440 spin_lock_destroy(&adev->mmio_idx_lock); 3441 mutex_destroy(&adev->notifier_lock); 3442 mutex_destroy(&adev->psp.mutex); 3443 mutex_destroy(&adev->lock_reset); 3444 /* hash_destroy(adev->mn_hash)? */ 3445 mutex_destroy(&adev->virt.vf_errors.lock); 3446 mutex_destroy(&adev->mn_lock); 3447 mutex_destroy(&adev->grbm_idx_mutex); 3448 mutex_destroy(&adev->gfx.gfx_off_mutex); 3449 mutex_destroy(&adev->gfx.pipe_reserve_mutex); 3450 mutex_destroy(&adev->srbm_mutex); 3451 mutex_destroy(&adev->gfx.gpu_clock_mutex); 3452 mutex_destroy(&adev->pm.mutex); 3453 mutex_destroy(&adev->firmware.mutex); 3454 } 3455 3456 3457 /* 3458 * Suspend & resume. 3459 */ 3460 /** 3461 * amdgpu_device_suspend - initiate device suspend 3462 * 3463 * @dev: drm dev pointer 3464 * @suspend: suspend state 3465 * @fbcon : notify the fbdev of suspend 3466 * 3467 * Puts the hw in the suspend state (all asics). 3468 * Returns 0 for success or an error on failure. 3469 * Called at driver suspend. 3470 */ 3471 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3472 { 3473 struct amdgpu_device *adev; 3474 struct drm_crtc *crtc; 3475 struct drm_connector *connector; 3476 struct drm_connector_list_iter iter; 3477 int r; 3478 3479 if (dev == NULL || dev->dev_private == NULL) { 3480 return -ENODEV; 3481 } 3482 3483 adev = dev->dev_private; 3484 3485 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3486 return 0; 3487 3488 adev->in_suspend = true; 3489 drm_kms_helper_poll_disable(dev); 3490 3491 if (fbcon) 3492 amdgpu_fbdev_set_suspend(adev, 1); 3493 3494 cancel_delayed_work_sync(&adev->delayed_init_work); 3495 3496 if (!amdgpu_device_has_dc_support(adev)) { 3497 /* turn off display hw */ 3498 drm_modeset_lock_all(dev); 3499 drm_connector_list_iter_begin(dev, &iter); 3500 drm_for_each_connector_iter(connector, &iter) 3501 drm_helper_connector_dpms(connector, 3502 DRM_MODE_DPMS_OFF); 3503 drm_connector_list_iter_end(&iter); 3504 drm_modeset_unlock_all(dev); 3505 /* unpin the front buffers and cursors */ 3506 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3507 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3508 struct drm_framebuffer *fb = crtc->primary->fb; 3509 struct amdgpu_bo *robj; 3510 3511 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3512 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3513 r = amdgpu_bo_reserve(aobj, true); 3514 if (r == 0) { 3515 amdgpu_bo_unpin(aobj); 3516 amdgpu_bo_unreserve(aobj); 3517 } 3518 } 3519 3520 if (fb == NULL || fb->obj[0] == NULL) { 3521 continue; 3522 } 3523 robj = gem_to_amdgpu_bo(fb->obj[0]); 3524 /* don't unpin kernel fb objects */ 3525 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3526 r = amdgpu_bo_reserve(robj, true); 3527 if (r == 0) { 3528 amdgpu_bo_unpin(robj); 3529 amdgpu_bo_unreserve(robj); 3530 } 3531 } 3532 } 3533 } 3534 3535 amdgpu_amdkfd_suspend(adev); 3536 3537 amdgpu_ras_suspend(adev); 3538 3539 r = amdgpu_device_ip_suspend_phase1(adev); 3540 3541 /* evict vram memory */ 3542 amdgpu_bo_evict_vram(adev); 3543 3544 amdgpu_fence_driver_suspend(adev); 3545 3546 r = amdgpu_device_ip_suspend_phase2(adev); 3547 3548 /* evict remaining vram memory 3549 * This second call to evict vram is to evict the gart page table 3550 * using the CPU. 3551 */ 3552 amdgpu_bo_evict_vram(adev); 3553 3554 return 0; 3555 } 3556 3557 /** 3558 * amdgpu_device_resume - initiate device resume 3559 * 3560 * @dev: drm dev pointer 3561 * @resume: resume state 3562 * @fbcon : notify the fbdev of resume 3563 * 3564 * Bring the hw back to operating state (all asics). 3565 * Returns 0 for success or an error on failure. 3566 * Called at driver resume. 3567 */ 3568 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3569 { 3570 struct drm_connector *connector; 3571 struct drm_connector_list_iter iter; 3572 struct amdgpu_device *adev = dev->dev_private; 3573 struct drm_crtc *crtc; 3574 int r = 0; 3575 3576 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3577 return 0; 3578 3579 /* post card */ 3580 if (amdgpu_device_need_post(adev)) { 3581 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3582 if (r) 3583 DRM_ERROR("amdgpu asic init failed\n"); 3584 } 3585 3586 r = amdgpu_device_ip_resume(adev); 3587 if (r) { 3588 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3589 return r; 3590 } 3591 amdgpu_fence_driver_resume(adev); 3592 3593 3594 r = amdgpu_device_ip_late_init(adev); 3595 if (r) 3596 return r; 3597 3598 queue_delayed_work(system_wq, &adev->delayed_init_work, 3599 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3600 3601 if (!amdgpu_device_has_dc_support(adev)) { 3602 /* pin cursors */ 3603 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3604 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3605 3606 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3607 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3608 r = amdgpu_bo_reserve(aobj, true); 3609 if (r == 0) { 3610 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3611 if (r != 0) 3612 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3613 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3614 amdgpu_bo_unreserve(aobj); 3615 } 3616 } 3617 } 3618 } 3619 r = amdgpu_amdkfd_resume(adev); 3620 if (r) 3621 return r; 3622 3623 /* Make sure IB tests flushed */ 3624 flush_delayed_work(&adev->delayed_init_work); 3625 3626 /* blat the mode back in */ 3627 if (fbcon) { 3628 if (!amdgpu_device_has_dc_support(adev)) { 3629 /* pre DCE11 */ 3630 drm_helper_resume_force_mode(dev); 3631 3632 /* turn on display hw */ 3633 drm_modeset_lock_all(dev); 3634 3635 drm_connector_list_iter_begin(dev, &iter); 3636 drm_for_each_connector_iter(connector, &iter) 3637 drm_helper_connector_dpms(connector, 3638 DRM_MODE_DPMS_ON); 3639 drm_connector_list_iter_end(&iter); 3640 3641 drm_modeset_unlock_all(dev); 3642 } 3643 amdgpu_fbdev_set_suspend(adev, 0); 3644 } 3645 3646 drm_kms_helper_poll_enable(dev); 3647 3648 amdgpu_ras_resume(adev); 3649 3650 /* 3651 * Most of the connector probing functions try to acquire runtime pm 3652 * refs to ensure that the GPU is powered on when connector polling is 3653 * performed. Since we're calling this from a runtime PM callback, 3654 * trying to acquire rpm refs will cause us to deadlock. 3655 * 3656 * Since we're guaranteed to be holding the rpm lock, it's safe to 3657 * temporarily disable the rpm helpers so this doesn't deadlock us. 3658 */ 3659 #ifdef CONFIG_PM 3660 dev->dev->power.disable_depth++; 3661 #endif 3662 if (!amdgpu_device_has_dc_support(adev)) 3663 drm_helper_hpd_irq_event(dev); 3664 else 3665 drm_kms_helper_hotplug_event(dev); 3666 #ifdef CONFIG_PM 3667 dev->dev->power.disable_depth--; 3668 #endif 3669 adev->in_suspend = false; 3670 3671 return 0; 3672 } 3673 3674 /** 3675 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3676 * 3677 * @adev: amdgpu_device pointer 3678 * 3679 * The list of all the hardware IPs that make up the asic is walked and 3680 * the check_soft_reset callbacks are run. check_soft_reset determines 3681 * if the asic is still hung or not. 3682 * Returns true if any of the IPs are still in a hung state, false if not. 3683 */ 3684 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3685 { 3686 int i; 3687 bool asic_hang = false; 3688 3689 if (amdgpu_sriov_vf(adev)) 3690 return true; 3691 3692 if (amdgpu_asic_need_full_reset(adev)) 3693 return true; 3694 3695 for (i = 0; i < adev->num_ip_blocks; i++) { 3696 if (!adev->ip_blocks[i].status.valid) 3697 continue; 3698 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3699 adev->ip_blocks[i].status.hang = 3700 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3701 if (adev->ip_blocks[i].status.hang) { 3702 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3703 asic_hang = true; 3704 } 3705 } 3706 return asic_hang; 3707 } 3708 3709 /** 3710 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3711 * 3712 * @adev: amdgpu_device pointer 3713 * 3714 * The list of all the hardware IPs that make up the asic is walked and the 3715 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3716 * handles any IP specific hardware or software state changes that are 3717 * necessary for a soft reset to succeed. 3718 * Returns 0 on success, negative error code on failure. 3719 */ 3720 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3721 { 3722 int i, r = 0; 3723 3724 for (i = 0; i < adev->num_ip_blocks; i++) { 3725 if (!adev->ip_blocks[i].status.valid) 3726 continue; 3727 if (adev->ip_blocks[i].status.hang && 3728 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3729 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3730 if (r) 3731 return r; 3732 } 3733 } 3734 3735 return 0; 3736 } 3737 3738 /** 3739 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3740 * 3741 * @adev: amdgpu_device pointer 3742 * 3743 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3744 * reset is necessary to recover. 3745 * Returns true if a full asic reset is required, false if not. 3746 */ 3747 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3748 { 3749 int i; 3750 3751 if (amdgpu_asic_need_full_reset(adev)) 3752 return true; 3753 3754 for (i = 0; i < adev->num_ip_blocks; i++) { 3755 if (!adev->ip_blocks[i].status.valid) 3756 continue; 3757 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3758 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3759 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3760 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3762 if (adev->ip_blocks[i].status.hang) { 3763 DRM_INFO("Some block need full reset!\n"); 3764 return true; 3765 } 3766 } 3767 } 3768 return false; 3769 } 3770 3771 /** 3772 * amdgpu_device_ip_soft_reset - do a soft reset 3773 * 3774 * @adev: amdgpu_device pointer 3775 * 3776 * The list of all the hardware IPs that make up the asic is walked and the 3777 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3778 * IP specific hardware or software state changes that are necessary to soft 3779 * reset the IP. 3780 * Returns 0 on success, negative error code on failure. 3781 */ 3782 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3783 { 3784 int i, r = 0; 3785 3786 for (i = 0; i < adev->num_ip_blocks; i++) { 3787 if (!adev->ip_blocks[i].status.valid) 3788 continue; 3789 if (adev->ip_blocks[i].status.hang && 3790 adev->ip_blocks[i].version->funcs->soft_reset) { 3791 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3792 if (r) 3793 return r; 3794 } 3795 } 3796 3797 return 0; 3798 } 3799 3800 /** 3801 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3802 * 3803 * @adev: amdgpu_device pointer 3804 * 3805 * The list of all the hardware IPs that make up the asic is walked and the 3806 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3807 * handles any IP specific hardware or software state changes that are 3808 * necessary after the IP has been soft reset. 3809 * Returns 0 on success, negative error code on failure. 3810 */ 3811 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3812 { 3813 int i, r = 0; 3814 3815 for (i = 0; i < adev->num_ip_blocks; i++) { 3816 if (!adev->ip_blocks[i].status.valid) 3817 continue; 3818 if (adev->ip_blocks[i].status.hang && 3819 adev->ip_blocks[i].version->funcs->post_soft_reset) 3820 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3821 if (r) 3822 return r; 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_recover_vram - Recover some VRAM contents 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3834 * restore things like GPUVM page tables after a GPU reset where 3835 * the contents of VRAM might be lost. 3836 * 3837 * Returns: 3838 * 0 on success, negative error code on failure. 3839 */ 3840 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3841 { 3842 struct dma_fence *fence = NULL, *next = NULL; 3843 struct amdgpu_bo *shadow; 3844 long r = 1, tmo; 3845 3846 if (amdgpu_sriov_runtime(adev)) 3847 tmo = msecs_to_jiffies(8000); 3848 else 3849 tmo = msecs_to_jiffies(100); 3850 3851 DRM_INFO("recover vram bo from shadow start\n"); 3852 mutex_lock(&adev->shadow_list_lock); 3853 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3854 3855 /* No need to recover an evicted BO */ 3856 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3857 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3858 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3859 continue; 3860 3861 r = amdgpu_bo_restore_shadow(shadow, &next); 3862 if (r) 3863 break; 3864 3865 if (fence) { 3866 tmo = dma_fence_wait_timeout(fence, false, tmo); 3867 dma_fence_put(fence); 3868 fence = next; 3869 if (tmo == 0) { 3870 r = -ETIMEDOUT; 3871 break; 3872 } else if (tmo < 0) { 3873 r = tmo; 3874 break; 3875 } 3876 } else { 3877 fence = next; 3878 } 3879 } 3880 mutex_unlock(&adev->shadow_list_lock); 3881 3882 if (fence) 3883 tmo = dma_fence_wait_timeout(fence, false, tmo); 3884 dma_fence_put(fence); 3885 3886 if (r < 0 || tmo <= 0) { 3887 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3888 return -EIO; 3889 } 3890 3891 DRM_INFO("recover vram bo from shadow done\n"); 3892 return 0; 3893 } 3894 3895 3896 /** 3897 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3898 * 3899 * @adev: amdgpu device pointer 3900 * @from_hypervisor: request from hypervisor 3901 * 3902 * do VF FLR and reinitialize Asic 3903 * return 0 means succeeded otherwise failed 3904 */ 3905 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3906 bool from_hypervisor) 3907 { 3908 int r; 3909 3910 if (from_hypervisor) 3911 r = amdgpu_virt_request_full_gpu(adev, true); 3912 else 3913 r = amdgpu_virt_reset_gpu(adev); 3914 if (r) 3915 return r; 3916 3917 /* Resume IP prior to SMC */ 3918 r = amdgpu_device_ip_reinit_early_sriov(adev); 3919 if (r) 3920 goto error; 3921 3922 amdgpu_virt_init_data_exchange(adev); 3923 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3924 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3925 3926 r = amdgpu_device_fw_loading(adev); 3927 if (r) 3928 return r; 3929 3930 /* now we are okay to resume SMC/CP/SDMA */ 3931 r = amdgpu_device_ip_reinit_late_sriov(adev); 3932 if (r) 3933 goto error; 3934 3935 amdgpu_irq_gpu_reset_resume_helper(adev); 3936 r = amdgpu_ib_ring_tests(adev); 3937 amdgpu_amdkfd_post_reset(adev); 3938 3939 error: 3940 amdgpu_virt_release_full_gpu(adev, true); 3941 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3942 amdgpu_inc_vram_lost(adev); 3943 r = amdgpu_device_recover_vram(adev); 3944 } 3945 3946 return r; 3947 } 3948 3949 /** 3950 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3951 * 3952 * @adev: amdgpu device pointer 3953 * 3954 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3955 * a hung GPU. 3956 */ 3957 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3958 { 3959 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3960 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3961 return false; 3962 } 3963 3964 if (amdgpu_gpu_recovery == 0) 3965 goto disabled; 3966 3967 if (amdgpu_sriov_vf(adev)) 3968 return true; 3969 3970 if (amdgpu_gpu_recovery == -1) { 3971 switch (adev->asic_type) { 3972 case CHIP_BONAIRE: 3973 case CHIP_HAWAII: 3974 case CHIP_TOPAZ: 3975 case CHIP_TONGA: 3976 case CHIP_FIJI: 3977 case CHIP_POLARIS10: 3978 case CHIP_POLARIS11: 3979 case CHIP_POLARIS12: 3980 case CHIP_VEGAM: 3981 case CHIP_VEGA20: 3982 case CHIP_VEGA10: 3983 case CHIP_VEGA12: 3984 case CHIP_RAVEN: 3985 case CHIP_ARCTURUS: 3986 case CHIP_RENOIR: 3987 case CHIP_NAVI10: 3988 case CHIP_NAVI14: 3989 case CHIP_NAVI12: 3990 break; 3991 default: 3992 goto disabled; 3993 } 3994 } 3995 3996 return true; 3997 3998 disabled: 3999 DRM_INFO("GPU recovery disabled.\n"); 4000 return false; 4001 } 4002 4003 4004 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4005 struct amdgpu_job *job, 4006 bool *need_full_reset_arg) 4007 { 4008 int i, r = 0; 4009 bool need_full_reset = *need_full_reset_arg; 4010 4011 /* block all schedulers and reset given job's ring */ 4012 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4013 struct amdgpu_ring *ring = adev->rings[i]; 4014 4015 if (!ring || !ring->sched.thread) 4016 continue; 4017 4018 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4019 amdgpu_fence_driver_force_completion(ring); 4020 } 4021 4022 if(job) 4023 drm_sched_increase_karma(&job->base); 4024 4025 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4026 if (!amdgpu_sriov_vf(adev)) { 4027 4028 if (!need_full_reset) 4029 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4030 4031 if (!need_full_reset) { 4032 amdgpu_device_ip_pre_soft_reset(adev); 4033 r = amdgpu_device_ip_soft_reset(adev); 4034 amdgpu_device_ip_post_soft_reset(adev); 4035 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4036 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 4037 need_full_reset = true; 4038 } 4039 } 4040 4041 if (need_full_reset) 4042 r = amdgpu_device_ip_suspend(adev); 4043 4044 *need_full_reset_arg = need_full_reset; 4045 } 4046 4047 return r; 4048 } 4049 4050 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4051 struct list_head *device_list_handle, 4052 bool *need_full_reset_arg) 4053 { 4054 struct amdgpu_device *tmp_adev = NULL; 4055 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4056 int r = 0; 4057 4058 /* 4059 * ASIC reset has to be done on all HGMI hive nodes ASAP 4060 * to allow proper links negotiation in FW (within 1 sec) 4061 */ 4062 if (need_full_reset) { 4063 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4064 /* For XGMI run all resets in parallel to speed up the process */ 4065 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4066 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4067 r = -EALREADY; 4068 } else 4069 r = amdgpu_asic_reset(tmp_adev); 4070 4071 if (r) { 4072 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4073 r, tmp_adev->ddev->unique); 4074 break; 4075 } 4076 } 4077 4078 /* For XGMI wait for all resets to complete before proceed */ 4079 if (!r) { 4080 list_for_each_entry(tmp_adev, device_list_handle, 4081 gmc.xgmi.head) { 4082 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4083 flush_work(&tmp_adev->xgmi_reset_work); 4084 r = tmp_adev->asic_reset_res; 4085 if (r) 4086 break; 4087 } 4088 } 4089 } 4090 } 4091 4092 if (!r && amdgpu_ras_intr_triggered()) 4093 amdgpu_ras_intr_cleared(); 4094 4095 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4096 if (need_full_reset) { 4097 /* post card */ 4098 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 4099 DRM_WARN("asic atom init failed!"); 4100 4101 if (!r) { 4102 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4103 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4104 if (r) 4105 goto out; 4106 4107 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4108 if (vram_lost) { 4109 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4110 amdgpu_inc_vram_lost(tmp_adev); 4111 } 4112 4113 r = amdgpu_gtt_mgr_recover( 4114 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4115 if (r) 4116 goto out; 4117 4118 r = amdgpu_device_fw_loading(tmp_adev); 4119 if (r) 4120 return r; 4121 4122 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4123 if (r) 4124 goto out; 4125 4126 if (vram_lost) 4127 amdgpu_device_fill_reset_magic(tmp_adev); 4128 4129 /* 4130 * Add this ASIC as tracked as reset was already 4131 * complete successfully. 4132 */ 4133 amdgpu_register_gpu_instance(tmp_adev); 4134 4135 r = amdgpu_device_ip_late_init(tmp_adev); 4136 if (r) 4137 goto out; 4138 4139 /* must succeed. */ 4140 amdgpu_ras_resume(tmp_adev); 4141 4142 /* Update PSP FW topology after reset */ 4143 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4144 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4145 } 4146 } 4147 4148 4149 out: 4150 if (!r) { 4151 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4152 r = amdgpu_ib_ring_tests(tmp_adev); 4153 if (r) { 4154 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4155 r = amdgpu_device_ip_suspend(tmp_adev); 4156 need_full_reset = true; 4157 r = -EAGAIN; 4158 goto end; 4159 } 4160 } 4161 4162 if (!r) 4163 r = amdgpu_device_recover_vram(tmp_adev); 4164 else 4165 tmp_adev->asic_reset_res = r; 4166 } 4167 4168 end: 4169 *need_full_reset_arg = need_full_reset; 4170 return r; 4171 } 4172 4173 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 4174 { 4175 if (trylock) { 4176 if (!mutex_trylock(&adev->lock_reset)) 4177 return false; 4178 } else 4179 mutex_lock(&adev->lock_reset); 4180 4181 atomic_inc(&adev->gpu_reset_counter); 4182 adev->in_gpu_reset = true; 4183 switch (amdgpu_asic_reset_method(adev)) { 4184 case AMD_RESET_METHOD_MODE1: 4185 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4186 break; 4187 case AMD_RESET_METHOD_MODE2: 4188 adev->mp1_state = PP_MP1_STATE_RESET; 4189 break; 4190 default: 4191 adev->mp1_state = PP_MP1_STATE_NONE; 4192 break; 4193 } 4194 4195 return true; 4196 } 4197 4198 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4199 { 4200 amdgpu_vf_error_trans_all(adev); 4201 adev->mp1_state = PP_MP1_STATE_NONE; 4202 adev->in_gpu_reset = false; 4203 mutex_unlock(&adev->lock_reset); 4204 } 4205 4206 /** 4207 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4208 * 4209 * @adev: amdgpu device pointer 4210 * @job: which job trigger hang 4211 * 4212 * Attempt to reset the GPU if it has hung (all asics). 4213 * Attempt to do soft-reset or full-reset and reinitialize Asic 4214 * Returns 0 for success or an error on failure. 4215 */ 4216 4217 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4218 struct amdgpu_job *job) 4219 { 4220 struct list_head device_list, *device_list_handle = NULL; 4221 bool need_full_reset, job_signaled; 4222 struct amdgpu_hive_info *hive = NULL; 4223 struct amdgpu_device *tmp_adev = NULL; 4224 int i, r = 0; 4225 bool in_ras_intr = amdgpu_ras_intr_triggered(); 4226 bool use_baco = 4227 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? 4228 true : false; 4229 4230 /* 4231 * Flush RAM to disk so that after reboot 4232 * the user can read log and see why the system rebooted. 4233 */ 4234 if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { 4235 4236 DRM_WARN("Emergency reboot."); 4237 4238 ksys_sync_helper(); 4239 emergency_restart(); 4240 } 4241 4242 need_full_reset = job_signaled = false; 4243 INIT_LIST_HEAD(&device_list); 4244 4245 dev_info(adev->dev, "GPU %s begin!\n", 4246 (in_ras_intr && !use_baco) ? "jobs stop":"reset"); 4247 4248 cancel_delayed_work_sync(&adev->delayed_init_work); 4249 4250 hive = amdgpu_get_xgmi_hive(adev, false); 4251 4252 /* 4253 * Here we trylock to avoid chain of resets executing from 4254 * either trigger by jobs on different adevs in XGMI hive or jobs on 4255 * different schedulers for same device while this TO handler is running. 4256 * We always reset all schedulers for device and all devices for XGMI 4257 * hive so that should take care of them too. 4258 */ 4259 4260 if (hive && !mutex_trylock(&hive->reset_lock)) { 4261 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", hive: %"PRIx64" as another already in progress", 4262 job ? job->base.id : -1, hive->hive_id); 4263 return 0; 4264 } 4265 4266 /* Start with adev pre asic reset first for soft reset check.*/ 4267 if (!amdgpu_device_lock_adev(adev, !hive)) { 4268 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", as another already in progress", 4269 job ? job->base.id : -1); 4270 return 0; 4271 } 4272 4273 /* Block kfd: SRIOV would do it separately */ 4274 if (!amdgpu_sriov_vf(adev)) 4275 amdgpu_amdkfd_pre_reset(adev); 4276 4277 /* Build list of devices to reset */ 4278 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4279 if (!hive) { 4280 /*unlock kfd: SRIOV would do it separately */ 4281 if (!amdgpu_sriov_vf(adev)) 4282 amdgpu_amdkfd_post_reset(adev); 4283 amdgpu_device_unlock_adev(adev); 4284 return -ENODEV; 4285 } 4286 4287 /* 4288 * In case we are in XGMI hive mode device reset is done for all the 4289 * nodes in the hive to retrain all XGMI links and hence the reset 4290 * sequence is executed in loop on all nodes. 4291 */ 4292 device_list_handle = &hive->device_list; 4293 } else { 4294 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4295 device_list_handle = &device_list; 4296 } 4297 4298 /* block all schedulers and reset given job's ring */ 4299 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4300 if (tmp_adev != adev) { 4301 amdgpu_device_lock_adev(tmp_adev, false); 4302 if (!amdgpu_sriov_vf(tmp_adev)) 4303 amdgpu_amdkfd_pre_reset(tmp_adev); 4304 } 4305 4306 /* 4307 * Mark these ASICs to be reseted as untracked first 4308 * And add them back after reset completed 4309 */ 4310 amdgpu_unregister_gpu_instance(tmp_adev); 4311 4312 /* disable ras on ALL IPs */ 4313 if (!(in_ras_intr && !use_baco) && 4314 amdgpu_device_ip_need_full_reset(tmp_adev)) 4315 amdgpu_ras_suspend(tmp_adev); 4316 4317 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4318 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4319 4320 if (!ring || !ring->sched.thread) 4321 continue; 4322 4323 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4324 4325 if (in_ras_intr && !use_baco) 4326 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4327 } 4328 } 4329 4330 4331 if (in_ras_intr && !use_baco) 4332 goto skip_sched_resume; 4333 4334 /* 4335 * Must check guilty signal here since after this point all old 4336 * HW fences are force signaled. 4337 * 4338 * job->base holds a reference to parent fence 4339 */ 4340 if (job && job->base.s_fence->parent && 4341 dma_fence_is_signaled(job->base.s_fence->parent)) 4342 job_signaled = true; 4343 4344 if (job_signaled) { 4345 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4346 goto skip_hw_reset; 4347 } 4348 4349 4350 /* Guilty job will be freed after this*/ 4351 r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); 4352 if (r) { 4353 /*TODO Should we stop ?*/ 4354 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4355 r, adev->ddev->unique); 4356 adev->asic_reset_res = r; 4357 } 4358 4359 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4360 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4361 4362 if (tmp_adev == adev) 4363 continue; 4364 4365 r = amdgpu_device_pre_asic_reset(tmp_adev, 4366 NULL, 4367 &need_full_reset); 4368 /*TODO Should we stop ?*/ 4369 if (r) { 4370 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4371 r, tmp_adev->ddev->unique); 4372 tmp_adev->asic_reset_res = r; 4373 } 4374 } 4375 4376 /* Actual ASIC resets if needed.*/ 4377 /* TODO Implement XGMI hive reset logic for SRIOV */ 4378 if (amdgpu_sriov_vf(adev)) { 4379 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4380 if (r) 4381 adev->asic_reset_res = r; 4382 } else { 4383 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4384 if (r && r == -EAGAIN) 4385 goto retry; 4386 } 4387 4388 skip_hw_reset: 4389 4390 /* Post ASIC reset for all devs .*/ 4391 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4392 4393 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4394 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4395 4396 if (!ring || !ring->sched.thread) 4397 continue; 4398 4399 /* No point to resubmit jobs if we didn't HW reset*/ 4400 if (!tmp_adev->asic_reset_res && !job_signaled) 4401 drm_sched_resubmit_jobs(&ring->sched); 4402 4403 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4404 } 4405 4406 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4407 drm_helper_resume_force_mode(tmp_adev->ddev); 4408 } 4409 4410 tmp_adev->asic_reset_res = 0; 4411 4412 if (r) { 4413 /* bad news, how to tell it to userspace ? */ 4414 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4415 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4416 } else { 4417 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4418 } 4419 } 4420 4421 skip_sched_resume: 4422 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4423 /*unlock kfd: SRIOV would do it separately */ 4424 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) 4425 amdgpu_amdkfd_post_reset(tmp_adev); 4426 amdgpu_device_unlock_adev(tmp_adev); 4427 } 4428 4429 if (hive) 4430 mutex_unlock(&hive->reset_lock); 4431 4432 if (r) 4433 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4434 return r; 4435 } 4436 4437 /** 4438 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4439 * 4440 * @adev: amdgpu_device pointer 4441 * 4442 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4443 * and lanes) of the slot the device is in. Handles APUs and 4444 * virtualized environments where PCIE config space may not be available. 4445 */ 4446 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4447 { 4448 struct pci_dev *pdev; 4449 enum pci_bus_speed speed_cap, platform_speed_cap; 4450 enum pcie_link_width platform_link_width; 4451 4452 if (amdgpu_pcie_gen_cap) 4453 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4454 4455 if (amdgpu_pcie_lane_cap) 4456 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4457 4458 /* covers APUs as well */ 4459 if (pci_is_root_bus(adev->pdev->bus)) { 4460 if (adev->pm.pcie_gen_mask == 0) 4461 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4462 if (adev->pm.pcie_mlw_mask == 0) 4463 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4464 return; 4465 } 4466 4467 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4468 return; 4469 4470 pcie_bandwidth_available(adev->pdev, NULL, 4471 &platform_speed_cap, &platform_link_width); 4472 4473 if (adev->pm.pcie_gen_mask == 0) { 4474 /* asic caps */ 4475 pdev = adev->pdev; 4476 speed_cap = pcie_get_speed_cap(pdev); 4477 if (speed_cap == PCI_SPEED_UNKNOWN) { 4478 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4479 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4480 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4481 } else { 4482 if (speed_cap == PCIE_SPEED_16_0GT) 4483 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4487 else if (speed_cap == PCIE_SPEED_8_0GT) 4488 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4489 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4491 else if (speed_cap == PCIE_SPEED_5_0GT) 4492 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4494 else 4495 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4496 } 4497 /* platform caps */ 4498 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4499 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4501 } else { 4502 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4503 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4507 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4508 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4511 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4512 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4513 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4514 else 4515 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4516 4517 } 4518 } 4519 if (adev->pm.pcie_mlw_mask == 0) { 4520 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4521 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4522 } else { 4523 switch (platform_link_width) { 4524 case PCIE_LNK_X32: 4525 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4532 break; 4533 case PCIE_LNK_X16: 4534 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4540 break; 4541 case PCIE_LNK_X12: 4542 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4547 break; 4548 case PCIE_LNK_X8: 4549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4553 break; 4554 case PCIE_LNK_X4: 4555 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4558 break; 4559 case PCIE_LNK_X2: 4560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4562 break; 4563 case PCIE_LNK_X1: 4564 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4565 break; 4566 default: 4567 break; 4568 } 4569 } 4570 } 4571 } 4572 4573 int amdgpu_device_baco_enter(struct drm_device *dev) 4574 { 4575 struct amdgpu_device *adev = dev->dev_private; 4576 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4577 4578 if (!amdgpu_device_supports_baco(adev->ddev)) 4579 return -ENOTSUPP; 4580 4581 if (ras && ras->supported) 4582 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4583 4584 return amdgpu_dpm_baco_enter(adev); 4585 } 4586 4587 int amdgpu_device_baco_exit(struct drm_device *dev) 4588 { 4589 struct amdgpu_device *adev = dev->dev_private; 4590 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4591 int ret = 0; 4592 4593 if (!amdgpu_device_supports_baco(adev->ddev)) 4594 return -ENOTSUPP; 4595 4596 ret = amdgpu_dpm_baco_exit(adev); 4597 if (ret) 4598 return ret; 4599 4600 if (ras && ras->supported) 4601 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4602 4603 return 0; 4604 } 4605