1 /* $NetBSD: amdgpu_device.c,v 1.18 2023/02/21 11:39:39 riastradh Exp $ */ 2 3 /* 4 * Copyright 2008 Advanced Micro Devices, Inc. 5 * Copyright 2008 Red Hat Inc. 6 * Copyright 2009 Jerome Glisse. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 * 26 * Authors: Dave Airlie 27 * Alex Deucher 28 * Jerome Glisse 29 */ 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: amdgpu_device.c,v 1.18 2023/02/21 11:39:39 riastradh Exp $"); 32 33 #include <linux/power_supply.h> 34 #include <linux/kthread.h> 35 #include <linux/module.h> 36 #include <linux/console.h> 37 #include <linux/slab.h> 38 #include <linux/reboot.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/pci.h> 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 74 #include <linux/suspend.h> 75 #include <drm/task_barrier.h> 76 #include <linux/nbsd-namespace.h> 77 78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "NAVI10", 118 "NAVI14", 119 "NAVI12", 120 "LAST", 121 }; 122 123 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = ddev->dev_private; 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 #endif /* __NetBSD__ */ 148 149 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 150 151 /** 152 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 153 * 154 * @dev: drm_device pointer 155 * 156 * Returns true if the device is a dGPU with HG/PX power control, 157 * otherwise return false. 158 */ 159 bool amdgpu_device_supports_boco(struct drm_device *dev) 160 { 161 struct amdgpu_device *adev = dev->dev_private; 162 163 if (adev->flags & AMD_IS_PX) 164 return true; 165 return false; 166 } 167 168 /** 169 * amdgpu_device_supports_baco - Does the device support BACO 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device supporte BACO, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_baco(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = dev->dev_private; 179 180 return amdgpu_asic_supports_baco(adev); 181 } 182 183 /** 184 * VRAM access helper functions. 185 * 186 * amdgpu_device_vram_access - read/write a buffer in vram 187 * 188 * @adev: amdgpu_device pointer 189 * @pos: offset of the buffer in vram 190 * @buf: virtual address of the buffer in system memory 191 * @size: read/write size, sizeof(@buf) must > @size 192 * @write: true - write to vram, otherwise - read from vram 193 */ 194 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 195 uint32_t *buf, size_t size, bool write) 196 { 197 uint64_t last; 198 unsigned long flags; 199 200 last = size - 4; 201 for (last += pos; pos <= last; pos += 4) { 202 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 203 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 204 WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); 205 if (write) 206 WREG32_NO_KIQ(mmMM_DATA, *buf++); 207 else 208 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 209 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 210 } 211 } 212 213 /* 214 * MMIO register access helper functions. 215 */ 216 /** 217 * amdgpu_mm_rreg - read a memory mapped IO register 218 * 219 * @adev: amdgpu_device pointer 220 * @reg: dword aligned register offset 221 * @acc_flags: access flags which require special behavior 222 * 223 * Returns the 32 bit value from the offset specified. 224 */ 225 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 226 uint32_t acc_flags) 227 { 228 uint32_t ret; 229 230 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 231 return amdgpu_kiq_rreg(adev, reg); 232 233 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 234 #ifdef __NetBSD__ 235 return bus_space_read_4(adev->rmmiot, adev->rmmioh, 4*reg); 236 #else 237 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 238 #endif 239 else { 240 unsigned long flags; 241 242 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 243 #ifdef __NetBSD__ 244 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 245 4*reg); 246 ret = bus_space_read_4(adev->rmmiot, adev->rmmioh, 247 4*mmMM_DATA); 248 #else 249 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 250 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 251 #endif 252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 253 } 254 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 255 return ret; 256 } 257 258 /* 259 * MMIO register read with bytes helper functions 260 * @offset:bytes offset from MMIO start 261 * 262 */ 263 264 /** 265 * amdgpu_mm_rreg8 - read a memory mapped IO register 266 * 267 * @adev: amdgpu_device pointer 268 * @offset: byte aligned register offset 269 * 270 * Returns the 8 bit value from the offset specified. 271 */ 272 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 273 if (offset < adev->rmmio_size) 274 #ifdef __NetBSD__ 275 return bus_space_read_1(adev->rmmiot, adev->rmmioh, offset); 276 #else 277 return (readb(adev->rmmio + offset)); 278 #endif 279 BUG(); 280 } 281 282 /* 283 * MMIO register write with bytes helper functions 284 * @offset:bytes offset from MMIO start 285 * @value: the value want to be written to the register 286 * 287 */ 288 /** 289 * amdgpu_mm_wreg8 - read a memory mapped IO register 290 * 291 * @adev: amdgpu_device pointer 292 * @offset: byte aligned register offset 293 * @value: 8 bit value to write 294 * 295 * Writes the value specified to the offset specified. 296 */ 297 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 298 if (offset < adev->rmmio_size) 299 #ifdef __NetBSD__ 300 bus_space_write_1(adev->rmmiot, adev->rmmioh, offset, value); 301 #else 302 writeb(value, adev->rmmio + offset); 303 #endif 304 else 305 BUG(); 306 } 307 308 /** 309 * amdgpu_mm_wreg - write to a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @v: 32 bit value to write to the register 314 * @acc_flags: access flags which require special behavior 315 * 316 * Writes the value specified to the offset specified. 317 */ 318 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 319 uint32_t acc_flags) 320 { 321 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 322 323 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 324 adev->last_mm_index = v; 325 } 326 327 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 328 return amdgpu_kiq_wreg(adev, reg, v); 329 330 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 331 #ifdef __NetBSD__ 332 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*reg, v); 333 #else 334 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 335 #endif 336 else { 337 unsigned long flags; 338 339 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 340 #ifdef __NetBSD__ 341 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 342 reg*4); 343 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_DATA, v); 344 #else 345 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 346 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 347 #endif 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 } 350 351 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 352 udelay(500); 353 } 354 } 355 356 /** 357 * amdgpu_io_rreg - read an IO register 358 * 359 * @adev: amdgpu_device pointer 360 * @reg: dword aligned register offset 361 * 362 * Returns the 32 bit value from the offset specified. 363 */ 364 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 365 { 366 if ((reg * 4) < adev->rio_mem_size) 367 #ifdef __NetBSD__ 368 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 4*reg); 369 #else 370 return ioread32(adev->rio_mem + (reg * 4)); 371 #endif 372 else { 373 #ifdef __NetBSD__ 374 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 375 4*reg); 376 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 377 4*mmMM_DATA); 378 #else 379 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 380 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 381 #endif 382 } 383 } 384 385 /** 386 * amdgpu_io_wreg - write to an IO register 387 * 388 * @adev: amdgpu_device pointer 389 * @reg: dword aligned register offset 390 * @v: 32 bit value to write to the register 391 * 392 * Writes the value specified to the offset specified. 393 */ 394 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 395 { 396 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 397 adev->last_mm_index = v; 398 } 399 400 if ((reg * 4) < adev->rio_mem_size) 401 #ifdef __NetBSD__ 402 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*reg, v); 403 #else 404 iowrite32(v, adev->rio_mem + (reg * 4)); 405 #endif 406 else { 407 #ifdef __NetBSD__ 408 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 409 4*reg); 410 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_DATA, 411 v); 412 #else 413 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 414 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 415 #endif 416 } 417 418 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 419 udelay(500); 420 } 421 } 422 423 /** 424 * amdgpu_mm_rdoorbell - read a doorbell dword 425 * 426 * @adev: amdgpu_device pointer 427 * @index: doorbell index 428 * 429 * Returns the value in the doorbell aperture at the 430 * requested doorbell index (CIK). 431 */ 432 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 433 { 434 if (index < adev->doorbell.num_doorbells) { 435 #ifdef __NetBSD__ 436 return bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 437 4*index); 438 #else 439 return readl(adev->doorbell.ptr + index); 440 #endif 441 } else { 442 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 443 return 0; 444 } 445 } 446 447 /** 448 * amdgpu_mm_wdoorbell - write a doorbell dword 449 * 450 * @adev: amdgpu_device pointer 451 * @index: doorbell index 452 * @v: value to write 453 * 454 * Writes @v to the doorbell aperture at the 455 * requested doorbell index (CIK). 456 */ 457 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 458 { 459 if (index < adev->doorbell.num_doorbells) { 460 #ifdef __NetBSD__ 461 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 462 4*index, v); 463 #else 464 writel(v, adev->doorbell.ptr + index); 465 #endif 466 } else { 467 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 468 } 469 } 470 471 /** 472 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 473 * 474 * @adev: amdgpu_device pointer 475 * @index: doorbell index 476 * 477 * Returns the value in the doorbell aperture at the 478 * requested doorbell index (VEGA10+). 479 */ 480 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 481 { 482 if (index < adev->doorbell.num_doorbells) { 483 #ifdef __NetBSD__ 484 #ifdef _LP64 485 return bus_space_read_8(adev->doorbell.bst, adev->doorbell.bsh, 486 4*index); 487 #else 488 uint64_t lo, hi; 489 #if _BYTE_ORDER == _LITTLE_ENDIAN 490 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 491 4*index); 492 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 493 4*index + 4); 494 #else 495 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 496 4*index); 497 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 498 4*index + 4); 499 #endif 500 return lo | (hi << 32); 501 #endif 502 #else 503 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 504 #endif 505 } else { 506 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 507 return 0; 508 } 509 } 510 511 /** 512 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 513 * 514 * @adev: amdgpu_device pointer 515 * @index: doorbell index 516 * @v: value to write 517 * 518 * Writes @v to the doorbell aperture at the 519 * requested doorbell index (VEGA10+). 520 */ 521 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 522 { 523 if (index < adev->doorbell.num_doorbells) { 524 #ifdef __NetBSD__ 525 #ifdef _LP64 526 bus_space_write_8(adev->doorbell.bst, adev->doorbell.bsh, 527 4*index, v); 528 #else 529 /* 530 * XXX This might not be as atomic as one might hope... 531 */ 532 #if _BYTE_ORDER == _LITTLE_ENDIAN 533 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 534 4*index, v & 0xffffffffU); 535 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 536 4*index + 4, v >> 32); 537 #else 538 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 539 4*index, v >> 32); 540 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 541 4*index + 4, v & 0xffffffffU); 542 #endif 543 #endif 544 #else 545 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 546 #endif 547 } else { 548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 549 } 550 } 551 552 /** 553 * amdgpu_invalid_rreg - dummy reg read function 554 * 555 * @adev: amdgpu device pointer 556 * @reg: offset of register 557 * 558 * Dummy register read function. Used for register blocks 559 * that certain asics don't have (all asics). 560 * Returns the value in the register. 561 */ 562 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 563 { 564 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 565 BUG(); 566 return 0; 567 } 568 569 /** 570 * amdgpu_invalid_wreg - dummy reg write function 571 * 572 * @adev: amdgpu device pointer 573 * @reg: offset of register 574 * @v: value to write to the register 575 * 576 * Dummy register read function. Used for register blocks 577 * that certain asics don't have (all asics). 578 */ 579 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 580 { 581 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 582 reg, v); 583 BUG(); 584 } 585 586 /** 587 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 588 * 589 * @adev: amdgpu device pointer 590 * @reg: offset of register 591 * 592 * Dummy register read function. Used for register blocks 593 * that certain asics don't have (all asics). 594 * Returns the value in the register. 595 */ 596 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 597 { 598 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 599 BUG(); 600 return 0; 601 } 602 603 /** 604 * amdgpu_invalid_wreg64 - dummy reg write function 605 * 606 * @adev: amdgpu device pointer 607 * @reg: offset of register 608 * @v: value to write to the register 609 * 610 * Dummy register read function. Used for register blocks 611 * that certain asics don't have (all asics). 612 */ 613 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 614 { 615 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08"PRIX64"\n", 616 reg, v); 617 BUG(); 618 } 619 620 /** 621 * amdgpu_block_invalid_rreg - dummy reg read function 622 * 623 * @adev: amdgpu device pointer 624 * @block: offset of instance 625 * @reg: offset of register 626 * 627 * Dummy register read function. Used for register blocks 628 * that certain asics don't have (all asics). 629 * Returns the value in the register. 630 */ 631 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 632 uint32_t block, uint32_t reg) 633 { 634 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 635 reg, block); 636 BUG(); 637 return 0; 638 } 639 640 /** 641 * amdgpu_block_invalid_wreg - dummy reg write function 642 * 643 * @adev: amdgpu device pointer 644 * @block: offset of instance 645 * @reg: offset of register 646 * @v: value to write to the register 647 * 648 * Dummy register read function. Used for register blocks 649 * that certain asics don't have (all asics). 650 */ 651 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 652 uint32_t block, 653 uint32_t reg, uint32_t v) 654 { 655 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 656 reg, block, v); 657 BUG(); 658 } 659 660 /** 661 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 662 * 663 * @adev: amdgpu device pointer 664 * 665 * Allocates a scratch page of VRAM for use by various things in the 666 * driver. 667 */ 668 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 669 { 670 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 671 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 672 &adev->vram_scratch.robj, 673 &adev->vram_scratch.gpu_addr, 674 (void **)__UNVOLATILE(&adev->vram_scratch.ptr)); 675 } 676 677 /** 678 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 679 * 680 * @adev: amdgpu device pointer 681 * 682 * Frees the VRAM scratch page. 683 */ 684 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 685 { 686 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 687 } 688 689 /** 690 * amdgpu_device_program_register_sequence - program an array of registers. 691 * 692 * @adev: amdgpu_device pointer 693 * @registers: pointer to the register array 694 * @array_size: size of the register array 695 * 696 * Programs an array or registers with and and or masks. 697 * This is a helper for setting golden registers. 698 */ 699 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 700 const u32 *registers, 701 const u32 array_size) 702 { 703 u32 tmp, reg, and_mask, or_mask; 704 int i; 705 706 if (array_size % 3) 707 return; 708 709 for (i = 0; i < array_size; i +=3) { 710 reg = registers[i + 0]; 711 and_mask = registers[i + 1]; 712 or_mask = registers[i + 2]; 713 714 if (and_mask == 0xffffffff) { 715 tmp = or_mask; 716 } else { 717 tmp = RREG32(reg); 718 tmp &= ~and_mask; 719 if (adev->family >= AMDGPU_FAMILY_AI) 720 tmp |= (or_mask & and_mask); 721 else 722 tmp |= or_mask; 723 } 724 WREG32(reg, tmp); 725 } 726 } 727 728 /** 729 * amdgpu_device_pci_config_reset - reset the GPU 730 * 731 * @adev: amdgpu_device pointer 732 * 733 * Resets the GPU using the pci config reset sequence. 734 * Only applicable to asics prior to vega10. 735 */ 736 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 737 { 738 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 739 } 740 741 /* 742 * GPU doorbell aperture helpers function. 743 */ 744 /** 745 * amdgpu_device_doorbell_init - Init doorbell driver information. 746 * 747 * @adev: amdgpu_device pointer 748 * 749 * Init doorbell driver information (CIK) 750 * Returns 0 on success, error on failure. 751 */ 752 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 753 { 754 755 /* No doorbell on SI hardware generation */ 756 if (adev->asic_type < CHIP_BONAIRE) { 757 adev->doorbell.base = 0; 758 adev->doorbell.size = 0; 759 adev->doorbell.num_doorbells = 0; 760 #ifndef __NetBSD__ 761 adev->doorbell.ptr = NULL; 762 #endif 763 return 0; 764 } 765 766 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 767 return -EINVAL; 768 769 amdgpu_asic_init_doorbell_index(adev); 770 771 /* doorbell bar mapping */ 772 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 773 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 774 775 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 776 adev->doorbell_index.max_assignment+1); 777 if (adev->doorbell.num_doorbells == 0) 778 return -EINVAL; 779 780 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 781 * paging queue doorbell use the second page. The 782 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 783 * doorbells are in the first page. So with paging queue enabled, 784 * the max num_doorbells should + 1 page (0x400 in dword) 785 */ 786 if (adev->asic_type >= CHIP_VEGA10) 787 adev->doorbell.num_doorbells += 0x400; 788 789 #ifdef __NetBSD__ 790 int r; 791 adev->doorbell.bst = adev->pdev->pd_pa.pa_memt; 792 /* XXX errno NetBSD->Linux */ 793 r = -bus_space_map(adev->doorbell.bst, adev->doorbell.base, 794 adev->doorbell.num_doorbells * sizeof(u32), 0, 795 &adev->doorbell.bsh); 796 if (r) 797 return r; 798 #else 799 adev->doorbell.ptr = ioremap(adev->doorbell.base, 800 adev->doorbell.num_doorbells * 801 sizeof(u32)); 802 if (adev->doorbell.ptr == NULL) 803 return -ENOMEM; 804 #endif 805 806 return 0; 807 } 808 809 /** 810 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 811 * 812 * @adev: amdgpu_device pointer 813 * 814 * Tear down doorbell driver information (CIK) 815 */ 816 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 817 { 818 #ifdef __NetBSD__ 819 if (adev->doorbell.num_doorbells) { 820 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 821 adev->doorbell.num_doorbells * sizeof(u32)); 822 adev->doorbell.num_doorbells = 0; 823 } 824 #else 825 iounmap(adev->doorbell.ptr); 826 adev->doorbell.ptr = NULL; 827 #endif 828 } 829 830 831 832 /* 833 * amdgpu_device_wb_*() 834 * Writeback is the method by which the GPU updates special pages in memory 835 * with the status of certain GPU events (fences, ring pointers,etc.). 836 */ 837 838 /** 839 * amdgpu_device_wb_fini - Disable Writeback and free memory 840 * 841 * @adev: amdgpu_device pointer 842 * 843 * Disables Writeback and frees the Writeback memory (all asics). 844 * Used at driver shutdown. 845 */ 846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 847 { 848 if (adev->wb.wb_obj) { 849 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 850 &adev->wb.gpu_addr, 851 (void **)__UNVOLATILE(&adev->wb.wb)); 852 adev->wb.wb_obj = NULL; 853 } 854 } 855 856 /** 857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 858 * 859 * @adev: amdgpu_device pointer 860 * 861 * Initializes writeback and allocates writeback memory (all asics). 862 * Used at driver startup. 863 * Returns 0 on success or an -error on failure. 864 */ 865 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 866 { 867 int r; 868 869 if (adev->wb.wb_obj == NULL) { 870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 873 &adev->wb.wb_obj, &adev->wb.gpu_addr, 874 (void **)__UNVOLATILE(&adev->wb.wb)); 875 if (r) { 876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 877 return r; 878 } 879 880 adev->wb.num_wb = AMDGPU_MAX_WB; 881 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 882 883 /* clear wb memory */ 884 memset(__UNVOLATILE(adev->wb.wb), 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 885 } 886 887 return 0; 888 } 889 890 /** 891 * amdgpu_device_wb_get - Allocate a wb entry 892 * 893 * @adev: amdgpu_device pointer 894 * @wb: wb index 895 * 896 * Allocate a wb slot for use by the driver (all asics). 897 * Returns 0 on success or -EINVAL on failure. 898 */ 899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 900 { 901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 902 903 if (offset < adev->wb.num_wb) { 904 __set_bit(offset, adev->wb.used); 905 *wb = offset << 3; /* convert to dw offset */ 906 return 0; 907 } else { 908 return -EINVAL; 909 } 910 } 911 912 /** 913 * amdgpu_device_wb_free - Free a wb entry 914 * 915 * @adev: amdgpu_device pointer 916 * @wb: wb index 917 * 918 * Free a wb slot allocated for use by the driver (all asics) 919 */ 920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 921 { 922 wb >>= 3; 923 if (wb < adev->wb.num_wb) 924 __clear_bit(wb, adev->wb.used); 925 } 926 927 /** 928 * amdgpu_device_resize_fb_bar - try to resize FB BAR 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 933 * to fail, but if any of the BARs is not accessible after the size we abort 934 * driver loading by returning -ENODEV. 935 */ 936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 937 { 938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 940 struct pci_bus *root; 941 struct resource *res; 942 unsigned i; 943 u16 cmd; 944 int r; 945 946 /* Bypass for VF */ 947 if (amdgpu_sriov_vf(adev)) 948 return 0; 949 950 #ifdef __NetBSD__ /* XXX amdgpu fb resize */ 951 __USE(space_needed); 952 __USE(rbar_size); 953 __USE(root); 954 __USE(res); 955 __USE(i); 956 __USE(cmd); 957 __USE(r); 958 #else 959 960 /* Check if the root BUS has 64bit memory resources */ 961 root = adev->pdev->bus; 962 while (root->parent) 963 root = root->parent; 964 965 pci_bus_for_each_resource(root, res, i) { 966 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 967 res->start > 0x100000000ull) 968 break; 969 } 970 971 /* Trying to resize is pointless without a root hub window above 4GB */ 972 if (!res) 973 return 0; 974 975 /* Disable memory decoding while we change the BAR addresses and size */ 976 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 977 pci_write_config_word(adev->pdev, PCI_COMMAND, 978 cmd & ~PCI_COMMAND_MEMORY); 979 980 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 981 amdgpu_device_doorbell_fini(adev); 982 if (adev->asic_type >= CHIP_BONAIRE) 983 pci_release_resource(adev->pdev, 2); 984 985 pci_release_resource(adev->pdev, 0); 986 987 r = pci_resize_resource(adev->pdev, 0, rbar_size); 988 if (r == -ENOSPC) 989 DRM_INFO("Not enough PCI address space for a large BAR."); 990 else if (r && r != -ENOTSUPP) 991 DRM_ERROR("Problem resizing BAR0 (%d).", r); 992 993 pci_assign_unassigned_bus_resources(adev->pdev->bus); 994 995 /* When the doorbell or fb BAR isn't available we have no chance of 996 * using the device. 997 */ 998 r = amdgpu_device_doorbell_init(adev); 999 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1000 return -ENODEV; 1001 1002 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1003 1004 #endif 1005 1006 return 0; 1007 } 1008 1009 /* 1010 * GPU helpers function. 1011 */ 1012 /** 1013 * amdgpu_device_need_post - check if the hw need post or not 1014 * 1015 * @adev: amdgpu_device pointer 1016 * 1017 * Check if the asic has been initialized (all asics) at driver startup 1018 * or post is needed if hw reset is performed. 1019 * Returns true if need or false if not. 1020 */ 1021 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1022 { 1023 uint32_t reg; 1024 1025 if (amdgpu_sriov_vf(adev)) 1026 return false; 1027 1028 if (amdgpu_passthrough(adev)) { 1029 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1030 * some old smc fw still need driver do vPost otherwise gpu hang, while 1031 * those smc fw version above 22.15 doesn't have this flaw, so we force 1032 * vpost executed for smc version below 22.15 1033 */ 1034 if (adev->asic_type == CHIP_FIJI) { 1035 int err; 1036 uint32_t fw_ver; 1037 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1038 /* force vPost if error occured */ 1039 if (err) 1040 return true; 1041 1042 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1043 if (fw_ver < 0x00160e00) 1044 return true; 1045 } 1046 } 1047 1048 if (adev->has_hw_reset) { 1049 adev->has_hw_reset = false; 1050 return true; 1051 } 1052 1053 /* bios scratch used on CIK+ */ 1054 if (adev->asic_type >= CHIP_BONAIRE) 1055 return amdgpu_atombios_scratch_need_asic_init(adev); 1056 1057 /* check MEM_SIZE for older asics */ 1058 reg = amdgpu_asic_get_config_memsize(adev); 1059 1060 if ((reg != 0) && (reg != 0xffffffff)) 1061 return false; 1062 1063 return true; 1064 } 1065 1066 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1067 /* if we get transitioned to only one device, take VGA back */ 1068 /** 1069 * amdgpu_device_vga_set_decode - enable/disable vga decode 1070 * 1071 * @cookie: amdgpu_device pointer 1072 * @state: enable/disable vga decode 1073 * 1074 * Enable/disable vga decode (all asics). 1075 * Returns VGA resource flags. 1076 */ 1077 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1078 { 1079 struct amdgpu_device *adev = cookie; 1080 amdgpu_asic_set_vga_state(adev, state); 1081 if (state) 1082 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1083 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1084 else 1085 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1086 } 1087 #endif /* __NetBSD__ */ 1088 1089 /** 1090 * amdgpu_device_check_block_size - validate the vm block size 1091 * 1092 * @adev: amdgpu_device pointer 1093 * 1094 * Validates the vm block size specified via module parameter. 1095 * The vm block size defines number of bits in page table versus page directory, 1096 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1097 * page table and the remaining bits are in the page directory. 1098 */ 1099 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1100 { 1101 /* defines number of bits in page table versus page directory, 1102 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1103 * page table and the remaining bits are in the page directory */ 1104 if (amdgpu_vm_block_size == -1) 1105 return; 1106 1107 if (amdgpu_vm_block_size < 9) { 1108 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1109 amdgpu_vm_block_size); 1110 amdgpu_vm_block_size = -1; 1111 } 1112 } 1113 1114 /** 1115 * amdgpu_device_check_vm_size - validate the vm size 1116 * 1117 * @adev: amdgpu_device pointer 1118 * 1119 * Validates the vm size in GB specified via module parameter. 1120 * The VM size is the size of the GPU virtual memory space in GB. 1121 */ 1122 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1123 { 1124 /* no need to check the default value */ 1125 if (amdgpu_vm_size == -1) 1126 return; 1127 1128 if (amdgpu_vm_size < 1) { 1129 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1130 amdgpu_vm_size); 1131 amdgpu_vm_size = -1; 1132 } 1133 } 1134 1135 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1136 { 1137 struct sysinfo si; 1138 bool is_os_64 = (sizeof(void *) == 8); 1139 uint64_t total_memory; 1140 uint64_t dram_size_seven_GB = 0x1B8000000; 1141 uint64_t dram_size_three_GB = 0xB8000000; 1142 1143 if (amdgpu_smu_memory_pool_size == 0) 1144 return; 1145 1146 if (!is_os_64) { 1147 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1148 goto def_value; 1149 } 1150 si_meminfo(&si); 1151 total_memory = (uint64_t)si.totalram * si.mem_unit; 1152 1153 if ((amdgpu_smu_memory_pool_size == 1) || 1154 (amdgpu_smu_memory_pool_size == 2)) { 1155 if (total_memory < dram_size_three_GB) 1156 goto def_value1; 1157 } else if ((amdgpu_smu_memory_pool_size == 4) || 1158 (amdgpu_smu_memory_pool_size == 8)) { 1159 if (total_memory < dram_size_seven_GB) 1160 goto def_value1; 1161 } else { 1162 DRM_WARN("Smu memory pool size not supported\n"); 1163 goto def_value; 1164 } 1165 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1166 1167 return; 1168 1169 def_value1: 1170 DRM_WARN("No enough system memory\n"); 1171 def_value: 1172 adev->pm.smu_prv_buffer_size = 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_check_arguments - validate module params 1177 * 1178 * @adev: amdgpu_device pointer 1179 * 1180 * Validates certain module parameters and updates 1181 * the associated values used by the driver (all asics). 1182 */ 1183 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1184 { 1185 if (amdgpu_sched_jobs < 4) { 1186 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1187 amdgpu_sched_jobs); 1188 amdgpu_sched_jobs = 4; 1189 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1190 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1191 amdgpu_sched_jobs); 1192 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1193 } 1194 1195 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1196 /* gart size must be greater or equal to 32M */ 1197 dev_warn(adev->dev, "gart size (%d) too small\n", 1198 amdgpu_gart_size); 1199 amdgpu_gart_size = -1; 1200 } 1201 1202 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1203 /* gtt size must be greater or equal to 32M */ 1204 dev_warn(adev->dev, "gtt size (%d) too small\n", 1205 amdgpu_gtt_size); 1206 amdgpu_gtt_size = -1; 1207 } 1208 1209 /* valid range is between 4 and 9 inclusive */ 1210 if (amdgpu_vm_fragment_size != -1 && 1211 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1212 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1213 amdgpu_vm_fragment_size = -1; 1214 } 1215 1216 amdgpu_device_check_smu_prv_buffer_size(adev); 1217 1218 amdgpu_device_check_vm_size(adev); 1219 1220 amdgpu_device_check_block_size(adev); 1221 1222 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1223 1224 return 0; 1225 } 1226 1227 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1228 /** 1229 * amdgpu_switcheroo_set_state - set switcheroo state 1230 * 1231 * @pdev: pci dev pointer 1232 * @state: vga_switcheroo state 1233 * 1234 * Callback for the switcheroo driver. Suspends or resumes the 1235 * the asics before or after it is powered up using ACPI methods. 1236 */ 1237 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1238 { 1239 struct drm_device *dev = pci_get_drvdata(pdev); 1240 int r; 1241 1242 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1243 return; 1244 1245 if (state == VGA_SWITCHEROO_ON) { 1246 pr_info("amdgpu: switched on\n"); 1247 /* don't suspend or resume card normally */ 1248 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1249 1250 #ifndef __NetBSD__ /* pmf handles this for us. */ 1251 pci_set_power_state(dev->pdev, PCI_D0); 1252 pci_restore_state(dev->pdev); 1253 r = pci_enable_device(dev->pdev); 1254 if (r) 1255 DRM_WARN("pci_enable_device failed (%d)\n", r); 1256 #endif 1257 amdgpu_device_resume(dev, true); 1258 1259 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1260 drm_kms_helper_poll_enable(dev); 1261 } else { 1262 pr_info("amdgpu: switched off\n"); 1263 drm_kms_helper_poll_disable(dev); 1264 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1265 amdgpu_device_suspend(dev, true); 1266 #ifndef __NetBSD__ /* pmf handles this for us. */ 1267 pci_save_state(dev->pdev); 1268 /* Shut down the device */ 1269 pci_disable_device(dev->pdev); 1270 pci_set_power_state(dev->pdev, PCI_D3cold); 1271 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1272 #endif 1273 } 1274 } 1275 1276 /** 1277 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1278 * 1279 * @pdev: pci dev pointer 1280 * 1281 * Callback for the switcheroo driver. Check of the switcheroo 1282 * state can be changed. 1283 * Returns true if the state can be changed, false if not. 1284 */ 1285 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1286 { 1287 struct drm_device *dev = pci_get_drvdata(pdev); 1288 1289 /* 1290 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1291 * locking inversion with the driver load path. And the access here is 1292 * completely racy anyway. So don't bother with locking for now. 1293 */ 1294 return dev->open_count == 0; 1295 } 1296 1297 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1298 .set_gpu_state = amdgpu_switcheroo_set_state, 1299 .reprobe = NULL, 1300 .can_switch = amdgpu_switcheroo_can_switch, 1301 }; 1302 #endif /* __NetBSD__ */ 1303 1304 /** 1305 * amdgpu_device_ip_set_clockgating_state - set the CG state 1306 * 1307 * @dev: amdgpu_device pointer 1308 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1309 * @state: clockgating state (gate or ungate) 1310 * 1311 * Sets the requested clockgating state for all instances of 1312 * the hardware IP specified. 1313 * Returns the error code from the last instance. 1314 */ 1315 int amdgpu_device_ip_set_clockgating_state(void *dev, 1316 enum amd_ip_block_type block_type, 1317 enum amd_clockgating_state state) 1318 { 1319 struct amdgpu_device *adev = dev; 1320 int i, r = 0; 1321 1322 for (i = 0; i < adev->num_ip_blocks; i++) { 1323 if (!adev->ip_blocks[i].status.valid) 1324 continue; 1325 if (adev->ip_blocks[i].version->type != block_type) 1326 continue; 1327 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1328 continue; 1329 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1330 (void *)adev, state); 1331 if (r) 1332 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1333 adev->ip_blocks[i].version->funcs->name, r); 1334 } 1335 return r; 1336 } 1337 1338 /** 1339 * amdgpu_device_ip_set_powergating_state - set the PG state 1340 * 1341 * @dev: amdgpu_device pointer 1342 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1343 * @state: powergating state (gate or ungate) 1344 * 1345 * Sets the requested powergating state for all instances of 1346 * the hardware IP specified. 1347 * Returns the error code from the last instance. 1348 */ 1349 int amdgpu_device_ip_set_powergating_state(void *dev, 1350 enum amd_ip_block_type block_type, 1351 enum amd_powergating_state state) 1352 { 1353 struct amdgpu_device *adev = dev; 1354 int i, r = 0; 1355 1356 for (i = 0; i < adev->num_ip_blocks; i++) { 1357 if (!adev->ip_blocks[i].status.valid) 1358 continue; 1359 if (adev->ip_blocks[i].version->type != block_type) 1360 continue; 1361 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1362 continue; 1363 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1364 (void *)adev, state); 1365 if (r) 1366 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1367 adev->ip_blocks[i].version->funcs->name, r); 1368 } 1369 return r; 1370 } 1371 1372 /** 1373 * amdgpu_device_ip_get_clockgating_state - get the CG state 1374 * 1375 * @adev: amdgpu_device pointer 1376 * @flags: clockgating feature flags 1377 * 1378 * Walks the list of IPs on the device and updates the clockgating 1379 * flags for each IP. 1380 * Updates @flags with the feature flags for each hardware IP where 1381 * clockgating is enabled. 1382 */ 1383 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1384 u32 *flags) 1385 { 1386 int i; 1387 1388 for (i = 0; i < adev->num_ip_blocks; i++) { 1389 if (!adev->ip_blocks[i].status.valid) 1390 continue; 1391 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1392 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1393 } 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_wait_for_idle - wait for idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Waits for the request hardware IP to be idle. 1403 * Returns 0 for success or a negative error code on failure. 1404 */ 1405 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i, r; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) { 1414 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1415 if (r) 1416 return r; 1417 break; 1418 } 1419 } 1420 return 0; 1421 1422 } 1423 1424 /** 1425 * amdgpu_device_ip_is_idle - is the hardware IP idle 1426 * 1427 * @adev: amdgpu_device pointer 1428 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1429 * 1430 * Check if the hardware IP is idle or not. 1431 * Returns true if it the IP is idle, false if not. 1432 */ 1433 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1434 enum amd_ip_block_type block_type) 1435 { 1436 int i; 1437 1438 for (i = 0; i < adev->num_ip_blocks; i++) { 1439 if (!adev->ip_blocks[i].status.valid) 1440 continue; 1441 if (adev->ip_blocks[i].version->type == block_type) 1442 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1443 } 1444 return true; 1445 1446 } 1447 1448 /** 1449 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1450 * 1451 * @adev: amdgpu_device pointer 1452 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1453 * 1454 * Returns a pointer to the hardware IP block structure 1455 * if it exists for the asic, otherwise NULL. 1456 */ 1457 struct amdgpu_ip_block * 1458 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1459 enum amd_ip_block_type type) 1460 { 1461 int i; 1462 1463 for (i = 0; i < adev->num_ip_blocks; i++) 1464 if (adev->ip_blocks[i].version->type == type) 1465 return &adev->ip_blocks[i]; 1466 1467 return NULL; 1468 } 1469 1470 /** 1471 * amdgpu_device_ip_block_version_cmp 1472 * 1473 * @adev: amdgpu_device pointer 1474 * @type: enum amd_ip_block_type 1475 * @major: major version 1476 * @minor: minor version 1477 * 1478 * return 0 if equal or greater 1479 * return 1 if smaller or the ip_block doesn't exist 1480 */ 1481 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1482 enum amd_ip_block_type type, 1483 u32 major, u32 minor) 1484 { 1485 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1486 1487 if (ip_block && ((ip_block->version->major > major) || 1488 ((ip_block->version->major == major) && 1489 (ip_block->version->minor >= minor)))) 1490 return 0; 1491 1492 return 1; 1493 } 1494 1495 /** 1496 * amdgpu_device_ip_block_add 1497 * 1498 * @adev: amdgpu_device pointer 1499 * @ip_block_version: pointer to the IP to add 1500 * 1501 * Adds the IP block driver information to the collection of IPs 1502 * on the asic. 1503 */ 1504 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1505 const struct amdgpu_ip_block_version *ip_block_version) 1506 { 1507 if (!ip_block_version) 1508 return -EINVAL; 1509 1510 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1511 ip_block_version->funcs->name); 1512 1513 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1514 1515 return 0; 1516 } 1517 1518 /** 1519 * amdgpu_device_enable_virtual_display - enable virtual display feature 1520 * 1521 * @adev: amdgpu_device pointer 1522 * 1523 * Enabled the virtual display feature if the user has enabled it via 1524 * the module parameter virtual_display. This feature provides a virtual 1525 * display hardware on headless boards or in virtualized environments. 1526 * This function parses and validates the configuration string specified by 1527 * the user and configues the virtual display configuration (number of 1528 * virtual connectors, crtcs, etc.) specified. 1529 */ 1530 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1531 { 1532 adev->enable_virtual_display = false; 1533 1534 if (amdgpu_virtual_display) { 1535 struct drm_device *ddev = adev->ddev; 1536 const char *pci_address_name = pci_name(ddev->pdev); 1537 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1538 1539 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1540 pciaddstr_tmp = pciaddstr; 1541 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1542 pciaddname = strsep(&pciaddname_tmp, ","); 1543 if (!strcmp("all", pciaddname) 1544 || !strcmp(pci_address_name, pciaddname)) { 1545 long num_crtc; 1546 int res = -1; 1547 1548 adev->enable_virtual_display = true; 1549 1550 if (pciaddname_tmp) 1551 res = kstrtol(pciaddname_tmp, 10, 1552 &num_crtc); 1553 1554 if (!res) { 1555 if (num_crtc < 1) 1556 num_crtc = 1; 1557 if (num_crtc > 6) 1558 num_crtc = 6; 1559 adev->mode_info.num_crtc = num_crtc; 1560 } else { 1561 adev->mode_info.num_crtc = 1; 1562 } 1563 break; 1564 } 1565 } 1566 1567 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1568 amdgpu_virtual_display, pci_address_name, 1569 adev->enable_virtual_display, adev->mode_info.num_crtc); 1570 1571 kfree(pciaddstr); 1572 } 1573 } 1574 1575 /** 1576 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1577 * 1578 * @adev: amdgpu_device pointer 1579 * 1580 * Parses the asic configuration parameters specified in the gpu info 1581 * firmware and makes them availale to the driver for use in configuring 1582 * the asic. 1583 * Returns 0 on success, -EINVAL on failure. 1584 */ 1585 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1586 { 1587 const char *chip_name; 1588 char fw_name[30]; 1589 int err; 1590 const struct gpu_info_firmware_header_v1_0 *hdr; 1591 1592 adev->firmware.gpu_info_fw = NULL; 1593 1594 switch (adev->asic_type) { 1595 case CHIP_TOPAZ: 1596 case CHIP_TONGA: 1597 case CHIP_FIJI: 1598 case CHIP_POLARIS10: 1599 case CHIP_POLARIS11: 1600 case CHIP_POLARIS12: 1601 case CHIP_VEGAM: 1602 case CHIP_CARRIZO: 1603 case CHIP_STONEY: 1604 #ifdef CONFIG_DRM_AMDGPU_SI 1605 case CHIP_VERDE: 1606 case CHIP_TAHITI: 1607 case CHIP_PITCAIRN: 1608 case CHIP_OLAND: 1609 case CHIP_HAINAN: 1610 #endif 1611 #ifdef CONFIG_DRM_AMDGPU_CIK 1612 case CHIP_BONAIRE: 1613 case CHIP_HAWAII: 1614 case CHIP_KAVERI: 1615 case CHIP_KABINI: 1616 case CHIP_MULLINS: 1617 #endif 1618 case CHIP_VEGA20: 1619 default: 1620 return 0; 1621 case CHIP_VEGA10: 1622 chip_name = "vega10"; 1623 break; 1624 case CHIP_VEGA12: 1625 chip_name = "vega12"; 1626 break; 1627 case CHIP_RAVEN: 1628 if (adev->rev_id >= 8) 1629 chip_name = "raven2"; 1630 else if (adev->pdev->device == 0x15d8) 1631 chip_name = "picasso"; 1632 else 1633 chip_name = "raven"; 1634 break; 1635 case CHIP_ARCTURUS: 1636 chip_name = "arcturus"; 1637 break; 1638 case CHIP_RENOIR: 1639 chip_name = "renoir"; 1640 break; 1641 case CHIP_NAVI10: 1642 chip_name = "navi10"; 1643 break; 1644 case CHIP_NAVI14: 1645 chip_name = "navi14"; 1646 break; 1647 case CHIP_NAVI12: 1648 chip_name = "navi12"; 1649 break; 1650 } 1651 1652 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1653 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1654 if (err) { 1655 dev_err(adev->dev, 1656 "Failed to load gpu_info firmware \"%s\"\n", 1657 fw_name); 1658 goto out; 1659 } 1660 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1661 if (err) { 1662 dev_err(adev->dev, 1663 "Failed to validate gpu_info firmware \"%s\"\n", 1664 fw_name); 1665 goto out; 1666 } 1667 1668 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1669 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1670 1671 switch (hdr->version_major) { 1672 case 1: 1673 { 1674 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1675 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1676 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1677 1678 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1679 goto parse_soc_bounding_box; 1680 1681 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1682 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1683 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1684 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1685 adev->gfx.config.max_texture_channel_caches = 1686 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1687 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1688 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1689 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1690 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1691 adev->gfx.config.double_offchip_lds_buf = 1692 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1693 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1694 adev->gfx.cu_info.max_waves_per_simd = 1695 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1696 adev->gfx.cu_info.max_scratch_slots_per_cu = 1697 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1698 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1699 if (hdr->version_minor >= 1) { 1700 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1701 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1702 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1703 adev->gfx.config.num_sc_per_sh = 1704 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1705 adev->gfx.config.num_packer_per_sc = 1706 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1707 } 1708 1709 parse_soc_bounding_box: 1710 /* 1711 * soc bounding box info is not integrated in disocovery table, 1712 * we always need to parse it from gpu info firmware. 1713 */ 1714 if (hdr->version_minor == 2) { 1715 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1716 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1717 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1718 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1719 } 1720 break; 1721 } 1722 default: 1723 dev_err(adev->dev, 1724 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1725 err = -EINVAL; 1726 goto out; 1727 } 1728 out: 1729 return err; 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_early_init - run early init for hardware IPs 1734 * 1735 * @adev: amdgpu_device pointer 1736 * 1737 * Early initialization pass for hardware IPs. The hardware IPs that make 1738 * up each asic are discovered each IP's early_init callback is run. This 1739 * is the first stage in initializing the asic. 1740 * Returns 0 on success, negative error code on failure. 1741 */ 1742 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1743 { 1744 int i, r; 1745 1746 amdgpu_device_enable_virtual_display(adev); 1747 1748 switch (adev->asic_type) { 1749 case CHIP_TOPAZ: 1750 case CHIP_TONGA: 1751 case CHIP_FIJI: 1752 case CHIP_POLARIS10: 1753 case CHIP_POLARIS11: 1754 case CHIP_POLARIS12: 1755 case CHIP_VEGAM: 1756 case CHIP_CARRIZO: 1757 case CHIP_STONEY: 1758 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1759 adev->family = AMDGPU_FAMILY_CZ; 1760 else 1761 adev->family = AMDGPU_FAMILY_VI; 1762 1763 r = vi_set_ip_blocks(adev); 1764 if (r) 1765 return r; 1766 break; 1767 #ifdef CONFIG_DRM_AMDGPU_SI 1768 case CHIP_VERDE: 1769 case CHIP_TAHITI: 1770 case CHIP_PITCAIRN: 1771 case CHIP_OLAND: 1772 case CHIP_HAINAN: 1773 adev->family = AMDGPU_FAMILY_SI; 1774 r = si_set_ip_blocks(adev); 1775 if (r) 1776 return r; 1777 break; 1778 #endif 1779 #ifdef CONFIG_DRM_AMDGPU_CIK 1780 case CHIP_BONAIRE: 1781 case CHIP_HAWAII: 1782 case CHIP_KAVERI: 1783 case CHIP_KABINI: 1784 case CHIP_MULLINS: 1785 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1786 adev->family = AMDGPU_FAMILY_CI; 1787 else 1788 adev->family = AMDGPU_FAMILY_KV; 1789 1790 r = cik_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 #endif 1795 case CHIP_VEGA10: 1796 case CHIP_VEGA12: 1797 case CHIP_VEGA20: 1798 case CHIP_RAVEN: 1799 case CHIP_ARCTURUS: 1800 case CHIP_RENOIR: 1801 if (adev->asic_type == CHIP_RAVEN || 1802 adev->asic_type == CHIP_RENOIR) 1803 adev->family = AMDGPU_FAMILY_RV; 1804 else 1805 adev->family = AMDGPU_FAMILY_AI; 1806 1807 r = soc15_set_ip_blocks(adev); 1808 if (r) 1809 return r; 1810 break; 1811 case CHIP_NAVI10: 1812 case CHIP_NAVI14: 1813 case CHIP_NAVI12: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 r = amdgpu_device_parse_gpu_info_fw(adev); 1826 if (r) 1827 return r; 1828 1829 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1830 amdgpu_discovery_get_gfx_info(adev); 1831 1832 amdgpu_amdkfd_device_probe(adev); 1833 1834 if (amdgpu_sriov_vf(adev)) { 1835 r = amdgpu_virt_request_full_gpu(adev, true); 1836 if (r) 1837 return -EAGAIN; 1838 } 1839 1840 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1841 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1842 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1843 1844 for (i = 0; i < adev->num_ip_blocks; i++) { 1845 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1846 DRM_ERROR("disabled ip block: %d <%s>\n", 1847 i, adev->ip_blocks[i].version->funcs->name); 1848 adev->ip_blocks[i].status.valid = false; 1849 } else { 1850 if (adev->ip_blocks[i].version->funcs->early_init) { 1851 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1852 if (r == -ENOENT) { 1853 adev->ip_blocks[i].status.valid = false; 1854 } else if (r) { 1855 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1856 adev->ip_blocks[i].version->funcs->name, r); 1857 return r; 1858 } else { 1859 adev->ip_blocks[i].status.valid = true; 1860 } 1861 } else { 1862 adev->ip_blocks[i].status.valid = true; 1863 } 1864 } 1865 /* get the vbios after the asic_funcs are set up */ 1866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1867 /* Read BIOS */ 1868 if (!amdgpu_get_bios(adev)) 1869 return -EINVAL; 1870 1871 r = amdgpu_atombios_init(adev); 1872 if (r) { 1873 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1875 return r; 1876 } 1877 } 1878 } 1879 1880 adev->cg_flags &= amdgpu_cg_mask; 1881 adev->pg_flags &= amdgpu_pg_mask; 1882 1883 return 0; 1884 } 1885 1886 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1887 { 1888 int i, r; 1889 1890 for (i = 0; i < adev->num_ip_blocks; i++) { 1891 if (!adev->ip_blocks[i].status.sw) 1892 continue; 1893 if (adev->ip_blocks[i].status.hw) 1894 continue; 1895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1896 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1898 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1899 if (r) { 1900 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1901 adev->ip_blocks[i].version->funcs->name, r); 1902 return r; 1903 } 1904 adev->ip_blocks[i].status.hw = true; 1905 } 1906 } 1907 1908 return 0; 1909 } 1910 1911 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1912 { 1913 int i, r; 1914 1915 for (i = 0; i < adev->num_ip_blocks; i++) { 1916 if (!adev->ip_blocks[i].status.sw) 1917 continue; 1918 if (adev->ip_blocks[i].status.hw) 1919 continue; 1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1921 if (r) { 1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1923 adev->ip_blocks[i].version->funcs->name, r); 1924 return r; 1925 } 1926 adev->ip_blocks[i].status.hw = true; 1927 } 1928 1929 return 0; 1930 } 1931 1932 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1933 { 1934 int r = 0; 1935 int i; 1936 uint32_t smu_version; 1937 1938 if (adev->asic_type >= CHIP_VEGA10) { 1939 for (i = 0; i < adev->num_ip_blocks; i++) { 1940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1941 continue; 1942 1943 /* no need to do the fw loading again if already done*/ 1944 if (adev->ip_blocks[i].status.hw == true) 1945 break; 1946 1947 if (adev->in_gpu_reset || adev->in_suspend) { 1948 r = adev->ip_blocks[i].version->funcs->resume(adev); 1949 if (r) { 1950 DRM_ERROR("resume of IP block <%s> failed %d\n", 1951 adev->ip_blocks[i].version->funcs->name, r); 1952 return r; 1953 } 1954 } else { 1955 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1956 if (r) { 1957 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1958 adev->ip_blocks[i].version->funcs->name, r); 1959 return r; 1960 } 1961 } 1962 1963 adev->ip_blocks[i].status.hw = true; 1964 break; 1965 } 1966 } 1967 1968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1970 1971 return r; 1972 } 1973 1974 /** 1975 * amdgpu_device_ip_init - run init for hardware IPs 1976 * 1977 * @adev: amdgpu_device pointer 1978 * 1979 * Main initialization pass for hardware IPs. The list of all the hardware 1980 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1981 * are run. sw_init initializes the software state associated with each IP 1982 * and hw_init initializes the hardware associated with each IP. 1983 * Returns 0 on success, negative error code on failure. 1984 */ 1985 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1986 { 1987 int i, r; 1988 1989 r = amdgpu_ras_init(adev); 1990 if (r) 1991 return r; 1992 1993 for (i = 0; i < adev->num_ip_blocks; i++) { 1994 if (!adev->ip_blocks[i].status.valid) 1995 continue; 1996 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1997 if (r) { 1998 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1999 adev->ip_blocks[i].version->funcs->name, r); 2000 goto init_failed; 2001 } 2002 adev->ip_blocks[i].status.sw = true; 2003 2004 /* need to do gmc hw init early so we can allocate gpu mem */ 2005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2006 r = amdgpu_device_vram_scratch_init(adev); 2007 if (r) { 2008 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2009 goto init_failed; 2010 } 2011 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2012 if (r) { 2013 DRM_ERROR("hw_init %d failed %d\n", i, r); 2014 goto init_failed; 2015 } 2016 r = amdgpu_device_wb_init(adev); 2017 if (r) { 2018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2019 goto init_failed; 2020 } 2021 adev->ip_blocks[i].status.hw = true; 2022 2023 /* right after GMC hw init, we create CSA */ 2024 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2026 AMDGPU_GEM_DOMAIN_VRAM, 2027 AMDGPU_CSA_SIZE); 2028 if (r) { 2029 DRM_ERROR("allocate CSA failed %d\n", r); 2030 goto init_failed; 2031 } 2032 } 2033 } 2034 } 2035 2036 if (amdgpu_sriov_vf(adev)) 2037 amdgpu_virt_init_data_exchange(adev); 2038 2039 r = amdgpu_ib_pool_init(adev); 2040 if (r) { 2041 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2043 goto init_failed; 2044 } 2045 2046 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2047 if (r) 2048 goto init_failed; 2049 2050 r = amdgpu_device_ip_hw_init_phase1(adev); 2051 if (r) 2052 goto init_failed; 2053 2054 r = amdgpu_device_fw_loading(adev); 2055 if (r) 2056 goto init_failed; 2057 2058 r = amdgpu_device_ip_hw_init_phase2(adev); 2059 if (r) 2060 goto init_failed; 2061 2062 /* 2063 * retired pages will be loaded from eeprom and reserved here, 2064 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2065 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2066 * for I2C communication which only true at this point. 2067 * recovery_init may fail, but it can free all resources allocated by 2068 * itself and its failure should not stop amdgpu init process. 2069 * 2070 * Note: theoretically, this should be called before all vram allocations 2071 * to protect retired page from abusing 2072 */ 2073 amdgpu_ras_recovery_init(adev); 2074 2075 if (adev->gmc.xgmi.num_physical_nodes > 1) 2076 amdgpu_xgmi_add_device(adev); 2077 amdgpu_amdkfd_device_init(adev); 2078 2079 init_failed: 2080 if (amdgpu_sriov_vf(adev)) 2081 amdgpu_virt_release_full_gpu(adev, true); 2082 2083 return r; 2084 } 2085 2086 /** 2087 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * 2091 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2092 * this function before a GPU reset. If the value is retained after a 2093 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2094 */ 2095 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2096 { 2097 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2098 } 2099 2100 /** 2101 * amdgpu_device_check_vram_lost - check if vram is valid 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Checks the reset magic value written to the gart pointer in VRAM. 2106 * The driver calls this after a GPU reset to see if the contents of 2107 * VRAM is lost or now. 2108 * returns true if vram is lost, false if not. 2109 */ 2110 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2111 { 2112 return !!memcmp(adev->gart.ptr, adev->reset_magic, 2113 AMDGPU_RESET_MAGIC_NUM); 2114 } 2115 2116 /** 2117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2118 * 2119 * @adev: amdgpu_device pointer 2120 * @state: clockgating state (gate or ungate) 2121 * 2122 * The list of all the hardware IPs that make up the asic is walked and the 2123 * set_clockgating_state callbacks are run. 2124 * Late initialization pass enabling clockgating for hardware IPs. 2125 * Fini or suspend, pass disabling clockgating for hardware IPs. 2126 * Returns 0 on success, negative error code on failure. 2127 */ 2128 2129 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2130 enum amd_clockgating_state state) 2131 { 2132 int i, j, r; 2133 2134 if (amdgpu_emu_mode == 1) 2135 return 0; 2136 2137 for (j = 0; j < adev->num_ip_blocks; j++) { 2138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2139 if (!adev->ip_blocks[i].status.late_initialized) 2140 continue; 2141 /* skip CG for VCE/UVD, it's handled specially */ 2142 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2145 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2146 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2147 /* enable clockgating to save power */ 2148 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2149 state); 2150 if (r) { 2151 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2152 adev->ip_blocks[i].version->funcs->name, r); 2153 return r; 2154 } 2155 } 2156 } 2157 2158 return 0; 2159 } 2160 2161 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2162 { 2163 int i, j, r; 2164 2165 if (amdgpu_emu_mode == 1) 2166 return 0; 2167 2168 for (j = 0; j < adev->num_ip_blocks; j++) { 2169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2170 if (!adev->ip_blocks[i].status.late_initialized) 2171 continue; 2172 /* skip CG for VCE/UVD, it's handled specially */ 2173 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2177 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2178 /* enable powergating to save power */ 2179 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2180 state); 2181 if (r) { 2182 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2183 adev->ip_blocks[i].version->funcs->name, r); 2184 return r; 2185 } 2186 } 2187 } 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_enable_mgpu_fan_boost(void) 2192 { 2193 struct amdgpu_gpu_instance *gpu_ins; 2194 struct amdgpu_device *adev; 2195 int i, ret = 0; 2196 2197 mutex_lock(&mgpu_info.mutex); 2198 2199 /* 2200 * MGPU fan boost feature should be enabled 2201 * only when there are two or more dGPUs in 2202 * the system 2203 */ 2204 if (mgpu_info.num_dgpu < 2) 2205 goto out; 2206 2207 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2208 gpu_ins = &(mgpu_info.gpu_ins[i]); 2209 adev = gpu_ins->adev; 2210 if (!(adev->flags & AMD_IS_APU) && 2211 !gpu_ins->mgpu_fan_enabled && 2212 adev->powerplay.pp_funcs && 2213 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2215 if (ret) 2216 break; 2217 2218 gpu_ins->mgpu_fan_enabled = 1; 2219 } 2220 } 2221 2222 out: 2223 mutex_unlock(&mgpu_info.mutex); 2224 2225 return ret; 2226 } 2227 2228 /** 2229 * amdgpu_device_ip_late_init - run late init for hardware IPs 2230 * 2231 * @adev: amdgpu_device pointer 2232 * 2233 * Late initialization pass for hardware IPs. The list of all the hardware 2234 * IPs that make up the asic is walked and the late_init callbacks are run. 2235 * late_init covers any special initialization that an IP requires 2236 * after all of the have been initialized or something that needs to happen 2237 * late in the init process. 2238 * Returns 0 on success, negative error code on failure. 2239 */ 2240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2241 { 2242 struct amdgpu_gpu_instance *gpu_instance; 2243 int i = 0, r; 2244 2245 for (i = 0; i < adev->num_ip_blocks; i++) { 2246 if (!adev->ip_blocks[i].status.hw) 2247 continue; 2248 if (adev->ip_blocks[i].version->funcs->late_init) { 2249 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2250 if (r) { 2251 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2252 adev->ip_blocks[i].version->funcs->name, r); 2253 return r; 2254 } 2255 } 2256 adev->ip_blocks[i].status.late_initialized = true; 2257 } 2258 2259 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2260 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2261 2262 amdgpu_device_fill_reset_magic(adev); 2263 2264 r = amdgpu_device_enable_mgpu_fan_boost(); 2265 if (r) 2266 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2267 2268 2269 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2270 mutex_lock(&mgpu_info.mutex); 2271 2272 /* 2273 * Reset device p-state to low as this was booted with high. 2274 * 2275 * This should be performed only after all devices from the same 2276 * hive get initialized. 2277 * 2278 * However, it's unknown how many device in the hive in advance. 2279 * As this is counted one by one during devices initializations. 2280 * 2281 * So, we wait for all XGMI interlinked devices initialized. 2282 * This may bring some delays as those devices may come from 2283 * different hives. But that should be OK. 2284 */ 2285 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2286 for (i = 0; i < mgpu_info.num_gpu; i++) { 2287 gpu_instance = &(mgpu_info.gpu_ins[i]); 2288 if (gpu_instance->adev->flags & AMD_IS_APU) 2289 continue; 2290 2291 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); 2292 if (r) { 2293 DRM_ERROR("pstate setting failed (%d).\n", r); 2294 break; 2295 } 2296 } 2297 } 2298 2299 mutex_unlock(&mgpu_info.mutex); 2300 } 2301 2302 return 0; 2303 } 2304 2305 /** 2306 * amdgpu_device_ip_fini - run fini for hardware IPs 2307 * 2308 * @adev: amdgpu_device pointer 2309 * 2310 * Main teardown pass for hardware IPs. The list of all the hardware 2311 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2312 * are run. hw_fini tears down the hardware associated with each IP 2313 * and sw_fini tears down any software state associated with each IP. 2314 * Returns 0 on success, negative error code on failure. 2315 */ 2316 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2317 { 2318 int i, r; 2319 2320 amdgpu_ras_pre_fini(adev); 2321 2322 if (adev->gmc.xgmi.num_physical_nodes > 1) 2323 amdgpu_xgmi_remove_device(adev); 2324 2325 amdgpu_amdkfd_device_fini(adev); 2326 2327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2328 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2329 2330 /* need to disable SMC first */ 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (!adev->ip_blocks[i].status.hw) 2333 continue; 2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2335 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2336 /* XXX handle errors */ 2337 if (r) { 2338 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 } 2341 adev->ip_blocks[i].status.hw = false; 2342 break; 2343 } 2344 } 2345 2346 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 2357 adev->ip_blocks[i].status.hw = false; 2358 } 2359 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.sw) 2363 continue; 2364 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 amdgpu_ucode_free_bo(adev); 2367 amdgpu_free_static_csa(&adev->virt.csa_obj); 2368 amdgpu_device_wb_fini(adev); 2369 amdgpu_device_vram_scratch_fini(adev); 2370 amdgpu_ib_pool_fini(adev); 2371 } 2372 2373 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2374 /* XXX handle errors */ 2375 if (r) { 2376 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 } 2379 adev->ip_blocks[i].status.sw = false; 2380 adev->ip_blocks[i].status.valid = false; 2381 } 2382 2383 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2384 if (!adev->ip_blocks[i].status.late_initialized) 2385 continue; 2386 if (adev->ip_blocks[i].version->funcs->late_fini) 2387 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2388 adev->ip_blocks[i].status.late_initialized = false; 2389 } 2390 2391 amdgpu_ras_fini(adev); 2392 2393 if (amdgpu_sriov_vf(adev)) 2394 if (amdgpu_virt_release_full_gpu(adev, false)) 2395 DRM_ERROR("failed to release exclusive mode on fini\n"); 2396 2397 return 0; 2398 } 2399 2400 /** 2401 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2402 * 2403 * @work: work_struct. 2404 */ 2405 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2406 { 2407 struct amdgpu_device *adev = 2408 container_of(work, struct amdgpu_device, delayed_init_work.work); 2409 int r; 2410 2411 r = amdgpu_ib_ring_tests(adev); 2412 if (r) 2413 DRM_ERROR("ib ring test failed (%d).\n", r); 2414 } 2415 2416 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2417 { 2418 struct amdgpu_device *adev = 2419 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2420 2421 mutex_lock(&adev->gfx.gfx_off_mutex); 2422 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2423 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2424 adev->gfx.gfx_off_state = true; 2425 } 2426 mutex_unlock(&adev->gfx.gfx_off_mutex); 2427 } 2428 2429 /** 2430 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2431 * 2432 * @adev: amdgpu_device pointer 2433 * 2434 * Main suspend function for hardware IPs. The list of all the hardware 2435 * IPs that make up the asic is walked, clockgating is disabled and the 2436 * suspend callbacks are run. suspend puts the hardware and software state 2437 * in each IP into a state suitable for suspend. 2438 * Returns 0 on success, negative error code on failure. 2439 */ 2440 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2441 { 2442 int i, r; 2443 2444 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2445 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2446 2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2448 if (!adev->ip_blocks[i].status.valid) 2449 continue; 2450 /* displays are handled separately */ 2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2452 /* XXX handle errors */ 2453 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2454 /* XXX handle errors */ 2455 if (r) { 2456 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2457 adev->ip_blocks[i].version->funcs->name, r); 2458 return r; 2459 } 2460 adev->ip_blocks[i].status.hw = false; 2461 } 2462 } 2463 2464 return 0; 2465 } 2466 2467 /** 2468 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Main suspend function for hardware IPs. The list of all the hardware 2473 * IPs that make up the asic is walked, clockgating is disabled and the 2474 * suspend callbacks are run. suspend puts the hardware and software state 2475 * in each IP into a state suitable for suspend. 2476 * Returns 0 on success, negative error code on failure. 2477 */ 2478 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2479 { 2480 int i, r __unused; 2481 2482 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2483 if (!adev->ip_blocks[i].status.valid) 2484 continue; 2485 /* displays are handled in phase1 */ 2486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2487 continue; 2488 /* PSP lost connection when err_event_athub occurs */ 2489 if (amdgpu_ras_intr_triggered() && 2490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2491 adev->ip_blocks[i].status.hw = false; 2492 continue; 2493 } 2494 /* XXX handle errors */ 2495 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2496 /* XXX handle errors */ 2497 if (r) { 2498 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2499 adev->ip_blocks[i].version->funcs->name, r); 2500 } 2501 adev->ip_blocks[i].status.hw = false; 2502 /* handle putting the SMC in the appropriate state */ 2503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2505 if (r) { 2506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2507 adev->mp1_state, r); 2508 return r; 2509 } 2510 } 2511 2512 adev->ip_blocks[i].status.hw = false; 2513 } 2514 2515 return 0; 2516 } 2517 2518 /** 2519 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2520 * 2521 * @adev: amdgpu_device pointer 2522 * 2523 * Main suspend function for hardware IPs. The list of all the hardware 2524 * IPs that make up the asic is walked, clockgating is disabled and the 2525 * suspend callbacks are run. suspend puts the hardware and software state 2526 * in each IP into a state suitable for suspend. 2527 * Returns 0 on success, negative error code on failure. 2528 */ 2529 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2530 { 2531 int r; 2532 2533 if (amdgpu_sriov_vf(adev)) 2534 amdgpu_virt_request_full_gpu(adev, false); 2535 2536 r = amdgpu_device_ip_suspend_phase1(adev); 2537 if (r) 2538 return r; 2539 r = amdgpu_device_ip_suspend_phase2(adev); 2540 2541 if (amdgpu_sriov_vf(adev)) 2542 amdgpu_virt_release_full_gpu(adev, false); 2543 2544 return r; 2545 } 2546 2547 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2548 { 2549 int i, r; 2550 2551 static enum amd_ip_block_type ip_order[] = { 2552 AMD_IP_BLOCK_TYPE_GMC, 2553 AMD_IP_BLOCK_TYPE_COMMON, 2554 AMD_IP_BLOCK_TYPE_PSP, 2555 AMD_IP_BLOCK_TYPE_IH, 2556 }; 2557 2558 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2559 int j; 2560 struct amdgpu_ip_block *block; 2561 2562 for (j = 0; j < adev->num_ip_blocks; j++) { 2563 block = &adev->ip_blocks[j]; 2564 2565 block->status.hw = false; 2566 if (block->version->type != ip_order[i] || 2567 !block->status.valid) 2568 continue; 2569 2570 r = block->version->funcs->hw_init(adev); 2571 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2572 if (r) 2573 return r; 2574 block->status.hw = true; 2575 } 2576 } 2577 2578 return 0; 2579 } 2580 2581 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2582 { 2583 int i, r; 2584 2585 static enum amd_ip_block_type ip_order[] = { 2586 AMD_IP_BLOCK_TYPE_SMC, 2587 AMD_IP_BLOCK_TYPE_DCE, 2588 AMD_IP_BLOCK_TYPE_GFX, 2589 AMD_IP_BLOCK_TYPE_SDMA, 2590 AMD_IP_BLOCK_TYPE_UVD, 2591 AMD_IP_BLOCK_TYPE_VCE, 2592 AMD_IP_BLOCK_TYPE_VCN 2593 }; 2594 2595 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2596 int j; 2597 struct amdgpu_ip_block *block; 2598 2599 for (j = 0; j < adev->num_ip_blocks; j++) { 2600 block = &adev->ip_blocks[j]; 2601 2602 if (block->version->type != ip_order[i] || 2603 !block->status.valid || 2604 block->status.hw) 2605 continue; 2606 2607 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2608 r = block->version->funcs->resume(adev); 2609 else 2610 r = block->version->funcs->hw_init(adev); 2611 2612 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2613 if (r) 2614 return r; 2615 block->status.hw = true; 2616 } 2617 } 2618 2619 return 0; 2620 } 2621 2622 /** 2623 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2624 * 2625 * @adev: amdgpu_device pointer 2626 * 2627 * First resume function for hardware IPs. The list of all the hardware 2628 * IPs that make up the asic is walked and the resume callbacks are run for 2629 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2630 * after a suspend and updates the software state as necessary. This 2631 * function is also used for restoring the GPU after a GPU reset. 2632 * Returns 0 on success, negative error code on failure. 2633 */ 2634 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2635 { 2636 int i, r; 2637 2638 for (i = 0; i < adev->num_ip_blocks; i++) { 2639 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2640 continue; 2641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2642 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2643 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2644 2645 r = adev->ip_blocks[i].version->funcs->resume(adev); 2646 if (r) { 2647 DRM_ERROR("resume of IP block <%s> failed %d\n", 2648 adev->ip_blocks[i].version->funcs->name, r); 2649 return r; 2650 } 2651 adev->ip_blocks[i].status.hw = true; 2652 } 2653 } 2654 2655 return 0; 2656 } 2657 2658 /** 2659 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2660 * 2661 * @adev: amdgpu_device pointer 2662 * 2663 * First resume function for hardware IPs. The list of all the hardware 2664 * IPs that make up the asic is walked and the resume callbacks are run for 2665 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2666 * functional state after a suspend and updates the software state as 2667 * necessary. This function is also used for restoring the GPU after a GPU 2668 * reset. 2669 * Returns 0 on success, negative error code on failure. 2670 */ 2671 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2672 { 2673 int i, r; 2674 2675 for (i = 0; i < adev->num_ip_blocks; i++) { 2676 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2677 continue; 2678 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2679 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2680 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2682 continue; 2683 r = adev->ip_blocks[i].version->funcs->resume(adev); 2684 if (r) { 2685 DRM_ERROR("resume of IP block <%s> failed %d\n", 2686 adev->ip_blocks[i].version->funcs->name, r); 2687 return r; 2688 } 2689 adev->ip_blocks[i].status.hw = true; 2690 } 2691 2692 return 0; 2693 } 2694 2695 /** 2696 * amdgpu_device_ip_resume - run resume for hardware IPs 2697 * 2698 * @adev: amdgpu_device pointer 2699 * 2700 * Main resume function for hardware IPs. The hardware IPs 2701 * are split into two resume functions because they are 2702 * are also used in in recovering from a GPU reset and some additional 2703 * steps need to be take between them. In this case (S3/S4) they are 2704 * run sequentially. 2705 * Returns 0 on success, negative error code on failure. 2706 */ 2707 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2708 { 2709 int r; 2710 2711 r = amdgpu_device_ip_resume_phase1(adev); 2712 if (r) 2713 return r; 2714 2715 r = amdgpu_device_fw_loading(adev); 2716 if (r) 2717 return r; 2718 2719 r = amdgpu_device_ip_resume_phase2(adev); 2720 2721 return r; 2722 } 2723 2724 /** 2725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2726 * 2727 * @adev: amdgpu_device pointer 2728 * 2729 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2730 */ 2731 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2732 { 2733 if (amdgpu_sriov_vf(adev)) { 2734 if (adev->is_atom_fw) { 2735 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2737 } else { 2738 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2740 } 2741 2742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2744 } 2745 } 2746 2747 /** 2748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2749 * 2750 * @asic_type: AMD asic type 2751 * 2752 * Check if there is DC (new modesetting infrastructre) support for an asic. 2753 * returns true if DC has support, false if not. 2754 */ 2755 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2756 { 2757 switch (asic_type) { 2758 #if defined(CONFIG_DRM_AMD_DC) 2759 case CHIP_BONAIRE: 2760 case CHIP_KAVERI: 2761 case CHIP_KABINI: 2762 case CHIP_MULLINS: 2763 /* 2764 * We have systems in the wild with these ASICs that require 2765 * LVDS and VGA support which is not supported with DC. 2766 * 2767 * Fallback to the non-DC driver here by default so as not to 2768 * cause regressions. 2769 */ 2770 return amdgpu_dc > 0; 2771 case CHIP_HAWAII: 2772 case CHIP_CARRIZO: 2773 case CHIP_STONEY: 2774 case CHIP_POLARIS10: 2775 case CHIP_POLARIS11: 2776 case CHIP_POLARIS12: 2777 case CHIP_VEGAM: 2778 case CHIP_TONGA: 2779 case CHIP_FIJI: 2780 case CHIP_VEGA10: 2781 case CHIP_VEGA12: 2782 case CHIP_VEGA20: 2783 #if defined(CONFIG_DRM_AMD_DC_DCN) 2784 case CHIP_RAVEN: 2785 case CHIP_NAVI10: 2786 case CHIP_NAVI14: 2787 case CHIP_NAVI12: 2788 case CHIP_RENOIR: 2789 #endif 2790 return amdgpu_dc != 0; 2791 #endif 2792 default: 2793 if (amdgpu_dc > 0) 2794 DRM_INFO("Display Core has been requested via kernel parameter " 2795 "but isn't supported by ASIC, ignoring\n"); 2796 return false; 2797 } 2798 } 2799 2800 /** 2801 * amdgpu_device_has_dc_support - check if dc is supported 2802 * 2803 * @adev: amdgpu_device_pointer 2804 * 2805 * Returns true for supported, false for not supported 2806 */ 2807 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2808 { 2809 if (amdgpu_sriov_vf(adev)) 2810 return false; 2811 2812 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2813 } 2814 2815 2816 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2817 { 2818 struct amdgpu_device *adev = 2819 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2820 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2821 2822 /* It's a bug to not have a hive within this function */ 2823 if (WARN_ON(!hive)) 2824 return; 2825 2826 /* 2827 * Use task barrier to synchronize all xgmi reset works across the 2828 * hive. task_barrier_enter and task_barrier_exit will block 2829 * until all the threads running the xgmi reset works reach 2830 * those points. task_barrier_full will do both blocks. 2831 */ 2832 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2833 2834 task_barrier_enter(&hive->tb); 2835 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2836 2837 if (adev->asic_reset_res) 2838 goto fail; 2839 2840 task_barrier_exit(&hive->tb); 2841 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2842 2843 if (adev->asic_reset_res) 2844 goto fail; 2845 } else { 2846 2847 task_barrier_full(&hive->tb); 2848 adev->asic_reset_res = amdgpu_asic_reset(adev); 2849 } 2850 2851 fail: 2852 if (adev->asic_reset_res) 2853 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2854 adev->asic_reset_res, adev->ddev->unique); 2855 } 2856 2857 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2858 { 2859 char *input = amdgpu_lockup_timeout; 2860 char *timeout_setting = NULL; 2861 int index = 0; 2862 long timeout; 2863 int ret = 0; 2864 2865 /* 2866 * By default timeout for non compute jobs is 10000. 2867 * And there is no timeout enforced on compute jobs. 2868 * In SR-IOV or passthrough mode, timeout for compute 2869 * jobs are 10000 by default. 2870 */ 2871 adev->gfx_timeout = msecs_to_jiffies(10000); 2872 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2873 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2874 adev->compute_timeout = adev->gfx_timeout; 2875 else 2876 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2877 2878 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2879 while ((timeout_setting = strsep(&input, ",")) && 2880 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2881 ret = kstrtol(timeout_setting, 0, &timeout); 2882 if (ret) 2883 return ret; 2884 2885 if (timeout == 0) { 2886 index++; 2887 continue; 2888 } else if (timeout < 0) { 2889 timeout = MAX_SCHEDULE_TIMEOUT; 2890 } else { 2891 timeout = msecs_to_jiffies(timeout); 2892 } 2893 2894 switch (index++) { 2895 case 0: 2896 adev->gfx_timeout = timeout; 2897 break; 2898 case 1: 2899 adev->compute_timeout = timeout; 2900 break; 2901 case 2: 2902 adev->sdma_timeout = timeout; 2903 break; 2904 case 3: 2905 adev->video_timeout = timeout; 2906 break; 2907 default: 2908 break; 2909 } 2910 } 2911 /* 2912 * There is only one value specified and 2913 * it should apply to all non-compute jobs. 2914 */ 2915 if (index == 1) { 2916 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2917 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2918 adev->compute_timeout = adev->gfx_timeout; 2919 } 2920 } 2921 2922 return ret; 2923 } 2924 2925 /** 2926 * amdgpu_device_init - initialize the driver 2927 * 2928 * @adev: amdgpu_device pointer 2929 * @ddev: drm dev pointer 2930 * @pdev: pci dev pointer 2931 * @flags: driver flags 2932 * 2933 * Initializes the driver info and hw (all asics). 2934 * Returns 0 for success or an error on failure. 2935 * Called at driver startup. 2936 */ 2937 int amdgpu_device_init(struct amdgpu_device *adev, 2938 struct drm_device *ddev, 2939 struct pci_dev *pdev, 2940 uint32_t flags) 2941 { 2942 int r, i; 2943 bool boco = false; 2944 u32 max_MBps; 2945 2946 adev->shutdown = false; 2947 adev->dev = pci_dev_dev(pdev); 2948 adev->ddev = ddev; 2949 adev->pdev = pdev; 2950 adev->flags = flags; 2951 2952 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2953 adev->asic_type = amdgpu_force_asic_type; 2954 else 2955 adev->asic_type = flags & AMD_ASIC_MASK; 2956 2957 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2958 if (amdgpu_emu_mode == 1) 2959 adev->usec_timeout *= 2; 2960 adev->gmc.gart_size = 512 * 1024 * 1024; 2961 adev->accel_working = false; 2962 adev->num_rings = 0; 2963 adev->mman.buffer_funcs = NULL; 2964 adev->mman.buffer_funcs_ring = NULL; 2965 adev->vm_manager.vm_pte_funcs = NULL; 2966 adev->vm_manager.vm_pte_num_scheds = 0; 2967 adev->gmc.gmc_funcs = NULL; 2968 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2969 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2970 2971 adev->smc_rreg = &amdgpu_invalid_rreg; 2972 adev->smc_wreg = &amdgpu_invalid_wreg; 2973 adev->pcie_rreg = &amdgpu_invalid_rreg; 2974 adev->pcie_wreg = &amdgpu_invalid_wreg; 2975 adev->pciep_rreg = &amdgpu_invalid_rreg; 2976 adev->pciep_wreg = &amdgpu_invalid_wreg; 2977 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 2978 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 2979 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2980 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2981 adev->didt_rreg = &amdgpu_invalid_rreg; 2982 adev->didt_wreg = &amdgpu_invalid_wreg; 2983 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2984 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2985 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2986 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2987 2988 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2989 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2990 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2991 2992 /* mutex initialization are all done here so we 2993 * can recall function without having locking issues */ 2994 atomic_set(&adev->irq.ih.lock, 0); 2995 mutex_init(&adev->firmware.mutex); 2996 mutex_init(&adev->pm.mutex); 2997 mutex_init(&adev->gfx.gpu_clock_mutex); 2998 mutex_init(&adev->srbm_mutex); 2999 mutex_init(&adev->gfx.pipe_reserve_mutex); 3000 mutex_init(&adev->gfx.gfx_off_mutex); 3001 mutex_init(&adev->grbm_idx_mutex); 3002 mutex_init(&adev->mn_lock); 3003 mutex_init(&adev->virt.vf_errors.lock); 3004 hash_init(adev->mn_hash); 3005 mutex_init(&adev->lock_reset); 3006 mutex_init(&adev->psp.mutex); 3007 mutex_init(&adev->notifier_lock); 3008 3009 spin_lock_init(&adev->mmio_idx_lock); 3010 spin_lock_init(&adev->smc_idx_lock); 3011 spin_lock_init(&adev->pcie_idx_lock); 3012 spin_lock_init(&adev->uvd_ctx_idx_lock); 3013 spin_lock_init(&adev->didt_idx_lock); 3014 spin_lock_init(&adev->gc_cac_idx_lock); 3015 spin_lock_init(&adev->se_cac_idx_lock); 3016 spin_lock_init(&adev->audio_endpt_idx_lock); 3017 spin_lock_init(&adev->mm_stats.lock); 3018 3019 INIT_LIST_HEAD(&adev->shadow_list); 3020 mutex_init(&adev->shadow_list_lock); 3021 3022 INIT_LIST_HEAD(&adev->ring_lru_list); 3023 spin_lock_init(&adev->ring_lru_list_lock); 3024 3025 INIT_DELAYED_WORK(&adev->delayed_init_work, 3026 amdgpu_device_delayed_init_work_handler); 3027 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3028 amdgpu_device_delay_enable_gfx_off); 3029 3030 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3031 3032 r = amdgpu_device_check_arguments(adev); 3033 if (r) 3034 return r; 3035 3036 adev->gfx.gfx_off_req_count = 1; 3037 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 3038 3039 /* Registers mapping */ 3040 /* TODO: block userspace mapping of io register */ 3041 if (adev->asic_type >= CHIP_BONAIRE) { 3042 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3043 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3044 } else { 3045 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3046 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3047 } 3048 3049 #ifdef __NetBSD__ 3050 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(5), 3051 pci_mapreg_type(adev->pdev->pd_pa.pa_pc, 3052 adev->pdev->pd_pa.pa_tag, PCI_BAR(5)), 3053 0, 3054 &adev->rmmiot, &adev->rmmioh, 3055 &adev->rmmio_base, &adev->rmmio_size)) 3056 return -EIO; 3057 DRM_INFO("register mmio base: 0x%8"PRIXMAX"\n", 3058 (uintmax_t)adev->rmmio_base); 3059 DRM_INFO("register mmio size: %"PRIuMAX"\n", 3060 (uintmax_t)adev->rmmio_size); 3061 #else 3062 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3063 if (adev->rmmio == NULL) { 3064 return -ENOMEM; 3065 } 3066 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3067 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3068 #endif 3069 3070 /* io port mapping */ 3071 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3072 #ifdef __NetBSD__ 3073 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(i), 3074 PCI_MAPREG_TYPE_IO, 0, 3075 &adev->rio_memt, &adev->rio_memh, 3076 NULL, &adev->rio_mem_size) == 0) 3077 break; 3078 #else 3079 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3080 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3081 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3082 break; 3083 } 3084 #endif 3085 } 3086 #ifdef __NetBSD__ 3087 if (i == DEVICE_COUNT_RESOURCE) 3088 DRM_ERROR("Unable to find PCI I/O BAR\n"); 3089 #else 3090 if (adev->rio_mem == NULL) 3091 #endif 3092 DRM_INFO("PCI I/O BAR is not found.\n"); 3093 3094 /* enable PCIE atomic ops */ 3095 #ifndef __NetBSD__ /* XXX amdgpu pcie atomics */ 3096 r = pci_enable_atomic_ops_to_root(adev->pdev, 3097 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3098 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3099 if (r) { 3100 adev->have_atomics_support = false; 3101 DRM_INFO("PCIE atomic ops is not supported\n"); 3102 } else { 3103 adev->have_atomics_support = true; 3104 } 3105 #endif 3106 3107 amdgpu_device_get_pcie_info(adev); 3108 3109 if (amdgpu_mcbp) 3110 DRM_INFO("MCBP is enabled\n"); 3111 3112 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3113 adev->enable_mes = true; 3114 3115 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 3116 r = amdgpu_discovery_init(adev); 3117 if (r) { 3118 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 3119 return r; 3120 } 3121 } 3122 3123 /* early init functions */ 3124 r = amdgpu_device_ip_early_init(adev); 3125 if (r) 3126 return r; 3127 3128 r = amdgpu_device_get_job_timeout_settings(adev); 3129 if (r) { 3130 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3131 return r; 3132 } 3133 3134 /* doorbell bar mapping and doorbell index init*/ 3135 amdgpu_device_doorbell_init(adev); 3136 3137 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3138 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3139 /* this will fail for cards that aren't VGA class devices, just 3140 * ignore it */ 3141 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3142 3143 if (amdgpu_device_supports_boco(ddev)) 3144 boco = true; 3145 if (amdgpu_has_atpx() && 3146 (amdgpu_is_atpx_hybrid() || 3147 amdgpu_has_atpx_dgpu_power_cntl()) && 3148 !pci_is_thunderbolt_attached(adev->pdev)) 3149 vga_switcheroo_register_client(adev->pdev, 3150 &amdgpu_switcheroo_ops, boco); 3151 if (boco) 3152 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3153 #endif 3154 3155 if (amdgpu_emu_mode == 1) { 3156 /* post the asic on emulation mode */ 3157 emu_soc_asic_init(adev); 3158 goto fence_driver_init; 3159 } 3160 3161 /* detect if we are with an SRIOV vbios */ 3162 amdgpu_device_detect_sriov_bios(adev); 3163 3164 /* check if we need to reset the asic 3165 * E.g., driver was not cleanly unloaded previously, etc. 3166 */ 3167 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3168 r = amdgpu_asic_reset(adev); 3169 if (r) { 3170 dev_err(adev->dev, "asic reset on init failed\n"); 3171 goto failed; 3172 } 3173 } 3174 3175 /* Post card if necessary */ 3176 if (amdgpu_device_need_post(adev)) { 3177 if (!adev->bios) { 3178 dev_err(adev->dev, "no vBIOS found\n"); 3179 r = -EINVAL; 3180 goto failed; 3181 } 3182 DRM_INFO("GPU posting now...\n"); 3183 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3184 if (r) { 3185 dev_err(adev->dev, "gpu post error!\n"); 3186 goto failed; 3187 } 3188 } 3189 3190 if (adev->is_atom_fw) { 3191 /* Initialize clocks */ 3192 r = amdgpu_atomfirmware_get_clock_info(adev); 3193 if (r) { 3194 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3195 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3196 goto failed; 3197 } 3198 } else { 3199 /* Initialize clocks */ 3200 r = amdgpu_atombios_get_clock_info(adev); 3201 if (r) { 3202 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3203 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3204 goto failed; 3205 } 3206 /* init i2c buses */ 3207 if (!amdgpu_device_has_dc_support(adev)) 3208 amdgpu_atombios_i2c_init(adev); 3209 } 3210 3211 fence_driver_init: 3212 /* Fence driver */ 3213 r = amdgpu_fence_driver_init(adev); 3214 if (r) { 3215 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3216 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3217 goto failed; 3218 } 3219 3220 /* init the mode config */ 3221 drm_mode_config_init(adev->ddev); 3222 3223 r = amdgpu_device_ip_init(adev); 3224 if (r) { 3225 /* failed in exclusive mode due to timeout */ 3226 if (amdgpu_sriov_vf(adev) && 3227 !amdgpu_sriov_runtime(adev) && 3228 amdgpu_virt_mmio_blocked(adev) && 3229 !amdgpu_virt_wait_reset(adev)) { 3230 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3231 /* Don't send request since VF is inactive. */ 3232 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3233 adev->virt.ops = NULL; 3234 r = -EAGAIN; 3235 goto failed; 3236 } 3237 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3239 goto failed; 3240 } 3241 3242 DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3243 adev->gfx.config.max_shader_engines, 3244 adev->gfx.config.max_sh_per_se, 3245 adev->gfx.config.max_cu_per_sh, 3246 adev->gfx.cu_info.number); 3247 3248 amdgpu_ctx_init_sched(adev); 3249 3250 adev->accel_working = true; 3251 3252 amdgpu_vm_check_compute_bug(adev); 3253 3254 /* Initialize the buffer migration limit. */ 3255 if (amdgpu_moverate >= 0) 3256 max_MBps = amdgpu_moverate; 3257 else 3258 max_MBps = 8; /* Allow 8 MB/s. */ 3259 /* Get a log2 for easy divisions. */ 3260 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3261 3262 amdgpu_fbdev_init(adev); 3263 3264 r = amdgpu_pm_sysfs_init(adev); 3265 if (r) { 3266 adev->pm_sysfs_en = false; 3267 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3268 } else 3269 adev->pm_sysfs_en = true; 3270 3271 r = amdgpu_ucode_sysfs_init(adev); 3272 if (r) { 3273 adev->ucode_sysfs_en = false; 3274 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3275 } else 3276 adev->ucode_sysfs_en = true; 3277 3278 r = amdgpu_debugfs_gem_init(adev); 3279 if (r) 3280 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 3281 3282 r = amdgpu_debugfs_regs_init(adev); 3283 if (r) 3284 DRM_ERROR("registering register debugfs failed (%d).\n", r); 3285 3286 r = amdgpu_debugfs_firmware_init(adev); 3287 if (r) 3288 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 3289 3290 r = amdgpu_debugfs_init(adev); 3291 if (r) 3292 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 3293 3294 if ((amdgpu_testing & 1)) { 3295 if (adev->accel_working) 3296 amdgpu_test_moves(adev); 3297 else 3298 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3299 } 3300 if (amdgpu_benchmarking) { 3301 if (adev->accel_working) 3302 amdgpu_benchmark(adev, amdgpu_benchmarking); 3303 else 3304 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3305 } 3306 3307 /* 3308 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3309 * Otherwise the mgpu fan boost feature will be skipped due to the 3310 * gpu instance is counted less. 3311 */ 3312 amdgpu_register_gpu_instance(adev); 3313 3314 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3315 * explicit gating rather than handling it automatically. 3316 */ 3317 r = amdgpu_device_ip_late_init(adev); 3318 if (r) { 3319 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3320 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3321 goto failed; 3322 } 3323 3324 /* must succeed. */ 3325 amdgpu_ras_resume(adev); 3326 3327 queue_delayed_work(system_wq, &adev->delayed_init_work, 3328 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3329 3330 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3331 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 3332 if (r) { 3333 dev_err(adev->dev, "Could not create pcie_replay_count"); 3334 return r; 3335 } 3336 #endif 3337 3338 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3339 r = amdgpu_pmu_init(adev); 3340 if (r) 3341 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3342 3343 return 0; 3344 3345 failed: 3346 amdgpu_vf_error_trans_all(adev); 3347 if (boco) 3348 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3349 3350 return r; 3351 } 3352 3353 /** 3354 * amdgpu_device_fini - tear down the driver 3355 * 3356 * @adev: amdgpu_device pointer 3357 * 3358 * Tear down the driver info (all asics). 3359 * Called at driver shutdown. 3360 */ 3361 void amdgpu_device_fini(struct amdgpu_device *adev) 3362 { 3363 int r __unused; 3364 3365 DRM_INFO("amdgpu: finishing device.\n"); 3366 flush_delayed_work(&adev->delayed_init_work); 3367 adev->shutdown = true; 3368 3369 /* disable all interrupts */ 3370 amdgpu_irq_disable_all(adev); 3371 if (adev->mode_info.mode_config_initialized){ 3372 if (!amdgpu_device_has_dc_support(adev)) 3373 drm_helper_force_disable_all(adev->ddev); 3374 else 3375 drm_atomic_helper_shutdown(adev->ddev); 3376 } 3377 amdgpu_fence_driver_fini(adev); 3378 if (adev->pm_sysfs_en) 3379 amdgpu_pm_sysfs_fini(adev); 3380 amdgpu_fbdev_fini(adev); 3381 r = amdgpu_device_ip_fini(adev); 3382 if (adev->firmware.gpu_info_fw) { 3383 release_firmware(adev->firmware.gpu_info_fw); 3384 adev->firmware.gpu_info_fw = NULL; 3385 } 3386 adev->accel_working = false; 3387 /* free i2c buses */ 3388 if (!amdgpu_device_has_dc_support(adev)) 3389 amdgpu_i2c_fini(adev); 3390 3391 if (amdgpu_emu_mode != 1) 3392 amdgpu_atombios_fini(adev); 3393 3394 kfree(adev->bios); 3395 adev->bios = NULL; 3396 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3397 if (amdgpu_has_atpx() && 3398 (amdgpu_is_atpx_hybrid() || 3399 amdgpu_has_atpx_dgpu_power_cntl()) && 3400 !pci_is_thunderbolt_attached(adev->pdev)) 3401 vga_switcheroo_unregister_client(adev->pdev); 3402 if (amdgpu_device_supports_boco(adev->ddev)) 3403 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3404 vga_client_register(adev->pdev, NULL, NULL, NULL); 3405 #endif 3406 #ifdef __NetBSD__ 3407 if (adev->rio_mem_size) 3408 bus_space_unmap(adev->rio_memt, adev->rio_memh, 3409 adev->rio_mem_size); 3410 adev->rio_mem_size = 0; 3411 bus_space_unmap(adev->rmmiot, adev->rmmioh, adev->rmmio_size); 3412 #else 3413 if (adev->rio_mem) 3414 pci_iounmap(adev->pdev, adev->rio_mem); 3415 adev->rio_mem = NULL; 3416 iounmap(adev->rmmio); 3417 adev->rmmio = NULL; 3418 #endif 3419 amdgpu_device_doorbell_fini(adev); 3420 3421 amdgpu_debugfs_regs_cleanup(adev); 3422 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3423 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 3424 #endif 3425 if (adev->ucode_sysfs_en) 3426 amdgpu_ucode_sysfs_fini(adev); 3427 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3428 amdgpu_pmu_fini(adev); 3429 amdgpu_debugfs_preempt_cleanup(adev); 3430 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 3431 amdgpu_discovery_fini(adev); 3432 spin_lock_destroy(&adev->ring_lru_list_lock); 3433 mutex_destroy(&adev->shadow_list_lock); 3434 spin_lock_destroy(&adev->mm_stats.lock); 3435 spin_lock_destroy(&adev->audio_endpt_idx_lock); 3436 spin_lock_destroy(&adev->se_cac_idx_lock); 3437 spin_lock_destroy(&adev->gc_cac_idx_lock); 3438 spin_lock_destroy(&adev->didt_idx_lock); 3439 spin_lock_destroy(&adev->uvd_ctx_idx_lock); 3440 spin_lock_destroy(&adev->pcie_idx_lock); 3441 spin_lock_destroy(&adev->smc_idx_lock); 3442 spin_lock_destroy(&adev->mmio_idx_lock); 3443 mutex_destroy(&adev->notifier_lock); 3444 mutex_destroy(&adev->psp.mutex); 3445 mutex_destroy(&adev->lock_reset); 3446 /* hash_destroy(adev->mn_hash)? */ 3447 mutex_destroy(&adev->virt.vf_errors.lock); 3448 mutex_destroy(&adev->mn_lock); 3449 mutex_destroy(&adev->grbm_idx_mutex); 3450 mutex_destroy(&adev->gfx.gfx_off_mutex); 3451 mutex_destroy(&adev->gfx.pipe_reserve_mutex); 3452 mutex_destroy(&adev->srbm_mutex); 3453 mutex_destroy(&adev->gfx.gpu_clock_mutex); 3454 mutex_destroy(&adev->pm.mutex); 3455 mutex_destroy(&adev->firmware.mutex); 3456 } 3457 3458 3459 /* 3460 * Suspend & resume. 3461 */ 3462 /** 3463 * amdgpu_device_suspend - initiate device suspend 3464 * 3465 * @dev: drm dev pointer 3466 * @suspend: suspend state 3467 * @fbcon : notify the fbdev of suspend 3468 * 3469 * Puts the hw in the suspend state (all asics). 3470 * Returns 0 for success or an error on failure. 3471 * Called at driver suspend. 3472 */ 3473 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3474 { 3475 struct amdgpu_device *adev; 3476 struct drm_crtc *crtc; 3477 struct drm_connector *connector; 3478 struct drm_connector_list_iter iter; 3479 int r; 3480 3481 if (dev == NULL || dev->dev_private == NULL) { 3482 return -ENODEV; 3483 } 3484 3485 adev = dev->dev_private; 3486 3487 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3488 return 0; 3489 3490 adev->in_suspend = true; 3491 drm_kms_helper_poll_disable(dev); 3492 3493 if (fbcon) 3494 amdgpu_fbdev_set_suspend(adev, 1); 3495 3496 cancel_delayed_work_sync(&adev->delayed_init_work); 3497 3498 if (!amdgpu_device_has_dc_support(adev)) { 3499 /* turn off display hw */ 3500 drm_modeset_lock_all(dev); 3501 drm_connector_list_iter_begin(dev, &iter); 3502 drm_for_each_connector_iter(connector, &iter) 3503 drm_helper_connector_dpms(connector, 3504 DRM_MODE_DPMS_OFF); 3505 drm_connector_list_iter_end(&iter); 3506 drm_modeset_unlock_all(dev); 3507 /* unpin the front buffers and cursors */ 3508 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3509 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3510 struct drm_framebuffer *fb = crtc->primary->fb; 3511 struct amdgpu_bo *robj; 3512 3513 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3514 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3515 r = amdgpu_bo_reserve(aobj, true); 3516 if (r == 0) { 3517 amdgpu_bo_unpin(aobj); 3518 amdgpu_bo_unreserve(aobj); 3519 } 3520 } 3521 3522 if (fb == NULL || fb->obj[0] == NULL) { 3523 continue; 3524 } 3525 robj = gem_to_amdgpu_bo(fb->obj[0]); 3526 /* don't unpin kernel fb objects */ 3527 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3528 r = amdgpu_bo_reserve(robj, true); 3529 if (r == 0) { 3530 amdgpu_bo_unpin(robj); 3531 amdgpu_bo_unreserve(robj); 3532 } 3533 } 3534 } 3535 } 3536 3537 amdgpu_amdkfd_suspend(adev); 3538 3539 amdgpu_ras_suspend(adev); 3540 3541 r = amdgpu_device_ip_suspend_phase1(adev); 3542 3543 /* evict vram memory */ 3544 amdgpu_bo_evict_vram(adev); 3545 3546 amdgpu_fence_driver_suspend(adev); 3547 3548 r = amdgpu_device_ip_suspend_phase2(adev); 3549 3550 /* evict remaining vram memory 3551 * This second call to evict vram is to evict the gart page table 3552 * using the CPU. 3553 */ 3554 amdgpu_bo_evict_vram(adev); 3555 3556 return 0; 3557 } 3558 3559 /** 3560 * amdgpu_device_resume - initiate device resume 3561 * 3562 * @dev: drm dev pointer 3563 * @resume: resume state 3564 * @fbcon : notify the fbdev of resume 3565 * 3566 * Bring the hw back to operating state (all asics). 3567 * Returns 0 for success or an error on failure. 3568 * Called at driver resume. 3569 */ 3570 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3571 { 3572 struct drm_connector *connector; 3573 struct drm_connector_list_iter iter; 3574 struct amdgpu_device *adev = dev->dev_private; 3575 struct drm_crtc *crtc; 3576 int r = 0; 3577 3578 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3579 return 0; 3580 3581 /* post card */ 3582 if (amdgpu_device_need_post(adev)) { 3583 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3584 if (r) 3585 DRM_ERROR("amdgpu asic init failed\n"); 3586 } 3587 3588 r = amdgpu_device_ip_resume(adev); 3589 if (r) { 3590 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3591 return r; 3592 } 3593 amdgpu_fence_driver_resume(adev); 3594 3595 3596 r = amdgpu_device_ip_late_init(adev); 3597 if (r) 3598 return r; 3599 3600 queue_delayed_work(system_wq, &adev->delayed_init_work, 3601 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3602 3603 if (!amdgpu_device_has_dc_support(adev)) { 3604 /* pin cursors */ 3605 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3606 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3607 3608 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3609 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3610 r = amdgpu_bo_reserve(aobj, true); 3611 if (r == 0) { 3612 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3613 if (r != 0) 3614 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3615 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3616 amdgpu_bo_unreserve(aobj); 3617 } 3618 } 3619 } 3620 } 3621 r = amdgpu_amdkfd_resume(adev); 3622 if (r) 3623 return r; 3624 3625 /* Make sure IB tests flushed */ 3626 flush_delayed_work(&adev->delayed_init_work); 3627 3628 /* blat the mode back in */ 3629 if (fbcon) { 3630 if (!amdgpu_device_has_dc_support(adev)) { 3631 /* pre DCE11 */ 3632 drm_helper_resume_force_mode(dev); 3633 3634 /* turn on display hw */ 3635 drm_modeset_lock_all(dev); 3636 3637 drm_connector_list_iter_begin(dev, &iter); 3638 drm_for_each_connector_iter(connector, &iter) 3639 drm_helper_connector_dpms(connector, 3640 DRM_MODE_DPMS_ON); 3641 drm_connector_list_iter_end(&iter); 3642 3643 drm_modeset_unlock_all(dev); 3644 } 3645 amdgpu_fbdev_set_suspend(adev, 0); 3646 } 3647 3648 drm_kms_helper_poll_enable(dev); 3649 3650 amdgpu_ras_resume(adev); 3651 3652 /* 3653 * Most of the connector probing functions try to acquire runtime pm 3654 * refs to ensure that the GPU is powered on when connector polling is 3655 * performed. Since we're calling this from a runtime PM callback, 3656 * trying to acquire rpm refs will cause us to deadlock. 3657 * 3658 * Since we're guaranteed to be holding the rpm lock, it's safe to 3659 * temporarily disable the rpm helpers so this doesn't deadlock us. 3660 */ 3661 #ifdef CONFIG_PM 3662 dev->dev->power.disable_depth++; 3663 #endif 3664 if (!amdgpu_device_has_dc_support(adev)) 3665 drm_helper_hpd_irq_event(dev); 3666 else 3667 drm_kms_helper_hotplug_event(dev); 3668 #ifdef CONFIG_PM 3669 dev->dev->power.disable_depth--; 3670 #endif 3671 adev->in_suspend = false; 3672 3673 return 0; 3674 } 3675 3676 /** 3677 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3678 * 3679 * @adev: amdgpu_device pointer 3680 * 3681 * The list of all the hardware IPs that make up the asic is walked and 3682 * the check_soft_reset callbacks are run. check_soft_reset determines 3683 * if the asic is still hung or not. 3684 * Returns true if any of the IPs are still in a hung state, false if not. 3685 */ 3686 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3687 { 3688 int i; 3689 bool asic_hang = false; 3690 3691 if (amdgpu_sriov_vf(adev)) 3692 return true; 3693 3694 if (amdgpu_asic_need_full_reset(adev)) 3695 return true; 3696 3697 for (i = 0; i < adev->num_ip_blocks; i++) { 3698 if (!adev->ip_blocks[i].status.valid) 3699 continue; 3700 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3701 adev->ip_blocks[i].status.hang = 3702 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3703 if (adev->ip_blocks[i].status.hang) { 3704 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3705 asic_hang = true; 3706 } 3707 } 3708 return asic_hang; 3709 } 3710 3711 /** 3712 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3713 * 3714 * @adev: amdgpu_device pointer 3715 * 3716 * The list of all the hardware IPs that make up the asic is walked and the 3717 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3718 * handles any IP specific hardware or software state changes that are 3719 * necessary for a soft reset to succeed. 3720 * Returns 0 on success, negative error code on failure. 3721 */ 3722 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3723 { 3724 int i, r = 0; 3725 3726 for (i = 0; i < adev->num_ip_blocks; i++) { 3727 if (!adev->ip_blocks[i].status.valid) 3728 continue; 3729 if (adev->ip_blocks[i].status.hang && 3730 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3731 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3732 if (r) 3733 return r; 3734 } 3735 } 3736 3737 return 0; 3738 } 3739 3740 /** 3741 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3742 * 3743 * @adev: amdgpu_device pointer 3744 * 3745 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3746 * reset is necessary to recover. 3747 * Returns true if a full asic reset is required, false if not. 3748 */ 3749 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3750 { 3751 int i; 3752 3753 if (amdgpu_asic_need_full_reset(adev)) 3754 return true; 3755 3756 for (i = 0; i < adev->num_ip_blocks; i++) { 3757 if (!adev->ip_blocks[i].status.valid) 3758 continue; 3759 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3760 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3761 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3762 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3763 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3764 if (adev->ip_blocks[i].status.hang) { 3765 DRM_INFO("Some block need full reset!\n"); 3766 return true; 3767 } 3768 } 3769 } 3770 return false; 3771 } 3772 3773 /** 3774 * amdgpu_device_ip_soft_reset - do a soft reset 3775 * 3776 * @adev: amdgpu_device pointer 3777 * 3778 * The list of all the hardware IPs that make up the asic is walked and the 3779 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3780 * IP specific hardware or software state changes that are necessary to soft 3781 * reset the IP. 3782 * Returns 0 on success, negative error code on failure. 3783 */ 3784 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3785 { 3786 int i, r = 0; 3787 3788 for (i = 0; i < adev->num_ip_blocks; i++) { 3789 if (!adev->ip_blocks[i].status.valid) 3790 continue; 3791 if (adev->ip_blocks[i].status.hang && 3792 adev->ip_blocks[i].version->funcs->soft_reset) { 3793 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3794 if (r) 3795 return r; 3796 } 3797 } 3798 3799 return 0; 3800 } 3801 3802 /** 3803 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3804 * 3805 * @adev: amdgpu_device pointer 3806 * 3807 * The list of all the hardware IPs that make up the asic is walked and the 3808 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3809 * handles any IP specific hardware or software state changes that are 3810 * necessary after the IP has been soft reset. 3811 * Returns 0 on success, negative error code on failure. 3812 */ 3813 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3814 { 3815 int i, r = 0; 3816 3817 for (i = 0; i < adev->num_ip_blocks; i++) { 3818 if (!adev->ip_blocks[i].status.valid) 3819 continue; 3820 if (adev->ip_blocks[i].status.hang && 3821 adev->ip_blocks[i].version->funcs->post_soft_reset) 3822 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3823 if (r) 3824 return r; 3825 } 3826 3827 return 0; 3828 } 3829 3830 /** 3831 * amdgpu_device_recover_vram - Recover some VRAM contents 3832 * 3833 * @adev: amdgpu_device pointer 3834 * 3835 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3836 * restore things like GPUVM page tables after a GPU reset where 3837 * the contents of VRAM might be lost. 3838 * 3839 * Returns: 3840 * 0 on success, negative error code on failure. 3841 */ 3842 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3843 { 3844 struct dma_fence *fence = NULL, *next = NULL; 3845 struct amdgpu_bo *shadow; 3846 long r = 1, tmo; 3847 3848 if (amdgpu_sriov_runtime(adev)) 3849 tmo = msecs_to_jiffies(8000); 3850 else 3851 tmo = msecs_to_jiffies(100); 3852 3853 DRM_INFO("recover vram bo from shadow start\n"); 3854 mutex_lock(&adev->shadow_list_lock); 3855 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3856 3857 /* No need to recover an evicted BO */ 3858 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3859 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3860 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3861 continue; 3862 3863 r = amdgpu_bo_restore_shadow(shadow, &next); 3864 if (r) 3865 break; 3866 3867 if (fence) { 3868 tmo = dma_fence_wait_timeout(fence, false, tmo); 3869 dma_fence_put(fence); 3870 fence = next; 3871 if (tmo == 0) { 3872 r = -ETIMEDOUT; 3873 break; 3874 } else if (tmo < 0) { 3875 r = tmo; 3876 break; 3877 } 3878 } else { 3879 fence = next; 3880 } 3881 } 3882 mutex_unlock(&adev->shadow_list_lock); 3883 3884 if (fence) 3885 tmo = dma_fence_wait_timeout(fence, false, tmo); 3886 dma_fence_put(fence); 3887 3888 if (r < 0 || tmo <= 0) { 3889 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3890 return -EIO; 3891 } 3892 3893 DRM_INFO("recover vram bo from shadow done\n"); 3894 return 0; 3895 } 3896 3897 3898 /** 3899 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3900 * 3901 * @adev: amdgpu device pointer 3902 * @from_hypervisor: request from hypervisor 3903 * 3904 * do VF FLR and reinitialize Asic 3905 * return 0 means succeeded otherwise failed 3906 */ 3907 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3908 bool from_hypervisor) 3909 { 3910 int r; 3911 3912 if (from_hypervisor) 3913 r = amdgpu_virt_request_full_gpu(adev, true); 3914 else 3915 r = amdgpu_virt_reset_gpu(adev); 3916 if (r) 3917 return r; 3918 3919 /* Resume IP prior to SMC */ 3920 r = amdgpu_device_ip_reinit_early_sriov(adev); 3921 if (r) 3922 goto error; 3923 3924 amdgpu_virt_init_data_exchange(adev); 3925 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3926 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3927 3928 r = amdgpu_device_fw_loading(adev); 3929 if (r) 3930 return r; 3931 3932 /* now we are okay to resume SMC/CP/SDMA */ 3933 r = amdgpu_device_ip_reinit_late_sriov(adev); 3934 if (r) 3935 goto error; 3936 3937 amdgpu_irq_gpu_reset_resume_helper(adev); 3938 r = amdgpu_ib_ring_tests(adev); 3939 amdgpu_amdkfd_post_reset(adev); 3940 3941 error: 3942 amdgpu_virt_release_full_gpu(adev, true); 3943 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3944 amdgpu_inc_vram_lost(adev); 3945 r = amdgpu_device_recover_vram(adev); 3946 } 3947 3948 return r; 3949 } 3950 3951 /** 3952 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3953 * 3954 * @adev: amdgpu device pointer 3955 * 3956 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3957 * a hung GPU. 3958 */ 3959 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3960 { 3961 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3962 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3963 return false; 3964 } 3965 3966 if (amdgpu_gpu_recovery == 0) 3967 goto disabled; 3968 3969 if (amdgpu_sriov_vf(adev)) 3970 return true; 3971 3972 if (amdgpu_gpu_recovery == -1) { 3973 switch (adev->asic_type) { 3974 case CHIP_BONAIRE: 3975 case CHIP_HAWAII: 3976 case CHIP_TOPAZ: 3977 case CHIP_TONGA: 3978 case CHIP_FIJI: 3979 case CHIP_POLARIS10: 3980 case CHIP_POLARIS11: 3981 case CHIP_POLARIS12: 3982 case CHIP_VEGAM: 3983 case CHIP_VEGA20: 3984 case CHIP_VEGA10: 3985 case CHIP_VEGA12: 3986 case CHIP_RAVEN: 3987 case CHIP_ARCTURUS: 3988 case CHIP_RENOIR: 3989 case CHIP_NAVI10: 3990 case CHIP_NAVI14: 3991 case CHIP_NAVI12: 3992 break; 3993 default: 3994 goto disabled; 3995 } 3996 } 3997 3998 return true; 3999 4000 disabled: 4001 DRM_INFO("GPU recovery disabled.\n"); 4002 return false; 4003 } 4004 4005 4006 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4007 struct amdgpu_job *job, 4008 bool *need_full_reset_arg) 4009 { 4010 int i, r = 0; 4011 bool need_full_reset = *need_full_reset_arg; 4012 4013 /* block all schedulers and reset given job's ring */ 4014 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4015 struct amdgpu_ring *ring = adev->rings[i]; 4016 4017 if (!ring || !ring->sched.thread) 4018 continue; 4019 4020 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4021 amdgpu_fence_driver_force_completion(ring); 4022 } 4023 4024 if(job) 4025 drm_sched_increase_karma(&job->base); 4026 4027 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4028 if (!amdgpu_sriov_vf(adev)) { 4029 4030 if (!need_full_reset) 4031 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4032 4033 if (!need_full_reset) { 4034 amdgpu_device_ip_pre_soft_reset(adev); 4035 r = amdgpu_device_ip_soft_reset(adev); 4036 amdgpu_device_ip_post_soft_reset(adev); 4037 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4038 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 4039 need_full_reset = true; 4040 } 4041 } 4042 4043 if (need_full_reset) 4044 r = amdgpu_device_ip_suspend(adev); 4045 4046 *need_full_reset_arg = need_full_reset; 4047 } 4048 4049 return r; 4050 } 4051 4052 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4053 struct list_head *device_list_handle, 4054 bool *need_full_reset_arg) 4055 { 4056 struct amdgpu_device *tmp_adev = NULL; 4057 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4058 int r = 0; 4059 4060 /* 4061 * ASIC reset has to be done on all HGMI hive nodes ASAP 4062 * to allow proper links negotiation in FW (within 1 sec) 4063 */ 4064 if (need_full_reset) { 4065 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4066 /* For XGMI run all resets in parallel to speed up the process */ 4067 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4068 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4069 r = -EALREADY; 4070 } else 4071 r = amdgpu_asic_reset(tmp_adev); 4072 4073 if (r) { 4074 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4075 r, tmp_adev->ddev->unique); 4076 break; 4077 } 4078 } 4079 4080 /* For XGMI wait for all resets to complete before proceed */ 4081 if (!r) { 4082 list_for_each_entry(tmp_adev, device_list_handle, 4083 gmc.xgmi.head) { 4084 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4085 flush_work(&tmp_adev->xgmi_reset_work); 4086 r = tmp_adev->asic_reset_res; 4087 if (r) 4088 break; 4089 } 4090 } 4091 } 4092 } 4093 4094 if (!r && amdgpu_ras_intr_triggered()) 4095 amdgpu_ras_intr_cleared(); 4096 4097 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4098 if (need_full_reset) { 4099 /* post card */ 4100 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 4101 DRM_WARN("asic atom init failed!"); 4102 4103 if (!r) { 4104 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4105 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4106 if (r) 4107 goto out; 4108 4109 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4110 if (vram_lost) { 4111 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4112 amdgpu_inc_vram_lost(tmp_adev); 4113 } 4114 4115 r = amdgpu_gtt_mgr_recover( 4116 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4117 if (r) 4118 goto out; 4119 4120 r = amdgpu_device_fw_loading(tmp_adev); 4121 if (r) 4122 return r; 4123 4124 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4125 if (r) 4126 goto out; 4127 4128 if (vram_lost) 4129 amdgpu_device_fill_reset_magic(tmp_adev); 4130 4131 /* 4132 * Add this ASIC as tracked as reset was already 4133 * complete successfully. 4134 */ 4135 amdgpu_register_gpu_instance(tmp_adev); 4136 4137 r = amdgpu_device_ip_late_init(tmp_adev); 4138 if (r) 4139 goto out; 4140 4141 /* must succeed. */ 4142 amdgpu_ras_resume(tmp_adev); 4143 4144 /* Update PSP FW topology after reset */ 4145 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4146 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4147 } 4148 } 4149 4150 4151 out: 4152 if (!r) { 4153 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4154 r = amdgpu_ib_ring_tests(tmp_adev); 4155 if (r) { 4156 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4157 r = amdgpu_device_ip_suspend(tmp_adev); 4158 need_full_reset = true; 4159 r = -EAGAIN; 4160 goto end; 4161 } 4162 } 4163 4164 if (!r) 4165 r = amdgpu_device_recover_vram(tmp_adev); 4166 else 4167 tmp_adev->asic_reset_res = r; 4168 } 4169 4170 end: 4171 *need_full_reset_arg = need_full_reset; 4172 return r; 4173 } 4174 4175 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 4176 { 4177 if (trylock) { 4178 if (!mutex_trylock(&adev->lock_reset)) 4179 return false; 4180 } else 4181 mutex_lock(&adev->lock_reset); 4182 4183 atomic_inc(&adev->gpu_reset_counter); 4184 adev->in_gpu_reset = true; 4185 switch (amdgpu_asic_reset_method(adev)) { 4186 case AMD_RESET_METHOD_MODE1: 4187 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4188 break; 4189 case AMD_RESET_METHOD_MODE2: 4190 adev->mp1_state = PP_MP1_STATE_RESET; 4191 break; 4192 default: 4193 adev->mp1_state = PP_MP1_STATE_NONE; 4194 break; 4195 } 4196 4197 return true; 4198 } 4199 4200 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4201 { 4202 amdgpu_vf_error_trans_all(adev); 4203 adev->mp1_state = PP_MP1_STATE_NONE; 4204 adev->in_gpu_reset = false; 4205 mutex_unlock(&adev->lock_reset); 4206 } 4207 4208 /** 4209 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4210 * 4211 * @adev: amdgpu device pointer 4212 * @job: which job trigger hang 4213 * 4214 * Attempt to reset the GPU if it has hung (all asics). 4215 * Attempt to do soft-reset or full-reset and reinitialize Asic 4216 * Returns 0 for success or an error on failure. 4217 */ 4218 4219 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4220 struct amdgpu_job *job) 4221 { 4222 struct list_head device_list, *device_list_handle = NULL; 4223 bool need_full_reset, job_signaled; 4224 struct amdgpu_hive_info *hive = NULL; 4225 struct amdgpu_device *tmp_adev = NULL; 4226 int i, r = 0; 4227 bool in_ras_intr = amdgpu_ras_intr_triggered(); 4228 bool use_baco = 4229 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? 4230 true : false; 4231 4232 /* 4233 * Flush RAM to disk so that after reboot 4234 * the user can read log and see why the system rebooted. 4235 */ 4236 if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { 4237 4238 DRM_WARN("Emergency reboot."); 4239 4240 ksys_sync_helper(); 4241 emergency_restart(); 4242 } 4243 4244 need_full_reset = job_signaled = false; 4245 INIT_LIST_HEAD(&device_list); 4246 4247 dev_info(adev->dev, "GPU %s begin!\n", 4248 (in_ras_intr && !use_baco) ? "jobs stop":"reset"); 4249 4250 cancel_delayed_work_sync(&adev->delayed_init_work); 4251 4252 hive = amdgpu_get_xgmi_hive(adev, false); 4253 4254 /* 4255 * Here we trylock to avoid chain of resets executing from 4256 * either trigger by jobs on different adevs in XGMI hive or jobs on 4257 * different schedulers for same device while this TO handler is running. 4258 * We always reset all schedulers for device and all devices for XGMI 4259 * hive so that should take care of them too. 4260 */ 4261 4262 if (hive && !mutex_trylock(&hive->reset_lock)) { 4263 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", hive: %"PRIx64" as another already in progress", 4264 job ? job->base.id : -1, hive->hive_id); 4265 return 0; 4266 } 4267 4268 /* Start with adev pre asic reset first for soft reset check.*/ 4269 if (!amdgpu_device_lock_adev(adev, !hive)) { 4270 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", as another already in progress", 4271 job ? job->base.id : -1); 4272 return 0; 4273 } 4274 4275 /* Block kfd: SRIOV would do it separately */ 4276 if (!amdgpu_sriov_vf(adev)) 4277 amdgpu_amdkfd_pre_reset(adev); 4278 4279 /* Build list of devices to reset */ 4280 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4281 if (!hive) { 4282 /*unlock kfd: SRIOV would do it separately */ 4283 if (!amdgpu_sriov_vf(adev)) 4284 amdgpu_amdkfd_post_reset(adev); 4285 amdgpu_device_unlock_adev(adev); 4286 return -ENODEV; 4287 } 4288 4289 /* 4290 * In case we are in XGMI hive mode device reset is done for all the 4291 * nodes in the hive to retrain all XGMI links and hence the reset 4292 * sequence is executed in loop on all nodes. 4293 */ 4294 device_list_handle = &hive->device_list; 4295 } else { 4296 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4297 device_list_handle = &device_list; 4298 } 4299 4300 /* block all schedulers and reset given job's ring */ 4301 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4302 if (tmp_adev != adev) { 4303 amdgpu_device_lock_adev(tmp_adev, false); 4304 if (!amdgpu_sriov_vf(tmp_adev)) 4305 amdgpu_amdkfd_pre_reset(tmp_adev); 4306 } 4307 4308 /* 4309 * Mark these ASICs to be reseted as untracked first 4310 * And add them back after reset completed 4311 */ 4312 amdgpu_unregister_gpu_instance(tmp_adev); 4313 4314 /* disable ras on ALL IPs */ 4315 if (!(in_ras_intr && !use_baco) && 4316 amdgpu_device_ip_need_full_reset(tmp_adev)) 4317 amdgpu_ras_suspend(tmp_adev); 4318 4319 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4320 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4321 4322 if (!ring || !ring->sched.thread) 4323 continue; 4324 4325 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4326 4327 if (in_ras_intr && !use_baco) 4328 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4329 } 4330 } 4331 4332 4333 if (in_ras_intr && !use_baco) 4334 goto skip_sched_resume; 4335 4336 /* 4337 * Must check guilty signal here since after this point all old 4338 * HW fences are force signaled. 4339 * 4340 * job->base holds a reference to parent fence 4341 */ 4342 if (job && job->base.s_fence->parent && 4343 dma_fence_is_signaled(job->base.s_fence->parent)) 4344 job_signaled = true; 4345 4346 if (job_signaled) { 4347 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4348 goto skip_hw_reset; 4349 } 4350 4351 4352 /* Guilty job will be freed after this*/ 4353 r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); 4354 if (r) { 4355 /*TODO Should we stop ?*/ 4356 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4357 r, adev->ddev->unique); 4358 adev->asic_reset_res = r; 4359 } 4360 4361 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4362 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4363 4364 if (tmp_adev == adev) 4365 continue; 4366 4367 r = amdgpu_device_pre_asic_reset(tmp_adev, 4368 NULL, 4369 &need_full_reset); 4370 /*TODO Should we stop ?*/ 4371 if (r) { 4372 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4373 r, tmp_adev->ddev->unique); 4374 tmp_adev->asic_reset_res = r; 4375 } 4376 } 4377 4378 /* Actual ASIC resets if needed.*/ 4379 /* TODO Implement XGMI hive reset logic for SRIOV */ 4380 if (amdgpu_sriov_vf(adev)) { 4381 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4382 if (r) 4383 adev->asic_reset_res = r; 4384 } else { 4385 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4386 if (r && r == -EAGAIN) 4387 goto retry; 4388 } 4389 4390 skip_hw_reset: 4391 4392 /* Post ASIC reset for all devs .*/ 4393 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4394 4395 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4396 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4397 4398 if (!ring || !ring->sched.thread) 4399 continue; 4400 4401 /* No point to resubmit jobs if we didn't HW reset*/ 4402 if (!tmp_adev->asic_reset_res && !job_signaled) 4403 drm_sched_resubmit_jobs(&ring->sched); 4404 4405 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4406 } 4407 4408 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4409 drm_helper_resume_force_mode(tmp_adev->ddev); 4410 } 4411 4412 tmp_adev->asic_reset_res = 0; 4413 4414 if (r) { 4415 /* bad news, how to tell it to userspace ? */ 4416 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4417 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4418 } else { 4419 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4420 } 4421 } 4422 4423 skip_sched_resume: 4424 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4425 /*unlock kfd: SRIOV would do it separately */ 4426 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) 4427 amdgpu_amdkfd_post_reset(tmp_adev); 4428 amdgpu_device_unlock_adev(tmp_adev); 4429 } 4430 4431 if (hive) 4432 mutex_unlock(&hive->reset_lock); 4433 4434 if (r) 4435 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4436 return r; 4437 } 4438 4439 /** 4440 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4441 * 4442 * @adev: amdgpu_device pointer 4443 * 4444 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4445 * and lanes) of the slot the device is in. Handles APUs and 4446 * virtualized environments where PCIE config space may not be available. 4447 */ 4448 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4449 { 4450 struct pci_dev *pdev; 4451 enum pci_bus_speed speed_cap, platform_speed_cap; 4452 enum pcie_link_width platform_link_width; 4453 4454 if (amdgpu_pcie_gen_cap) 4455 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4456 4457 if (amdgpu_pcie_lane_cap) 4458 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4459 4460 /* covers APUs as well */ 4461 if (pci_is_root_bus(adev->pdev->bus)) { 4462 if (adev->pm.pcie_gen_mask == 0) 4463 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4464 if (adev->pm.pcie_mlw_mask == 0) 4465 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4466 return; 4467 } 4468 4469 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4470 return; 4471 4472 pcie_bandwidth_available(adev->pdev, NULL, 4473 &platform_speed_cap, &platform_link_width); 4474 4475 if (adev->pm.pcie_gen_mask == 0) { 4476 /* asic caps */ 4477 pdev = adev->pdev; 4478 speed_cap = pcie_get_speed_cap(pdev); 4479 if (speed_cap == PCI_SPEED_UNKNOWN) { 4480 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4481 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4482 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4483 } else { 4484 if (speed_cap == PCIE_SPEED_16_0GT) 4485 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4487 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4488 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4489 else if (speed_cap == PCIE_SPEED_8_0GT) 4490 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4493 else if (speed_cap == PCIE_SPEED_5_0GT) 4494 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4495 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4496 else 4497 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4498 } 4499 /* platform caps */ 4500 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4501 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4502 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4503 } else { 4504 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4505 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4507 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4508 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4509 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4510 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4512 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4513 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4514 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4515 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4516 else 4517 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4518 4519 } 4520 } 4521 if (adev->pm.pcie_mlw_mask == 0) { 4522 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4523 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4524 } else { 4525 switch (platform_link_width) { 4526 case PCIE_LNK_X32: 4527 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4534 break; 4535 case PCIE_LNK_X16: 4536 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4542 break; 4543 case PCIE_LNK_X12: 4544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4549 break; 4550 case PCIE_LNK_X8: 4551 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4555 break; 4556 case PCIE_LNK_X4: 4557 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4560 break; 4561 case PCIE_LNK_X2: 4562 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4564 break; 4565 case PCIE_LNK_X1: 4566 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4567 break; 4568 default: 4569 break; 4570 } 4571 } 4572 } 4573 } 4574 4575 int amdgpu_device_baco_enter(struct drm_device *dev) 4576 { 4577 struct amdgpu_device *adev = dev->dev_private; 4578 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4579 4580 if (!amdgpu_device_supports_baco(adev->ddev)) 4581 return -ENOTSUPP; 4582 4583 if (ras && ras->supported) 4584 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4585 4586 return amdgpu_dpm_baco_enter(adev); 4587 } 4588 4589 int amdgpu_device_baco_exit(struct drm_device *dev) 4590 { 4591 struct amdgpu_device *adev = dev->dev_private; 4592 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4593 int ret = 0; 4594 4595 if (!amdgpu_device_supports_baco(adev->ddev)) 4596 return -ENOTSUPP; 4597 4598 ret = amdgpu_dpm_baco_exit(adev); 4599 if (ret) 4600 return ret; 4601 4602 if (ras && ras->supported) 4603 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4604 4605 return 0; 4606 } 4607