1 /* $NetBSD: amdgpu_device.c,v 1.19 2023/05/25 12:07:43 riastradh Exp $ */ 2 3 /* 4 * Copyright 2008 Advanced Micro Devices, Inc. 5 * Copyright 2008 Red Hat Inc. 6 * Copyright 2009 Jerome Glisse. 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 * 26 * Authors: Dave Airlie 27 * Alex Deucher 28 * Jerome Glisse 29 */ 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: amdgpu_device.c,v 1.19 2023/05/25 12:07:43 riastradh Exp $"); 32 33 #include <linux/power_supply.h> 34 #include <linux/kthread.h> 35 #include <linux/module.h> 36 #include <linux/console.h> 37 #include <linux/slab.h> 38 #include <linux/reboot.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/pci.h> 64 #include <linux/firmware.h> 65 #include "amdgpu_vf_error.h" 66 67 #include "amdgpu_amdkfd.h" 68 #include "amdgpu_pm.h" 69 70 #include "amdgpu_xgmi.h" 71 #include "amdgpu_ras.h" 72 #include "amdgpu_pmu.h" 73 74 #include <linux/suspend.h> 75 #include <drm/task_barrier.h> 76 #include <linux/nbsd-namespace.h> 77 78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "NAVI10", 118 "NAVI14", 119 "NAVI12", 120 "LAST", 121 }; 122 123 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = ddev->dev_private; 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 #endif /* __NetBSD__ */ 148 149 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 150 151 /** 152 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control 153 * 154 * @dev: drm_device pointer 155 * 156 * Returns true if the device is a dGPU with HG/PX power control, 157 * otherwise return false. 158 */ 159 bool amdgpu_device_supports_boco(struct drm_device *dev) 160 { 161 struct amdgpu_device *adev = dev->dev_private; 162 163 if (adev->flags & AMD_IS_PX) 164 return true; 165 return false; 166 } 167 168 /** 169 * amdgpu_device_supports_baco - Does the device support BACO 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device supporte BACO, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_baco(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = dev->dev_private; 179 180 return amdgpu_asic_supports_baco(adev); 181 } 182 183 /** 184 * VRAM access helper functions. 185 * 186 * amdgpu_device_vram_access - read/write a buffer in vram 187 * 188 * @adev: amdgpu_device pointer 189 * @pos: offset of the buffer in vram 190 * @buf: virtual address of the buffer in system memory 191 * @size: read/write size, sizeof(@buf) must > @size 192 * @write: true - write to vram, otherwise - read from vram 193 */ 194 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 195 uint32_t *buf, size_t size, bool write) 196 { 197 uint64_t last; 198 unsigned long flags; 199 200 last = size - 4; 201 for (last += pos; pos <= last; pos += 4) { 202 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 203 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 204 WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); 205 if (write) 206 WREG32_NO_KIQ(mmMM_DATA, *buf++); 207 else 208 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 209 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 210 } 211 } 212 213 /* 214 * MMIO register access helper functions. 215 */ 216 /** 217 * amdgpu_mm_rreg - read a memory mapped IO register 218 * 219 * @adev: amdgpu_device pointer 220 * @reg: dword aligned register offset 221 * @acc_flags: access flags which require special behavior 222 * 223 * Returns the 32 bit value from the offset specified. 224 */ 225 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 226 uint32_t acc_flags) 227 { 228 uint32_t ret; 229 230 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 231 return amdgpu_kiq_rreg(adev, reg); 232 233 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 234 #ifdef __NetBSD__ 235 return bus_space_read_4(adev->rmmiot, adev->rmmioh, 4*reg); 236 #else 237 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 238 #endif 239 else { 240 unsigned long flags; 241 242 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 243 #ifdef __NetBSD__ 244 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 245 4*reg); 246 ret = bus_space_read_4(adev->rmmiot, adev->rmmioh, 247 4*mmMM_DATA); 248 #else 249 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 250 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 251 #endif 252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 253 } 254 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 255 return ret; 256 } 257 258 /* 259 * MMIO register read with bytes helper functions 260 * @offset:bytes offset from MMIO start 261 * 262 */ 263 264 /** 265 * amdgpu_mm_rreg8 - read a memory mapped IO register 266 * 267 * @adev: amdgpu_device pointer 268 * @offset: byte aligned register offset 269 * 270 * Returns the 8 bit value from the offset specified. 271 */ 272 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 273 if (offset < adev->rmmio_size) 274 #ifdef __NetBSD__ 275 return bus_space_read_1(adev->rmmiot, adev->rmmioh, offset); 276 #else 277 return (readb(adev->rmmio + offset)); 278 #endif 279 BUG(); 280 } 281 282 /* 283 * MMIO register write with bytes helper functions 284 * @offset:bytes offset from MMIO start 285 * @value: the value want to be written to the register 286 * 287 */ 288 /** 289 * amdgpu_mm_wreg8 - read a memory mapped IO register 290 * 291 * @adev: amdgpu_device pointer 292 * @offset: byte aligned register offset 293 * @value: 8 bit value to write 294 * 295 * Writes the value specified to the offset specified. 296 */ 297 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 298 if (offset < adev->rmmio_size) 299 #ifdef __NetBSD__ 300 bus_space_write_1(adev->rmmiot, adev->rmmioh, offset, value); 301 #else 302 writeb(value, adev->rmmio + offset); 303 #endif 304 else 305 BUG(); 306 } 307 308 /** 309 * amdgpu_mm_wreg - write to a memory mapped IO register 310 * 311 * @adev: amdgpu_device pointer 312 * @reg: dword aligned register offset 313 * @v: 32 bit value to write to the register 314 * @acc_flags: access flags which require special behavior 315 * 316 * Writes the value specified to the offset specified. 317 */ 318 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 319 uint32_t acc_flags) 320 { 321 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 322 323 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 324 adev->last_mm_index = v; 325 } 326 327 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) 328 return amdgpu_kiq_wreg(adev, reg, v); 329 330 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 331 #ifdef __NetBSD__ 332 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*reg, v); 333 #else 334 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 335 #endif 336 else { 337 unsigned long flags; 338 339 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 340 #ifdef __NetBSD__ 341 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX, 342 reg*4); 343 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_DATA, v); 344 #else 345 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 346 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 347 #endif 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 } 350 351 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 352 udelay(500); 353 } 354 } 355 356 /** 357 * amdgpu_io_rreg - read an IO register 358 * 359 * @adev: amdgpu_device pointer 360 * @reg: dword aligned register offset 361 * 362 * Returns the 32 bit value from the offset specified. 363 */ 364 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 365 { 366 if ((reg * 4) < adev->rio_mem_size) 367 #ifdef __NetBSD__ 368 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 4*reg); 369 #else 370 return ioread32(adev->rio_mem + (reg * 4)); 371 #endif 372 else { 373 #ifdef __NetBSD__ 374 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 375 4*reg); 376 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 377 4*mmMM_DATA); 378 #else 379 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 380 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 381 #endif 382 } 383 } 384 385 /** 386 * amdgpu_io_wreg - write to an IO register 387 * 388 * @adev: amdgpu_device pointer 389 * @reg: dword aligned register offset 390 * @v: 32 bit value to write to the register 391 * 392 * Writes the value specified to the offset specified. 393 */ 394 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 395 { 396 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 397 adev->last_mm_index = v; 398 } 399 400 if ((reg * 4) < adev->rio_mem_size) 401 #ifdef __NetBSD__ 402 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*reg, v); 403 #else 404 iowrite32(v, adev->rio_mem + (reg * 4)); 405 #endif 406 else { 407 #ifdef __NetBSD__ 408 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX, 409 4*reg); 410 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_DATA, 411 v); 412 #else 413 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 414 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 415 #endif 416 } 417 418 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 419 udelay(500); 420 } 421 } 422 423 /** 424 * amdgpu_mm_rdoorbell - read a doorbell dword 425 * 426 * @adev: amdgpu_device pointer 427 * @index: doorbell index 428 * 429 * Returns the value in the doorbell aperture at the 430 * requested doorbell index (CIK). 431 */ 432 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 433 { 434 if (index < adev->doorbell.num_doorbells) { 435 #ifdef __NetBSD__ 436 return bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 437 4*index); 438 #else 439 return readl(adev->doorbell.ptr + index); 440 #endif 441 } else { 442 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 443 return 0; 444 } 445 } 446 447 /** 448 * amdgpu_mm_wdoorbell - write a doorbell dword 449 * 450 * @adev: amdgpu_device pointer 451 * @index: doorbell index 452 * @v: value to write 453 * 454 * Writes @v to the doorbell aperture at the 455 * requested doorbell index (CIK). 456 */ 457 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 458 { 459 if (index < adev->doorbell.num_doorbells) { 460 #ifdef __NetBSD__ 461 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 462 4*index, v); 463 #else 464 writel(v, adev->doorbell.ptr + index); 465 #endif 466 } else { 467 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 468 } 469 } 470 471 /** 472 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 473 * 474 * @adev: amdgpu_device pointer 475 * @index: doorbell index 476 * 477 * Returns the value in the doorbell aperture at the 478 * requested doorbell index (VEGA10+). 479 */ 480 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 481 { 482 if (index < adev->doorbell.num_doorbells) { 483 #ifdef __NetBSD__ 484 #ifdef _LP64 485 return bus_space_read_8(adev->doorbell.bst, adev->doorbell.bsh, 486 4*index); 487 #else 488 uint64_t lo, hi; 489 #if _BYTE_ORDER == _LITTLE_ENDIAN 490 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 491 4*index); 492 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 493 4*index + 4); 494 #else 495 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 496 4*index); 497 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh, 498 4*index + 4); 499 #endif 500 return lo | (hi << 32); 501 #endif 502 #else 503 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 504 #endif 505 } else { 506 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 507 return 0; 508 } 509 } 510 511 /** 512 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 513 * 514 * @adev: amdgpu_device pointer 515 * @index: doorbell index 516 * @v: value to write 517 * 518 * Writes @v to the doorbell aperture at the 519 * requested doorbell index (VEGA10+). 520 */ 521 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 522 { 523 if (index < adev->doorbell.num_doorbells) { 524 #ifdef __NetBSD__ 525 #ifdef _LP64 526 bus_space_write_8(adev->doorbell.bst, adev->doorbell.bsh, 527 4*index, v); 528 #else 529 /* 530 * XXX This might not be as atomic as one might hope... 531 */ 532 #if _BYTE_ORDER == _LITTLE_ENDIAN 533 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 534 4*index, v & 0xffffffffU); 535 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 536 4*index + 4, v >> 32); 537 #else 538 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 539 4*index, v >> 32); 540 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh, 541 4*index + 4, v & 0xffffffffU); 542 #endif 543 #endif 544 #else 545 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 546 #endif 547 } else { 548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 549 } 550 } 551 552 /** 553 * amdgpu_invalid_rreg - dummy reg read function 554 * 555 * @adev: amdgpu device pointer 556 * @reg: offset of register 557 * 558 * Dummy register read function. Used for register blocks 559 * that certain asics don't have (all asics). 560 * Returns the value in the register. 561 */ 562 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 563 { 564 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 565 BUG(); 566 return 0; 567 } 568 569 /** 570 * amdgpu_invalid_wreg - dummy reg write function 571 * 572 * @adev: amdgpu device pointer 573 * @reg: offset of register 574 * @v: value to write to the register 575 * 576 * Dummy register read function. Used for register blocks 577 * that certain asics don't have (all asics). 578 */ 579 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 580 { 581 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 582 reg, v); 583 BUG(); 584 } 585 586 /** 587 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 588 * 589 * @adev: amdgpu device pointer 590 * @reg: offset of register 591 * 592 * Dummy register read function. Used for register blocks 593 * that certain asics don't have (all asics). 594 * Returns the value in the register. 595 */ 596 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 597 { 598 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 599 BUG(); 600 return 0; 601 } 602 603 /** 604 * amdgpu_invalid_wreg64 - dummy reg write function 605 * 606 * @adev: amdgpu device pointer 607 * @reg: offset of register 608 * @v: value to write to the register 609 * 610 * Dummy register read function. Used for register blocks 611 * that certain asics don't have (all asics). 612 */ 613 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 614 { 615 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08"PRIX64"\n", 616 reg, v); 617 BUG(); 618 } 619 620 /** 621 * amdgpu_block_invalid_rreg - dummy reg read function 622 * 623 * @adev: amdgpu device pointer 624 * @block: offset of instance 625 * @reg: offset of register 626 * 627 * Dummy register read function. Used for register blocks 628 * that certain asics don't have (all asics). 629 * Returns the value in the register. 630 */ 631 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 632 uint32_t block, uint32_t reg) 633 { 634 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 635 reg, block); 636 BUG(); 637 return 0; 638 } 639 640 /** 641 * amdgpu_block_invalid_wreg - dummy reg write function 642 * 643 * @adev: amdgpu device pointer 644 * @block: offset of instance 645 * @reg: offset of register 646 * @v: value to write to the register 647 * 648 * Dummy register read function. Used for register blocks 649 * that certain asics don't have (all asics). 650 */ 651 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 652 uint32_t block, 653 uint32_t reg, uint32_t v) 654 { 655 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 656 reg, block, v); 657 BUG(); 658 } 659 660 /** 661 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 662 * 663 * @adev: amdgpu device pointer 664 * 665 * Allocates a scratch page of VRAM for use by various things in the 666 * driver. 667 */ 668 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 669 { 670 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 671 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 672 &adev->vram_scratch.robj, 673 &adev->vram_scratch.gpu_addr, 674 (void **)__UNVOLATILE(&adev->vram_scratch.ptr)); 675 } 676 677 /** 678 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 679 * 680 * @adev: amdgpu device pointer 681 * 682 * Frees the VRAM scratch page. 683 */ 684 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 685 { 686 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 687 } 688 689 /** 690 * amdgpu_device_program_register_sequence - program an array of registers. 691 * 692 * @adev: amdgpu_device pointer 693 * @registers: pointer to the register array 694 * @array_size: size of the register array 695 * 696 * Programs an array or registers with and and or masks. 697 * This is a helper for setting golden registers. 698 */ 699 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 700 const u32 *registers, 701 const u32 array_size) 702 { 703 u32 tmp, reg, and_mask, or_mask; 704 int i; 705 706 if (array_size % 3) 707 return; 708 709 for (i = 0; i < array_size; i +=3) { 710 reg = registers[i + 0]; 711 and_mask = registers[i + 1]; 712 or_mask = registers[i + 2]; 713 714 if (and_mask == 0xffffffff) { 715 tmp = or_mask; 716 } else { 717 tmp = RREG32(reg); 718 tmp &= ~and_mask; 719 if (adev->family >= AMDGPU_FAMILY_AI) 720 tmp |= (or_mask & and_mask); 721 else 722 tmp |= or_mask; 723 } 724 WREG32(reg, tmp); 725 } 726 } 727 728 /** 729 * amdgpu_device_pci_config_reset - reset the GPU 730 * 731 * @adev: amdgpu_device pointer 732 * 733 * Resets the GPU using the pci config reset sequence. 734 * Only applicable to asics prior to vega10. 735 */ 736 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 737 { 738 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 739 } 740 741 /* 742 * GPU doorbell aperture helpers function. 743 */ 744 /** 745 * amdgpu_device_doorbell_init - Init doorbell driver information. 746 * 747 * @adev: amdgpu_device pointer 748 * 749 * Init doorbell driver information (CIK) 750 * Returns 0 on success, error on failure. 751 */ 752 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 753 { 754 755 /* No doorbell on SI hardware generation */ 756 if (adev->asic_type < CHIP_BONAIRE) { 757 adev->doorbell.base = 0; 758 adev->doorbell.size = 0; 759 adev->doorbell.num_doorbells = 0; 760 #ifndef __NetBSD__ 761 adev->doorbell.ptr = NULL; 762 #endif 763 return 0; 764 } 765 766 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 767 return -EINVAL; 768 769 amdgpu_asic_init_doorbell_index(adev); 770 771 /* doorbell bar mapping */ 772 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 773 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 774 775 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 776 adev->doorbell_index.max_assignment+1); 777 if (adev->doorbell.num_doorbells == 0) 778 return -EINVAL; 779 780 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 781 * paging queue doorbell use the second page. The 782 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 783 * doorbells are in the first page. So with paging queue enabled, 784 * the max num_doorbells should + 1 page (0x400 in dword) 785 */ 786 if (adev->asic_type >= CHIP_VEGA10) 787 adev->doorbell.num_doorbells += 0x400; 788 789 #ifdef __NetBSD__ 790 int r; 791 adev->doorbell.bst = adev->pdev->pd_pa.pa_memt; 792 /* XXX errno NetBSD->Linux */ 793 r = -bus_space_map(adev->doorbell.bst, adev->doorbell.base, 794 adev->doorbell.num_doorbells * sizeof(u32), 0, 795 &adev->doorbell.bsh); 796 if (r) 797 return r; 798 #else 799 adev->doorbell.ptr = ioremap(adev->doorbell.base, 800 adev->doorbell.num_doorbells * 801 sizeof(u32)); 802 if (adev->doorbell.ptr == NULL) 803 return -ENOMEM; 804 #endif 805 806 return 0; 807 } 808 809 /** 810 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 811 * 812 * @adev: amdgpu_device pointer 813 * 814 * Tear down doorbell driver information (CIK) 815 */ 816 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 817 { 818 #ifdef __NetBSD__ 819 if (adev->doorbell.num_doorbells) { 820 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh, 821 adev->doorbell.num_doorbells * sizeof(u32)); 822 adev->doorbell.num_doorbells = 0; 823 } 824 #else 825 iounmap(adev->doorbell.ptr); 826 adev->doorbell.ptr = NULL; 827 #endif 828 } 829 830 831 832 /* 833 * amdgpu_device_wb_*() 834 * Writeback is the method by which the GPU updates special pages in memory 835 * with the status of certain GPU events (fences, ring pointers,etc.). 836 */ 837 838 /** 839 * amdgpu_device_wb_fini - Disable Writeback and free memory 840 * 841 * @adev: amdgpu_device pointer 842 * 843 * Disables Writeback and frees the Writeback memory (all asics). 844 * Used at driver shutdown. 845 */ 846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 847 { 848 if (adev->wb.wb_obj) { 849 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 850 &adev->wb.gpu_addr, 851 (void **)__UNVOLATILE(&adev->wb.wb)); 852 adev->wb.wb_obj = NULL; 853 } 854 } 855 856 /** 857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 858 * 859 * @adev: amdgpu_device pointer 860 * 861 * Initializes writeback and allocates writeback memory (all asics). 862 * Used at driver startup. 863 * Returns 0 on success or an -error on failure. 864 */ 865 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 866 { 867 int r; 868 869 if (adev->wb.wb_obj == NULL) { 870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 873 &adev->wb.wb_obj, &adev->wb.gpu_addr, 874 (void **)__UNVOLATILE(&adev->wb.wb)); 875 if (r) { 876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 877 return r; 878 } 879 880 adev->wb.num_wb = AMDGPU_MAX_WB; 881 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 882 883 /* clear wb memory */ 884 memset(__UNVOLATILE(adev->wb.wb), 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 885 } 886 887 return 0; 888 } 889 890 /** 891 * amdgpu_device_wb_get - Allocate a wb entry 892 * 893 * @adev: amdgpu_device pointer 894 * @wb: wb index 895 * 896 * Allocate a wb slot for use by the driver (all asics). 897 * Returns 0 on success or -EINVAL on failure. 898 */ 899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 900 { 901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 902 903 if (offset < adev->wb.num_wb) { 904 __set_bit(offset, adev->wb.used); 905 *wb = offset << 3; /* convert to dw offset */ 906 return 0; 907 } else { 908 return -EINVAL; 909 } 910 } 911 912 /** 913 * amdgpu_device_wb_free - Free a wb entry 914 * 915 * @adev: amdgpu_device pointer 916 * @wb: wb index 917 * 918 * Free a wb slot allocated for use by the driver (all asics) 919 */ 920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 921 { 922 wb >>= 3; 923 if (wb < adev->wb.num_wb) 924 __clear_bit(wb, adev->wb.used); 925 } 926 927 /** 928 * amdgpu_device_resize_fb_bar - try to resize FB BAR 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 933 * to fail, but if any of the BARs is not accessible after the size we abort 934 * driver loading by returning -ENODEV. 935 */ 936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 937 { 938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 940 struct pci_bus *root; 941 struct resource *res; 942 unsigned i; 943 u16 cmd; 944 int r; 945 946 /* Bypass for VF */ 947 if (amdgpu_sriov_vf(adev)) 948 return 0; 949 950 #ifdef __NetBSD__ /* XXX amdgpu fb resize */ 951 __USE(space_needed); 952 __USE(rbar_size); 953 __USE(root); 954 __USE(res); 955 __USE(i); 956 __USE(cmd); 957 __USE(r); 958 #else 959 960 /* Check if the root BUS has 64bit memory resources */ 961 root = adev->pdev->bus; 962 while (root->parent) 963 root = root->parent; 964 965 pci_bus_for_each_resource(root, res, i) { 966 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 967 res->start > 0x100000000ull) 968 break; 969 } 970 971 /* Trying to resize is pointless without a root hub window above 4GB */ 972 if (!res) 973 return 0; 974 975 /* Disable memory decoding while we change the BAR addresses and size */ 976 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 977 pci_write_config_word(adev->pdev, PCI_COMMAND, 978 cmd & ~PCI_COMMAND_MEMORY); 979 980 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 981 amdgpu_device_doorbell_fini(adev); 982 if (adev->asic_type >= CHIP_BONAIRE) 983 pci_release_resource(adev->pdev, 2); 984 985 pci_release_resource(adev->pdev, 0); 986 987 r = pci_resize_resource(adev->pdev, 0, rbar_size); 988 if (r == -ENOSPC) 989 DRM_INFO("Not enough PCI address space for a large BAR."); 990 else if (r && r != -ENOTSUPP) 991 DRM_ERROR("Problem resizing BAR0 (%d).", r); 992 993 pci_assign_unassigned_bus_resources(adev->pdev->bus); 994 995 /* When the doorbell or fb BAR isn't available we have no chance of 996 * using the device. 997 */ 998 r = amdgpu_device_doorbell_init(adev); 999 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1000 return -ENODEV; 1001 1002 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1003 1004 #endif 1005 1006 return 0; 1007 } 1008 1009 /* 1010 * GPU helpers function. 1011 */ 1012 /** 1013 * amdgpu_device_need_post - check if the hw need post or not 1014 * 1015 * @adev: amdgpu_device pointer 1016 * 1017 * Check if the asic has been initialized (all asics) at driver startup 1018 * or post is needed if hw reset is performed. 1019 * Returns true if need or false if not. 1020 */ 1021 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1022 { 1023 uint32_t reg; 1024 1025 if (amdgpu_sriov_vf(adev)) 1026 return false; 1027 1028 if (amdgpu_passthrough(adev)) { 1029 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1030 * some old smc fw still need driver do vPost otherwise gpu hang, while 1031 * those smc fw version above 22.15 doesn't have this flaw, so we force 1032 * vpost executed for smc version below 22.15 1033 */ 1034 if (adev->asic_type == CHIP_FIJI) { 1035 int err; 1036 uint32_t fw_ver; 1037 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1038 /* force vPost if error occured */ 1039 if (err) 1040 return true; 1041 1042 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1043 if (fw_ver < 0x00160e00) 1044 return true; 1045 } 1046 } 1047 1048 if (adev->has_hw_reset) { 1049 adev->has_hw_reset = false; 1050 return true; 1051 } 1052 1053 /* bios scratch used on CIK+ */ 1054 if (adev->asic_type >= CHIP_BONAIRE) 1055 return amdgpu_atombios_scratch_need_asic_init(adev); 1056 1057 /* check MEM_SIZE for older asics */ 1058 reg = amdgpu_asic_get_config_memsize(adev); 1059 1060 if ((reg != 0) && (reg != 0xffffffff)) 1061 return false; 1062 1063 return true; 1064 } 1065 1066 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1067 /* if we get transitioned to only one device, take VGA back */ 1068 /** 1069 * amdgpu_device_vga_set_decode - enable/disable vga decode 1070 * 1071 * @cookie: amdgpu_device pointer 1072 * @state: enable/disable vga decode 1073 * 1074 * Enable/disable vga decode (all asics). 1075 * Returns VGA resource flags. 1076 */ 1077 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1078 { 1079 struct amdgpu_device *adev = cookie; 1080 amdgpu_asic_set_vga_state(adev, state); 1081 if (state) 1082 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1083 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1084 else 1085 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1086 } 1087 #endif /* __NetBSD__ */ 1088 1089 /** 1090 * amdgpu_device_check_block_size - validate the vm block size 1091 * 1092 * @adev: amdgpu_device pointer 1093 * 1094 * Validates the vm block size specified via module parameter. 1095 * The vm block size defines number of bits in page table versus page directory, 1096 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1097 * page table and the remaining bits are in the page directory. 1098 */ 1099 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1100 { 1101 /* defines number of bits in page table versus page directory, 1102 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1103 * page table and the remaining bits are in the page directory */ 1104 if (amdgpu_vm_block_size == -1) 1105 return; 1106 1107 if (amdgpu_vm_block_size < 9) { 1108 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1109 amdgpu_vm_block_size); 1110 amdgpu_vm_block_size = -1; 1111 } 1112 } 1113 1114 /** 1115 * amdgpu_device_check_vm_size - validate the vm size 1116 * 1117 * @adev: amdgpu_device pointer 1118 * 1119 * Validates the vm size in GB specified via module parameter. 1120 * The VM size is the size of the GPU virtual memory space in GB. 1121 */ 1122 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1123 { 1124 /* no need to check the default value */ 1125 if (amdgpu_vm_size == -1) 1126 return; 1127 1128 if (amdgpu_vm_size < 1) { 1129 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1130 amdgpu_vm_size); 1131 amdgpu_vm_size = -1; 1132 } 1133 } 1134 1135 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1136 { 1137 struct sysinfo si; 1138 bool is_os_64 = (sizeof(void *) == 8); 1139 uint64_t total_memory; 1140 uint64_t dram_size_seven_GB = 0x1B8000000; 1141 uint64_t dram_size_three_GB = 0xB8000000; 1142 1143 if (amdgpu_smu_memory_pool_size == 0) 1144 return; 1145 1146 if (!is_os_64) { 1147 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1148 goto def_value; 1149 } 1150 si_meminfo(&si); 1151 total_memory = (uint64_t)si.totalram * si.mem_unit; 1152 1153 if ((amdgpu_smu_memory_pool_size == 1) || 1154 (amdgpu_smu_memory_pool_size == 2)) { 1155 if (total_memory < dram_size_three_GB) 1156 goto def_value1; 1157 } else if ((amdgpu_smu_memory_pool_size == 4) || 1158 (amdgpu_smu_memory_pool_size == 8)) { 1159 if (total_memory < dram_size_seven_GB) 1160 goto def_value1; 1161 } else { 1162 DRM_WARN("Smu memory pool size not supported\n"); 1163 goto def_value; 1164 } 1165 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1166 1167 return; 1168 1169 def_value1: 1170 DRM_WARN("No enough system memory\n"); 1171 def_value: 1172 adev->pm.smu_prv_buffer_size = 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_check_arguments - validate module params 1177 * 1178 * @adev: amdgpu_device pointer 1179 * 1180 * Validates certain module parameters and updates 1181 * the associated values used by the driver (all asics). 1182 */ 1183 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1184 { 1185 if (amdgpu_sched_jobs < 4) { 1186 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1187 amdgpu_sched_jobs); 1188 amdgpu_sched_jobs = 4; 1189 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1190 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1191 amdgpu_sched_jobs); 1192 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1193 } 1194 1195 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1196 /* gart size must be greater or equal to 32M */ 1197 dev_warn(adev->dev, "gart size (%d) too small\n", 1198 amdgpu_gart_size); 1199 amdgpu_gart_size = -1; 1200 } 1201 1202 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1203 /* gtt size must be greater or equal to 32M */ 1204 dev_warn(adev->dev, "gtt size (%d) too small\n", 1205 amdgpu_gtt_size); 1206 amdgpu_gtt_size = -1; 1207 } 1208 1209 /* valid range is between 4 and 9 inclusive */ 1210 if (amdgpu_vm_fragment_size != -1 && 1211 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1212 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1213 amdgpu_vm_fragment_size = -1; 1214 } 1215 1216 amdgpu_device_check_smu_prv_buffer_size(adev); 1217 1218 amdgpu_device_check_vm_size(adev); 1219 1220 amdgpu_device_check_block_size(adev); 1221 1222 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1223 1224 return 0; 1225 } 1226 1227 #ifndef __NetBSD__ /* XXX amdgpu vga */ 1228 /** 1229 * amdgpu_switcheroo_set_state - set switcheroo state 1230 * 1231 * @pdev: pci dev pointer 1232 * @state: vga_switcheroo state 1233 * 1234 * Callback for the switcheroo driver. Suspends or resumes the 1235 * the asics before or after it is powered up using ACPI methods. 1236 */ 1237 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1238 { 1239 struct drm_device *dev = pci_get_drvdata(pdev); 1240 int r; 1241 1242 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) 1243 return; 1244 1245 if (state == VGA_SWITCHEROO_ON) { 1246 pr_info("amdgpu: switched on\n"); 1247 /* don't suspend or resume card normally */ 1248 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1249 1250 #ifndef __NetBSD__ /* pmf handles this for us. */ 1251 pci_set_power_state(dev->pdev, PCI_D0); 1252 pci_restore_state(dev->pdev); 1253 r = pci_enable_device(dev->pdev); 1254 if (r) 1255 DRM_WARN("pci_enable_device failed (%d)\n", r); 1256 #endif 1257 amdgpu_device_resume(dev, true); 1258 1259 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1260 drm_kms_helper_poll_enable(dev); 1261 } else { 1262 pr_info("amdgpu: switched off\n"); 1263 drm_kms_helper_poll_disable(dev); 1264 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1265 amdgpu_device_suspend(dev, true); 1266 #ifndef __NetBSD__ /* pmf handles this for us. */ 1267 pci_save_state(dev->pdev); 1268 /* Shut down the device */ 1269 pci_disable_device(dev->pdev); 1270 pci_set_power_state(dev->pdev, PCI_D3cold); 1271 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1272 #endif 1273 } 1274 } 1275 1276 /** 1277 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1278 * 1279 * @pdev: pci dev pointer 1280 * 1281 * Callback for the switcheroo driver. Check of the switcheroo 1282 * state can be changed. 1283 * Returns true if the state can be changed, false if not. 1284 */ 1285 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1286 { 1287 struct drm_device *dev = pci_get_drvdata(pdev); 1288 1289 /* 1290 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1291 * locking inversion with the driver load path. And the access here is 1292 * completely racy anyway. So don't bother with locking for now. 1293 */ 1294 return dev->open_count == 0; 1295 } 1296 1297 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1298 .set_gpu_state = amdgpu_switcheroo_set_state, 1299 .reprobe = NULL, 1300 .can_switch = amdgpu_switcheroo_can_switch, 1301 }; 1302 #endif /* __NetBSD__ */ 1303 1304 /** 1305 * amdgpu_device_ip_set_clockgating_state - set the CG state 1306 * 1307 * @dev: amdgpu_device pointer 1308 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1309 * @state: clockgating state (gate or ungate) 1310 * 1311 * Sets the requested clockgating state for all instances of 1312 * the hardware IP specified. 1313 * Returns the error code from the last instance. 1314 */ 1315 int amdgpu_device_ip_set_clockgating_state(void *dev, 1316 enum amd_ip_block_type block_type, 1317 enum amd_clockgating_state state) 1318 { 1319 struct amdgpu_device *adev = dev; 1320 int i, r = 0; 1321 1322 for (i = 0; i < adev->num_ip_blocks; i++) { 1323 if (!adev->ip_blocks[i].status.valid) 1324 continue; 1325 if (adev->ip_blocks[i].version->type != block_type) 1326 continue; 1327 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1328 continue; 1329 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1330 (void *)adev, state); 1331 if (r) 1332 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1333 adev->ip_blocks[i].version->funcs->name, r); 1334 } 1335 return r; 1336 } 1337 1338 /** 1339 * amdgpu_device_ip_set_powergating_state - set the PG state 1340 * 1341 * @dev: amdgpu_device pointer 1342 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1343 * @state: powergating state (gate or ungate) 1344 * 1345 * Sets the requested powergating state for all instances of 1346 * the hardware IP specified. 1347 * Returns the error code from the last instance. 1348 */ 1349 int amdgpu_device_ip_set_powergating_state(void *dev, 1350 enum amd_ip_block_type block_type, 1351 enum amd_powergating_state state) 1352 { 1353 struct amdgpu_device *adev = dev; 1354 int i, r = 0; 1355 1356 for (i = 0; i < adev->num_ip_blocks; i++) { 1357 if (!adev->ip_blocks[i].status.valid) 1358 continue; 1359 if (adev->ip_blocks[i].version->type != block_type) 1360 continue; 1361 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1362 continue; 1363 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1364 (void *)adev, state); 1365 if (r) 1366 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1367 adev->ip_blocks[i].version->funcs->name, r); 1368 } 1369 return r; 1370 } 1371 1372 /** 1373 * amdgpu_device_ip_get_clockgating_state - get the CG state 1374 * 1375 * @adev: amdgpu_device pointer 1376 * @flags: clockgating feature flags 1377 * 1378 * Walks the list of IPs on the device and updates the clockgating 1379 * flags for each IP. 1380 * Updates @flags with the feature flags for each hardware IP where 1381 * clockgating is enabled. 1382 */ 1383 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1384 u32 *flags) 1385 { 1386 int i; 1387 1388 for (i = 0; i < adev->num_ip_blocks; i++) { 1389 if (!adev->ip_blocks[i].status.valid) 1390 continue; 1391 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1392 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1393 } 1394 } 1395 1396 /** 1397 * amdgpu_device_ip_wait_for_idle - wait for idle 1398 * 1399 * @adev: amdgpu_device pointer 1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1401 * 1402 * Waits for the request hardware IP to be idle. 1403 * Returns 0 for success or a negative error code on failure. 1404 */ 1405 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1406 enum amd_ip_block_type block_type) 1407 { 1408 int i, r; 1409 1410 for (i = 0; i < adev->num_ip_blocks; i++) { 1411 if (!adev->ip_blocks[i].status.valid) 1412 continue; 1413 if (adev->ip_blocks[i].version->type == block_type) { 1414 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1415 if (r) 1416 return r; 1417 break; 1418 } 1419 } 1420 return 0; 1421 1422 } 1423 1424 /** 1425 * amdgpu_device_ip_is_idle - is the hardware IP idle 1426 * 1427 * @adev: amdgpu_device pointer 1428 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1429 * 1430 * Check if the hardware IP is idle or not. 1431 * Returns true if it the IP is idle, false if not. 1432 */ 1433 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1434 enum amd_ip_block_type block_type) 1435 { 1436 int i; 1437 1438 for (i = 0; i < adev->num_ip_blocks; i++) { 1439 if (!adev->ip_blocks[i].status.valid) 1440 continue; 1441 if (adev->ip_blocks[i].version->type == block_type) 1442 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1443 } 1444 return true; 1445 1446 } 1447 1448 /** 1449 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1450 * 1451 * @adev: amdgpu_device pointer 1452 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1453 * 1454 * Returns a pointer to the hardware IP block structure 1455 * if it exists for the asic, otherwise NULL. 1456 */ 1457 struct amdgpu_ip_block * 1458 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1459 enum amd_ip_block_type type) 1460 { 1461 int i; 1462 1463 for (i = 0; i < adev->num_ip_blocks; i++) 1464 if (adev->ip_blocks[i].version->type == type) 1465 return &adev->ip_blocks[i]; 1466 1467 return NULL; 1468 } 1469 1470 /** 1471 * amdgpu_device_ip_block_version_cmp 1472 * 1473 * @adev: amdgpu_device pointer 1474 * @type: enum amd_ip_block_type 1475 * @major: major version 1476 * @minor: minor version 1477 * 1478 * return 0 if equal or greater 1479 * return 1 if smaller or the ip_block doesn't exist 1480 */ 1481 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1482 enum amd_ip_block_type type, 1483 u32 major, u32 minor) 1484 { 1485 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1486 1487 if (ip_block && ((ip_block->version->major > major) || 1488 ((ip_block->version->major == major) && 1489 (ip_block->version->minor >= minor)))) 1490 return 0; 1491 1492 return 1; 1493 } 1494 1495 /** 1496 * amdgpu_device_ip_block_add 1497 * 1498 * @adev: amdgpu_device pointer 1499 * @ip_block_version: pointer to the IP to add 1500 * 1501 * Adds the IP block driver information to the collection of IPs 1502 * on the asic. 1503 */ 1504 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1505 const struct amdgpu_ip_block_version *ip_block_version) 1506 { 1507 if (!ip_block_version) 1508 return -EINVAL; 1509 1510 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1511 ip_block_version->funcs->name); 1512 1513 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1514 1515 return 0; 1516 } 1517 1518 /** 1519 * amdgpu_device_enable_virtual_display - enable virtual display feature 1520 * 1521 * @adev: amdgpu_device pointer 1522 * 1523 * Enabled the virtual display feature if the user has enabled it via 1524 * the module parameter virtual_display. This feature provides a virtual 1525 * display hardware on headless boards or in virtualized environments. 1526 * This function parses and validates the configuration string specified by 1527 * the user and configues the virtual display configuration (number of 1528 * virtual connectors, crtcs, etc.) specified. 1529 */ 1530 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1531 { 1532 adev->enable_virtual_display = false; 1533 1534 if (amdgpu_virtual_display) { 1535 struct drm_device *ddev = adev->ddev; 1536 const char *pci_address_name = pci_name(ddev->pdev); 1537 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1538 1539 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1540 pciaddstr_tmp = pciaddstr; 1541 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1542 pciaddname = strsep(&pciaddname_tmp, ","); 1543 if (!strcmp("all", pciaddname) 1544 || !strcmp(pci_address_name, pciaddname)) { 1545 long num_crtc; 1546 int res = -1; 1547 1548 adev->enable_virtual_display = true; 1549 1550 if (pciaddname_tmp) 1551 res = kstrtol(pciaddname_tmp, 10, 1552 &num_crtc); 1553 1554 if (!res) { 1555 if (num_crtc < 1) 1556 num_crtc = 1; 1557 if (num_crtc > 6) 1558 num_crtc = 6; 1559 adev->mode_info.num_crtc = num_crtc; 1560 } else { 1561 adev->mode_info.num_crtc = 1; 1562 } 1563 break; 1564 } 1565 } 1566 1567 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1568 amdgpu_virtual_display, pci_address_name, 1569 adev->enable_virtual_display, adev->mode_info.num_crtc); 1570 1571 kfree(pciaddstr); 1572 } 1573 } 1574 1575 /** 1576 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1577 * 1578 * @adev: amdgpu_device pointer 1579 * 1580 * Parses the asic configuration parameters specified in the gpu info 1581 * firmware and makes them availale to the driver for use in configuring 1582 * the asic. 1583 * Returns 0 on success, -EINVAL on failure. 1584 */ 1585 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1586 { 1587 const char *chip_name; 1588 char fw_name[30]; 1589 int err; 1590 const struct gpu_info_firmware_header_v1_0 *hdr; 1591 1592 adev->firmware.gpu_info_fw = NULL; 1593 1594 switch (adev->asic_type) { 1595 case CHIP_TOPAZ: 1596 case CHIP_TONGA: 1597 case CHIP_FIJI: 1598 case CHIP_POLARIS10: 1599 case CHIP_POLARIS11: 1600 case CHIP_POLARIS12: 1601 case CHIP_VEGAM: 1602 case CHIP_CARRIZO: 1603 case CHIP_STONEY: 1604 #ifdef CONFIG_DRM_AMDGPU_SI 1605 case CHIP_VERDE: 1606 case CHIP_TAHITI: 1607 case CHIP_PITCAIRN: 1608 case CHIP_OLAND: 1609 case CHIP_HAINAN: 1610 #endif 1611 #ifdef CONFIG_DRM_AMDGPU_CIK 1612 case CHIP_BONAIRE: 1613 case CHIP_HAWAII: 1614 case CHIP_KAVERI: 1615 case CHIP_KABINI: 1616 case CHIP_MULLINS: 1617 #endif 1618 case CHIP_VEGA20: 1619 default: 1620 return 0; 1621 case CHIP_VEGA10: 1622 chip_name = "vega10"; 1623 break; 1624 case CHIP_VEGA12: 1625 chip_name = "vega12"; 1626 break; 1627 case CHIP_RAVEN: 1628 if (adev->rev_id >= 8) 1629 chip_name = "raven2"; 1630 else if (adev->pdev->device == 0x15d8) 1631 chip_name = "picasso"; 1632 else 1633 chip_name = "raven"; 1634 break; 1635 case CHIP_ARCTURUS: 1636 chip_name = "arcturus"; 1637 break; 1638 case CHIP_RENOIR: 1639 chip_name = "renoir"; 1640 break; 1641 case CHIP_NAVI10: 1642 chip_name = "navi10"; 1643 break; 1644 case CHIP_NAVI14: 1645 chip_name = "navi14"; 1646 break; 1647 case CHIP_NAVI12: 1648 chip_name = "navi12"; 1649 break; 1650 } 1651 1652 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1653 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1654 if (err) { 1655 dev_err(adev->dev, 1656 "Failed to load gpu_info firmware \"%s\"\n", 1657 fw_name); 1658 goto out; 1659 } 1660 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1661 if (err) { 1662 dev_err(adev->dev, 1663 "Failed to validate gpu_info firmware \"%s\"\n", 1664 fw_name); 1665 goto out; 1666 } 1667 1668 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1669 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1670 1671 switch (hdr->version_major) { 1672 case 1: 1673 { 1674 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1675 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1676 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1677 1678 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1679 goto parse_soc_bounding_box; 1680 1681 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1682 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1683 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1684 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1685 adev->gfx.config.max_texture_channel_caches = 1686 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1687 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1688 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1689 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1690 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1691 adev->gfx.config.double_offchip_lds_buf = 1692 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1693 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1694 adev->gfx.cu_info.max_waves_per_simd = 1695 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1696 adev->gfx.cu_info.max_scratch_slots_per_cu = 1697 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1698 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1699 if (hdr->version_minor >= 1) { 1700 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1701 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1702 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1703 adev->gfx.config.num_sc_per_sh = 1704 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1705 adev->gfx.config.num_packer_per_sc = 1706 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1707 } 1708 1709 parse_soc_bounding_box: 1710 /* 1711 * soc bounding box info is not integrated in disocovery table, 1712 * we always need to parse it from gpu info firmware. 1713 */ 1714 if (hdr->version_minor == 2) { 1715 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1716 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1717 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1718 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1719 } 1720 break; 1721 } 1722 default: 1723 dev_err(adev->dev, 1724 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1725 err = -EINVAL; 1726 goto out; 1727 } 1728 out: 1729 return err; 1730 } 1731 1732 /** 1733 * amdgpu_device_ip_early_init - run early init for hardware IPs 1734 * 1735 * @adev: amdgpu_device pointer 1736 * 1737 * Early initialization pass for hardware IPs. The hardware IPs that make 1738 * up each asic are discovered each IP's early_init callback is run. This 1739 * is the first stage in initializing the asic. 1740 * Returns 0 on success, negative error code on failure. 1741 */ 1742 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1743 { 1744 int i, r; 1745 1746 amdgpu_device_enable_virtual_display(adev); 1747 1748 switch (adev->asic_type) { 1749 case CHIP_TOPAZ: 1750 case CHIP_TONGA: 1751 case CHIP_FIJI: 1752 case CHIP_POLARIS10: 1753 case CHIP_POLARIS11: 1754 case CHIP_POLARIS12: 1755 case CHIP_VEGAM: 1756 case CHIP_CARRIZO: 1757 case CHIP_STONEY: 1758 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1759 adev->family = AMDGPU_FAMILY_CZ; 1760 else 1761 adev->family = AMDGPU_FAMILY_VI; 1762 1763 r = vi_set_ip_blocks(adev); 1764 if (r) 1765 return r; 1766 break; 1767 #ifdef CONFIG_DRM_AMDGPU_SI 1768 case CHIP_VERDE: 1769 case CHIP_TAHITI: 1770 case CHIP_PITCAIRN: 1771 case CHIP_OLAND: 1772 case CHIP_HAINAN: 1773 adev->family = AMDGPU_FAMILY_SI; 1774 r = si_set_ip_blocks(adev); 1775 if (r) 1776 return r; 1777 break; 1778 #endif 1779 #ifdef CONFIG_DRM_AMDGPU_CIK 1780 case CHIP_BONAIRE: 1781 case CHIP_HAWAII: 1782 case CHIP_KAVERI: 1783 case CHIP_KABINI: 1784 case CHIP_MULLINS: 1785 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1786 adev->family = AMDGPU_FAMILY_CI; 1787 else 1788 adev->family = AMDGPU_FAMILY_KV; 1789 1790 r = cik_set_ip_blocks(adev); 1791 if (r) 1792 return r; 1793 break; 1794 #endif 1795 case CHIP_VEGA10: 1796 case CHIP_VEGA12: 1797 case CHIP_VEGA20: 1798 case CHIP_RAVEN: 1799 case CHIP_ARCTURUS: 1800 case CHIP_RENOIR: 1801 if (adev->asic_type == CHIP_RAVEN || 1802 adev->asic_type == CHIP_RENOIR) 1803 adev->family = AMDGPU_FAMILY_RV; 1804 else 1805 adev->family = AMDGPU_FAMILY_AI; 1806 1807 r = soc15_set_ip_blocks(adev); 1808 if (r) 1809 return r; 1810 break; 1811 case CHIP_NAVI10: 1812 case CHIP_NAVI14: 1813 case CHIP_NAVI12: 1814 adev->family = AMDGPU_FAMILY_NV; 1815 1816 r = nv_set_ip_blocks(adev); 1817 if (r) 1818 return r; 1819 break; 1820 default: 1821 /* FIXME: not supported yet */ 1822 return -EINVAL; 1823 } 1824 1825 r = amdgpu_device_parse_gpu_info_fw(adev); 1826 if (r) 1827 return r; 1828 1829 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 1830 amdgpu_discovery_get_gfx_info(adev); 1831 1832 amdgpu_amdkfd_device_probe(adev); 1833 1834 if (amdgpu_sriov_vf(adev)) { 1835 r = amdgpu_virt_request_full_gpu(adev, true); 1836 if (r) 1837 return -EAGAIN; 1838 } 1839 1840 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1841 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 1842 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1843 1844 for (i = 0; i < adev->num_ip_blocks; i++) { 1845 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1846 DRM_ERROR("disabled ip block: %d <%s>\n", 1847 i, adev->ip_blocks[i].version->funcs->name); 1848 adev->ip_blocks[i].status.valid = false; 1849 } else { 1850 if (adev->ip_blocks[i].version->funcs->early_init) { 1851 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1852 if (r == -ENOENT) { 1853 adev->ip_blocks[i].status.valid = false; 1854 } else if (r) { 1855 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1856 adev->ip_blocks[i].version->funcs->name, r); 1857 return r; 1858 } else { 1859 adev->ip_blocks[i].status.valid = true; 1860 } 1861 } else { 1862 adev->ip_blocks[i].status.valid = true; 1863 } 1864 } 1865 /* get the vbios after the asic_funcs are set up */ 1866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1867 /* Read BIOS */ 1868 if (!amdgpu_get_bios(adev)) 1869 return -EINVAL; 1870 1871 r = amdgpu_atombios_init(adev); 1872 if (r) { 1873 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1875 return r; 1876 } 1877 } 1878 } 1879 1880 adev->cg_flags &= amdgpu_cg_mask; 1881 adev->pg_flags &= amdgpu_pg_mask; 1882 1883 return 0; 1884 } 1885 1886 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1887 { 1888 int i, r; 1889 1890 for (i = 0; i < adev->num_ip_blocks; i++) { 1891 if (!adev->ip_blocks[i].status.sw) 1892 continue; 1893 if (adev->ip_blocks[i].status.hw) 1894 continue; 1895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1896 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1898 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1899 if (r) { 1900 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1901 adev->ip_blocks[i].version->funcs->name, r); 1902 return r; 1903 } 1904 adev->ip_blocks[i].status.hw = true; 1905 } 1906 } 1907 1908 return 0; 1909 } 1910 1911 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1912 { 1913 int i, r; 1914 1915 for (i = 0; i < adev->num_ip_blocks; i++) { 1916 if (!adev->ip_blocks[i].status.sw) 1917 continue; 1918 if (adev->ip_blocks[i].status.hw) 1919 continue; 1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1921 if (r) { 1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1923 adev->ip_blocks[i].version->funcs->name, r); 1924 return r; 1925 } 1926 adev->ip_blocks[i].status.hw = true; 1927 } 1928 1929 return 0; 1930 } 1931 1932 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1933 { 1934 int r = 0; 1935 int i; 1936 uint32_t smu_version; 1937 1938 if (adev->asic_type >= CHIP_VEGA10) { 1939 for (i = 0; i < adev->num_ip_blocks; i++) { 1940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1941 continue; 1942 1943 /* no need to do the fw loading again if already done*/ 1944 if (adev->ip_blocks[i].status.hw == true) 1945 break; 1946 1947 if (adev->in_gpu_reset || adev->in_suspend) { 1948 r = adev->ip_blocks[i].version->funcs->resume(adev); 1949 if (r) { 1950 DRM_ERROR("resume of IP block <%s> failed %d\n", 1951 adev->ip_blocks[i].version->funcs->name, r); 1952 return r; 1953 } 1954 } else { 1955 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1956 if (r) { 1957 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1958 adev->ip_blocks[i].version->funcs->name, r); 1959 return r; 1960 } 1961 } 1962 1963 adev->ip_blocks[i].status.hw = true; 1964 break; 1965 } 1966 } 1967 1968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 1969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1970 1971 return r; 1972 } 1973 1974 /** 1975 * amdgpu_device_ip_init - run init for hardware IPs 1976 * 1977 * @adev: amdgpu_device pointer 1978 * 1979 * Main initialization pass for hardware IPs. The list of all the hardware 1980 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1981 * are run. sw_init initializes the software state associated with each IP 1982 * and hw_init initializes the hardware associated with each IP. 1983 * Returns 0 on success, negative error code on failure. 1984 */ 1985 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1986 { 1987 int i, r; 1988 1989 r = amdgpu_ras_init(adev); 1990 if (r) 1991 return r; 1992 1993 for (i = 0; i < adev->num_ip_blocks; i++) { 1994 if (!adev->ip_blocks[i].status.valid) 1995 continue; 1996 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1997 if (r) { 1998 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1999 adev->ip_blocks[i].version->funcs->name, r); 2000 goto init_failed; 2001 } 2002 adev->ip_blocks[i].status.sw = true; 2003 2004 /* need to do gmc hw init early so we can allocate gpu mem */ 2005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2006 r = amdgpu_device_vram_scratch_init(adev); 2007 if (r) { 2008 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2009 goto init_failed; 2010 } 2011 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2012 if (r) { 2013 DRM_ERROR("hw_init %d failed %d\n", i, r); 2014 goto init_failed; 2015 } 2016 r = amdgpu_device_wb_init(adev); 2017 if (r) { 2018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2019 goto init_failed; 2020 } 2021 adev->ip_blocks[i].status.hw = true; 2022 2023 /* right after GMC hw init, we create CSA */ 2024 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2026 AMDGPU_GEM_DOMAIN_VRAM, 2027 AMDGPU_CSA_SIZE); 2028 if (r) { 2029 DRM_ERROR("allocate CSA failed %d\n", r); 2030 goto init_failed; 2031 } 2032 } 2033 } 2034 } 2035 2036 if (amdgpu_sriov_vf(adev)) 2037 amdgpu_virt_init_data_exchange(adev); 2038 2039 r = amdgpu_ib_pool_init(adev); 2040 if (r) { 2041 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2043 goto init_failed; 2044 } 2045 2046 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2047 if (r) 2048 goto init_failed; 2049 2050 r = amdgpu_device_ip_hw_init_phase1(adev); 2051 if (r) 2052 goto init_failed; 2053 2054 r = amdgpu_device_fw_loading(adev); 2055 if (r) 2056 goto init_failed; 2057 2058 r = amdgpu_device_ip_hw_init_phase2(adev); 2059 if (r) 2060 goto init_failed; 2061 2062 /* 2063 * retired pages will be loaded from eeprom and reserved here, 2064 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2065 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2066 * for I2C communication which only true at this point. 2067 * recovery_init may fail, but it can free all resources allocated by 2068 * itself and its failure should not stop amdgpu init process. 2069 * 2070 * Note: theoretically, this should be called before all vram allocations 2071 * to protect retired page from abusing 2072 */ 2073 amdgpu_ras_recovery_init(adev); 2074 2075 if (adev->gmc.xgmi.num_physical_nodes > 1) 2076 amdgpu_xgmi_add_device(adev); 2077 amdgpu_amdkfd_device_init(adev); 2078 2079 init_failed: 2080 if (amdgpu_sriov_vf(adev)) 2081 amdgpu_virt_release_full_gpu(adev, true); 2082 2083 return r; 2084 } 2085 2086 /** 2087 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * 2091 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2092 * this function before a GPU reset. If the value is retained after a 2093 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2094 */ 2095 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2096 { 2097 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2098 } 2099 2100 /** 2101 * amdgpu_device_check_vram_lost - check if vram is valid 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Checks the reset magic value written to the gart pointer in VRAM. 2106 * The driver calls this after a GPU reset to see if the contents of 2107 * VRAM is lost or now. 2108 * returns true if vram is lost, false if not. 2109 */ 2110 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2111 { 2112 return !!memcmp(adev->gart.ptr, adev->reset_magic, 2113 AMDGPU_RESET_MAGIC_NUM); 2114 } 2115 2116 /** 2117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2118 * 2119 * @adev: amdgpu_device pointer 2120 * @state: clockgating state (gate or ungate) 2121 * 2122 * The list of all the hardware IPs that make up the asic is walked and the 2123 * set_clockgating_state callbacks are run. 2124 * Late initialization pass enabling clockgating for hardware IPs. 2125 * Fini or suspend, pass disabling clockgating for hardware IPs. 2126 * Returns 0 on success, negative error code on failure. 2127 */ 2128 2129 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2130 enum amd_clockgating_state state) 2131 { 2132 int i, j, r; 2133 2134 if (amdgpu_emu_mode == 1) 2135 return 0; 2136 2137 for (j = 0; j < adev->num_ip_blocks; j++) { 2138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2139 if (!adev->ip_blocks[i].status.late_initialized) 2140 continue; 2141 /* skip CG for VCE/UVD, it's handled specially */ 2142 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2145 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2146 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2147 /* enable clockgating to save power */ 2148 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2149 state); 2150 if (r) { 2151 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2152 adev->ip_blocks[i].version->funcs->name, r); 2153 return r; 2154 } 2155 } 2156 } 2157 2158 return 0; 2159 } 2160 2161 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 2162 { 2163 int i, j, r; 2164 2165 if (amdgpu_emu_mode == 1) 2166 return 0; 2167 2168 for (j = 0; j < adev->num_ip_blocks; j++) { 2169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2170 if (!adev->ip_blocks[i].status.late_initialized) 2171 continue; 2172 /* skip CG for VCE/UVD, it's handled specially */ 2173 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2177 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2178 /* enable powergating to save power */ 2179 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2180 state); 2181 if (r) { 2182 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2183 adev->ip_blocks[i].version->funcs->name, r); 2184 return r; 2185 } 2186 } 2187 } 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_enable_mgpu_fan_boost(void) 2192 { 2193 struct amdgpu_gpu_instance *gpu_ins; 2194 struct amdgpu_device *adev; 2195 int i, ret = 0; 2196 2197 mutex_lock(&mgpu_info.mutex); 2198 2199 /* 2200 * MGPU fan boost feature should be enabled 2201 * only when there are two or more dGPUs in 2202 * the system 2203 */ 2204 if (mgpu_info.num_dgpu < 2) 2205 goto out; 2206 2207 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2208 gpu_ins = &(mgpu_info.gpu_ins[i]); 2209 adev = gpu_ins->adev; 2210 if (!(adev->flags & AMD_IS_APU) && 2211 !gpu_ins->mgpu_fan_enabled && 2212 adev->powerplay.pp_funcs && 2213 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 2214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2215 if (ret) 2216 break; 2217 2218 gpu_ins->mgpu_fan_enabled = 1; 2219 } 2220 } 2221 2222 out: 2223 mutex_unlock(&mgpu_info.mutex); 2224 2225 return ret; 2226 } 2227 2228 /** 2229 * amdgpu_device_ip_late_init - run late init for hardware IPs 2230 * 2231 * @adev: amdgpu_device pointer 2232 * 2233 * Late initialization pass for hardware IPs. The list of all the hardware 2234 * IPs that make up the asic is walked and the late_init callbacks are run. 2235 * late_init covers any special initialization that an IP requires 2236 * after all of the have been initialized or something that needs to happen 2237 * late in the init process. 2238 * Returns 0 on success, negative error code on failure. 2239 */ 2240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2241 { 2242 struct amdgpu_gpu_instance *gpu_instance; 2243 int i = 0, r; 2244 2245 for (i = 0; i < adev->num_ip_blocks; i++) { 2246 if (!adev->ip_blocks[i].status.hw) 2247 continue; 2248 if (adev->ip_blocks[i].version->funcs->late_init) { 2249 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2250 if (r) { 2251 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2252 adev->ip_blocks[i].version->funcs->name, r); 2253 return r; 2254 } 2255 } 2256 adev->ip_blocks[i].status.late_initialized = true; 2257 } 2258 2259 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2260 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2261 2262 amdgpu_device_fill_reset_magic(adev); 2263 2264 r = amdgpu_device_enable_mgpu_fan_boost(); 2265 if (r) 2266 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2267 2268 2269 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2270 mutex_lock(&mgpu_info.mutex); 2271 2272 /* 2273 * Reset device p-state to low as this was booted with high. 2274 * 2275 * This should be performed only after all devices from the same 2276 * hive get initialized. 2277 * 2278 * However, it's unknown how many device in the hive in advance. 2279 * As this is counted one by one during devices initializations. 2280 * 2281 * So, we wait for all XGMI interlinked devices initialized. 2282 * This may bring some delays as those devices may come from 2283 * different hives. But that should be OK. 2284 */ 2285 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2286 for (i = 0; i < mgpu_info.num_gpu; i++) { 2287 gpu_instance = &(mgpu_info.gpu_ins[i]); 2288 if (gpu_instance->adev->flags & AMD_IS_APU) 2289 continue; 2290 2291 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); 2292 if (r) { 2293 DRM_ERROR("pstate setting failed (%d).\n", r); 2294 break; 2295 } 2296 } 2297 } 2298 2299 mutex_unlock(&mgpu_info.mutex); 2300 } 2301 2302 return 0; 2303 } 2304 2305 /** 2306 * amdgpu_device_ip_fini - run fini for hardware IPs 2307 * 2308 * @adev: amdgpu_device pointer 2309 * 2310 * Main teardown pass for hardware IPs. The list of all the hardware 2311 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2312 * are run. hw_fini tears down the hardware associated with each IP 2313 * and sw_fini tears down any software state associated with each IP. 2314 * Returns 0 on success, negative error code on failure. 2315 */ 2316 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2317 { 2318 int i, r; 2319 2320 amdgpu_ras_pre_fini(adev); 2321 2322 if (adev->gmc.xgmi.num_physical_nodes > 1) 2323 amdgpu_xgmi_remove_device(adev); 2324 2325 amdgpu_amdkfd_device_fini(adev); 2326 2327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2328 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2329 2330 /* need to disable SMC first */ 2331 for (i = 0; i < adev->num_ip_blocks; i++) { 2332 if (!adev->ip_blocks[i].status.hw) 2333 continue; 2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2335 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2336 /* XXX handle errors */ 2337 if (r) { 2338 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2339 adev->ip_blocks[i].version->funcs->name, r); 2340 } 2341 adev->ip_blocks[i].status.hw = false; 2342 break; 2343 } 2344 } 2345 2346 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2347 if (!adev->ip_blocks[i].status.hw) 2348 continue; 2349 2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2351 /* XXX handle errors */ 2352 if (r) { 2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2354 adev->ip_blocks[i].version->funcs->name, r); 2355 } 2356 2357 adev->ip_blocks[i].status.hw = false; 2358 } 2359 2360 2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2362 if (!adev->ip_blocks[i].status.sw) 2363 continue; 2364 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 amdgpu_ucode_free_bo(adev); 2367 amdgpu_free_static_csa(&adev->virt.csa_obj); 2368 amdgpu_device_wb_fini(adev); 2369 amdgpu_device_vram_scratch_fini(adev); 2370 amdgpu_ib_pool_fini(adev); 2371 } 2372 2373 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2374 /* XXX handle errors */ 2375 if (r) { 2376 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2377 adev->ip_blocks[i].version->funcs->name, r); 2378 } 2379 adev->ip_blocks[i].status.sw = false; 2380 adev->ip_blocks[i].status.valid = false; 2381 } 2382 2383 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2384 if (!adev->ip_blocks[i].status.late_initialized) 2385 continue; 2386 if (adev->ip_blocks[i].version->funcs->late_fini) 2387 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2388 adev->ip_blocks[i].status.late_initialized = false; 2389 } 2390 2391 amdgpu_ras_fini(adev); 2392 2393 if (amdgpu_sriov_vf(adev)) 2394 if (amdgpu_virt_release_full_gpu(adev, false)) 2395 DRM_ERROR("failed to release exclusive mode on fini\n"); 2396 2397 return 0; 2398 } 2399 2400 /** 2401 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2402 * 2403 * @work: work_struct. 2404 */ 2405 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2406 { 2407 struct amdgpu_device *adev = 2408 container_of(work, struct amdgpu_device, delayed_init_work.work); 2409 int r; 2410 2411 r = amdgpu_ib_ring_tests(adev); 2412 if (r) 2413 DRM_ERROR("ib ring test failed (%d).\n", r); 2414 } 2415 2416 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2417 { 2418 struct amdgpu_device *adev = 2419 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2420 2421 mutex_lock(&adev->gfx.gfx_off_mutex); 2422 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2423 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2424 adev->gfx.gfx_off_state = true; 2425 } 2426 mutex_unlock(&adev->gfx.gfx_off_mutex); 2427 } 2428 2429 /** 2430 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2431 * 2432 * @adev: amdgpu_device pointer 2433 * 2434 * Main suspend function for hardware IPs. The list of all the hardware 2435 * IPs that make up the asic is walked, clockgating is disabled and the 2436 * suspend callbacks are run. suspend puts the hardware and software state 2437 * in each IP into a state suitable for suspend. 2438 * Returns 0 on success, negative error code on failure. 2439 */ 2440 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2441 { 2442 int i, r; 2443 2444 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2445 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2446 2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2448 if (!adev->ip_blocks[i].status.valid) 2449 continue; 2450 /* displays are handled separately */ 2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2452 /* XXX handle errors */ 2453 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2454 /* XXX handle errors */ 2455 if (r) { 2456 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2457 adev->ip_blocks[i].version->funcs->name, r); 2458 return r; 2459 } 2460 adev->ip_blocks[i].status.hw = false; 2461 } 2462 } 2463 2464 return 0; 2465 } 2466 2467 /** 2468 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2469 * 2470 * @adev: amdgpu_device pointer 2471 * 2472 * Main suspend function for hardware IPs. The list of all the hardware 2473 * IPs that make up the asic is walked, clockgating is disabled and the 2474 * suspend callbacks are run. suspend puts the hardware and software state 2475 * in each IP into a state suitable for suspend. 2476 * Returns 0 on success, negative error code on failure. 2477 */ 2478 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2479 { 2480 int i, r __unused; 2481 2482 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2483 if (!adev->ip_blocks[i].status.valid) 2484 continue; 2485 /* displays are handled in phase1 */ 2486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2487 continue; 2488 /* PSP lost connection when err_event_athub occurs */ 2489 if (amdgpu_ras_intr_triggered() && 2490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2491 adev->ip_blocks[i].status.hw = false; 2492 continue; 2493 } 2494 /* XXX handle errors */ 2495 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2496 /* XXX handle errors */ 2497 if (r) { 2498 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2499 adev->ip_blocks[i].version->funcs->name, r); 2500 } 2501 adev->ip_blocks[i].status.hw = false; 2502 /* handle putting the SMC in the appropriate state */ 2503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2505 if (r) { 2506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2507 adev->mp1_state, r); 2508 return r; 2509 } 2510 } 2511 2512 adev->ip_blocks[i].status.hw = false; 2513 } 2514 2515 return 0; 2516 } 2517 2518 /** 2519 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2520 * 2521 * @adev: amdgpu_device pointer 2522 * 2523 * Main suspend function for hardware IPs. The list of all the hardware 2524 * IPs that make up the asic is walked, clockgating is disabled and the 2525 * suspend callbacks are run. suspend puts the hardware and software state 2526 * in each IP into a state suitable for suspend. 2527 * Returns 0 on success, negative error code on failure. 2528 */ 2529 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2530 { 2531 int r; 2532 2533 if (amdgpu_sriov_vf(adev)) 2534 amdgpu_virt_request_full_gpu(adev, false); 2535 2536 r = amdgpu_device_ip_suspend_phase1(adev); 2537 if (r) 2538 return r; 2539 r = amdgpu_device_ip_suspend_phase2(adev); 2540 2541 if (amdgpu_sriov_vf(adev)) 2542 amdgpu_virt_release_full_gpu(adev, false); 2543 2544 return r; 2545 } 2546 2547 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2548 { 2549 int i, r; 2550 2551 static enum amd_ip_block_type ip_order[] = { 2552 AMD_IP_BLOCK_TYPE_GMC, 2553 AMD_IP_BLOCK_TYPE_COMMON, 2554 AMD_IP_BLOCK_TYPE_PSP, 2555 AMD_IP_BLOCK_TYPE_IH, 2556 }; 2557 2558 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2559 int j; 2560 struct amdgpu_ip_block *block; 2561 2562 for (j = 0; j < adev->num_ip_blocks; j++) { 2563 block = &adev->ip_blocks[j]; 2564 2565 block->status.hw = false; 2566 if (block->version->type != ip_order[i] || 2567 !block->status.valid) 2568 continue; 2569 2570 r = block->version->funcs->hw_init(adev); 2571 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2572 if (r) 2573 return r; 2574 block->status.hw = true; 2575 } 2576 } 2577 2578 return 0; 2579 } 2580 2581 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2582 { 2583 int i, r; 2584 2585 static enum amd_ip_block_type ip_order[] = { 2586 AMD_IP_BLOCK_TYPE_SMC, 2587 AMD_IP_BLOCK_TYPE_DCE, 2588 AMD_IP_BLOCK_TYPE_GFX, 2589 AMD_IP_BLOCK_TYPE_SDMA, 2590 AMD_IP_BLOCK_TYPE_UVD, 2591 AMD_IP_BLOCK_TYPE_VCE, 2592 AMD_IP_BLOCK_TYPE_VCN 2593 }; 2594 2595 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2596 int j; 2597 struct amdgpu_ip_block *block; 2598 2599 for (j = 0; j < adev->num_ip_blocks; j++) { 2600 block = &adev->ip_blocks[j]; 2601 2602 if (block->version->type != ip_order[i] || 2603 !block->status.valid || 2604 block->status.hw) 2605 continue; 2606 2607 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2608 r = block->version->funcs->resume(adev); 2609 else 2610 r = block->version->funcs->hw_init(adev); 2611 2612 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2613 if (r) 2614 return r; 2615 block->status.hw = true; 2616 } 2617 } 2618 2619 return 0; 2620 } 2621 2622 /** 2623 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2624 * 2625 * @adev: amdgpu_device pointer 2626 * 2627 * First resume function for hardware IPs. The list of all the hardware 2628 * IPs that make up the asic is walked and the resume callbacks are run for 2629 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2630 * after a suspend and updates the software state as necessary. This 2631 * function is also used for restoring the GPU after a GPU reset. 2632 * Returns 0 on success, negative error code on failure. 2633 */ 2634 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2635 { 2636 int i, r; 2637 2638 for (i = 0; i < adev->num_ip_blocks; i++) { 2639 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2640 continue; 2641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2642 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2643 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2644 2645 r = adev->ip_blocks[i].version->funcs->resume(adev); 2646 if (r) { 2647 DRM_ERROR("resume of IP block <%s> failed %d\n", 2648 adev->ip_blocks[i].version->funcs->name, r); 2649 return r; 2650 } 2651 adev->ip_blocks[i].status.hw = true; 2652 } 2653 } 2654 2655 return 0; 2656 } 2657 2658 /** 2659 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2660 * 2661 * @adev: amdgpu_device pointer 2662 * 2663 * First resume function for hardware IPs. The list of all the hardware 2664 * IPs that make up the asic is walked and the resume callbacks are run for 2665 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2666 * functional state after a suspend and updates the software state as 2667 * necessary. This function is also used for restoring the GPU after a GPU 2668 * reset. 2669 * Returns 0 on success, negative error code on failure. 2670 */ 2671 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2672 { 2673 int i, r; 2674 2675 for (i = 0; i < adev->num_ip_blocks; i++) { 2676 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2677 continue; 2678 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2679 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2680 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2682 continue; 2683 r = adev->ip_blocks[i].version->funcs->resume(adev); 2684 if (r) { 2685 DRM_ERROR("resume of IP block <%s> failed %d\n", 2686 adev->ip_blocks[i].version->funcs->name, r); 2687 return r; 2688 } 2689 adev->ip_blocks[i].status.hw = true; 2690 } 2691 2692 return 0; 2693 } 2694 2695 /** 2696 * amdgpu_device_ip_resume - run resume for hardware IPs 2697 * 2698 * @adev: amdgpu_device pointer 2699 * 2700 * Main resume function for hardware IPs. The hardware IPs 2701 * are split into two resume functions because they are 2702 * are also used in in recovering from a GPU reset and some additional 2703 * steps need to be take between them. In this case (S3/S4) they are 2704 * run sequentially. 2705 * Returns 0 on success, negative error code on failure. 2706 */ 2707 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2708 { 2709 int r; 2710 2711 r = amdgpu_device_ip_resume_phase1(adev); 2712 if (r) 2713 return r; 2714 2715 r = amdgpu_device_fw_loading(adev); 2716 if (r) 2717 return r; 2718 2719 r = amdgpu_device_ip_resume_phase2(adev); 2720 2721 return r; 2722 } 2723 2724 /** 2725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2726 * 2727 * @adev: amdgpu_device pointer 2728 * 2729 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2730 */ 2731 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2732 { 2733 if (amdgpu_sriov_vf(adev)) { 2734 if (adev->is_atom_fw) { 2735 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2737 } else { 2738 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2740 } 2741 2742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2744 } 2745 } 2746 2747 /** 2748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2749 * 2750 * @asic_type: AMD asic type 2751 * 2752 * Check if there is DC (new modesetting infrastructre) support for an asic. 2753 * returns true if DC has support, false if not. 2754 */ 2755 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2756 { 2757 switch (asic_type) { 2758 #if defined(CONFIG_DRM_AMD_DC) 2759 case CHIP_BONAIRE: 2760 case CHIP_KAVERI: 2761 case CHIP_KABINI: 2762 case CHIP_MULLINS: 2763 /* 2764 * We have systems in the wild with these ASICs that require 2765 * LVDS and VGA support which is not supported with DC. 2766 * 2767 * Fallback to the non-DC driver here by default so as not to 2768 * cause regressions. 2769 */ 2770 return amdgpu_dc > 0; 2771 case CHIP_HAWAII: 2772 case CHIP_CARRIZO: 2773 case CHIP_STONEY: 2774 case CHIP_POLARIS10: 2775 case CHIP_POLARIS11: 2776 case CHIP_POLARIS12: 2777 case CHIP_VEGAM: 2778 case CHIP_TONGA: 2779 case CHIP_FIJI: 2780 case CHIP_VEGA10: 2781 case CHIP_VEGA12: 2782 case CHIP_VEGA20: 2783 #if defined(CONFIG_DRM_AMD_DC_DCN) 2784 case CHIP_RAVEN: 2785 case CHIP_NAVI10: 2786 case CHIP_NAVI14: 2787 case CHIP_NAVI12: 2788 case CHIP_RENOIR: 2789 #endif 2790 return amdgpu_dc != 0; 2791 #endif 2792 default: 2793 if (amdgpu_dc > 0) 2794 DRM_INFO("Display Core has been requested via kernel parameter " 2795 "but isn't supported by ASIC, ignoring\n"); 2796 return false; 2797 } 2798 } 2799 2800 /** 2801 * amdgpu_device_has_dc_support - check if dc is supported 2802 * 2803 * @adev: amdgpu_device_pointer 2804 * 2805 * Returns true for supported, false for not supported 2806 */ 2807 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2808 { 2809 if (amdgpu_sriov_vf(adev)) 2810 return false; 2811 2812 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2813 } 2814 2815 2816 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2817 { 2818 struct amdgpu_device *adev = 2819 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2820 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); 2821 2822 /* It's a bug to not have a hive within this function */ 2823 if (WARN_ON(!hive)) 2824 return; 2825 2826 /* 2827 * Use task barrier to synchronize all xgmi reset works across the 2828 * hive. task_barrier_enter and task_barrier_exit will block 2829 * until all the threads running the xgmi reset works reach 2830 * those points. task_barrier_full will do both blocks. 2831 */ 2832 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 2833 2834 task_barrier_enter(&hive->tb); 2835 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); 2836 2837 if (adev->asic_reset_res) 2838 goto fail; 2839 2840 task_barrier_exit(&hive->tb); 2841 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); 2842 2843 if (adev->asic_reset_res) 2844 goto fail; 2845 } else { 2846 2847 task_barrier_full(&hive->tb); 2848 adev->asic_reset_res = amdgpu_asic_reset(adev); 2849 } 2850 2851 fail: 2852 if (adev->asic_reset_res) 2853 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2854 adev->asic_reset_res, adev->ddev->unique); 2855 } 2856 2857 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 2858 { 2859 char *input = amdgpu_lockup_timeout; 2860 char *timeout_setting = NULL; 2861 int index = 0; 2862 long timeout; 2863 int ret = 0; 2864 2865 /* 2866 * By default timeout for non compute jobs is 10000. 2867 * And there is no timeout enforced on compute jobs. 2868 * In SR-IOV or passthrough mode, timeout for compute 2869 * jobs are 10000 by default. 2870 */ 2871 adev->gfx_timeout = msecs_to_jiffies(10000); 2872 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2873 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2874 adev->compute_timeout = adev->gfx_timeout; 2875 else 2876 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 2877 2878 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2879 while ((timeout_setting = strsep(&input, ",")) && 2880 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 2881 ret = kstrtol(timeout_setting, 0, &timeout); 2882 if (ret) 2883 return ret; 2884 2885 if (timeout == 0) { 2886 index++; 2887 continue; 2888 } else if (timeout < 0) { 2889 timeout = MAX_SCHEDULE_TIMEOUT; 2890 } else { 2891 timeout = msecs_to_jiffies(timeout); 2892 } 2893 2894 switch (index++) { 2895 case 0: 2896 adev->gfx_timeout = timeout; 2897 break; 2898 case 1: 2899 adev->compute_timeout = timeout; 2900 break; 2901 case 2: 2902 adev->sdma_timeout = timeout; 2903 break; 2904 case 3: 2905 adev->video_timeout = timeout; 2906 break; 2907 default: 2908 break; 2909 } 2910 } 2911 /* 2912 * There is only one value specified and 2913 * it should apply to all non-compute jobs. 2914 */ 2915 if (index == 1) { 2916 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 2917 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 2918 adev->compute_timeout = adev->gfx_timeout; 2919 } 2920 } 2921 2922 return ret; 2923 } 2924 2925 /** 2926 * amdgpu_device_init - initialize the driver 2927 * 2928 * @adev: amdgpu_device pointer 2929 * @ddev: drm dev pointer 2930 * @pdev: pci dev pointer 2931 * @flags: driver flags 2932 * 2933 * Initializes the driver info and hw (all asics). 2934 * Returns 0 for success or an error on failure. 2935 * Called at driver startup. 2936 */ 2937 int amdgpu_device_init(struct amdgpu_device *adev, 2938 struct drm_device *ddev, 2939 struct pci_dev *pdev, 2940 uint32_t flags) 2941 { 2942 int r, i; 2943 bool boco = false; 2944 u32 max_MBps; 2945 2946 adev->shutdown = false; 2947 adev->dev = pci_dev_dev(pdev); 2948 adev->ddev = ddev; 2949 adev->pdev = pdev; 2950 adev->flags = flags; 2951 2952 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 2953 adev->asic_type = amdgpu_force_asic_type; 2954 else 2955 adev->asic_type = flags & AMD_ASIC_MASK; 2956 2957 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2958 if (amdgpu_emu_mode == 1) 2959 adev->usec_timeout *= 2; 2960 adev->gmc.gart_size = 512 * 1024 * 1024; 2961 adev->accel_working = false; 2962 adev->num_rings = 0; 2963 adev->mman.buffer_funcs = NULL; 2964 adev->mman.buffer_funcs_ring = NULL; 2965 adev->vm_manager.vm_pte_funcs = NULL; 2966 adev->vm_manager.vm_pte_num_scheds = 0; 2967 adev->gmc.gmc_funcs = NULL; 2968 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2969 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2970 2971 adev->smc_rreg = &amdgpu_invalid_rreg; 2972 adev->smc_wreg = &amdgpu_invalid_wreg; 2973 adev->pcie_rreg = &amdgpu_invalid_rreg; 2974 adev->pcie_wreg = &amdgpu_invalid_wreg; 2975 adev->pciep_rreg = &amdgpu_invalid_rreg; 2976 adev->pciep_wreg = &amdgpu_invalid_wreg; 2977 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 2978 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 2979 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2980 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2981 adev->didt_rreg = &amdgpu_invalid_rreg; 2982 adev->didt_wreg = &amdgpu_invalid_wreg; 2983 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2984 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2985 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2986 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2987 2988 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2989 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2990 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2991 2992 /* mutex initialization are all done here so we 2993 * can recall function without having locking issues */ 2994 atomic_set(&adev->irq.ih.lock, 0); 2995 mutex_init(&adev->firmware.mutex); 2996 mutex_init(&adev->pm.mutex); 2997 mutex_init(&adev->gfx.gpu_clock_mutex); 2998 mutex_init(&adev->srbm_mutex); 2999 mutex_init(&adev->gfx.pipe_reserve_mutex); 3000 mutex_init(&adev->gfx.gfx_off_mutex); 3001 mutex_init(&adev->grbm_idx_mutex); 3002 mutex_init(&adev->mn_lock); 3003 mutex_init(&adev->virt.vf_errors.lock); 3004 hash_init(adev->mn_hash); 3005 mutex_init(&adev->lock_reset); 3006 mutex_init(&adev->psp.mutex); 3007 mutex_init(&adev->notifier_lock); 3008 3009 spin_lock_init(&adev->mmio_idx_lock); 3010 spin_lock_init(&adev->smc_idx_lock); 3011 spin_lock_init(&adev->pcie_idx_lock); 3012 spin_lock_init(&adev->uvd_ctx_idx_lock); 3013 spin_lock_init(&adev->didt_idx_lock); 3014 spin_lock_init(&adev->gc_cac_idx_lock); 3015 spin_lock_init(&adev->se_cac_idx_lock); 3016 spin_lock_init(&adev->audio_endpt_idx_lock); 3017 spin_lock_init(&adev->mm_stats.lock); 3018 3019 INIT_LIST_HEAD(&adev->shadow_list); 3020 mutex_init(&adev->shadow_list_lock); 3021 3022 INIT_LIST_HEAD(&adev->ring_lru_list); 3023 spin_lock_init(&adev->ring_lru_list_lock); 3024 3025 INIT_DELAYED_WORK(&adev->delayed_init_work, 3026 amdgpu_device_delayed_init_work_handler); 3027 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3028 amdgpu_device_delay_enable_gfx_off); 3029 3030 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3031 3032 r = amdgpu_device_check_arguments(adev); 3033 if (r) 3034 return r; 3035 3036 adev->gfx.gfx_off_req_count = 1; 3037 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 3038 3039 /* Registers mapping */ 3040 /* TODO: block userspace mapping of io register */ 3041 if (adev->asic_type >= CHIP_BONAIRE) { 3042 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3043 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3044 } else { 3045 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3046 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3047 } 3048 3049 #ifdef __NetBSD__ 3050 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(5), 3051 pci_mapreg_type(adev->pdev->pd_pa.pa_pc, 3052 adev->pdev->pd_pa.pa_tag, PCI_BAR(5)), 3053 0, 3054 &adev->rmmiot, &adev->rmmioh, 3055 &adev->rmmio_base, &adev->rmmio_size)) 3056 return -EIO; 3057 DRM_INFO("register mmio base: 0x%8"PRIXMAX"\n", 3058 (uintmax_t)adev->rmmio_base); 3059 DRM_INFO("register mmio size: %"PRIuMAX"\n", 3060 (uintmax_t)adev->rmmio_size); 3061 #else 3062 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3063 if (adev->rmmio == NULL) { 3064 return -ENOMEM; 3065 } 3066 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3067 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3068 #endif 3069 3070 /* io port mapping */ 3071 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 3072 #ifdef __NetBSD__ 3073 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(i), 3074 PCI_MAPREG_TYPE_IO, 0, 3075 &adev->rio_memt, &adev->rio_memh, 3076 NULL, &adev->rio_mem_size) == 0) 3077 break; 3078 #else 3079 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 3080 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 3081 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 3082 break; 3083 } 3084 #endif 3085 } 3086 #ifdef __NetBSD__ 3087 if (i == DEVICE_COUNT_RESOURCE) 3088 #else 3089 if (adev->rio_mem == NULL) 3090 #endif 3091 DRM_INFO("PCI I/O BAR is not found.\n"); 3092 3093 /* enable PCIE atomic ops */ 3094 #ifndef __NetBSD__ /* XXX amdgpu pcie atomics */ 3095 r = pci_enable_atomic_ops_to_root(adev->pdev, 3096 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3097 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3098 if (r) { 3099 adev->have_atomics_support = false; 3100 DRM_INFO("PCIE atomic ops is not supported\n"); 3101 } else { 3102 adev->have_atomics_support = true; 3103 } 3104 #endif 3105 3106 amdgpu_device_get_pcie_info(adev); 3107 3108 if (amdgpu_mcbp) 3109 DRM_INFO("MCBP is enabled\n"); 3110 3111 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3112 adev->enable_mes = true; 3113 3114 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 3115 r = amdgpu_discovery_init(adev); 3116 if (r) { 3117 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 3118 return r; 3119 } 3120 } 3121 3122 /* early init functions */ 3123 r = amdgpu_device_ip_early_init(adev); 3124 if (r) 3125 return r; 3126 3127 r = amdgpu_device_get_job_timeout_settings(adev); 3128 if (r) { 3129 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3130 return r; 3131 } 3132 3133 /* doorbell bar mapping and doorbell index init*/ 3134 amdgpu_device_doorbell_init(adev); 3135 3136 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3137 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3138 /* this will fail for cards that aren't VGA class devices, just 3139 * ignore it */ 3140 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3141 3142 if (amdgpu_device_supports_boco(ddev)) 3143 boco = true; 3144 if (amdgpu_has_atpx() && 3145 (amdgpu_is_atpx_hybrid() || 3146 amdgpu_has_atpx_dgpu_power_cntl()) && 3147 !pci_is_thunderbolt_attached(adev->pdev)) 3148 vga_switcheroo_register_client(adev->pdev, 3149 &amdgpu_switcheroo_ops, boco); 3150 if (boco) 3151 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3152 #endif 3153 3154 if (amdgpu_emu_mode == 1) { 3155 /* post the asic on emulation mode */ 3156 emu_soc_asic_init(adev); 3157 goto fence_driver_init; 3158 } 3159 3160 /* detect if we are with an SRIOV vbios */ 3161 amdgpu_device_detect_sriov_bios(adev); 3162 3163 /* check if we need to reset the asic 3164 * E.g., driver was not cleanly unloaded previously, etc. 3165 */ 3166 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3167 r = amdgpu_asic_reset(adev); 3168 if (r) { 3169 dev_err(adev->dev, "asic reset on init failed\n"); 3170 goto failed; 3171 } 3172 } 3173 3174 /* Post card if necessary */ 3175 if (amdgpu_device_need_post(adev)) { 3176 if (!adev->bios) { 3177 dev_err(adev->dev, "no vBIOS found\n"); 3178 r = -EINVAL; 3179 goto failed; 3180 } 3181 DRM_INFO("GPU posting now...\n"); 3182 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3183 if (r) { 3184 dev_err(adev->dev, "gpu post error!\n"); 3185 goto failed; 3186 } 3187 } 3188 3189 if (adev->is_atom_fw) { 3190 /* Initialize clocks */ 3191 r = amdgpu_atomfirmware_get_clock_info(adev); 3192 if (r) { 3193 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3194 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3195 goto failed; 3196 } 3197 } else { 3198 /* Initialize clocks */ 3199 r = amdgpu_atombios_get_clock_info(adev); 3200 if (r) { 3201 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3202 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3203 goto failed; 3204 } 3205 /* init i2c buses */ 3206 if (!amdgpu_device_has_dc_support(adev)) 3207 amdgpu_atombios_i2c_init(adev); 3208 } 3209 3210 fence_driver_init: 3211 /* Fence driver */ 3212 r = amdgpu_fence_driver_init(adev); 3213 if (r) { 3214 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3215 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3216 goto failed; 3217 } 3218 3219 /* init the mode config */ 3220 drm_mode_config_init(adev->ddev); 3221 3222 r = amdgpu_device_ip_init(adev); 3223 if (r) { 3224 /* failed in exclusive mode due to timeout */ 3225 if (amdgpu_sriov_vf(adev) && 3226 !amdgpu_sriov_runtime(adev) && 3227 amdgpu_virt_mmio_blocked(adev) && 3228 !amdgpu_virt_wait_reset(adev)) { 3229 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3230 /* Don't send request since VF is inactive. */ 3231 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3232 adev->virt.ops = NULL; 3233 r = -EAGAIN; 3234 goto failed; 3235 } 3236 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3237 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3238 goto failed; 3239 } 3240 3241 DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3242 adev->gfx.config.max_shader_engines, 3243 adev->gfx.config.max_sh_per_se, 3244 adev->gfx.config.max_cu_per_sh, 3245 adev->gfx.cu_info.number); 3246 3247 amdgpu_ctx_init_sched(adev); 3248 3249 adev->accel_working = true; 3250 3251 amdgpu_vm_check_compute_bug(adev); 3252 3253 /* Initialize the buffer migration limit. */ 3254 if (amdgpu_moverate >= 0) 3255 max_MBps = amdgpu_moverate; 3256 else 3257 max_MBps = 8; /* Allow 8 MB/s. */ 3258 /* Get a log2 for easy divisions. */ 3259 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3260 3261 amdgpu_fbdev_init(adev); 3262 3263 r = amdgpu_pm_sysfs_init(adev); 3264 if (r) { 3265 adev->pm_sysfs_en = false; 3266 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3267 } else 3268 adev->pm_sysfs_en = true; 3269 3270 r = amdgpu_ucode_sysfs_init(adev); 3271 if (r) { 3272 adev->ucode_sysfs_en = false; 3273 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3274 } else 3275 adev->ucode_sysfs_en = true; 3276 3277 r = amdgpu_debugfs_gem_init(adev); 3278 if (r) 3279 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 3280 3281 r = amdgpu_debugfs_regs_init(adev); 3282 if (r) 3283 DRM_ERROR("registering register debugfs failed (%d).\n", r); 3284 3285 r = amdgpu_debugfs_firmware_init(adev); 3286 if (r) 3287 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 3288 3289 r = amdgpu_debugfs_init(adev); 3290 if (r) 3291 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 3292 3293 if ((amdgpu_testing & 1)) { 3294 if (adev->accel_working) 3295 amdgpu_test_moves(adev); 3296 else 3297 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3298 } 3299 if (amdgpu_benchmarking) { 3300 if (adev->accel_working) 3301 amdgpu_benchmark(adev, amdgpu_benchmarking); 3302 else 3303 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3304 } 3305 3306 /* 3307 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3308 * Otherwise the mgpu fan boost feature will be skipped due to the 3309 * gpu instance is counted less. 3310 */ 3311 amdgpu_register_gpu_instance(adev); 3312 3313 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3314 * explicit gating rather than handling it automatically. 3315 */ 3316 r = amdgpu_device_ip_late_init(adev); 3317 if (r) { 3318 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3319 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3320 goto failed; 3321 } 3322 3323 /* must succeed. */ 3324 amdgpu_ras_resume(adev); 3325 3326 queue_delayed_work(system_wq, &adev->delayed_init_work, 3327 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3328 3329 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3330 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 3331 if (r) { 3332 dev_err(adev->dev, "Could not create pcie_replay_count"); 3333 return r; 3334 } 3335 #endif 3336 3337 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3338 r = amdgpu_pmu_init(adev); 3339 if (r) 3340 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3341 3342 return 0; 3343 3344 failed: 3345 amdgpu_vf_error_trans_all(adev); 3346 if (boco) 3347 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3348 3349 return r; 3350 } 3351 3352 /** 3353 * amdgpu_device_fini - tear down the driver 3354 * 3355 * @adev: amdgpu_device pointer 3356 * 3357 * Tear down the driver info (all asics). 3358 * Called at driver shutdown. 3359 */ 3360 void amdgpu_device_fini(struct amdgpu_device *adev) 3361 { 3362 int r __unused; 3363 3364 DRM_INFO("amdgpu: finishing device.\n"); 3365 flush_delayed_work(&adev->delayed_init_work); 3366 adev->shutdown = true; 3367 3368 /* disable all interrupts */ 3369 amdgpu_irq_disable_all(adev); 3370 if (adev->mode_info.mode_config_initialized){ 3371 if (!amdgpu_device_has_dc_support(adev)) 3372 drm_helper_force_disable_all(adev->ddev); 3373 else 3374 drm_atomic_helper_shutdown(adev->ddev); 3375 } 3376 amdgpu_fence_driver_fini(adev); 3377 if (adev->pm_sysfs_en) 3378 amdgpu_pm_sysfs_fini(adev); 3379 amdgpu_fbdev_fini(adev); 3380 r = amdgpu_device_ip_fini(adev); 3381 if (adev->firmware.gpu_info_fw) { 3382 release_firmware(adev->firmware.gpu_info_fw); 3383 adev->firmware.gpu_info_fw = NULL; 3384 } 3385 adev->accel_working = false; 3386 /* free i2c buses */ 3387 if (!amdgpu_device_has_dc_support(adev)) 3388 amdgpu_i2c_fini(adev); 3389 3390 if (amdgpu_emu_mode != 1) 3391 amdgpu_atombios_fini(adev); 3392 3393 kfree(adev->bios); 3394 adev->bios = NULL; 3395 #ifndef __NetBSD__ /* XXX amdgpu vga */ 3396 if (amdgpu_has_atpx() && 3397 (amdgpu_is_atpx_hybrid() || 3398 amdgpu_has_atpx_dgpu_power_cntl()) && 3399 !pci_is_thunderbolt_attached(adev->pdev)) 3400 vga_switcheroo_unregister_client(adev->pdev); 3401 if (amdgpu_device_supports_boco(adev->ddev)) 3402 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3403 vga_client_register(adev->pdev, NULL, NULL, NULL); 3404 #endif 3405 #ifdef __NetBSD__ 3406 if (adev->rio_mem_size) 3407 bus_space_unmap(adev->rio_memt, adev->rio_memh, 3408 adev->rio_mem_size); 3409 adev->rio_mem_size = 0; 3410 bus_space_unmap(adev->rmmiot, adev->rmmioh, adev->rmmio_size); 3411 #else 3412 if (adev->rio_mem) 3413 pci_iounmap(adev->pdev, adev->rio_mem); 3414 adev->rio_mem = NULL; 3415 iounmap(adev->rmmio); 3416 adev->rmmio = NULL; 3417 #endif 3418 amdgpu_device_doorbell_fini(adev); 3419 3420 amdgpu_debugfs_regs_cleanup(adev); 3421 #ifndef __NetBSD__ /* XXX amdgpu sysfs */ 3422 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 3423 #endif 3424 if (adev->ucode_sysfs_en) 3425 amdgpu_ucode_sysfs_fini(adev); 3426 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3427 amdgpu_pmu_fini(adev); 3428 amdgpu_debugfs_preempt_cleanup(adev); 3429 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 3430 amdgpu_discovery_fini(adev); 3431 spin_lock_destroy(&adev->ring_lru_list_lock); 3432 mutex_destroy(&adev->shadow_list_lock); 3433 spin_lock_destroy(&adev->mm_stats.lock); 3434 spin_lock_destroy(&adev->audio_endpt_idx_lock); 3435 spin_lock_destroy(&adev->se_cac_idx_lock); 3436 spin_lock_destroy(&adev->gc_cac_idx_lock); 3437 spin_lock_destroy(&adev->didt_idx_lock); 3438 spin_lock_destroy(&adev->uvd_ctx_idx_lock); 3439 spin_lock_destroy(&adev->pcie_idx_lock); 3440 spin_lock_destroy(&adev->smc_idx_lock); 3441 spin_lock_destroy(&adev->mmio_idx_lock); 3442 mutex_destroy(&adev->notifier_lock); 3443 mutex_destroy(&adev->psp.mutex); 3444 mutex_destroy(&adev->lock_reset); 3445 /* hash_destroy(adev->mn_hash)? */ 3446 mutex_destroy(&adev->virt.vf_errors.lock); 3447 mutex_destroy(&adev->mn_lock); 3448 mutex_destroy(&adev->grbm_idx_mutex); 3449 mutex_destroy(&adev->gfx.gfx_off_mutex); 3450 mutex_destroy(&adev->gfx.pipe_reserve_mutex); 3451 mutex_destroy(&adev->srbm_mutex); 3452 mutex_destroy(&adev->gfx.gpu_clock_mutex); 3453 mutex_destroy(&adev->pm.mutex); 3454 mutex_destroy(&adev->firmware.mutex); 3455 } 3456 3457 3458 /* 3459 * Suspend & resume. 3460 */ 3461 /** 3462 * amdgpu_device_suspend - initiate device suspend 3463 * 3464 * @dev: drm dev pointer 3465 * @suspend: suspend state 3466 * @fbcon : notify the fbdev of suspend 3467 * 3468 * Puts the hw in the suspend state (all asics). 3469 * Returns 0 for success or an error on failure. 3470 * Called at driver suspend. 3471 */ 3472 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3473 { 3474 struct amdgpu_device *adev; 3475 struct drm_crtc *crtc; 3476 struct drm_connector *connector; 3477 struct drm_connector_list_iter iter; 3478 int r; 3479 3480 if (dev == NULL || dev->dev_private == NULL) { 3481 return -ENODEV; 3482 } 3483 3484 adev = dev->dev_private; 3485 3486 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3487 return 0; 3488 3489 adev->in_suspend = true; 3490 drm_kms_helper_poll_disable(dev); 3491 3492 if (fbcon) 3493 amdgpu_fbdev_set_suspend(adev, 1); 3494 3495 cancel_delayed_work_sync(&adev->delayed_init_work); 3496 3497 if (!amdgpu_device_has_dc_support(adev)) { 3498 /* turn off display hw */ 3499 drm_modeset_lock_all(dev); 3500 drm_connector_list_iter_begin(dev, &iter); 3501 drm_for_each_connector_iter(connector, &iter) 3502 drm_helper_connector_dpms(connector, 3503 DRM_MODE_DPMS_OFF); 3504 drm_connector_list_iter_end(&iter); 3505 drm_modeset_unlock_all(dev); 3506 /* unpin the front buffers and cursors */ 3507 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3508 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3509 struct drm_framebuffer *fb = crtc->primary->fb; 3510 struct amdgpu_bo *robj; 3511 3512 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3513 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3514 r = amdgpu_bo_reserve(aobj, true); 3515 if (r == 0) { 3516 amdgpu_bo_unpin(aobj); 3517 amdgpu_bo_unreserve(aobj); 3518 } 3519 } 3520 3521 if (fb == NULL || fb->obj[0] == NULL) { 3522 continue; 3523 } 3524 robj = gem_to_amdgpu_bo(fb->obj[0]); 3525 /* don't unpin kernel fb objects */ 3526 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3527 r = amdgpu_bo_reserve(robj, true); 3528 if (r == 0) { 3529 amdgpu_bo_unpin(robj); 3530 amdgpu_bo_unreserve(robj); 3531 } 3532 } 3533 } 3534 } 3535 3536 amdgpu_amdkfd_suspend(adev); 3537 3538 amdgpu_ras_suspend(adev); 3539 3540 r = amdgpu_device_ip_suspend_phase1(adev); 3541 3542 /* evict vram memory */ 3543 amdgpu_bo_evict_vram(adev); 3544 3545 amdgpu_fence_driver_suspend(adev); 3546 3547 r = amdgpu_device_ip_suspend_phase2(adev); 3548 3549 /* evict remaining vram memory 3550 * This second call to evict vram is to evict the gart page table 3551 * using the CPU. 3552 */ 3553 amdgpu_bo_evict_vram(adev); 3554 3555 return 0; 3556 } 3557 3558 /** 3559 * amdgpu_device_resume - initiate device resume 3560 * 3561 * @dev: drm dev pointer 3562 * @resume: resume state 3563 * @fbcon : notify the fbdev of resume 3564 * 3565 * Bring the hw back to operating state (all asics). 3566 * Returns 0 for success or an error on failure. 3567 * Called at driver resume. 3568 */ 3569 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3570 { 3571 struct drm_connector *connector; 3572 struct drm_connector_list_iter iter; 3573 struct amdgpu_device *adev = dev->dev_private; 3574 struct drm_crtc *crtc; 3575 int r = 0; 3576 3577 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3578 return 0; 3579 3580 /* post card */ 3581 if (amdgpu_device_need_post(adev)) { 3582 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3583 if (r) 3584 DRM_ERROR("amdgpu asic init failed\n"); 3585 } 3586 3587 r = amdgpu_device_ip_resume(adev); 3588 if (r) { 3589 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3590 return r; 3591 } 3592 amdgpu_fence_driver_resume(adev); 3593 3594 3595 r = amdgpu_device_ip_late_init(adev); 3596 if (r) 3597 return r; 3598 3599 queue_delayed_work(system_wq, &adev->delayed_init_work, 3600 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3601 3602 if (!amdgpu_device_has_dc_support(adev)) { 3603 /* pin cursors */ 3604 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3605 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3606 3607 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3608 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3609 r = amdgpu_bo_reserve(aobj, true); 3610 if (r == 0) { 3611 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3612 if (r != 0) 3613 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3614 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3615 amdgpu_bo_unreserve(aobj); 3616 } 3617 } 3618 } 3619 } 3620 r = amdgpu_amdkfd_resume(adev); 3621 if (r) 3622 return r; 3623 3624 /* Make sure IB tests flushed */ 3625 flush_delayed_work(&adev->delayed_init_work); 3626 3627 /* blat the mode back in */ 3628 if (fbcon) { 3629 if (!amdgpu_device_has_dc_support(adev)) { 3630 /* pre DCE11 */ 3631 drm_helper_resume_force_mode(dev); 3632 3633 /* turn on display hw */ 3634 drm_modeset_lock_all(dev); 3635 3636 drm_connector_list_iter_begin(dev, &iter); 3637 drm_for_each_connector_iter(connector, &iter) 3638 drm_helper_connector_dpms(connector, 3639 DRM_MODE_DPMS_ON); 3640 drm_connector_list_iter_end(&iter); 3641 3642 drm_modeset_unlock_all(dev); 3643 } 3644 amdgpu_fbdev_set_suspend(adev, 0); 3645 } 3646 3647 drm_kms_helper_poll_enable(dev); 3648 3649 amdgpu_ras_resume(adev); 3650 3651 /* 3652 * Most of the connector probing functions try to acquire runtime pm 3653 * refs to ensure that the GPU is powered on when connector polling is 3654 * performed. Since we're calling this from a runtime PM callback, 3655 * trying to acquire rpm refs will cause us to deadlock. 3656 * 3657 * Since we're guaranteed to be holding the rpm lock, it's safe to 3658 * temporarily disable the rpm helpers so this doesn't deadlock us. 3659 */ 3660 #ifdef CONFIG_PM 3661 dev->dev->power.disable_depth++; 3662 #endif 3663 if (!amdgpu_device_has_dc_support(adev)) 3664 drm_helper_hpd_irq_event(dev); 3665 else 3666 drm_kms_helper_hotplug_event(dev); 3667 #ifdef CONFIG_PM 3668 dev->dev->power.disable_depth--; 3669 #endif 3670 adev->in_suspend = false; 3671 3672 return 0; 3673 } 3674 3675 /** 3676 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3677 * 3678 * @adev: amdgpu_device pointer 3679 * 3680 * The list of all the hardware IPs that make up the asic is walked and 3681 * the check_soft_reset callbacks are run. check_soft_reset determines 3682 * if the asic is still hung or not. 3683 * Returns true if any of the IPs are still in a hung state, false if not. 3684 */ 3685 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3686 { 3687 int i; 3688 bool asic_hang = false; 3689 3690 if (amdgpu_sriov_vf(adev)) 3691 return true; 3692 3693 if (amdgpu_asic_need_full_reset(adev)) 3694 return true; 3695 3696 for (i = 0; i < adev->num_ip_blocks; i++) { 3697 if (!adev->ip_blocks[i].status.valid) 3698 continue; 3699 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3700 adev->ip_blocks[i].status.hang = 3701 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3702 if (adev->ip_blocks[i].status.hang) { 3703 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3704 asic_hang = true; 3705 } 3706 } 3707 return asic_hang; 3708 } 3709 3710 /** 3711 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3712 * 3713 * @adev: amdgpu_device pointer 3714 * 3715 * The list of all the hardware IPs that make up the asic is walked and the 3716 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3717 * handles any IP specific hardware or software state changes that are 3718 * necessary for a soft reset to succeed. 3719 * Returns 0 on success, negative error code on failure. 3720 */ 3721 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3722 { 3723 int i, r = 0; 3724 3725 for (i = 0; i < adev->num_ip_blocks; i++) { 3726 if (!adev->ip_blocks[i].status.valid) 3727 continue; 3728 if (adev->ip_blocks[i].status.hang && 3729 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3730 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3731 if (r) 3732 return r; 3733 } 3734 } 3735 3736 return 0; 3737 } 3738 3739 /** 3740 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3741 * 3742 * @adev: amdgpu_device pointer 3743 * 3744 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3745 * reset is necessary to recover. 3746 * Returns true if a full asic reset is required, false if not. 3747 */ 3748 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3749 { 3750 int i; 3751 3752 if (amdgpu_asic_need_full_reset(adev)) 3753 return true; 3754 3755 for (i = 0; i < adev->num_ip_blocks; i++) { 3756 if (!adev->ip_blocks[i].status.valid) 3757 continue; 3758 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3759 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3760 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3761 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3763 if (adev->ip_blocks[i].status.hang) { 3764 DRM_INFO("Some block need full reset!\n"); 3765 return true; 3766 } 3767 } 3768 } 3769 return false; 3770 } 3771 3772 /** 3773 * amdgpu_device_ip_soft_reset - do a soft reset 3774 * 3775 * @adev: amdgpu_device pointer 3776 * 3777 * The list of all the hardware IPs that make up the asic is walked and the 3778 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3779 * IP specific hardware or software state changes that are necessary to soft 3780 * reset the IP. 3781 * Returns 0 on success, negative error code on failure. 3782 */ 3783 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3784 { 3785 int i, r = 0; 3786 3787 for (i = 0; i < adev->num_ip_blocks; i++) { 3788 if (!adev->ip_blocks[i].status.valid) 3789 continue; 3790 if (adev->ip_blocks[i].status.hang && 3791 adev->ip_blocks[i].version->funcs->soft_reset) { 3792 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3793 if (r) 3794 return r; 3795 } 3796 } 3797 3798 return 0; 3799 } 3800 3801 /** 3802 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3803 * 3804 * @adev: amdgpu_device pointer 3805 * 3806 * The list of all the hardware IPs that make up the asic is walked and the 3807 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3808 * handles any IP specific hardware or software state changes that are 3809 * necessary after the IP has been soft reset. 3810 * Returns 0 on success, negative error code on failure. 3811 */ 3812 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3813 { 3814 int i, r = 0; 3815 3816 for (i = 0; i < adev->num_ip_blocks; i++) { 3817 if (!adev->ip_blocks[i].status.valid) 3818 continue; 3819 if (adev->ip_blocks[i].status.hang && 3820 adev->ip_blocks[i].version->funcs->post_soft_reset) 3821 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3822 if (r) 3823 return r; 3824 } 3825 3826 return 0; 3827 } 3828 3829 /** 3830 * amdgpu_device_recover_vram - Recover some VRAM contents 3831 * 3832 * @adev: amdgpu_device pointer 3833 * 3834 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3835 * restore things like GPUVM page tables after a GPU reset where 3836 * the contents of VRAM might be lost. 3837 * 3838 * Returns: 3839 * 0 on success, negative error code on failure. 3840 */ 3841 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3842 { 3843 struct dma_fence *fence = NULL, *next = NULL; 3844 struct amdgpu_bo *shadow; 3845 long r = 1, tmo; 3846 3847 if (amdgpu_sriov_runtime(adev)) 3848 tmo = msecs_to_jiffies(8000); 3849 else 3850 tmo = msecs_to_jiffies(100); 3851 3852 DRM_INFO("recover vram bo from shadow start\n"); 3853 mutex_lock(&adev->shadow_list_lock); 3854 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3855 3856 /* No need to recover an evicted BO */ 3857 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3858 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3859 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3860 continue; 3861 3862 r = amdgpu_bo_restore_shadow(shadow, &next); 3863 if (r) 3864 break; 3865 3866 if (fence) { 3867 tmo = dma_fence_wait_timeout(fence, false, tmo); 3868 dma_fence_put(fence); 3869 fence = next; 3870 if (tmo == 0) { 3871 r = -ETIMEDOUT; 3872 break; 3873 } else if (tmo < 0) { 3874 r = tmo; 3875 break; 3876 } 3877 } else { 3878 fence = next; 3879 } 3880 } 3881 mutex_unlock(&adev->shadow_list_lock); 3882 3883 if (fence) 3884 tmo = dma_fence_wait_timeout(fence, false, tmo); 3885 dma_fence_put(fence); 3886 3887 if (r < 0 || tmo <= 0) { 3888 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3889 return -EIO; 3890 } 3891 3892 DRM_INFO("recover vram bo from shadow done\n"); 3893 return 0; 3894 } 3895 3896 3897 /** 3898 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3899 * 3900 * @adev: amdgpu device pointer 3901 * @from_hypervisor: request from hypervisor 3902 * 3903 * do VF FLR and reinitialize Asic 3904 * return 0 means succeeded otherwise failed 3905 */ 3906 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3907 bool from_hypervisor) 3908 { 3909 int r; 3910 3911 if (from_hypervisor) 3912 r = amdgpu_virt_request_full_gpu(adev, true); 3913 else 3914 r = amdgpu_virt_reset_gpu(adev); 3915 if (r) 3916 return r; 3917 3918 /* Resume IP prior to SMC */ 3919 r = amdgpu_device_ip_reinit_early_sriov(adev); 3920 if (r) 3921 goto error; 3922 3923 amdgpu_virt_init_data_exchange(adev); 3924 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3925 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3926 3927 r = amdgpu_device_fw_loading(adev); 3928 if (r) 3929 return r; 3930 3931 /* now we are okay to resume SMC/CP/SDMA */ 3932 r = amdgpu_device_ip_reinit_late_sriov(adev); 3933 if (r) 3934 goto error; 3935 3936 amdgpu_irq_gpu_reset_resume_helper(adev); 3937 r = amdgpu_ib_ring_tests(adev); 3938 amdgpu_amdkfd_post_reset(adev); 3939 3940 error: 3941 amdgpu_virt_release_full_gpu(adev, true); 3942 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3943 amdgpu_inc_vram_lost(adev); 3944 r = amdgpu_device_recover_vram(adev); 3945 } 3946 3947 return r; 3948 } 3949 3950 /** 3951 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3952 * 3953 * @adev: amdgpu device pointer 3954 * 3955 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3956 * a hung GPU. 3957 */ 3958 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3959 { 3960 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3961 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3962 return false; 3963 } 3964 3965 if (amdgpu_gpu_recovery == 0) 3966 goto disabled; 3967 3968 if (amdgpu_sriov_vf(adev)) 3969 return true; 3970 3971 if (amdgpu_gpu_recovery == -1) { 3972 switch (adev->asic_type) { 3973 case CHIP_BONAIRE: 3974 case CHIP_HAWAII: 3975 case CHIP_TOPAZ: 3976 case CHIP_TONGA: 3977 case CHIP_FIJI: 3978 case CHIP_POLARIS10: 3979 case CHIP_POLARIS11: 3980 case CHIP_POLARIS12: 3981 case CHIP_VEGAM: 3982 case CHIP_VEGA20: 3983 case CHIP_VEGA10: 3984 case CHIP_VEGA12: 3985 case CHIP_RAVEN: 3986 case CHIP_ARCTURUS: 3987 case CHIP_RENOIR: 3988 case CHIP_NAVI10: 3989 case CHIP_NAVI14: 3990 case CHIP_NAVI12: 3991 break; 3992 default: 3993 goto disabled; 3994 } 3995 } 3996 3997 return true; 3998 3999 disabled: 4000 DRM_INFO("GPU recovery disabled.\n"); 4001 return false; 4002 } 4003 4004 4005 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4006 struct amdgpu_job *job, 4007 bool *need_full_reset_arg) 4008 { 4009 int i, r = 0; 4010 bool need_full_reset = *need_full_reset_arg; 4011 4012 /* block all schedulers and reset given job's ring */ 4013 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4014 struct amdgpu_ring *ring = adev->rings[i]; 4015 4016 if (!ring || !ring->sched.thread) 4017 continue; 4018 4019 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4020 amdgpu_fence_driver_force_completion(ring); 4021 } 4022 4023 if(job) 4024 drm_sched_increase_karma(&job->base); 4025 4026 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4027 if (!amdgpu_sriov_vf(adev)) { 4028 4029 if (!need_full_reset) 4030 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4031 4032 if (!need_full_reset) { 4033 amdgpu_device_ip_pre_soft_reset(adev); 4034 r = amdgpu_device_ip_soft_reset(adev); 4035 amdgpu_device_ip_post_soft_reset(adev); 4036 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4037 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 4038 need_full_reset = true; 4039 } 4040 } 4041 4042 if (need_full_reset) 4043 r = amdgpu_device_ip_suspend(adev); 4044 4045 *need_full_reset_arg = need_full_reset; 4046 } 4047 4048 return r; 4049 } 4050 4051 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4052 struct list_head *device_list_handle, 4053 bool *need_full_reset_arg) 4054 { 4055 struct amdgpu_device *tmp_adev = NULL; 4056 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4057 int r = 0; 4058 4059 /* 4060 * ASIC reset has to be done on all HGMI hive nodes ASAP 4061 * to allow proper links negotiation in FW (within 1 sec) 4062 */ 4063 if (need_full_reset) { 4064 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4065 /* For XGMI run all resets in parallel to speed up the process */ 4066 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4067 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4068 r = -EALREADY; 4069 } else 4070 r = amdgpu_asic_reset(tmp_adev); 4071 4072 if (r) { 4073 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 4074 r, tmp_adev->ddev->unique); 4075 break; 4076 } 4077 } 4078 4079 /* For XGMI wait for all resets to complete before proceed */ 4080 if (!r) { 4081 list_for_each_entry(tmp_adev, device_list_handle, 4082 gmc.xgmi.head) { 4083 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4084 flush_work(&tmp_adev->xgmi_reset_work); 4085 r = tmp_adev->asic_reset_res; 4086 if (r) 4087 break; 4088 } 4089 } 4090 } 4091 } 4092 4093 if (!r && amdgpu_ras_intr_triggered()) 4094 amdgpu_ras_intr_cleared(); 4095 4096 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4097 if (need_full_reset) { 4098 /* post card */ 4099 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 4100 DRM_WARN("asic atom init failed!"); 4101 4102 if (!r) { 4103 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4104 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4105 if (r) 4106 goto out; 4107 4108 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4109 if (vram_lost) { 4110 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4111 amdgpu_inc_vram_lost(tmp_adev); 4112 } 4113 4114 r = amdgpu_gtt_mgr_recover( 4115 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 4116 if (r) 4117 goto out; 4118 4119 r = amdgpu_device_fw_loading(tmp_adev); 4120 if (r) 4121 return r; 4122 4123 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4124 if (r) 4125 goto out; 4126 4127 if (vram_lost) 4128 amdgpu_device_fill_reset_magic(tmp_adev); 4129 4130 /* 4131 * Add this ASIC as tracked as reset was already 4132 * complete successfully. 4133 */ 4134 amdgpu_register_gpu_instance(tmp_adev); 4135 4136 r = amdgpu_device_ip_late_init(tmp_adev); 4137 if (r) 4138 goto out; 4139 4140 /* must succeed. */ 4141 amdgpu_ras_resume(tmp_adev); 4142 4143 /* Update PSP FW topology after reset */ 4144 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4145 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4146 } 4147 } 4148 4149 4150 out: 4151 if (!r) { 4152 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4153 r = amdgpu_ib_ring_tests(tmp_adev); 4154 if (r) { 4155 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4156 r = amdgpu_device_ip_suspend(tmp_adev); 4157 need_full_reset = true; 4158 r = -EAGAIN; 4159 goto end; 4160 } 4161 } 4162 4163 if (!r) 4164 r = amdgpu_device_recover_vram(tmp_adev); 4165 else 4166 tmp_adev->asic_reset_res = r; 4167 } 4168 4169 end: 4170 *need_full_reset_arg = need_full_reset; 4171 return r; 4172 } 4173 4174 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 4175 { 4176 if (trylock) { 4177 if (!mutex_trylock(&adev->lock_reset)) 4178 return false; 4179 } else 4180 mutex_lock(&adev->lock_reset); 4181 4182 atomic_inc(&adev->gpu_reset_counter); 4183 adev->in_gpu_reset = true; 4184 switch (amdgpu_asic_reset_method(adev)) { 4185 case AMD_RESET_METHOD_MODE1: 4186 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4187 break; 4188 case AMD_RESET_METHOD_MODE2: 4189 adev->mp1_state = PP_MP1_STATE_RESET; 4190 break; 4191 default: 4192 adev->mp1_state = PP_MP1_STATE_NONE; 4193 break; 4194 } 4195 4196 return true; 4197 } 4198 4199 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4200 { 4201 amdgpu_vf_error_trans_all(adev); 4202 adev->mp1_state = PP_MP1_STATE_NONE; 4203 adev->in_gpu_reset = false; 4204 mutex_unlock(&adev->lock_reset); 4205 } 4206 4207 /** 4208 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4209 * 4210 * @adev: amdgpu device pointer 4211 * @job: which job trigger hang 4212 * 4213 * Attempt to reset the GPU if it has hung (all asics). 4214 * Attempt to do soft-reset or full-reset and reinitialize Asic 4215 * Returns 0 for success or an error on failure. 4216 */ 4217 4218 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4219 struct amdgpu_job *job) 4220 { 4221 struct list_head device_list, *device_list_handle = NULL; 4222 bool need_full_reset, job_signaled; 4223 struct amdgpu_hive_info *hive = NULL; 4224 struct amdgpu_device *tmp_adev = NULL; 4225 int i, r = 0; 4226 bool in_ras_intr = amdgpu_ras_intr_triggered(); 4227 bool use_baco = 4228 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? 4229 true : false; 4230 4231 /* 4232 * Flush RAM to disk so that after reboot 4233 * the user can read log and see why the system rebooted. 4234 */ 4235 if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { 4236 4237 DRM_WARN("Emergency reboot."); 4238 4239 ksys_sync_helper(); 4240 emergency_restart(); 4241 } 4242 4243 need_full_reset = job_signaled = false; 4244 INIT_LIST_HEAD(&device_list); 4245 4246 dev_info(adev->dev, "GPU %s begin!\n", 4247 (in_ras_intr && !use_baco) ? "jobs stop":"reset"); 4248 4249 cancel_delayed_work_sync(&adev->delayed_init_work); 4250 4251 hive = amdgpu_get_xgmi_hive(adev, false); 4252 4253 /* 4254 * Here we trylock to avoid chain of resets executing from 4255 * either trigger by jobs on different adevs in XGMI hive or jobs on 4256 * different schedulers for same device while this TO handler is running. 4257 * We always reset all schedulers for device and all devices for XGMI 4258 * hive so that should take care of them too. 4259 */ 4260 4261 if (hive && !mutex_trylock(&hive->reset_lock)) { 4262 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", hive: %"PRIx64" as another already in progress", 4263 job ? job->base.id : -1, hive->hive_id); 4264 return 0; 4265 } 4266 4267 /* Start with adev pre asic reset first for soft reset check.*/ 4268 if (!amdgpu_device_lock_adev(adev, !hive)) { 4269 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", as another already in progress", 4270 job ? job->base.id : -1); 4271 return 0; 4272 } 4273 4274 /* Block kfd: SRIOV would do it separately */ 4275 if (!amdgpu_sriov_vf(adev)) 4276 amdgpu_amdkfd_pre_reset(adev); 4277 4278 /* Build list of devices to reset */ 4279 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4280 if (!hive) { 4281 /*unlock kfd: SRIOV would do it separately */ 4282 if (!amdgpu_sriov_vf(adev)) 4283 amdgpu_amdkfd_post_reset(adev); 4284 amdgpu_device_unlock_adev(adev); 4285 return -ENODEV; 4286 } 4287 4288 /* 4289 * In case we are in XGMI hive mode device reset is done for all the 4290 * nodes in the hive to retrain all XGMI links and hence the reset 4291 * sequence is executed in loop on all nodes. 4292 */ 4293 device_list_handle = &hive->device_list; 4294 } else { 4295 list_add_tail(&adev->gmc.xgmi.head, &device_list); 4296 device_list_handle = &device_list; 4297 } 4298 4299 /* block all schedulers and reset given job's ring */ 4300 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4301 if (tmp_adev != adev) { 4302 amdgpu_device_lock_adev(tmp_adev, false); 4303 if (!amdgpu_sriov_vf(tmp_adev)) 4304 amdgpu_amdkfd_pre_reset(tmp_adev); 4305 } 4306 4307 /* 4308 * Mark these ASICs to be reseted as untracked first 4309 * And add them back after reset completed 4310 */ 4311 amdgpu_unregister_gpu_instance(tmp_adev); 4312 4313 /* disable ras on ALL IPs */ 4314 if (!(in_ras_intr && !use_baco) && 4315 amdgpu_device_ip_need_full_reset(tmp_adev)) 4316 amdgpu_ras_suspend(tmp_adev); 4317 4318 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4319 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4320 4321 if (!ring || !ring->sched.thread) 4322 continue; 4323 4324 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4325 4326 if (in_ras_intr && !use_baco) 4327 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4328 } 4329 } 4330 4331 4332 if (in_ras_intr && !use_baco) 4333 goto skip_sched_resume; 4334 4335 /* 4336 * Must check guilty signal here since after this point all old 4337 * HW fences are force signaled. 4338 * 4339 * job->base holds a reference to parent fence 4340 */ 4341 if (job && job->base.s_fence->parent && 4342 dma_fence_is_signaled(job->base.s_fence->parent)) 4343 job_signaled = true; 4344 4345 if (job_signaled) { 4346 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4347 goto skip_hw_reset; 4348 } 4349 4350 4351 /* Guilty job will be freed after this*/ 4352 r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); 4353 if (r) { 4354 /*TODO Should we stop ?*/ 4355 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4356 r, adev->ddev->unique); 4357 adev->asic_reset_res = r; 4358 } 4359 4360 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4361 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4362 4363 if (tmp_adev == adev) 4364 continue; 4365 4366 r = amdgpu_device_pre_asic_reset(tmp_adev, 4367 NULL, 4368 &need_full_reset); 4369 /*TODO Should we stop ?*/ 4370 if (r) { 4371 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 4372 r, tmp_adev->ddev->unique); 4373 tmp_adev->asic_reset_res = r; 4374 } 4375 } 4376 4377 /* Actual ASIC resets if needed.*/ 4378 /* TODO Implement XGMI hive reset logic for SRIOV */ 4379 if (amdgpu_sriov_vf(adev)) { 4380 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4381 if (r) 4382 adev->asic_reset_res = r; 4383 } else { 4384 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4385 if (r && r == -EAGAIN) 4386 goto retry; 4387 } 4388 4389 skip_hw_reset: 4390 4391 /* Post ASIC reset for all devs .*/ 4392 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4393 4394 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4395 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4396 4397 if (!ring || !ring->sched.thread) 4398 continue; 4399 4400 /* No point to resubmit jobs if we didn't HW reset*/ 4401 if (!tmp_adev->asic_reset_res && !job_signaled) 4402 drm_sched_resubmit_jobs(&ring->sched); 4403 4404 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4405 } 4406 4407 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4408 drm_helper_resume_force_mode(tmp_adev->ddev); 4409 } 4410 4411 tmp_adev->asic_reset_res = 0; 4412 4413 if (r) { 4414 /* bad news, how to tell it to userspace ? */ 4415 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4416 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4417 } else { 4418 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4419 } 4420 } 4421 4422 skip_sched_resume: 4423 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4424 /*unlock kfd: SRIOV would do it separately */ 4425 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) 4426 amdgpu_amdkfd_post_reset(tmp_adev); 4427 amdgpu_device_unlock_adev(tmp_adev); 4428 } 4429 4430 if (hive) 4431 mutex_unlock(&hive->reset_lock); 4432 4433 if (r) 4434 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4435 return r; 4436 } 4437 4438 /** 4439 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4440 * 4441 * @adev: amdgpu_device pointer 4442 * 4443 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4444 * and lanes) of the slot the device is in. Handles APUs and 4445 * virtualized environments where PCIE config space may not be available. 4446 */ 4447 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4448 { 4449 struct pci_dev *pdev; 4450 enum pci_bus_speed speed_cap, platform_speed_cap; 4451 enum pcie_link_width platform_link_width; 4452 4453 if (amdgpu_pcie_gen_cap) 4454 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4455 4456 if (amdgpu_pcie_lane_cap) 4457 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4458 4459 /* covers APUs as well */ 4460 if (pci_is_root_bus(adev->pdev->bus)) { 4461 if (adev->pm.pcie_gen_mask == 0) 4462 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4463 if (adev->pm.pcie_mlw_mask == 0) 4464 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4465 return; 4466 } 4467 4468 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4469 return; 4470 4471 pcie_bandwidth_available(adev->pdev, NULL, 4472 &platform_speed_cap, &platform_link_width); 4473 4474 if (adev->pm.pcie_gen_mask == 0) { 4475 /* asic caps */ 4476 pdev = adev->pdev; 4477 speed_cap = pcie_get_speed_cap(pdev); 4478 if (speed_cap == PCI_SPEED_UNKNOWN) { 4479 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4480 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4481 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4482 } else { 4483 if (speed_cap == PCIE_SPEED_16_0GT) 4484 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4487 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 4488 else if (speed_cap == PCIE_SPEED_8_0GT) 4489 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4492 else if (speed_cap == PCIE_SPEED_5_0GT) 4493 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 4495 else 4496 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4497 } 4498 /* platform caps */ 4499 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4500 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4501 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4502 } else { 4503 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4504 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4507 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4508 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4509 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4512 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4513 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4514 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4515 else 4516 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4517 4518 } 4519 } 4520 if (adev->pm.pcie_mlw_mask == 0) { 4521 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4522 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4523 } else { 4524 switch (platform_link_width) { 4525 case PCIE_LNK_X32: 4526 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4533 break; 4534 case PCIE_LNK_X16: 4535 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4541 break; 4542 case PCIE_LNK_X12: 4543 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4548 break; 4549 case PCIE_LNK_X8: 4550 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4554 break; 4555 case PCIE_LNK_X4: 4556 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4559 break; 4560 case PCIE_LNK_X2: 4561 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4563 break; 4564 case PCIE_LNK_X1: 4565 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4566 break; 4567 default: 4568 break; 4569 } 4570 } 4571 } 4572 } 4573 4574 int amdgpu_device_baco_enter(struct drm_device *dev) 4575 { 4576 struct amdgpu_device *adev = dev->dev_private; 4577 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4578 4579 if (!amdgpu_device_supports_baco(adev->ddev)) 4580 return -ENOTSUPP; 4581 4582 if (ras && ras->supported) 4583 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 4584 4585 return amdgpu_dpm_baco_enter(adev); 4586 } 4587 4588 int amdgpu_device_baco_exit(struct drm_device *dev) 4589 { 4590 struct amdgpu_device *adev = dev->dev_private; 4591 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4592 int ret = 0; 4593 4594 if (!amdgpu_device_supports_baco(adev->ddev)) 4595 return -ENOTSUPP; 4596 4597 ret = amdgpu_dpm_baco_exit(adev); 4598 if (ret) 4599 return ret; 4600 4601 if (ras && ras->supported) 4602 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 4603 4604 return 0; 4605 } 4606