1 /* $NetBSD: amdgpu_device.c,v 1.21 2024/07/01 12:09:52 riastradh Exp $ */
2
3 /*
4 * Copyright 2008 Advanced Micro Devices, Inc.
5 * Copyright 2008 Red Hat Inc.
6 * Copyright 2009 Jerome Glisse.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 *
26 * Authors: Dave Airlie
27 * Alex Deucher
28 * Jerome Glisse
29 */
30 #include <sys/cdefs.h>
31 __KERNEL_RCSID(0, "$NetBSD: amdgpu_device.c,v 1.21 2024/07/01 12:09:52 riastradh Exp $");
32
33 #include <linux/power_supply.h>
34 #include <linux/kthread.h>
35 #include <linux/module.h>
36 #include <linux/console.h>
37 #include <linux/slab.h>
38 #include <linux/reboot.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_probe_helper.h>
42 #include <drm/amdgpu_drm.h>
43 #include <linux/vgaarb.h>
44 #include <linux/vga_switcheroo.h>
45 #include <linux/efi.h>
46 #include "amdgpu.h"
47 #include "amdgpu_trace.h"
48 #include "amdgpu_i2c.h"
49 #include "atom.h"
50 #include "amdgpu_atombios.h"
51 #include "amdgpu_atomfirmware.h"
52 #include "amd_pcie.h"
53 #ifdef CONFIG_DRM_AMDGPU_SI
54 #include "si.h"
55 #endif
56 #ifdef CONFIG_DRM_AMDGPU_CIK
57 #include "cik.h"
58 #endif
59 #include "vi.h"
60 #include "soc15.h"
61 #include "nv.h"
62 #include "bif/bif_4_1_d.h"
63 #include <linux/pci.h>
64 #include <linux/firmware.h>
65 #include "amdgpu_vf_error.h"
66
67 #include "amdgpu_amdkfd.h"
68 #include "amdgpu_pm.h"
69
70 #include "amdgpu_xgmi.h"
71 #include "amdgpu_ras.h"
72 #include "amdgpu_pmu.h"
73
74 #include <linux/suspend.h>
75 #include <drm/task_barrier.h>
76 #include <linux/nbsd-namespace.h>
77
78 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
88
89 #define AMDGPU_RESUME_MS 2000
90
91 const char *amdgpu_asic_name[] = {
92 "TAHITI",
93 "PITCAIRN",
94 "VERDE",
95 "OLAND",
96 "HAINAN",
97 "BONAIRE",
98 "KAVERI",
99 "KABINI",
100 "HAWAII",
101 "MULLINS",
102 "TOPAZ",
103 "TONGA",
104 "FIJI",
105 "CARRIZO",
106 "STONEY",
107 "POLARIS10",
108 "POLARIS11",
109 "POLARIS12",
110 "VEGAM",
111 "VEGA10",
112 "VEGA12",
113 "VEGA20",
114 "RAVEN",
115 "ARCTURUS",
116 "RENOIR",
117 "NAVI10",
118 "NAVI14",
119 "NAVI12",
120 "LAST",
121 };
122
123 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
124
125 /**
126 * DOC: pcie_replay_count
127 *
128 * The amdgpu driver provides a sysfs API for reporting the total number
129 * of PCIe replays (NAKs)
130 * The file pcie_replay_count is used for this and returns the total
131 * number of replays as a sum of the NAKs generated and NAKs received
132 */
133
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
135 struct device_attribute *attr, char *buf)
136 {
137 struct drm_device *ddev = dev_get_drvdata(dev);
138 struct amdgpu_device *adev = ddev->dev_private;
139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
140
141 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
142 }
143
144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
145 amdgpu_device_get_pcie_replay_count, NULL);
146
147 #endif /* __NetBSD__ */
148
149 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
150
151 /**
152 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
153 *
154 * @dev: drm_device pointer
155 *
156 * Returns true if the device is a dGPU with HG/PX power control,
157 * otherwise return false.
158 */
amdgpu_device_supports_boco(struct drm_device * dev)159 bool amdgpu_device_supports_boco(struct drm_device *dev)
160 {
161 struct amdgpu_device *adev = dev->dev_private;
162
163 if (adev->flags & AMD_IS_PX)
164 return true;
165 return false;
166 }
167
168 /**
169 * amdgpu_device_supports_baco - Does the device support BACO
170 *
171 * @dev: drm_device pointer
172 *
173 * Returns true if the device supporte BACO,
174 * otherwise return false.
175 */
amdgpu_device_supports_baco(struct drm_device * dev)176 bool amdgpu_device_supports_baco(struct drm_device *dev)
177 {
178 struct amdgpu_device *adev = dev->dev_private;
179
180 return amdgpu_asic_supports_baco(adev);
181 }
182
183 /**
184 * VRAM access helper functions.
185 *
186 * amdgpu_device_vram_access - read/write a buffer in vram
187 *
188 * @adev: amdgpu_device pointer
189 * @pos: offset of the buffer in vram
190 * @buf: virtual address of the buffer in system memory
191 * @size: read/write size, sizeof(@buf) must > @size
192 * @write: true - write to vram, otherwise - read from vram
193 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)194 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
195 uint32_t *buf, size_t size, bool write)
196 {
197 uint64_t last;
198 unsigned long flags;
199
200 last = size - 4;
201 for (last += pos; pos <= last; pos += 4) {
202 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
203 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
204 WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31);
205 if (write)
206 WREG32_NO_KIQ(mmMM_DATA, *buf++);
207 else
208 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
209 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
210 }
211 }
212
213 /*
214 * MMIO register access helper functions.
215 */
216 /**
217 * amdgpu_mm_rreg - read a memory mapped IO register
218 *
219 * @adev: amdgpu_device pointer
220 * @reg: dword aligned register offset
221 * @acc_flags: access flags which require special behavior
222 *
223 * Returns the 32 bit value from the offset specified.
224 */
amdgpu_mm_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)225 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
226 uint32_t acc_flags)
227 {
228 uint32_t ret;
229
230 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
231 return amdgpu_kiq_rreg(adev, reg);
232
233 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
234 #ifdef __NetBSD__
235 return bus_space_read_4(adev->rmmiot, adev->rmmioh, 4*reg);
236 #else
237 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
238 #endif
239 else {
240 unsigned long flags;
241
242 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
243 #ifdef __NetBSD__
244 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX,
245 4*reg);
246 ret = bus_space_read_4(adev->rmmiot, adev->rmmioh,
247 4*mmMM_DATA);
248 #else
249 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
250 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
251 #endif
252 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
253 }
254 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
255 return ret;
256 }
257
258 /*
259 * MMIO register read with bytes helper functions
260 * @offset:bytes offset from MMIO start
261 *
262 */
263
264 /**
265 * amdgpu_mm_rreg8 - read a memory mapped IO register
266 *
267 * @adev: amdgpu_device pointer
268 * @offset: byte aligned register offset
269 *
270 * Returns the 8 bit value from the offset specified.
271 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)272 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
273 if (offset < adev->rmmio_size)
274 #ifdef __NetBSD__
275 return bus_space_read_1(adev->rmmiot, adev->rmmioh, offset);
276 #else
277 return (readb(adev->rmmio + offset));
278 #endif
279 BUG();
280 }
281
282 /*
283 * MMIO register write with bytes helper functions
284 * @offset:bytes offset from MMIO start
285 * @value: the value want to be written to the register
286 *
287 */
288 /**
289 * amdgpu_mm_wreg8 - read a memory mapped IO register
290 *
291 * @adev: amdgpu_device pointer
292 * @offset: byte aligned register offset
293 * @value: 8 bit value to write
294 *
295 * Writes the value specified to the offset specified.
296 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)297 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
298 if (offset < adev->rmmio_size)
299 #ifdef __NetBSD__
300 bus_space_write_1(adev->rmmiot, adev->rmmioh, offset, value);
301 #else
302 writeb(value, adev->rmmio + offset);
303 #endif
304 else
305 BUG();
306 }
307
308 /**
309 * amdgpu_mm_wreg - write to a memory mapped IO register
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @v: 32 bit value to write to the register
314 * @acc_flags: access flags which require special behavior
315 *
316 * Writes the value specified to the offset specified.
317 */
amdgpu_mm_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)318 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
319 uint32_t acc_flags)
320 {
321 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
322
323 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
324 adev->last_mm_index = v;
325 }
326
327 if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
328 return amdgpu_kiq_wreg(adev, reg, v);
329
330 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
331 #ifdef __NetBSD__
332 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*reg, v);
333 #else
334 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
335 #endif
336 else {
337 unsigned long flags;
338
339 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
340 #ifdef __NetBSD__
341 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_INDEX,
342 reg*4);
343 bus_space_write_4(adev->rmmiot, adev->rmmioh, 4*mmMM_DATA, v);
344 #else
345 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
346 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
347 #endif
348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
349 }
350
351 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
352 udelay(500);
353 }
354 }
355
356 /**
357 * amdgpu_io_rreg - read an IO register
358 *
359 * @adev: amdgpu_device pointer
360 * @reg: dword aligned register offset
361 *
362 * Returns the 32 bit value from the offset specified.
363 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)364 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
365 {
366 if ((reg * 4) < adev->rio_mem_size)
367 #ifdef __NetBSD__
368 return bus_space_read_4(adev->rio_memt, adev->rio_memh, 4*reg);
369 #else
370 return ioread32(adev->rio_mem + (reg * 4));
371 #endif
372 else {
373 #ifdef __NetBSD__
374 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX,
375 4*reg);
376 return bus_space_read_4(adev->rio_memt, adev->rio_memh,
377 4*mmMM_DATA);
378 #else
379 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
380 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
381 #endif
382 }
383 }
384
385 /**
386 * amdgpu_io_wreg - write to an IO register
387 *
388 * @adev: amdgpu_device pointer
389 * @reg: dword aligned register offset
390 * @v: 32 bit value to write to the register
391 *
392 * Writes the value specified to the offset specified.
393 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)394 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
395 {
396 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
397 adev->last_mm_index = v;
398 }
399
400 if ((reg * 4) < adev->rio_mem_size)
401 #ifdef __NetBSD__
402 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*reg, v);
403 #else
404 iowrite32(v, adev->rio_mem + (reg * 4));
405 #endif
406 else {
407 #ifdef __NetBSD__
408 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_INDEX,
409 4*reg);
410 bus_space_write_4(adev->rio_memt, adev->rio_memh, 4*mmMM_DATA,
411 v);
412 #else
413 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
414 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
415 #endif
416 }
417
418 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
419 udelay(500);
420 }
421 }
422
423 /**
424 * amdgpu_mm_rdoorbell - read a doorbell dword
425 *
426 * @adev: amdgpu_device pointer
427 * @index: doorbell index
428 *
429 * Returns the value in the doorbell aperture at the
430 * requested doorbell index (CIK).
431 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)432 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
433 {
434 if (index < adev->doorbell.num_doorbells) {
435 #ifdef __NetBSD__
436 return bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
437 4*index);
438 #else
439 return readl(adev->doorbell.ptr + index);
440 #endif
441 } else {
442 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
443 return 0;
444 }
445 }
446
447 /**
448 * amdgpu_mm_wdoorbell - write a doorbell dword
449 *
450 * @adev: amdgpu_device pointer
451 * @index: doorbell index
452 * @v: value to write
453 *
454 * Writes @v to the doorbell aperture at the
455 * requested doorbell index (CIK).
456 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)457 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
458 {
459 if (index < adev->doorbell.num_doorbells) {
460 #ifdef __NetBSD__
461 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
462 4*index, v);
463 #else
464 writel(v, adev->doorbell.ptr + index);
465 #endif
466 } else {
467 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
468 }
469 }
470
471 /**
472 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
473 *
474 * @adev: amdgpu_device pointer
475 * @index: doorbell index
476 *
477 * Returns the value in the doorbell aperture at the
478 * requested doorbell index (VEGA10+).
479 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)480 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
481 {
482 if (index < adev->doorbell.num_doorbells) {
483 #ifdef __NetBSD__
484 #ifdef _LP64
485 return bus_space_read_8(adev->doorbell.bst, adev->doorbell.bsh,
486 4*index);
487 #else
488 uint64_t lo, hi;
489 #if _BYTE_ORDER == _LITTLE_ENDIAN
490 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
491 4*index);
492 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
493 4*index + 4);
494 #else
495 hi = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
496 4*index);
497 lo = bus_space_read_4(adev->doorbell.bst, adev->doorbell.bsh,
498 4*index + 4);
499 #endif
500 return lo | (hi << 32);
501 #endif
502 #else
503 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
504 #endif
505 } else {
506 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
507 return 0;
508 }
509 }
510
511 /**
512 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
513 *
514 * @adev: amdgpu_device pointer
515 * @index: doorbell index
516 * @v: value to write
517 *
518 * Writes @v to the doorbell aperture at the
519 * requested doorbell index (VEGA10+).
520 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)521 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
522 {
523 if (index < adev->doorbell.num_doorbells) {
524 #ifdef __NetBSD__
525 #ifdef _LP64
526 bus_space_write_8(adev->doorbell.bst, adev->doorbell.bsh,
527 4*index, v);
528 #else
529 /*
530 * XXX This might not be as atomic as one might hope...
531 */
532 #if _BYTE_ORDER == _LITTLE_ENDIAN
533 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
534 4*index, v & 0xffffffffU);
535 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
536 4*index + 4, v >> 32);
537 #else
538 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
539 4*index, v >> 32);
540 bus_space_write_4(adev->doorbell.bst, adev->doorbell.bsh,
541 4*index + 4, v & 0xffffffffU);
542 #endif
543 #endif
544 #else
545 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
546 #endif
547 } else {
548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
549 }
550 }
551
552 /**
553 * amdgpu_invalid_rreg - dummy reg read function
554 *
555 * @adev: amdgpu device pointer
556 * @reg: offset of register
557 *
558 * Dummy register read function. Used for register blocks
559 * that certain asics don't have (all asics).
560 * Returns the value in the register.
561 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)562 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
563 {
564 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
565 BUG();
566 return 0;
567 }
568
569 /**
570 * amdgpu_invalid_wreg - dummy reg write function
571 *
572 * @adev: amdgpu device pointer
573 * @reg: offset of register
574 * @v: value to write to the register
575 *
576 * Dummy register read function. Used for register blocks
577 * that certain asics don't have (all asics).
578 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)579 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
580 {
581 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
582 reg, v);
583 BUG();
584 }
585
586 /**
587 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
588 *
589 * @adev: amdgpu device pointer
590 * @reg: offset of register
591 *
592 * Dummy register read function. Used for register blocks
593 * that certain asics don't have (all asics).
594 * Returns the value in the register.
595 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)596 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
597 {
598 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
599 BUG();
600 return 0;
601 }
602
603 /**
604 * amdgpu_invalid_wreg64 - dummy reg write function
605 *
606 * @adev: amdgpu device pointer
607 * @reg: offset of register
608 * @v: value to write to the register
609 *
610 * Dummy register read function. Used for register blocks
611 * that certain asics don't have (all asics).
612 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)613 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
614 {
615 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08"PRIX64"\n",
616 reg, v);
617 BUG();
618 }
619
620 /**
621 * amdgpu_block_invalid_rreg - dummy reg read function
622 *
623 * @adev: amdgpu device pointer
624 * @block: offset of instance
625 * @reg: offset of register
626 *
627 * Dummy register read function. Used for register blocks
628 * that certain asics don't have (all asics).
629 * Returns the value in the register.
630 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)631 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
632 uint32_t block, uint32_t reg)
633 {
634 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
635 reg, block);
636 BUG();
637 return 0;
638 }
639
640 /**
641 * amdgpu_block_invalid_wreg - dummy reg write function
642 *
643 * @adev: amdgpu device pointer
644 * @block: offset of instance
645 * @reg: offset of register
646 * @v: value to write to the register
647 *
648 * Dummy register read function. Used for register blocks
649 * that certain asics don't have (all asics).
650 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)651 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
652 uint32_t block,
653 uint32_t reg, uint32_t v)
654 {
655 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
656 reg, block, v);
657 BUG();
658 }
659
660 /**
661 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
662 *
663 * @adev: amdgpu device pointer
664 *
665 * Allocates a scratch page of VRAM for use by various things in the
666 * driver.
667 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)668 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
669 {
670 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
671 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
672 &adev->vram_scratch.robj,
673 &adev->vram_scratch.gpu_addr,
674 (void **)__UNVOLATILE(&adev->vram_scratch.ptr));
675 }
676
677 /**
678 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
679 *
680 * @adev: amdgpu device pointer
681 *
682 * Frees the VRAM scratch page.
683 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)684 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
685 {
686 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
687 }
688
689 /**
690 * amdgpu_device_program_register_sequence - program an array of registers.
691 *
692 * @adev: amdgpu_device pointer
693 * @registers: pointer to the register array
694 * @array_size: size of the register array
695 *
696 * Programs an array or registers with and and or masks.
697 * This is a helper for setting golden registers.
698 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)699 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
700 const u32 *registers,
701 const u32 array_size)
702 {
703 u32 tmp, reg, and_mask, or_mask;
704 int i;
705
706 if (array_size % 3)
707 return;
708
709 for (i = 0; i < array_size; i +=3) {
710 reg = registers[i + 0];
711 and_mask = registers[i + 1];
712 or_mask = registers[i + 2];
713
714 if (and_mask == 0xffffffff) {
715 tmp = or_mask;
716 } else {
717 tmp = RREG32(reg);
718 tmp &= ~and_mask;
719 if (adev->family >= AMDGPU_FAMILY_AI)
720 tmp |= (or_mask & and_mask);
721 else
722 tmp |= or_mask;
723 }
724 WREG32(reg, tmp);
725 }
726 }
727
728 /**
729 * amdgpu_device_pci_config_reset - reset the GPU
730 *
731 * @adev: amdgpu_device pointer
732 *
733 * Resets the GPU using the pci config reset sequence.
734 * Only applicable to asics prior to vega10.
735 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)736 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
737 {
738 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
739 }
740
741 /*
742 * GPU doorbell aperture helpers function.
743 */
744 /**
745 * amdgpu_device_doorbell_init - Init doorbell driver information.
746 *
747 * @adev: amdgpu_device pointer
748 *
749 * Init doorbell driver information (CIK)
750 * Returns 0 on success, error on failure.
751 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)752 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
753 {
754
755 /* No doorbell on SI hardware generation */
756 if (adev->asic_type < CHIP_BONAIRE) {
757 adev->doorbell.base = 0;
758 adev->doorbell.size = 0;
759 adev->doorbell.num_doorbells = 0;
760 #ifndef __NetBSD__
761 adev->doorbell.ptr = NULL;
762 #endif
763 return 0;
764 }
765
766 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
767 return -EINVAL;
768
769 amdgpu_asic_init_doorbell_index(adev);
770
771 /* doorbell bar mapping */
772 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
773 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
774
775 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
776 adev->doorbell_index.max_assignment+1);
777 if (adev->doorbell.num_doorbells == 0)
778 return -EINVAL;
779
780 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
781 * paging queue doorbell use the second page. The
782 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
783 * doorbells are in the first page. So with paging queue enabled,
784 * the max num_doorbells should + 1 page (0x400 in dword)
785 */
786 if (adev->asic_type >= CHIP_VEGA10)
787 adev->doorbell.num_doorbells += 0x400;
788
789 #ifdef __NetBSD__
790 int r;
791 adev->doorbell.bst = adev->pdev->pd_pa.pa_memt;
792 /* XXX errno NetBSD->Linux */
793 r = -bus_space_map(adev->doorbell.bst, adev->doorbell.base,
794 adev->doorbell.num_doorbells * sizeof(u32), 0,
795 &adev->doorbell.bsh);
796 if (r)
797 return r;
798 #else
799 adev->doorbell.ptr = ioremap(adev->doorbell.base,
800 adev->doorbell.num_doorbells *
801 sizeof(u32));
802 if (adev->doorbell.ptr == NULL)
803 return -ENOMEM;
804 #endif
805
806 return 0;
807 }
808
809 /**
810 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
811 *
812 * @adev: amdgpu_device pointer
813 *
814 * Tear down doorbell driver information (CIK)
815 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)816 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
817 {
818 #ifdef __NetBSD__
819 if (adev->doorbell.num_doorbells) {
820 bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh,
821 adev->doorbell.num_doorbells * sizeof(u32));
822 adev->doorbell.num_doorbells = 0;
823 }
824 #else
825 iounmap(adev->doorbell.ptr);
826 adev->doorbell.ptr = NULL;
827 #endif
828 }
829
830
831
832 /*
833 * amdgpu_device_wb_*()
834 * Writeback is the method by which the GPU updates special pages in memory
835 * with the status of certain GPU events (fences, ring pointers,etc.).
836 */
837
838 /**
839 * amdgpu_device_wb_fini - Disable Writeback and free memory
840 *
841 * @adev: amdgpu_device pointer
842 *
843 * Disables Writeback and frees the Writeback memory (all asics).
844 * Used at driver shutdown.
845 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
847 {
848 if (adev->wb.wb_obj) {
849 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
850 &adev->wb.gpu_addr,
851 (void **)__UNVOLATILE(&adev->wb.wb));
852 adev->wb.wb_obj = NULL;
853 }
854 }
855
856 /**
857 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
858 *
859 * @adev: amdgpu_device pointer
860 *
861 * Initializes writeback and allocates writeback memory (all asics).
862 * Used at driver startup.
863 * Returns 0 on success or an -error on failure.
864 */
amdgpu_device_wb_init(struct amdgpu_device * adev)865 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
866 {
867 int r;
868
869 if (adev->wb.wb_obj == NULL) {
870 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
871 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
872 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
873 &adev->wb.wb_obj, &adev->wb.gpu_addr,
874 (void **)__UNVOLATILE(&adev->wb.wb));
875 if (r) {
876 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
877 return r;
878 }
879
880 adev->wb.num_wb = AMDGPU_MAX_WB;
881 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
882
883 /* clear wb memory */
884 memset(__UNVOLATILE(adev->wb.wb), 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
885 }
886
887 return 0;
888 }
889
890 /**
891 * amdgpu_device_wb_get - Allocate a wb entry
892 *
893 * @adev: amdgpu_device pointer
894 * @wb: wb index
895 *
896 * Allocate a wb slot for use by the driver (all asics).
897 * Returns 0 on success or -EINVAL on failure.
898 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
900 {
901 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
902
903 if (offset < adev->wb.num_wb) {
904 __set_bit(offset, adev->wb.used);
905 *wb = offset << 3; /* convert to dw offset */
906 return 0;
907 } else {
908 return -EINVAL;
909 }
910 }
911
912 /**
913 * amdgpu_device_wb_free - Free a wb entry
914 *
915 * @adev: amdgpu_device pointer
916 * @wb: wb index
917 *
918 * Free a wb slot allocated for use by the driver (all asics)
919 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
921 {
922 wb >>= 3;
923 if (wb < adev->wb.num_wb)
924 __clear_bit(wb, adev->wb.used);
925 }
926
927 /**
928 * amdgpu_device_resize_fb_bar - try to resize FB BAR
929 *
930 * @adev: amdgpu_device pointer
931 *
932 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
933 * to fail, but if any of the BARs is not accessible after the size we abort
934 * driver loading by returning -ENODEV.
935 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
937 {
938 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
939 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
940 struct pci_bus *root;
941 struct resource *res;
942 unsigned i;
943 u16 cmd;
944 int r;
945
946 /* Bypass for VF */
947 if (amdgpu_sriov_vf(adev))
948 return 0;
949
950 #ifdef __NetBSD__ /* XXX amdgpu fb resize */
951 __USE(space_needed);
952 __USE(rbar_size);
953 __USE(root);
954 __USE(res);
955 __USE(i);
956 __USE(cmd);
957 __USE(r);
958 #else
959
960 /* Check if the root BUS has 64bit memory resources */
961 root = adev->pdev->bus;
962 while (root->parent)
963 root = root->parent;
964
965 pci_bus_for_each_resource(root, res, i) {
966 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
967 res->start > 0x100000000ull)
968 break;
969 }
970
971 /* Trying to resize is pointless without a root hub window above 4GB */
972 if (!res)
973 return 0;
974
975 /* Disable memory decoding while we change the BAR addresses and size */
976 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
977 pci_write_config_word(adev->pdev, PCI_COMMAND,
978 cmd & ~PCI_COMMAND_MEMORY);
979
980 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
981 amdgpu_device_doorbell_fini(adev);
982 if (adev->asic_type >= CHIP_BONAIRE)
983 pci_release_resource(adev->pdev, 2);
984
985 pci_release_resource(adev->pdev, 0);
986
987 r = pci_resize_resource(adev->pdev, 0, rbar_size);
988 if (r == -ENOSPC)
989 DRM_INFO("Not enough PCI address space for a large BAR.");
990 else if (r && r != -ENOTSUPP)
991 DRM_ERROR("Problem resizing BAR0 (%d).", r);
992
993 pci_assign_unassigned_bus_resources(adev->pdev->bus);
994
995 /* When the doorbell or fb BAR isn't available we have no chance of
996 * using the device.
997 */
998 r = amdgpu_device_doorbell_init(adev);
999 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1000 return -ENODEV;
1001
1002 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1003
1004 #endif
1005
1006 return 0;
1007 }
1008
1009 /*
1010 * GPU helpers function.
1011 */
1012 /**
1013 * amdgpu_device_need_post - check if the hw need post or not
1014 *
1015 * @adev: amdgpu_device pointer
1016 *
1017 * Check if the asic has been initialized (all asics) at driver startup
1018 * or post is needed if hw reset is performed.
1019 * Returns true if need or false if not.
1020 */
amdgpu_device_need_post(struct amdgpu_device * adev)1021 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1022 {
1023 uint32_t reg;
1024
1025 if (amdgpu_sriov_vf(adev))
1026 return false;
1027
1028 if (amdgpu_passthrough(adev)) {
1029 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1030 * some old smc fw still need driver do vPost otherwise gpu hang, while
1031 * those smc fw version above 22.15 doesn't have this flaw, so we force
1032 * vpost executed for smc version below 22.15
1033 */
1034 if (adev->asic_type == CHIP_FIJI) {
1035 int err;
1036 uint32_t fw_ver;
1037 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1038 /* force vPost if error occured */
1039 if (err)
1040 return true;
1041
1042 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1043 if (fw_ver < 0x00160e00)
1044 return true;
1045 }
1046 }
1047
1048 if (adev->has_hw_reset) {
1049 adev->has_hw_reset = false;
1050 return true;
1051 }
1052
1053 /* bios scratch used on CIK+ */
1054 if (adev->asic_type >= CHIP_BONAIRE)
1055 return amdgpu_atombios_scratch_need_asic_init(adev);
1056
1057 /* check MEM_SIZE for older asics */
1058 reg = amdgpu_asic_get_config_memsize(adev);
1059
1060 if ((reg != 0) && (reg != 0xffffffff))
1061 return false;
1062
1063 return true;
1064 }
1065
1066 #ifndef __NetBSD__ /* XXX amdgpu vga */
1067 /* if we get transitioned to only one device, take VGA back */
1068 /**
1069 * amdgpu_device_vga_set_decode - enable/disable vga decode
1070 *
1071 * @cookie: amdgpu_device pointer
1072 * @state: enable/disable vga decode
1073 *
1074 * Enable/disable vga decode (all asics).
1075 * Returns VGA resource flags.
1076 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1077 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1078 {
1079 struct amdgpu_device *adev = cookie;
1080 amdgpu_asic_set_vga_state(adev, state);
1081 if (state)
1082 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1083 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1084 else
1085 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1086 }
1087 #endif /* __NetBSD__ */
1088
1089 /**
1090 * amdgpu_device_check_block_size - validate the vm block size
1091 *
1092 * @adev: amdgpu_device pointer
1093 *
1094 * Validates the vm block size specified via module parameter.
1095 * The vm block size defines number of bits in page table versus page directory,
1096 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1097 * page table and the remaining bits are in the page directory.
1098 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1099 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1100 {
1101 /* defines number of bits in page table versus page directory,
1102 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1103 * page table and the remaining bits are in the page directory */
1104 if (amdgpu_vm_block_size == -1)
1105 return;
1106
1107 if (amdgpu_vm_block_size < 9) {
1108 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1109 amdgpu_vm_block_size);
1110 amdgpu_vm_block_size = -1;
1111 }
1112 }
1113
1114 /**
1115 * amdgpu_device_check_vm_size - validate the vm size
1116 *
1117 * @adev: amdgpu_device pointer
1118 *
1119 * Validates the vm size in GB specified via module parameter.
1120 * The VM size is the size of the GPU virtual memory space in GB.
1121 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1122 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1123 {
1124 /* no need to check the default value */
1125 if (amdgpu_vm_size == -1)
1126 return;
1127
1128 if (amdgpu_vm_size < 1) {
1129 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1130 amdgpu_vm_size);
1131 amdgpu_vm_size = -1;
1132 }
1133 }
1134
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1135 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1136 {
1137 struct sysinfo si;
1138 bool is_os_64 = (sizeof(void *) == 8);
1139 uint64_t total_memory;
1140 uint64_t dram_size_seven_GB = 0x1B8000000;
1141 uint64_t dram_size_three_GB = 0xB8000000;
1142
1143 if (amdgpu_smu_memory_pool_size == 0)
1144 return;
1145
1146 if (!is_os_64) {
1147 DRM_WARN("Not 64-bit OS, feature not supported\n");
1148 goto def_value;
1149 }
1150 si_meminfo(&si);
1151 total_memory = (uint64_t)si.totalram * si.mem_unit;
1152
1153 if ((amdgpu_smu_memory_pool_size == 1) ||
1154 (amdgpu_smu_memory_pool_size == 2)) {
1155 if (total_memory < dram_size_three_GB)
1156 goto def_value1;
1157 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1158 (amdgpu_smu_memory_pool_size == 8)) {
1159 if (total_memory < dram_size_seven_GB)
1160 goto def_value1;
1161 } else {
1162 DRM_WARN("Smu memory pool size not supported\n");
1163 goto def_value;
1164 }
1165 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1166
1167 return;
1168
1169 def_value1:
1170 DRM_WARN("No enough system memory\n");
1171 def_value:
1172 adev->pm.smu_prv_buffer_size = 0;
1173 }
1174
1175 /**
1176 * amdgpu_device_check_arguments - validate module params
1177 *
1178 * @adev: amdgpu_device pointer
1179 *
1180 * Validates certain module parameters and updates
1181 * the associated values used by the driver (all asics).
1182 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1183 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1184 {
1185 if (amdgpu_sched_jobs < 4) {
1186 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1187 amdgpu_sched_jobs);
1188 amdgpu_sched_jobs = 4;
1189 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1190 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1191 amdgpu_sched_jobs);
1192 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1193 }
1194
1195 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1196 /* gart size must be greater or equal to 32M */
1197 dev_warn(adev->dev, "gart size (%d) too small\n",
1198 amdgpu_gart_size);
1199 amdgpu_gart_size = -1;
1200 }
1201
1202 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1203 /* gtt size must be greater or equal to 32M */
1204 dev_warn(adev->dev, "gtt size (%d) too small\n",
1205 amdgpu_gtt_size);
1206 amdgpu_gtt_size = -1;
1207 }
1208
1209 /* valid range is between 4 and 9 inclusive */
1210 if (amdgpu_vm_fragment_size != -1 &&
1211 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1212 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1213 amdgpu_vm_fragment_size = -1;
1214 }
1215
1216 amdgpu_device_check_smu_prv_buffer_size(adev);
1217
1218 amdgpu_device_check_vm_size(adev);
1219
1220 amdgpu_device_check_block_size(adev);
1221
1222 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1223
1224 return 0;
1225 }
1226
1227 #ifndef __NetBSD__ /* XXX amdgpu vga */
1228 /**
1229 * amdgpu_switcheroo_set_state - set switcheroo state
1230 *
1231 * @pdev: pci dev pointer
1232 * @state: vga_switcheroo state
1233 *
1234 * Callback for the switcheroo driver. Suspends or resumes the
1235 * the asics before or after it is powered up using ACPI methods.
1236 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1237 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1238 {
1239 struct drm_device *dev = pci_get_drvdata(pdev);
1240 int r;
1241
1242 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1243 return;
1244
1245 if (state == VGA_SWITCHEROO_ON) {
1246 pr_info("amdgpu: switched on\n");
1247 /* don't suspend or resume card normally */
1248 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1249
1250 #ifndef __NetBSD__ /* pmf handles this for us. */
1251 pci_set_power_state(dev->pdev, PCI_D0);
1252 pci_restore_state(dev->pdev);
1253 r = pci_enable_device(dev->pdev);
1254 if (r)
1255 DRM_WARN("pci_enable_device failed (%d)\n", r);
1256 #endif
1257 amdgpu_device_resume(dev, true);
1258
1259 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1260 drm_kms_helper_poll_enable(dev);
1261 } else {
1262 pr_info("amdgpu: switched off\n");
1263 drm_kms_helper_poll_disable(dev);
1264 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1265 amdgpu_device_suspend(dev, true);
1266 #ifndef __NetBSD__ /* pmf handles this for us. */
1267 pci_save_state(dev->pdev);
1268 /* Shut down the device */
1269 pci_disable_device(dev->pdev);
1270 pci_set_power_state(dev->pdev, PCI_D3cold);
1271 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1272 #endif
1273 }
1274 }
1275
1276 /**
1277 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1278 *
1279 * @pdev: pci dev pointer
1280 *
1281 * Callback for the switcheroo driver. Check of the switcheroo
1282 * state can be changed.
1283 * Returns true if the state can be changed, false if not.
1284 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1285 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1286 {
1287 struct drm_device *dev = pci_get_drvdata(pdev);
1288
1289 /*
1290 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1291 * locking inversion with the driver load path. And the access here is
1292 * completely racy anyway. So don't bother with locking for now.
1293 */
1294 return dev->open_count == 0;
1295 }
1296
1297 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1298 .set_gpu_state = amdgpu_switcheroo_set_state,
1299 .reprobe = NULL,
1300 .can_switch = amdgpu_switcheroo_can_switch,
1301 };
1302 #endif /* __NetBSD__ */
1303
1304 /**
1305 * amdgpu_device_ip_set_clockgating_state - set the CG state
1306 *
1307 * @dev: amdgpu_device pointer
1308 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1309 * @state: clockgating state (gate or ungate)
1310 *
1311 * Sets the requested clockgating state for all instances of
1312 * the hardware IP specified.
1313 * Returns the error code from the last instance.
1314 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1315 int amdgpu_device_ip_set_clockgating_state(void *dev,
1316 enum amd_ip_block_type block_type,
1317 enum amd_clockgating_state state)
1318 {
1319 struct amdgpu_device *adev = dev;
1320 int i, r = 0;
1321
1322 for (i = 0; i < adev->num_ip_blocks; i++) {
1323 if (!adev->ip_blocks[i].status.valid)
1324 continue;
1325 if (adev->ip_blocks[i].version->type != block_type)
1326 continue;
1327 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1328 continue;
1329 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1330 (void *)adev, state);
1331 if (r)
1332 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1333 adev->ip_blocks[i].version->funcs->name, r);
1334 }
1335 return r;
1336 }
1337
1338 /**
1339 * amdgpu_device_ip_set_powergating_state - set the PG state
1340 *
1341 * @dev: amdgpu_device pointer
1342 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1343 * @state: powergating state (gate or ungate)
1344 *
1345 * Sets the requested powergating state for all instances of
1346 * the hardware IP specified.
1347 * Returns the error code from the last instance.
1348 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1349 int amdgpu_device_ip_set_powergating_state(void *dev,
1350 enum amd_ip_block_type block_type,
1351 enum amd_powergating_state state)
1352 {
1353 struct amdgpu_device *adev = dev;
1354 int i, r = 0;
1355
1356 for (i = 0; i < adev->num_ip_blocks; i++) {
1357 if (!adev->ip_blocks[i].status.valid)
1358 continue;
1359 if (adev->ip_blocks[i].version->type != block_type)
1360 continue;
1361 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1362 continue;
1363 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1364 (void *)adev, state);
1365 if (r)
1366 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1367 adev->ip_blocks[i].version->funcs->name, r);
1368 }
1369 return r;
1370 }
1371
1372 /**
1373 * amdgpu_device_ip_get_clockgating_state - get the CG state
1374 *
1375 * @adev: amdgpu_device pointer
1376 * @flags: clockgating feature flags
1377 *
1378 * Walks the list of IPs on the device and updates the clockgating
1379 * flags for each IP.
1380 * Updates @flags with the feature flags for each hardware IP where
1381 * clockgating is enabled.
1382 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1383 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1384 u32 *flags)
1385 {
1386 int i;
1387
1388 for (i = 0; i < adev->num_ip_blocks; i++) {
1389 if (!adev->ip_blocks[i].status.valid)
1390 continue;
1391 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1392 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1393 }
1394 }
1395
1396 /**
1397 * amdgpu_device_ip_wait_for_idle - wait for idle
1398 *
1399 * @adev: amdgpu_device pointer
1400 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1401 *
1402 * Waits for the request hardware IP to be idle.
1403 * Returns 0 for success or a negative error code on failure.
1404 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1405 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1406 enum amd_ip_block_type block_type)
1407 {
1408 int i, r;
1409
1410 for (i = 0; i < adev->num_ip_blocks; i++) {
1411 if (!adev->ip_blocks[i].status.valid)
1412 continue;
1413 if (adev->ip_blocks[i].version->type == block_type) {
1414 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1415 if (r)
1416 return r;
1417 break;
1418 }
1419 }
1420 return 0;
1421
1422 }
1423
1424 /**
1425 * amdgpu_device_ip_is_idle - is the hardware IP idle
1426 *
1427 * @adev: amdgpu_device pointer
1428 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1429 *
1430 * Check if the hardware IP is idle or not.
1431 * Returns true if it the IP is idle, false if not.
1432 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1433 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1434 enum amd_ip_block_type block_type)
1435 {
1436 int i;
1437
1438 for (i = 0; i < adev->num_ip_blocks; i++) {
1439 if (!adev->ip_blocks[i].status.valid)
1440 continue;
1441 if (adev->ip_blocks[i].version->type == block_type)
1442 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1443 }
1444 return true;
1445
1446 }
1447
1448 /**
1449 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1450 *
1451 * @adev: amdgpu_device pointer
1452 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1453 *
1454 * Returns a pointer to the hardware IP block structure
1455 * if it exists for the asic, otherwise NULL.
1456 */
1457 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1458 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1459 enum amd_ip_block_type type)
1460 {
1461 int i;
1462
1463 for (i = 0; i < adev->num_ip_blocks; i++)
1464 if (adev->ip_blocks[i].version->type == type)
1465 return &adev->ip_blocks[i];
1466
1467 return NULL;
1468 }
1469
1470 /**
1471 * amdgpu_device_ip_block_version_cmp
1472 *
1473 * @adev: amdgpu_device pointer
1474 * @type: enum amd_ip_block_type
1475 * @major: major version
1476 * @minor: minor version
1477 *
1478 * return 0 if equal or greater
1479 * return 1 if smaller or the ip_block doesn't exist
1480 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1481 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1482 enum amd_ip_block_type type,
1483 u32 major, u32 minor)
1484 {
1485 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1486
1487 if (ip_block && ((ip_block->version->major > major) ||
1488 ((ip_block->version->major == major) &&
1489 (ip_block->version->minor >= minor))))
1490 return 0;
1491
1492 return 1;
1493 }
1494
1495 /**
1496 * amdgpu_device_ip_block_add
1497 *
1498 * @adev: amdgpu_device pointer
1499 * @ip_block_version: pointer to the IP to add
1500 *
1501 * Adds the IP block driver information to the collection of IPs
1502 * on the asic.
1503 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1504 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1505 const struct amdgpu_ip_block_version *ip_block_version)
1506 {
1507 if (!ip_block_version)
1508 return -EINVAL;
1509
1510 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1511 ip_block_version->funcs->name);
1512
1513 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1514
1515 return 0;
1516 }
1517
1518 /**
1519 * amdgpu_device_enable_virtual_display - enable virtual display feature
1520 *
1521 * @adev: amdgpu_device pointer
1522 *
1523 * Enabled the virtual display feature if the user has enabled it via
1524 * the module parameter virtual_display. This feature provides a virtual
1525 * display hardware on headless boards or in virtualized environments.
1526 * This function parses and validates the configuration string specified by
1527 * the user and configues the virtual display configuration (number of
1528 * virtual connectors, crtcs, etc.) specified.
1529 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1530 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1531 {
1532 adev->enable_virtual_display = false;
1533
1534 if (amdgpu_virtual_display) {
1535 struct drm_device *ddev = adev->ddev;
1536 const char *pci_address_name = pci_name(ddev->pdev);
1537 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1538
1539 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1540 pciaddstr_tmp = pciaddstr;
1541 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1542 pciaddname = strsep(&pciaddname_tmp, ",");
1543 if (!strcmp("all", pciaddname)
1544 || !strcmp(pci_address_name, pciaddname)) {
1545 long num_crtc;
1546 int res = -1;
1547
1548 adev->enable_virtual_display = true;
1549
1550 if (pciaddname_tmp)
1551 res = kstrtol(pciaddname_tmp, 10,
1552 &num_crtc);
1553
1554 if (!res) {
1555 if (num_crtc < 1)
1556 num_crtc = 1;
1557 if (num_crtc > 6)
1558 num_crtc = 6;
1559 adev->mode_info.num_crtc = num_crtc;
1560 } else {
1561 adev->mode_info.num_crtc = 1;
1562 }
1563 break;
1564 }
1565 }
1566
1567 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1568 amdgpu_virtual_display, pci_address_name,
1569 adev->enable_virtual_display, adev->mode_info.num_crtc);
1570
1571 kfree(pciaddstr);
1572 }
1573 }
1574
1575 /**
1576 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1577 *
1578 * @adev: amdgpu_device pointer
1579 *
1580 * Parses the asic configuration parameters specified in the gpu info
1581 * firmware and makes them availale to the driver for use in configuring
1582 * the asic.
1583 * Returns 0 on success, -EINVAL on failure.
1584 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1585 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1586 {
1587 const char *chip_name;
1588 char fw_name[30];
1589 int err;
1590 const struct gpu_info_firmware_header_v1_0 *hdr;
1591
1592 adev->firmware.gpu_info_fw = NULL;
1593
1594 switch (adev->asic_type) {
1595 case CHIP_TOPAZ:
1596 case CHIP_TONGA:
1597 case CHIP_FIJI:
1598 case CHIP_POLARIS10:
1599 case CHIP_POLARIS11:
1600 case CHIP_POLARIS12:
1601 case CHIP_VEGAM:
1602 case CHIP_CARRIZO:
1603 case CHIP_STONEY:
1604 #ifdef CONFIG_DRM_AMDGPU_SI
1605 case CHIP_VERDE:
1606 case CHIP_TAHITI:
1607 case CHIP_PITCAIRN:
1608 case CHIP_OLAND:
1609 case CHIP_HAINAN:
1610 #endif
1611 #ifdef CONFIG_DRM_AMDGPU_CIK
1612 case CHIP_BONAIRE:
1613 case CHIP_HAWAII:
1614 case CHIP_KAVERI:
1615 case CHIP_KABINI:
1616 case CHIP_MULLINS:
1617 #endif
1618 case CHIP_VEGA20:
1619 default:
1620 return 0;
1621 case CHIP_VEGA10:
1622 chip_name = "vega10";
1623 break;
1624 case CHIP_VEGA12:
1625 chip_name = "vega12";
1626 break;
1627 case CHIP_RAVEN:
1628 if (adev->rev_id >= 8)
1629 chip_name = "raven2";
1630 else if (adev->pdev->device == 0x15d8)
1631 chip_name = "picasso";
1632 else
1633 chip_name = "raven";
1634 break;
1635 case CHIP_ARCTURUS:
1636 chip_name = "arcturus";
1637 break;
1638 case CHIP_RENOIR:
1639 chip_name = "renoir";
1640 break;
1641 case CHIP_NAVI10:
1642 chip_name = "navi10";
1643 break;
1644 case CHIP_NAVI14:
1645 chip_name = "navi14";
1646 break;
1647 case CHIP_NAVI12:
1648 chip_name = "navi12";
1649 break;
1650 }
1651
1652 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1653 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1654 if (err) {
1655 dev_err(adev->dev,
1656 "Failed to load gpu_info firmware \"%s\"\n",
1657 fw_name);
1658 goto out;
1659 }
1660 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1661 if (err) {
1662 dev_err(adev->dev,
1663 "Failed to validate gpu_info firmware \"%s\"\n",
1664 fw_name);
1665 goto out;
1666 }
1667
1668 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1669 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1670
1671 switch (hdr->version_major) {
1672 case 1:
1673 {
1674 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1675 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1676 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1677
1678 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1679 goto parse_soc_bounding_box;
1680
1681 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1682 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1683 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1684 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1685 adev->gfx.config.max_texture_channel_caches =
1686 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1687 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1688 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1689 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1690 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1691 adev->gfx.config.double_offchip_lds_buf =
1692 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1693 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1694 adev->gfx.cu_info.max_waves_per_simd =
1695 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1696 adev->gfx.cu_info.max_scratch_slots_per_cu =
1697 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1698 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1699 if (hdr->version_minor >= 1) {
1700 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1701 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1702 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1703 adev->gfx.config.num_sc_per_sh =
1704 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1705 adev->gfx.config.num_packer_per_sc =
1706 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1707 }
1708
1709 parse_soc_bounding_box:
1710 /*
1711 * soc bounding box info is not integrated in disocovery table,
1712 * we always need to parse it from gpu info firmware.
1713 */
1714 if (hdr->version_minor == 2) {
1715 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1716 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1717 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1718 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1719 }
1720 break;
1721 }
1722 default:
1723 dev_err(adev->dev,
1724 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1725 err = -EINVAL;
1726 goto out;
1727 }
1728 out:
1729 return err;
1730 }
1731
1732 /**
1733 * amdgpu_device_ip_early_init - run early init for hardware IPs
1734 *
1735 * @adev: amdgpu_device pointer
1736 *
1737 * Early initialization pass for hardware IPs. The hardware IPs that make
1738 * up each asic are discovered each IP's early_init callback is run. This
1739 * is the first stage in initializing the asic.
1740 * Returns 0 on success, negative error code on failure.
1741 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1742 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1743 {
1744 int i, r;
1745
1746 amdgpu_device_enable_virtual_display(adev);
1747
1748 switch (adev->asic_type) {
1749 case CHIP_TOPAZ:
1750 case CHIP_TONGA:
1751 case CHIP_FIJI:
1752 case CHIP_POLARIS10:
1753 case CHIP_POLARIS11:
1754 case CHIP_POLARIS12:
1755 case CHIP_VEGAM:
1756 case CHIP_CARRIZO:
1757 case CHIP_STONEY:
1758 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
1759 adev->family = AMDGPU_FAMILY_CZ;
1760 else
1761 adev->family = AMDGPU_FAMILY_VI;
1762
1763 r = vi_set_ip_blocks(adev);
1764 if (r)
1765 return r;
1766 break;
1767 #ifdef CONFIG_DRM_AMDGPU_SI
1768 case CHIP_VERDE:
1769 case CHIP_TAHITI:
1770 case CHIP_PITCAIRN:
1771 case CHIP_OLAND:
1772 case CHIP_HAINAN:
1773 adev->family = AMDGPU_FAMILY_SI;
1774 r = si_set_ip_blocks(adev);
1775 if (r)
1776 return r;
1777 break;
1778 #endif
1779 #ifdef CONFIG_DRM_AMDGPU_CIK
1780 case CHIP_BONAIRE:
1781 case CHIP_HAWAII:
1782 case CHIP_KAVERI:
1783 case CHIP_KABINI:
1784 case CHIP_MULLINS:
1785 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
1786 adev->family = AMDGPU_FAMILY_CI;
1787 else
1788 adev->family = AMDGPU_FAMILY_KV;
1789
1790 r = cik_set_ip_blocks(adev);
1791 if (r)
1792 return r;
1793 break;
1794 #endif
1795 case CHIP_VEGA10:
1796 case CHIP_VEGA12:
1797 case CHIP_VEGA20:
1798 case CHIP_RAVEN:
1799 case CHIP_ARCTURUS:
1800 case CHIP_RENOIR:
1801 if (adev->asic_type == CHIP_RAVEN ||
1802 adev->asic_type == CHIP_RENOIR)
1803 adev->family = AMDGPU_FAMILY_RV;
1804 else
1805 adev->family = AMDGPU_FAMILY_AI;
1806
1807 r = soc15_set_ip_blocks(adev);
1808 if (r)
1809 return r;
1810 break;
1811 case CHIP_NAVI10:
1812 case CHIP_NAVI14:
1813 case CHIP_NAVI12:
1814 adev->family = AMDGPU_FAMILY_NV;
1815
1816 r = nv_set_ip_blocks(adev);
1817 if (r)
1818 return r;
1819 break;
1820 default:
1821 /* FIXME: not supported yet */
1822 return -EINVAL;
1823 }
1824
1825 r = amdgpu_device_parse_gpu_info_fw(adev);
1826 if (r)
1827 return r;
1828
1829 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
1830 amdgpu_discovery_get_gfx_info(adev);
1831
1832 amdgpu_amdkfd_device_probe(adev);
1833
1834 if (amdgpu_sriov_vf(adev)) {
1835 r = amdgpu_virt_request_full_gpu(adev, true);
1836 if (r)
1837 return -EAGAIN;
1838 }
1839
1840 adev->pm.pp_feature = amdgpu_pp_feature_mask;
1841 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1842 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1843
1844 for (i = 0; i < adev->num_ip_blocks; i++) {
1845 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1846 DRM_ERROR("disabled ip block: %d <%s>\n",
1847 i, adev->ip_blocks[i].version->funcs->name);
1848 adev->ip_blocks[i].status.valid = false;
1849 } else {
1850 if (adev->ip_blocks[i].version->funcs->early_init) {
1851 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1852 if (r == -ENOENT) {
1853 adev->ip_blocks[i].status.valid = false;
1854 } else if (r) {
1855 DRM_ERROR("early_init of IP block <%s> failed %d\n",
1856 adev->ip_blocks[i].version->funcs->name, r);
1857 return r;
1858 } else {
1859 adev->ip_blocks[i].status.valid = true;
1860 }
1861 } else {
1862 adev->ip_blocks[i].status.valid = true;
1863 }
1864 }
1865 /* get the vbios after the asic_funcs are set up */
1866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1867 /* Read BIOS */
1868 if (!amdgpu_get_bios(adev))
1869 return -EINVAL;
1870
1871 r = amdgpu_atombios_init(adev);
1872 if (r) {
1873 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1874 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1875 return r;
1876 }
1877 }
1878 }
1879
1880 adev->cg_flags &= amdgpu_cg_mask;
1881 adev->pg_flags &= amdgpu_pg_mask;
1882
1883 return 0;
1884 }
1885
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)1886 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1887 {
1888 int i, r;
1889
1890 for (i = 0; i < adev->num_ip_blocks; i++) {
1891 if (!adev->ip_blocks[i].status.sw)
1892 continue;
1893 if (adev->ip_blocks[i].status.hw)
1894 continue;
1895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1896 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1898 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1899 if (r) {
1900 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1901 adev->ip_blocks[i].version->funcs->name, r);
1902 return r;
1903 }
1904 adev->ip_blocks[i].status.hw = true;
1905 }
1906 }
1907
1908 return 0;
1909 }
1910
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)1911 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1912 {
1913 int i, r;
1914
1915 for (i = 0; i < adev->num_ip_blocks; i++) {
1916 if (!adev->ip_blocks[i].status.sw)
1917 continue;
1918 if (adev->ip_blocks[i].status.hw)
1919 continue;
1920 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1921 if (r) {
1922 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1923 adev->ip_blocks[i].version->funcs->name, r);
1924 return r;
1925 }
1926 adev->ip_blocks[i].status.hw = true;
1927 }
1928
1929 return 0;
1930 }
1931
amdgpu_device_fw_loading(struct amdgpu_device * adev)1932 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1933 {
1934 int r = 0;
1935 int i;
1936 uint32_t smu_version;
1937
1938 if (adev->asic_type >= CHIP_VEGA10) {
1939 for (i = 0; i < adev->num_ip_blocks; i++) {
1940 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1941 continue;
1942
1943 /* no need to do the fw loading again if already done*/
1944 if (adev->ip_blocks[i].status.hw == true)
1945 break;
1946
1947 if (adev->in_gpu_reset || adev->in_suspend) {
1948 r = adev->ip_blocks[i].version->funcs->resume(adev);
1949 if (r) {
1950 DRM_ERROR("resume of IP block <%s> failed %d\n",
1951 adev->ip_blocks[i].version->funcs->name, r);
1952 return r;
1953 }
1954 } else {
1955 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1956 if (r) {
1957 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1958 adev->ip_blocks[i].version->funcs->name, r);
1959 return r;
1960 }
1961 }
1962
1963 adev->ip_blocks[i].status.hw = true;
1964 break;
1965 }
1966 }
1967
1968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1970
1971 return r;
1972 }
1973
1974 /**
1975 * amdgpu_device_ip_init - run init for hardware IPs
1976 *
1977 * @adev: amdgpu_device pointer
1978 *
1979 * Main initialization pass for hardware IPs. The list of all the hardware
1980 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1981 * are run. sw_init initializes the software state associated with each IP
1982 * and hw_init initializes the hardware associated with each IP.
1983 * Returns 0 on success, negative error code on failure.
1984 */
amdgpu_device_ip_init(struct amdgpu_device * adev)1985 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1986 {
1987 int i, r;
1988
1989 r = amdgpu_ras_init(adev);
1990 if (r)
1991 return r;
1992
1993 for (i = 0; i < adev->num_ip_blocks; i++) {
1994 if (!adev->ip_blocks[i].status.valid)
1995 continue;
1996 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1997 if (r) {
1998 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1999 adev->ip_blocks[i].version->funcs->name, r);
2000 goto init_failed;
2001 }
2002 adev->ip_blocks[i].status.sw = true;
2003
2004 /* need to do gmc hw init early so we can allocate gpu mem */
2005 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2006 r = amdgpu_device_vram_scratch_init(adev);
2007 if (r) {
2008 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2009 goto init_failed;
2010 }
2011 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2012 if (r) {
2013 DRM_ERROR("hw_init %d failed %d\n", i, r);
2014 goto init_failed;
2015 }
2016 r = amdgpu_device_wb_init(adev);
2017 if (r) {
2018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2019 goto init_failed;
2020 }
2021 adev->ip_blocks[i].status.hw = true;
2022
2023 /* right after GMC hw init, we create CSA */
2024 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2026 AMDGPU_GEM_DOMAIN_VRAM,
2027 AMDGPU_CSA_SIZE);
2028 if (r) {
2029 DRM_ERROR("allocate CSA failed %d\n", r);
2030 goto init_failed;
2031 }
2032 }
2033 }
2034 }
2035
2036 if (amdgpu_sriov_vf(adev))
2037 amdgpu_virt_init_data_exchange(adev);
2038
2039 r = amdgpu_ib_pool_init(adev);
2040 if (r) {
2041 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2042 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2043 goto init_failed;
2044 }
2045
2046 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2047 if (r)
2048 goto init_failed;
2049
2050 r = amdgpu_device_ip_hw_init_phase1(adev);
2051 if (r)
2052 goto init_failed;
2053
2054 r = amdgpu_device_fw_loading(adev);
2055 if (r)
2056 goto init_failed;
2057
2058 r = amdgpu_device_ip_hw_init_phase2(adev);
2059 if (r)
2060 goto init_failed;
2061
2062 /*
2063 * retired pages will be loaded from eeprom and reserved here,
2064 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2065 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2066 * for I2C communication which only true at this point.
2067 * recovery_init may fail, but it can free all resources allocated by
2068 * itself and its failure should not stop amdgpu init process.
2069 *
2070 * Note: theoretically, this should be called before all vram allocations
2071 * to protect retired page from abusing
2072 */
2073 amdgpu_ras_recovery_init(adev);
2074
2075 if (adev->gmc.xgmi.num_physical_nodes > 1)
2076 amdgpu_xgmi_add_device(adev);
2077 amdgpu_amdkfd_device_init(adev);
2078
2079 init_failed:
2080 if (amdgpu_sriov_vf(adev))
2081 amdgpu_virt_release_full_gpu(adev, true);
2082
2083 return r;
2084 }
2085
2086 /**
2087 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2088 *
2089 * @adev: amdgpu_device pointer
2090 *
2091 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2092 * this function before a GPU reset. If the value is retained after a
2093 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2094 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2095 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2096 {
2097 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2098 }
2099
2100 /**
2101 * amdgpu_device_check_vram_lost - check if vram is valid
2102 *
2103 * @adev: amdgpu_device pointer
2104 *
2105 * Checks the reset magic value written to the gart pointer in VRAM.
2106 * The driver calls this after a GPU reset to see if the contents of
2107 * VRAM is lost or now.
2108 * returns true if vram is lost, false if not.
2109 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2110 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2111 {
2112 return !!memcmp(adev->gart.ptr, adev->reset_magic,
2113 AMDGPU_RESET_MAGIC_NUM);
2114 }
2115
2116 /**
2117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2118 *
2119 * @adev: amdgpu_device pointer
2120 * @state: clockgating state (gate or ungate)
2121 *
2122 * The list of all the hardware IPs that make up the asic is walked and the
2123 * set_clockgating_state callbacks are run.
2124 * Late initialization pass enabling clockgating for hardware IPs.
2125 * Fini or suspend, pass disabling clockgating for hardware IPs.
2126 * Returns 0 on success, negative error code on failure.
2127 */
2128
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2129 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2130 enum amd_clockgating_state state)
2131 {
2132 int i, j, r;
2133
2134 if (amdgpu_emu_mode == 1)
2135 return 0;
2136
2137 for (j = 0; j < adev->num_ip_blocks; j++) {
2138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2139 if (!adev->ip_blocks[i].status.late_initialized)
2140 continue;
2141 /* skip CG for VCE/UVD, it's handled specially */
2142 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2145 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2146 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2147 /* enable clockgating to save power */
2148 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2149 state);
2150 if (r) {
2151 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2152 adev->ip_blocks[i].version->funcs->name, r);
2153 return r;
2154 }
2155 }
2156 }
2157
2158 return 0;
2159 }
2160
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2161 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2162 {
2163 int i, j, r;
2164
2165 if (amdgpu_emu_mode == 1)
2166 return 0;
2167
2168 for (j = 0; j < adev->num_ip_blocks; j++) {
2169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2170 if (!adev->ip_blocks[i].status.late_initialized)
2171 continue;
2172 /* skip CG for VCE/UVD, it's handled specially */
2173 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2177 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2178 /* enable powergating to save power */
2179 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2180 state);
2181 if (r) {
2182 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2183 adev->ip_blocks[i].version->funcs->name, r);
2184 return r;
2185 }
2186 }
2187 }
2188 return 0;
2189 }
2190
amdgpu_device_enable_mgpu_fan_boost(void)2191 static int amdgpu_device_enable_mgpu_fan_boost(void)
2192 {
2193 struct amdgpu_gpu_instance *gpu_ins;
2194 struct amdgpu_device *adev;
2195 int i, ret = 0;
2196
2197 mutex_lock(&mgpu_info.mutex);
2198
2199 /*
2200 * MGPU fan boost feature should be enabled
2201 * only when there are two or more dGPUs in
2202 * the system
2203 */
2204 if (mgpu_info.num_dgpu < 2)
2205 goto out;
2206
2207 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2208 gpu_ins = &(mgpu_info.gpu_ins[i]);
2209 adev = gpu_ins->adev;
2210 if (!(adev->flags & AMD_IS_APU) &&
2211 !gpu_ins->mgpu_fan_enabled &&
2212 adev->powerplay.pp_funcs &&
2213 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2215 if (ret)
2216 break;
2217
2218 gpu_ins->mgpu_fan_enabled = 1;
2219 }
2220 }
2221
2222 out:
2223 mutex_unlock(&mgpu_info.mutex);
2224
2225 return ret;
2226 }
2227
2228 /**
2229 * amdgpu_device_ip_late_init - run late init for hardware IPs
2230 *
2231 * @adev: amdgpu_device pointer
2232 *
2233 * Late initialization pass for hardware IPs. The list of all the hardware
2234 * IPs that make up the asic is walked and the late_init callbacks are run.
2235 * late_init covers any special initialization that an IP requires
2236 * after all of the have been initialized or something that needs to happen
2237 * late in the init process.
2238 * Returns 0 on success, negative error code on failure.
2239 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2241 {
2242 struct amdgpu_gpu_instance *gpu_instance;
2243 int i = 0, r;
2244
2245 for (i = 0; i < adev->num_ip_blocks; i++) {
2246 if (!adev->ip_blocks[i].status.hw)
2247 continue;
2248 if (adev->ip_blocks[i].version->funcs->late_init) {
2249 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2250 if (r) {
2251 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2252 adev->ip_blocks[i].version->funcs->name, r);
2253 return r;
2254 }
2255 }
2256 adev->ip_blocks[i].status.late_initialized = true;
2257 }
2258
2259 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2260 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2261
2262 amdgpu_device_fill_reset_magic(adev);
2263
2264 r = amdgpu_device_enable_mgpu_fan_boost();
2265 if (r)
2266 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2267
2268
2269 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2270 mutex_lock(&mgpu_info.mutex);
2271
2272 /*
2273 * Reset device p-state to low as this was booted with high.
2274 *
2275 * This should be performed only after all devices from the same
2276 * hive get initialized.
2277 *
2278 * However, it's unknown how many device in the hive in advance.
2279 * As this is counted one by one during devices initializations.
2280 *
2281 * So, we wait for all XGMI interlinked devices initialized.
2282 * This may bring some delays as those devices may come from
2283 * different hives. But that should be OK.
2284 */
2285 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2286 for (i = 0; i < mgpu_info.num_gpu; i++) {
2287 gpu_instance = &(mgpu_info.gpu_ins[i]);
2288 if (gpu_instance->adev->flags & AMD_IS_APU)
2289 continue;
2290
2291 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
2292 if (r) {
2293 DRM_ERROR("pstate setting failed (%d).\n", r);
2294 break;
2295 }
2296 }
2297 }
2298
2299 mutex_unlock(&mgpu_info.mutex);
2300 }
2301
2302 return 0;
2303 }
2304
2305 /**
2306 * amdgpu_device_ip_fini - run fini for hardware IPs
2307 *
2308 * @adev: amdgpu_device pointer
2309 *
2310 * Main teardown pass for hardware IPs. The list of all the hardware
2311 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2312 * are run. hw_fini tears down the hardware associated with each IP
2313 * and sw_fini tears down any software state associated with each IP.
2314 * Returns 0 on success, negative error code on failure.
2315 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2316 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2317 {
2318 int i, r;
2319
2320 amdgpu_ras_pre_fini(adev);
2321
2322 if (adev->gmc.xgmi.num_physical_nodes > 1)
2323 amdgpu_xgmi_remove_device(adev);
2324
2325 amdgpu_amdkfd_device_fini(adev);
2326
2327 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2328 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2329
2330 /* need to disable SMC first */
2331 for (i = 0; i < adev->num_ip_blocks; i++) {
2332 if (!adev->ip_blocks[i].status.hw)
2333 continue;
2334 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2335 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2336 /* XXX handle errors */
2337 if (r) {
2338 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2339 adev->ip_blocks[i].version->funcs->name, r);
2340 }
2341 adev->ip_blocks[i].status.hw = false;
2342 break;
2343 }
2344 }
2345
2346 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2347 if (!adev->ip_blocks[i].status.hw)
2348 continue;
2349
2350 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2351 /* XXX handle errors */
2352 if (r) {
2353 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2354 adev->ip_blocks[i].version->funcs->name, r);
2355 }
2356
2357 adev->ip_blocks[i].status.hw = false;
2358 }
2359
2360
2361 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2362 if (!adev->ip_blocks[i].status.sw)
2363 continue;
2364
2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2366 amdgpu_ucode_free_bo(adev);
2367 amdgpu_free_static_csa(&adev->virt.csa_obj);
2368 amdgpu_device_wb_fini(adev);
2369 amdgpu_device_vram_scratch_fini(adev);
2370 amdgpu_ib_pool_fini(adev);
2371 }
2372
2373 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2374 /* XXX handle errors */
2375 if (r) {
2376 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2377 adev->ip_blocks[i].version->funcs->name, r);
2378 }
2379 adev->ip_blocks[i].status.sw = false;
2380 adev->ip_blocks[i].status.valid = false;
2381 }
2382
2383 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2384 if (!adev->ip_blocks[i].status.late_initialized)
2385 continue;
2386 if (adev->ip_blocks[i].version->funcs->late_fini)
2387 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2388 adev->ip_blocks[i].status.late_initialized = false;
2389 }
2390
2391 amdgpu_ras_fini(adev);
2392
2393 if (amdgpu_sriov_vf(adev))
2394 if (amdgpu_virt_release_full_gpu(adev, false))
2395 DRM_ERROR("failed to release exclusive mode on fini\n");
2396
2397 return 0;
2398 }
2399
2400 /**
2401 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2402 *
2403 * @work: work_struct.
2404 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2405 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2406 {
2407 struct amdgpu_device *adev =
2408 container_of(work, struct amdgpu_device, delayed_init_work.work);
2409 int r;
2410
2411 r = amdgpu_ib_ring_tests(adev);
2412 if (r)
2413 DRM_ERROR("ib ring test failed (%d).\n", r);
2414 }
2415
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2416 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2417 {
2418 struct amdgpu_device *adev =
2419 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2420
2421 mutex_lock(&adev->gfx.gfx_off_mutex);
2422 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2423 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2424 adev->gfx.gfx_off_state = true;
2425 }
2426 mutex_unlock(&adev->gfx.gfx_off_mutex);
2427 }
2428
2429 /**
2430 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2431 *
2432 * @adev: amdgpu_device pointer
2433 *
2434 * Main suspend function for hardware IPs. The list of all the hardware
2435 * IPs that make up the asic is walked, clockgating is disabled and the
2436 * suspend callbacks are run. suspend puts the hardware and software state
2437 * in each IP into a state suitable for suspend.
2438 * Returns 0 on success, negative error code on failure.
2439 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2440 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2441 {
2442 int i, r;
2443
2444 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2445 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2446
2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2448 if (!adev->ip_blocks[i].status.valid)
2449 continue;
2450 /* displays are handled separately */
2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2452 /* XXX handle errors */
2453 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2454 /* XXX handle errors */
2455 if (r) {
2456 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2457 adev->ip_blocks[i].version->funcs->name, r);
2458 return r;
2459 }
2460 adev->ip_blocks[i].status.hw = false;
2461 }
2462 }
2463
2464 return 0;
2465 }
2466
2467 /**
2468 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2469 *
2470 * @adev: amdgpu_device pointer
2471 *
2472 * Main suspend function for hardware IPs. The list of all the hardware
2473 * IPs that make up the asic is walked, clockgating is disabled and the
2474 * suspend callbacks are run. suspend puts the hardware and software state
2475 * in each IP into a state suitable for suspend.
2476 * Returns 0 on success, negative error code on failure.
2477 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2478 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2479 {
2480 int i, r __unused;
2481
2482 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2483 if (!adev->ip_blocks[i].status.valid)
2484 continue;
2485 /* displays are handled in phase1 */
2486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2487 continue;
2488 /* PSP lost connection when err_event_athub occurs */
2489 if (amdgpu_ras_intr_triggered() &&
2490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2491 adev->ip_blocks[i].status.hw = false;
2492 continue;
2493 }
2494 /* XXX handle errors */
2495 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2496 /* XXX handle errors */
2497 if (r) {
2498 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2499 adev->ip_blocks[i].version->funcs->name, r);
2500 }
2501 adev->ip_blocks[i].status.hw = false;
2502 /* handle putting the SMC in the appropriate state */
2503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2505 if (r) {
2506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2507 adev->mp1_state, r);
2508 return r;
2509 }
2510 }
2511
2512 adev->ip_blocks[i].status.hw = false;
2513 }
2514
2515 return 0;
2516 }
2517
2518 /**
2519 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2520 *
2521 * @adev: amdgpu_device pointer
2522 *
2523 * Main suspend function for hardware IPs. The list of all the hardware
2524 * IPs that make up the asic is walked, clockgating is disabled and the
2525 * suspend callbacks are run. suspend puts the hardware and software state
2526 * in each IP into a state suitable for suspend.
2527 * Returns 0 on success, negative error code on failure.
2528 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2529 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2530 {
2531 int r;
2532
2533 if (amdgpu_sriov_vf(adev))
2534 amdgpu_virt_request_full_gpu(adev, false);
2535
2536 r = amdgpu_device_ip_suspend_phase1(adev);
2537 if (r)
2538 return r;
2539 r = amdgpu_device_ip_suspend_phase2(adev);
2540
2541 if (amdgpu_sriov_vf(adev))
2542 amdgpu_virt_release_full_gpu(adev, false);
2543
2544 return r;
2545 }
2546
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2547 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2548 {
2549 int i, r;
2550
2551 static enum amd_ip_block_type ip_order[] = {
2552 AMD_IP_BLOCK_TYPE_GMC,
2553 AMD_IP_BLOCK_TYPE_COMMON,
2554 AMD_IP_BLOCK_TYPE_PSP,
2555 AMD_IP_BLOCK_TYPE_IH,
2556 };
2557
2558 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2559 int j;
2560 struct amdgpu_ip_block *block;
2561
2562 for (j = 0; j < adev->num_ip_blocks; j++) {
2563 block = &adev->ip_blocks[j];
2564
2565 block->status.hw = false;
2566 if (block->version->type != ip_order[i] ||
2567 !block->status.valid)
2568 continue;
2569
2570 r = block->version->funcs->hw_init(adev);
2571 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2572 if (r)
2573 return r;
2574 block->status.hw = true;
2575 }
2576 }
2577
2578 return 0;
2579 }
2580
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2581 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2582 {
2583 int i, r;
2584
2585 static enum amd_ip_block_type ip_order[] = {
2586 AMD_IP_BLOCK_TYPE_SMC,
2587 AMD_IP_BLOCK_TYPE_DCE,
2588 AMD_IP_BLOCK_TYPE_GFX,
2589 AMD_IP_BLOCK_TYPE_SDMA,
2590 AMD_IP_BLOCK_TYPE_UVD,
2591 AMD_IP_BLOCK_TYPE_VCE,
2592 AMD_IP_BLOCK_TYPE_VCN
2593 };
2594
2595 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2596 int j;
2597 struct amdgpu_ip_block *block;
2598
2599 for (j = 0; j < adev->num_ip_blocks; j++) {
2600 block = &adev->ip_blocks[j];
2601
2602 if (block->version->type != ip_order[i] ||
2603 !block->status.valid ||
2604 block->status.hw)
2605 continue;
2606
2607 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2608 r = block->version->funcs->resume(adev);
2609 else
2610 r = block->version->funcs->hw_init(adev);
2611
2612 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2613 if (r)
2614 return r;
2615 block->status.hw = true;
2616 }
2617 }
2618
2619 return 0;
2620 }
2621
2622 /**
2623 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2624 *
2625 * @adev: amdgpu_device pointer
2626 *
2627 * First resume function for hardware IPs. The list of all the hardware
2628 * IPs that make up the asic is walked and the resume callbacks are run for
2629 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2630 * after a suspend and updates the software state as necessary. This
2631 * function is also used for restoring the GPU after a GPU reset.
2632 * Returns 0 on success, negative error code on failure.
2633 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2634 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2635 {
2636 int i, r;
2637
2638 for (i = 0; i < adev->num_ip_blocks; i++) {
2639 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2640 continue;
2641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2642 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2643 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2644
2645 r = adev->ip_blocks[i].version->funcs->resume(adev);
2646 if (r) {
2647 DRM_ERROR("resume of IP block <%s> failed %d\n",
2648 adev->ip_blocks[i].version->funcs->name, r);
2649 return r;
2650 }
2651 adev->ip_blocks[i].status.hw = true;
2652 }
2653 }
2654
2655 return 0;
2656 }
2657
2658 /**
2659 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2660 *
2661 * @adev: amdgpu_device pointer
2662 *
2663 * First resume function for hardware IPs. The list of all the hardware
2664 * IPs that make up the asic is walked and the resume callbacks are run for
2665 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2666 * functional state after a suspend and updates the software state as
2667 * necessary. This function is also used for restoring the GPU after a GPU
2668 * reset.
2669 * Returns 0 on success, negative error code on failure.
2670 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2671 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2672 {
2673 int i, r;
2674
2675 for (i = 0; i < adev->num_ip_blocks; i++) {
2676 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2677 continue;
2678 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2679 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2680 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2682 continue;
2683 r = adev->ip_blocks[i].version->funcs->resume(adev);
2684 if (r) {
2685 DRM_ERROR("resume of IP block <%s> failed %d\n",
2686 adev->ip_blocks[i].version->funcs->name, r);
2687 return r;
2688 }
2689 adev->ip_blocks[i].status.hw = true;
2690 }
2691
2692 return 0;
2693 }
2694
2695 /**
2696 * amdgpu_device_ip_resume - run resume for hardware IPs
2697 *
2698 * @adev: amdgpu_device pointer
2699 *
2700 * Main resume function for hardware IPs. The hardware IPs
2701 * are split into two resume functions because they are
2702 * are also used in in recovering from a GPU reset and some additional
2703 * steps need to be take between them. In this case (S3/S4) they are
2704 * run sequentially.
2705 * Returns 0 on success, negative error code on failure.
2706 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2707 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2708 {
2709 int r;
2710
2711 r = amdgpu_device_ip_resume_phase1(adev);
2712 if (r)
2713 return r;
2714
2715 r = amdgpu_device_fw_loading(adev);
2716 if (r)
2717 return r;
2718
2719 r = amdgpu_device_ip_resume_phase2(adev);
2720
2721 return r;
2722 }
2723
2724 /**
2725 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2726 *
2727 * @adev: amdgpu_device pointer
2728 *
2729 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2730 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2731 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2732 {
2733 if (amdgpu_sriov_vf(adev)) {
2734 if (adev->is_atom_fw) {
2735 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2736 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2737 } else {
2738 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2739 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2740 }
2741
2742 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2744 }
2745 }
2746
2747 /**
2748 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2749 *
2750 * @asic_type: AMD asic type
2751 *
2752 * Check if there is DC (new modesetting infrastructre) support for an asic.
2753 * returns true if DC has support, false if not.
2754 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2755 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2756 {
2757 switch (asic_type) {
2758 #if defined(CONFIG_DRM_AMD_DC)
2759 case CHIP_BONAIRE:
2760 case CHIP_KAVERI:
2761 case CHIP_KABINI:
2762 case CHIP_MULLINS:
2763 /*
2764 * We have systems in the wild with these ASICs that require
2765 * LVDS and VGA support which is not supported with DC.
2766 *
2767 * Fallback to the non-DC driver here by default so as not to
2768 * cause regressions.
2769 */
2770 return amdgpu_dc > 0;
2771 case CHIP_HAWAII:
2772 case CHIP_CARRIZO:
2773 case CHIP_STONEY:
2774 case CHIP_POLARIS10:
2775 case CHIP_POLARIS11:
2776 case CHIP_POLARIS12:
2777 case CHIP_VEGAM:
2778 case CHIP_TONGA:
2779 case CHIP_FIJI:
2780 case CHIP_VEGA10:
2781 case CHIP_VEGA12:
2782 case CHIP_VEGA20:
2783 #if defined(CONFIG_DRM_AMD_DC_DCN)
2784 case CHIP_RAVEN:
2785 case CHIP_NAVI10:
2786 case CHIP_NAVI14:
2787 case CHIP_NAVI12:
2788 case CHIP_RENOIR:
2789 #endif
2790 return amdgpu_dc != 0;
2791 #endif
2792 default:
2793 if (amdgpu_dc > 0)
2794 DRM_INFO("Display Core has been requested via kernel parameter "
2795 "but isn't supported by ASIC, ignoring\n");
2796 return false;
2797 }
2798 }
2799
2800 /**
2801 * amdgpu_device_has_dc_support - check if dc is supported
2802 *
2803 * @adev: amdgpu_device_pointer
2804 *
2805 * Returns true for supported, false for not supported
2806 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)2807 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2808 {
2809 if (amdgpu_sriov_vf(adev))
2810 return false;
2811
2812 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2813 }
2814
2815
amdgpu_device_xgmi_reset_func(struct work_struct * __work)2816 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2817 {
2818 struct amdgpu_device *adev =
2819 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2820 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2821
2822 /* It's a bug to not have a hive within this function */
2823 if (WARN_ON(!hive))
2824 return;
2825
2826 /*
2827 * Use task barrier to synchronize all xgmi reset works across the
2828 * hive. task_barrier_enter and task_barrier_exit will block
2829 * until all the threads running the xgmi reset works reach
2830 * those points. task_barrier_full will do both blocks.
2831 */
2832 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2833
2834 task_barrier_enter(&hive->tb);
2835 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2836
2837 if (adev->asic_reset_res)
2838 goto fail;
2839
2840 task_barrier_exit(&hive->tb);
2841 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2842
2843 if (adev->asic_reset_res)
2844 goto fail;
2845 } else {
2846
2847 task_barrier_full(&hive->tb);
2848 adev->asic_reset_res = amdgpu_asic_reset(adev);
2849 }
2850
2851 fail:
2852 if (adev->asic_reset_res)
2853 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2854 adev->asic_reset_res, adev->ddev->unique);
2855 }
2856
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)2857 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2858 {
2859 char *input = amdgpu_lockup_timeout;
2860 char *timeout_setting = NULL;
2861 int index = 0;
2862 long timeout;
2863 int ret = 0;
2864
2865 /*
2866 * By default timeout for non compute jobs is 10000.
2867 * And there is no timeout enforced on compute jobs.
2868 * In SR-IOV or passthrough mode, timeout for compute
2869 * jobs are 10000 by default.
2870 */
2871 adev->gfx_timeout = msecs_to_jiffies(10000);
2872 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2873 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2874 adev->compute_timeout = adev->gfx_timeout;
2875 else
2876 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2877
2878 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2879 while ((timeout_setting = strsep(&input, ",")) &&
2880 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2881 ret = kstrtol(timeout_setting, 0, &timeout);
2882 if (ret)
2883 return ret;
2884
2885 if (timeout == 0) {
2886 index++;
2887 continue;
2888 } else if (timeout < 0) {
2889 timeout = MAX_SCHEDULE_TIMEOUT;
2890 } else {
2891 timeout = msecs_to_jiffies(timeout);
2892 }
2893
2894 switch (index++) {
2895 case 0:
2896 adev->gfx_timeout = timeout;
2897 break;
2898 case 1:
2899 adev->compute_timeout = timeout;
2900 break;
2901 case 2:
2902 adev->sdma_timeout = timeout;
2903 break;
2904 case 3:
2905 adev->video_timeout = timeout;
2906 break;
2907 default:
2908 break;
2909 }
2910 }
2911 /*
2912 * There is only one value specified and
2913 * it should apply to all non-compute jobs.
2914 */
2915 if (index == 1) {
2916 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2917 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2918 adev->compute_timeout = adev->gfx_timeout;
2919 }
2920 }
2921
2922 return ret;
2923 }
2924
2925 /**
2926 * amdgpu_device_init - initialize the driver
2927 *
2928 * @adev: amdgpu_device pointer
2929 * @ddev: drm dev pointer
2930 * @pdev: pci dev pointer
2931 * @flags: driver flags
2932 *
2933 * Initializes the driver info and hw (all asics).
2934 * Returns 0 for success or an error on failure.
2935 * Called at driver startup.
2936 */
amdgpu_device_init(struct amdgpu_device * adev,struct drm_device * ddev,struct pci_dev * pdev,uint32_t flags)2937 int amdgpu_device_init(struct amdgpu_device *adev,
2938 struct drm_device *ddev,
2939 struct pci_dev *pdev,
2940 uint32_t flags)
2941 {
2942 int r, i;
2943 bool boco = false;
2944 u32 max_MBps;
2945
2946 adev->shutdown = false;
2947 adev->dev = pci_dev_dev(pdev);
2948 adev->ddev = ddev;
2949 adev->pdev = pdev;
2950 adev->flags = flags;
2951
2952 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2953 adev->asic_type = amdgpu_force_asic_type;
2954 else
2955 adev->asic_type = flags & AMD_ASIC_MASK;
2956
2957 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2958 if (amdgpu_emu_mode == 1)
2959 adev->usec_timeout *= 2;
2960 adev->gmc.gart_size = 512 * 1024 * 1024;
2961 adev->accel_working = false;
2962 adev->num_rings = 0;
2963 adev->mman.buffer_funcs = NULL;
2964 adev->mman.buffer_funcs_ring = NULL;
2965 adev->vm_manager.vm_pte_funcs = NULL;
2966 adev->vm_manager.vm_pte_num_scheds = 0;
2967 adev->gmc.gmc_funcs = NULL;
2968 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2969 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
2970
2971 adev->smc_rreg = &amdgpu_invalid_rreg;
2972 adev->smc_wreg = &amdgpu_invalid_wreg;
2973 adev->pcie_rreg = &amdgpu_invalid_rreg;
2974 adev->pcie_wreg = &amdgpu_invalid_wreg;
2975 adev->pciep_rreg = &amdgpu_invalid_rreg;
2976 adev->pciep_wreg = &amdgpu_invalid_wreg;
2977 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
2978 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2979 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2980 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2981 adev->didt_rreg = &amdgpu_invalid_rreg;
2982 adev->didt_wreg = &amdgpu_invalid_wreg;
2983 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
2984 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
2985 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
2986 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
2987
2988 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
2989 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
2990 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
2991
2992 /* mutex initialization are all done here so we
2993 * can recall function without having locking issues */
2994 atomic_set(&adev->irq.ih.lock, 0);
2995 mutex_init(&adev->firmware.mutex);
2996 mutex_init(&adev->pm.mutex);
2997 mutex_init(&adev->gfx.gpu_clock_mutex);
2998 mutex_init(&adev->srbm_mutex);
2999 mutex_init(&adev->gfx.pipe_reserve_mutex);
3000 mutex_init(&adev->gfx.gfx_off_mutex);
3001 mutex_init(&adev->grbm_idx_mutex);
3002 mutex_init(&adev->mn_lock);
3003 mutex_init(&adev->virt.vf_errors.lock);
3004 hash_init(adev->mn_hash);
3005 mutex_init(&adev->lock_reset);
3006 mutex_init(&adev->psp.mutex);
3007 mutex_init(&adev->notifier_lock);
3008
3009 spin_lock_init(&adev->mmio_idx_lock);
3010 spin_lock_init(&adev->smc_idx_lock);
3011 spin_lock_init(&adev->pcie_idx_lock);
3012 spin_lock_init(&adev->uvd_ctx_idx_lock);
3013 spin_lock_init(&adev->didt_idx_lock);
3014 spin_lock_init(&adev->gc_cac_idx_lock);
3015 spin_lock_init(&adev->se_cac_idx_lock);
3016 spin_lock_init(&adev->audio_endpt_idx_lock);
3017 spin_lock_init(&adev->mm_stats.lock);
3018
3019 INIT_LIST_HEAD(&adev->shadow_list);
3020 mutex_init(&adev->shadow_list_lock);
3021
3022 INIT_LIST_HEAD(&adev->ring_lru_list);
3023 spin_lock_init(&adev->ring_lru_list_lock);
3024
3025 INIT_DELAYED_WORK(&adev->delayed_init_work,
3026 amdgpu_device_delayed_init_work_handler);
3027 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3028 amdgpu_device_delay_enable_gfx_off);
3029
3030 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3031
3032 r = amdgpu_device_check_arguments(adev);
3033 if (r)
3034 return r;
3035
3036 adev->gfx.gfx_off_req_count = 1;
3037 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
3038
3039 /* Registers mapping */
3040 /* TODO: block userspace mapping of io register */
3041 if (adev->asic_type >= CHIP_BONAIRE) {
3042 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3043 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3044 } else {
3045 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3046 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3047 }
3048
3049 #ifdef __NetBSD__
3050 const int bar = (adev->asic_type >= CHIP_BONAIRE ? 5 : 2);
3051 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(bar),
3052 pci_mapreg_type(adev->pdev->pd_pa.pa_pc,
3053 adev->pdev->pd_pa.pa_tag, PCI_BAR(bar)),
3054 0,
3055 &adev->rmmiot, &adev->rmmioh,
3056 &adev->rmmio_base, &adev->rmmio_size))
3057 return -EIO;
3058 DRM_INFO("register mmio base: 0x%8"PRIXMAX"\n",
3059 (uintmax_t)adev->rmmio_base);
3060 DRM_INFO("register mmio size: %"PRIuMAX"\n",
3061 (uintmax_t)adev->rmmio_size);
3062 #else
3063 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3064 if (adev->rmmio == NULL) {
3065 return -ENOMEM;
3066 }
3067 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3068 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3069 #endif
3070
3071 /* io port mapping */
3072 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3073 #ifdef __NetBSD__
3074 if (pci_mapreg_map(&adev->pdev->pd_pa, PCI_BAR(i),
3075 PCI_MAPREG_TYPE_IO, 0,
3076 &adev->rio_memt, &adev->rio_memh,
3077 NULL, &adev->rio_mem_size) == 0)
3078 break;
3079 #else
3080 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3081 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3082 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3083 break;
3084 }
3085 #endif
3086 }
3087 #ifdef __NetBSD__
3088 if (i == DEVICE_COUNT_RESOURCE)
3089 #else
3090 if (adev->rio_mem == NULL)
3091 #endif
3092 DRM_INFO("PCI I/O BAR is not found.\n");
3093
3094 /* enable PCIE atomic ops */
3095 r = pci_enable_atomic_ops_to_root(adev->pdev,
3096 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3097 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3098 if (r) {
3099 adev->have_atomics_support = false;
3100 DRM_INFO("PCIE atomic ops is not supported\n");
3101 } else {
3102 adev->have_atomics_support = true;
3103 }
3104
3105 amdgpu_device_get_pcie_info(adev);
3106
3107 if (amdgpu_mcbp)
3108 DRM_INFO("MCBP is enabled\n");
3109
3110 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3111 adev->enable_mes = true;
3112
3113 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
3114 r = amdgpu_discovery_init(adev);
3115 if (r) {
3116 dev_err(adev->dev, "amdgpu_discovery_init failed\n");
3117 return r;
3118 }
3119 }
3120
3121 /* early init functions */
3122 r = amdgpu_device_ip_early_init(adev);
3123 if (r)
3124 return r;
3125
3126 r = amdgpu_device_get_job_timeout_settings(adev);
3127 if (r) {
3128 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3129 return r;
3130 }
3131
3132 /* doorbell bar mapping and doorbell index init*/
3133 amdgpu_device_doorbell_init(adev);
3134
3135 #ifndef __NetBSD__ /* XXX amdgpu vga */
3136 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3137 /* this will fail for cards that aren't VGA class devices, just
3138 * ignore it */
3139 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3140
3141 if (amdgpu_device_supports_boco(ddev))
3142 boco = true;
3143 if (amdgpu_has_atpx() &&
3144 (amdgpu_is_atpx_hybrid() ||
3145 amdgpu_has_atpx_dgpu_power_cntl()) &&
3146 !pci_is_thunderbolt_attached(adev->pdev))
3147 vga_switcheroo_register_client(adev->pdev,
3148 &amdgpu_switcheroo_ops, boco);
3149 if (boco)
3150 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3151 #endif
3152
3153 if (amdgpu_emu_mode == 1) {
3154 /* post the asic on emulation mode */
3155 emu_soc_asic_init(adev);
3156 goto fence_driver_init;
3157 }
3158
3159 /* detect if we are with an SRIOV vbios */
3160 amdgpu_device_detect_sriov_bios(adev);
3161
3162 /* check if we need to reset the asic
3163 * E.g., driver was not cleanly unloaded previously, etc.
3164 */
3165 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3166 r = amdgpu_asic_reset(adev);
3167 if (r) {
3168 dev_err(adev->dev, "asic reset on init failed\n");
3169 goto failed;
3170 }
3171 }
3172
3173 /* Post card if necessary */
3174 if (amdgpu_device_need_post(adev)) {
3175 if (!adev->bios) {
3176 dev_err(adev->dev, "no vBIOS found\n");
3177 r = -EINVAL;
3178 goto failed;
3179 }
3180 DRM_INFO("GPU posting now...\n");
3181 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3182 if (r) {
3183 dev_err(adev->dev, "gpu post error!\n");
3184 goto failed;
3185 }
3186 }
3187
3188 if (adev->is_atom_fw) {
3189 /* Initialize clocks */
3190 r = amdgpu_atomfirmware_get_clock_info(adev);
3191 if (r) {
3192 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3193 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3194 goto failed;
3195 }
3196 } else {
3197 /* Initialize clocks */
3198 r = amdgpu_atombios_get_clock_info(adev);
3199 if (r) {
3200 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3201 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3202 goto failed;
3203 }
3204 /* init i2c buses */
3205 if (!amdgpu_device_has_dc_support(adev))
3206 amdgpu_atombios_i2c_init(adev);
3207 }
3208
3209 fence_driver_init:
3210 /* Fence driver */
3211 r = amdgpu_fence_driver_init(adev);
3212 if (r) {
3213 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3214 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3215 goto failed;
3216 }
3217
3218 /* init the mode config */
3219 drm_mode_config_init(adev->ddev);
3220
3221 r = amdgpu_device_ip_init(adev);
3222 if (r) {
3223 /* failed in exclusive mode due to timeout */
3224 if (amdgpu_sriov_vf(adev) &&
3225 !amdgpu_sriov_runtime(adev) &&
3226 amdgpu_virt_mmio_blocked(adev) &&
3227 !amdgpu_virt_wait_reset(adev)) {
3228 dev_err(adev->dev, "VF exclusive mode timeout\n");
3229 /* Don't send request since VF is inactive. */
3230 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3231 adev->virt.ops = NULL;
3232 r = -EAGAIN;
3233 goto failed;
3234 }
3235 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3236 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3237 goto failed;
3238 }
3239
3240 DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3241 adev->gfx.config.max_shader_engines,
3242 adev->gfx.config.max_sh_per_se,
3243 adev->gfx.config.max_cu_per_sh,
3244 adev->gfx.cu_info.number);
3245
3246 amdgpu_ctx_init_sched(adev);
3247
3248 adev->accel_working = true;
3249
3250 amdgpu_vm_check_compute_bug(adev);
3251
3252 /* Initialize the buffer migration limit. */
3253 if (amdgpu_moverate >= 0)
3254 max_MBps = amdgpu_moverate;
3255 else
3256 max_MBps = 8; /* Allow 8 MB/s. */
3257 /* Get a log2 for easy divisions. */
3258 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3259
3260 amdgpu_fbdev_init(adev);
3261
3262 r = amdgpu_pm_sysfs_init(adev);
3263 if (r) {
3264 adev->pm_sysfs_en = false;
3265 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3266 } else
3267 adev->pm_sysfs_en = true;
3268
3269 r = amdgpu_ucode_sysfs_init(adev);
3270 if (r) {
3271 adev->ucode_sysfs_en = false;
3272 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3273 } else
3274 adev->ucode_sysfs_en = true;
3275
3276 r = amdgpu_debugfs_gem_init(adev);
3277 if (r)
3278 DRM_ERROR("registering gem debugfs failed (%d).\n", r);
3279
3280 r = amdgpu_debugfs_regs_init(adev);
3281 if (r)
3282 DRM_ERROR("registering register debugfs failed (%d).\n", r);
3283
3284 r = amdgpu_debugfs_firmware_init(adev);
3285 if (r)
3286 DRM_ERROR("registering firmware debugfs failed (%d).\n", r);
3287
3288 r = amdgpu_debugfs_init(adev);
3289 if (r)
3290 DRM_ERROR("Creating debugfs files failed (%d).\n", r);
3291
3292 if ((amdgpu_testing & 1)) {
3293 if (adev->accel_working)
3294 amdgpu_test_moves(adev);
3295 else
3296 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3297 }
3298 if (amdgpu_benchmarking) {
3299 if (adev->accel_working)
3300 amdgpu_benchmark(adev, amdgpu_benchmarking);
3301 else
3302 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3303 }
3304
3305 /*
3306 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3307 * Otherwise the mgpu fan boost feature will be skipped due to the
3308 * gpu instance is counted less.
3309 */
3310 amdgpu_register_gpu_instance(adev);
3311
3312 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3313 * explicit gating rather than handling it automatically.
3314 */
3315 r = amdgpu_device_ip_late_init(adev);
3316 if (r) {
3317 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3318 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3319 goto failed;
3320 }
3321
3322 /* must succeed. */
3323 amdgpu_ras_resume(adev);
3324
3325 queue_delayed_work(system_wq, &adev->delayed_init_work,
3326 msecs_to_jiffies(AMDGPU_RESUME_MS));
3327
3328 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
3329 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
3330 if (r) {
3331 dev_err(adev->dev, "Could not create pcie_replay_count");
3332 return r;
3333 }
3334 #endif
3335
3336 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3337 r = amdgpu_pmu_init(adev);
3338 if (r)
3339 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3340
3341 return 0;
3342
3343 failed:
3344 amdgpu_vf_error_trans_all(adev);
3345 if (boco)
3346 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3347
3348 return r;
3349 }
3350
3351 /**
3352 * amdgpu_device_fini - tear down the driver
3353 *
3354 * @adev: amdgpu_device pointer
3355 *
3356 * Tear down the driver info (all asics).
3357 * Called at driver shutdown.
3358 */
amdgpu_device_fini(struct amdgpu_device * adev)3359 void amdgpu_device_fini(struct amdgpu_device *adev)
3360 {
3361 int r __unused;
3362
3363 DRM_INFO("amdgpu: finishing device.\n");
3364 flush_delayed_work(&adev->delayed_init_work);
3365 adev->shutdown = true;
3366
3367 /* disable all interrupts */
3368 amdgpu_irq_disable_all(adev);
3369 if (adev->mode_info.mode_config_initialized){
3370 if (!amdgpu_device_has_dc_support(adev))
3371 drm_helper_force_disable_all(adev->ddev);
3372 else
3373 drm_atomic_helper_shutdown(adev->ddev);
3374 }
3375 amdgpu_fence_driver_fini(adev);
3376 if (adev->pm_sysfs_en)
3377 amdgpu_pm_sysfs_fini(adev);
3378 amdgpu_fbdev_fini(adev);
3379 r = amdgpu_device_ip_fini(adev);
3380 if (adev->firmware.gpu_info_fw) {
3381 release_firmware(adev->firmware.gpu_info_fw);
3382 adev->firmware.gpu_info_fw = NULL;
3383 }
3384 adev->accel_working = false;
3385 /* free i2c buses */
3386 if (!amdgpu_device_has_dc_support(adev))
3387 amdgpu_i2c_fini(adev);
3388
3389 if (amdgpu_emu_mode != 1)
3390 amdgpu_atombios_fini(adev);
3391
3392 kfree(adev->bios);
3393 adev->bios = NULL;
3394 #ifndef __NetBSD__ /* XXX amdgpu vga */
3395 if (amdgpu_has_atpx() &&
3396 (amdgpu_is_atpx_hybrid() ||
3397 amdgpu_has_atpx_dgpu_power_cntl()) &&
3398 !pci_is_thunderbolt_attached(adev->pdev))
3399 vga_switcheroo_unregister_client(adev->pdev);
3400 if (amdgpu_device_supports_boco(adev->ddev))
3401 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3402 vga_client_register(adev->pdev, NULL, NULL, NULL);
3403 #endif
3404 #ifdef __NetBSD__
3405 if (adev->rio_mem_size)
3406 bus_space_unmap(adev->rio_memt, adev->rio_memh,
3407 adev->rio_mem_size);
3408 adev->rio_mem_size = 0;
3409 bus_space_unmap(adev->rmmiot, adev->rmmioh, adev->rmmio_size);
3410 #else
3411 if (adev->rio_mem)
3412 pci_iounmap(adev->pdev, adev->rio_mem);
3413 adev->rio_mem = NULL;
3414 iounmap(adev->rmmio);
3415 adev->rmmio = NULL;
3416 #endif
3417 amdgpu_device_doorbell_fini(adev);
3418
3419 amdgpu_debugfs_regs_cleanup(adev);
3420 #ifndef __NetBSD__ /* XXX amdgpu sysfs */
3421 device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
3422 #endif
3423 if (adev->ucode_sysfs_en)
3424 amdgpu_ucode_sysfs_fini(adev);
3425 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3426 amdgpu_pmu_fini(adev);
3427 amdgpu_debugfs_preempt_cleanup(adev);
3428 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
3429 amdgpu_discovery_fini(adev);
3430 spin_lock_destroy(&adev->ring_lru_list_lock);
3431 mutex_destroy(&adev->shadow_list_lock);
3432 spin_lock_destroy(&adev->mm_stats.lock);
3433 spin_lock_destroy(&adev->audio_endpt_idx_lock);
3434 spin_lock_destroy(&adev->se_cac_idx_lock);
3435 spin_lock_destroy(&adev->gc_cac_idx_lock);
3436 spin_lock_destroy(&adev->didt_idx_lock);
3437 spin_lock_destroy(&adev->uvd_ctx_idx_lock);
3438 spin_lock_destroy(&adev->pcie_idx_lock);
3439 spin_lock_destroy(&adev->smc_idx_lock);
3440 spin_lock_destroy(&adev->mmio_idx_lock);
3441 mutex_destroy(&adev->notifier_lock);
3442 mutex_destroy(&adev->psp.mutex);
3443 mutex_destroy(&adev->lock_reset);
3444 /* hash_destroy(adev->mn_hash)? */
3445 mutex_destroy(&adev->virt.vf_errors.lock);
3446 mutex_destroy(&adev->mn_lock);
3447 mutex_destroy(&adev->grbm_idx_mutex);
3448 mutex_destroy(&adev->gfx.gfx_off_mutex);
3449 mutex_destroy(&adev->gfx.pipe_reserve_mutex);
3450 mutex_destroy(&adev->srbm_mutex);
3451 mutex_destroy(&adev->gfx.gpu_clock_mutex);
3452 mutex_destroy(&adev->pm.mutex);
3453 mutex_destroy(&adev->firmware.mutex);
3454 }
3455
3456
3457 /*
3458 * Suspend & resume.
3459 */
3460 /**
3461 * amdgpu_device_suspend - initiate device suspend
3462 *
3463 * @dev: drm dev pointer
3464 * @suspend: suspend state
3465 * @fbcon : notify the fbdev of suspend
3466 *
3467 * Puts the hw in the suspend state (all asics).
3468 * Returns 0 for success or an error on failure.
3469 * Called at driver suspend.
3470 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3471 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3472 {
3473 struct amdgpu_device *adev;
3474 struct drm_crtc *crtc;
3475 struct drm_connector *connector;
3476 struct drm_connector_list_iter iter;
3477 int r;
3478
3479 if (dev == NULL || dev->dev_private == NULL) {
3480 return -ENODEV;
3481 }
3482
3483 adev = dev->dev_private;
3484
3485 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3486 return 0;
3487
3488 adev->in_suspend = true;
3489 drm_kms_helper_poll_disable(dev);
3490
3491 if (fbcon)
3492 amdgpu_fbdev_set_suspend(adev, 1);
3493
3494 cancel_delayed_work_sync(&adev->delayed_init_work);
3495
3496 if (!amdgpu_device_has_dc_support(adev)) {
3497 /* turn off display hw */
3498 drm_modeset_lock_all(dev);
3499 drm_connector_list_iter_begin(dev, &iter);
3500 drm_for_each_connector_iter(connector, &iter)
3501 drm_helper_connector_dpms(connector,
3502 DRM_MODE_DPMS_OFF);
3503 drm_connector_list_iter_end(&iter);
3504 drm_modeset_unlock_all(dev);
3505 /* unpin the front buffers and cursors */
3506 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3507 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3508 struct drm_framebuffer *fb = crtc->primary->fb;
3509 struct amdgpu_bo *robj;
3510
3511 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3512 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3513 r = amdgpu_bo_reserve(aobj, true);
3514 if (r == 0) {
3515 amdgpu_bo_unpin(aobj);
3516 amdgpu_bo_unreserve(aobj);
3517 }
3518 }
3519
3520 if (fb == NULL || fb->obj[0] == NULL) {
3521 continue;
3522 }
3523 robj = gem_to_amdgpu_bo(fb->obj[0]);
3524 /* don't unpin kernel fb objects */
3525 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3526 r = amdgpu_bo_reserve(robj, true);
3527 if (r == 0) {
3528 amdgpu_bo_unpin(robj);
3529 amdgpu_bo_unreserve(robj);
3530 }
3531 }
3532 }
3533 }
3534
3535 amdgpu_amdkfd_suspend(adev);
3536
3537 amdgpu_ras_suspend(adev);
3538
3539 r = amdgpu_device_ip_suspend_phase1(adev);
3540
3541 /* evict vram memory */
3542 amdgpu_bo_evict_vram(adev);
3543
3544 amdgpu_fence_driver_suspend(adev);
3545
3546 r = amdgpu_device_ip_suspend_phase2(adev);
3547
3548 /* evict remaining vram memory
3549 * This second call to evict vram is to evict the gart page table
3550 * using the CPU.
3551 */
3552 amdgpu_bo_evict_vram(adev);
3553
3554 return 0;
3555 }
3556
3557 /**
3558 * amdgpu_device_resume - initiate device resume
3559 *
3560 * @dev: drm dev pointer
3561 * @resume: resume state
3562 * @fbcon : notify the fbdev of resume
3563 *
3564 * Bring the hw back to operating state (all asics).
3565 * Returns 0 for success or an error on failure.
3566 * Called at driver resume.
3567 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3568 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3569 {
3570 struct drm_connector *connector;
3571 struct drm_connector_list_iter iter;
3572 struct amdgpu_device *adev = dev->dev_private;
3573 struct drm_crtc *crtc;
3574 int r = 0;
3575
3576 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3577 return 0;
3578
3579 /* post card */
3580 if (amdgpu_device_need_post(adev)) {
3581 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3582 if (r)
3583 DRM_ERROR("amdgpu asic init failed\n");
3584 }
3585
3586 r = amdgpu_device_ip_resume(adev);
3587 if (r) {
3588 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3589 return r;
3590 }
3591 amdgpu_fence_driver_resume(adev);
3592
3593
3594 r = amdgpu_device_ip_late_init(adev);
3595 if (r)
3596 return r;
3597
3598 queue_delayed_work(system_wq, &adev->delayed_init_work,
3599 msecs_to_jiffies(AMDGPU_RESUME_MS));
3600
3601 if (!amdgpu_device_has_dc_support(adev)) {
3602 /* pin cursors */
3603 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3604 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3605
3606 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3607 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3608 r = amdgpu_bo_reserve(aobj, true);
3609 if (r == 0) {
3610 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3611 if (r != 0)
3612 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3613 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3614 amdgpu_bo_unreserve(aobj);
3615 }
3616 }
3617 }
3618 }
3619 r = amdgpu_amdkfd_resume(adev);
3620 if (r)
3621 return r;
3622
3623 /* Make sure IB tests flushed */
3624 flush_delayed_work(&adev->delayed_init_work);
3625
3626 /* blat the mode back in */
3627 if (fbcon) {
3628 if (!amdgpu_device_has_dc_support(adev)) {
3629 /* pre DCE11 */
3630 drm_helper_resume_force_mode(dev);
3631
3632 /* turn on display hw */
3633 drm_modeset_lock_all(dev);
3634
3635 drm_connector_list_iter_begin(dev, &iter);
3636 drm_for_each_connector_iter(connector, &iter)
3637 drm_helper_connector_dpms(connector,
3638 DRM_MODE_DPMS_ON);
3639 drm_connector_list_iter_end(&iter);
3640
3641 drm_modeset_unlock_all(dev);
3642 }
3643 amdgpu_fbdev_set_suspend(adev, 0);
3644 }
3645
3646 drm_kms_helper_poll_enable(dev);
3647
3648 amdgpu_ras_resume(adev);
3649
3650 /*
3651 * Most of the connector probing functions try to acquire runtime pm
3652 * refs to ensure that the GPU is powered on when connector polling is
3653 * performed. Since we're calling this from a runtime PM callback,
3654 * trying to acquire rpm refs will cause us to deadlock.
3655 *
3656 * Since we're guaranteed to be holding the rpm lock, it's safe to
3657 * temporarily disable the rpm helpers so this doesn't deadlock us.
3658 */
3659 #ifdef CONFIG_PM
3660 dev->dev->power.disable_depth++;
3661 #endif
3662 if (!amdgpu_device_has_dc_support(adev))
3663 drm_helper_hpd_irq_event(dev);
3664 else
3665 drm_kms_helper_hotplug_event(dev);
3666 #ifdef CONFIG_PM
3667 dev->dev->power.disable_depth--;
3668 #endif
3669 adev->in_suspend = false;
3670
3671 return 0;
3672 }
3673
3674 /**
3675 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3676 *
3677 * @adev: amdgpu_device pointer
3678 *
3679 * The list of all the hardware IPs that make up the asic is walked and
3680 * the check_soft_reset callbacks are run. check_soft_reset determines
3681 * if the asic is still hung or not.
3682 * Returns true if any of the IPs are still in a hung state, false if not.
3683 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3684 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3685 {
3686 int i;
3687 bool asic_hang = false;
3688
3689 if (amdgpu_sriov_vf(adev))
3690 return true;
3691
3692 if (amdgpu_asic_need_full_reset(adev))
3693 return true;
3694
3695 for (i = 0; i < adev->num_ip_blocks; i++) {
3696 if (!adev->ip_blocks[i].status.valid)
3697 continue;
3698 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3699 adev->ip_blocks[i].status.hang =
3700 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3701 if (adev->ip_blocks[i].status.hang) {
3702 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3703 asic_hang = true;
3704 }
3705 }
3706 return asic_hang;
3707 }
3708
3709 /**
3710 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3711 *
3712 * @adev: amdgpu_device pointer
3713 *
3714 * The list of all the hardware IPs that make up the asic is walked and the
3715 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3716 * handles any IP specific hardware or software state changes that are
3717 * necessary for a soft reset to succeed.
3718 * Returns 0 on success, negative error code on failure.
3719 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3720 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3721 {
3722 int i, r = 0;
3723
3724 for (i = 0; i < adev->num_ip_blocks; i++) {
3725 if (!adev->ip_blocks[i].status.valid)
3726 continue;
3727 if (adev->ip_blocks[i].status.hang &&
3728 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3729 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3730 if (r)
3731 return r;
3732 }
3733 }
3734
3735 return 0;
3736 }
3737
3738 /**
3739 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3740 *
3741 * @adev: amdgpu_device pointer
3742 *
3743 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3744 * reset is necessary to recover.
3745 * Returns true if a full asic reset is required, false if not.
3746 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3747 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3748 {
3749 int i;
3750
3751 if (amdgpu_asic_need_full_reset(adev))
3752 return true;
3753
3754 for (i = 0; i < adev->num_ip_blocks; i++) {
3755 if (!adev->ip_blocks[i].status.valid)
3756 continue;
3757 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3758 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3759 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3760 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3762 if (adev->ip_blocks[i].status.hang) {
3763 DRM_INFO("Some block need full reset!\n");
3764 return true;
3765 }
3766 }
3767 }
3768 return false;
3769 }
3770
3771 /**
3772 * amdgpu_device_ip_soft_reset - do a soft reset
3773 *
3774 * @adev: amdgpu_device pointer
3775 *
3776 * The list of all the hardware IPs that make up the asic is walked and the
3777 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3778 * IP specific hardware or software state changes that are necessary to soft
3779 * reset the IP.
3780 * Returns 0 on success, negative error code on failure.
3781 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3782 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3783 {
3784 int i, r = 0;
3785
3786 for (i = 0; i < adev->num_ip_blocks; i++) {
3787 if (!adev->ip_blocks[i].status.valid)
3788 continue;
3789 if (adev->ip_blocks[i].status.hang &&
3790 adev->ip_blocks[i].version->funcs->soft_reset) {
3791 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3792 if (r)
3793 return r;
3794 }
3795 }
3796
3797 return 0;
3798 }
3799
3800 /**
3801 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3802 *
3803 * @adev: amdgpu_device pointer
3804 *
3805 * The list of all the hardware IPs that make up the asic is walked and the
3806 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3807 * handles any IP specific hardware or software state changes that are
3808 * necessary after the IP has been soft reset.
3809 * Returns 0 on success, negative error code on failure.
3810 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3811 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3812 {
3813 int i, r = 0;
3814
3815 for (i = 0; i < adev->num_ip_blocks; i++) {
3816 if (!adev->ip_blocks[i].status.valid)
3817 continue;
3818 if (adev->ip_blocks[i].status.hang &&
3819 adev->ip_blocks[i].version->funcs->post_soft_reset)
3820 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3821 if (r)
3822 return r;
3823 }
3824
3825 return 0;
3826 }
3827
3828 /**
3829 * amdgpu_device_recover_vram - Recover some VRAM contents
3830 *
3831 * @adev: amdgpu_device pointer
3832 *
3833 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3834 * restore things like GPUVM page tables after a GPU reset where
3835 * the contents of VRAM might be lost.
3836 *
3837 * Returns:
3838 * 0 on success, negative error code on failure.
3839 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)3840 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3841 {
3842 struct dma_fence *fence = NULL, *next = NULL;
3843 struct amdgpu_bo *shadow;
3844 long r = 1, tmo;
3845
3846 if (amdgpu_sriov_runtime(adev))
3847 tmo = msecs_to_jiffies(8000);
3848 else
3849 tmo = msecs_to_jiffies(100);
3850
3851 DRM_INFO("recover vram bo from shadow start\n");
3852 mutex_lock(&adev->shadow_list_lock);
3853 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3854
3855 /* No need to recover an evicted BO */
3856 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3857 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3858 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3859 continue;
3860
3861 r = amdgpu_bo_restore_shadow(shadow, &next);
3862 if (r)
3863 break;
3864
3865 if (fence) {
3866 tmo = dma_fence_wait_timeout(fence, false, tmo);
3867 dma_fence_put(fence);
3868 fence = next;
3869 if (tmo == 0) {
3870 r = -ETIMEDOUT;
3871 break;
3872 } else if (tmo < 0) {
3873 r = tmo;
3874 break;
3875 }
3876 } else {
3877 fence = next;
3878 }
3879 }
3880 mutex_unlock(&adev->shadow_list_lock);
3881
3882 if (fence)
3883 tmo = dma_fence_wait_timeout(fence, false, tmo);
3884 dma_fence_put(fence);
3885
3886 if (r < 0 || tmo <= 0) {
3887 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3888 return -EIO;
3889 }
3890
3891 DRM_INFO("recover vram bo from shadow done\n");
3892 return 0;
3893 }
3894
3895
3896 /**
3897 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3898 *
3899 * @adev: amdgpu device pointer
3900 * @from_hypervisor: request from hypervisor
3901 *
3902 * do VF FLR and reinitialize Asic
3903 * return 0 means succeeded otherwise failed
3904 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)3905 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3906 bool from_hypervisor)
3907 {
3908 int r;
3909
3910 if (from_hypervisor)
3911 r = amdgpu_virt_request_full_gpu(adev, true);
3912 else
3913 r = amdgpu_virt_reset_gpu(adev);
3914 if (r)
3915 return r;
3916
3917 /* Resume IP prior to SMC */
3918 r = amdgpu_device_ip_reinit_early_sriov(adev);
3919 if (r)
3920 goto error;
3921
3922 amdgpu_virt_init_data_exchange(adev);
3923 /* we need recover gart prior to run SMC/CP/SDMA resume */
3924 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3925
3926 r = amdgpu_device_fw_loading(adev);
3927 if (r)
3928 return r;
3929
3930 /* now we are okay to resume SMC/CP/SDMA */
3931 r = amdgpu_device_ip_reinit_late_sriov(adev);
3932 if (r)
3933 goto error;
3934
3935 amdgpu_irq_gpu_reset_resume_helper(adev);
3936 r = amdgpu_ib_ring_tests(adev);
3937 amdgpu_amdkfd_post_reset(adev);
3938
3939 error:
3940 amdgpu_virt_release_full_gpu(adev, true);
3941 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3942 amdgpu_inc_vram_lost(adev);
3943 r = amdgpu_device_recover_vram(adev);
3944 }
3945
3946 return r;
3947 }
3948
3949 /**
3950 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3951 *
3952 * @adev: amdgpu device pointer
3953 *
3954 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3955 * a hung GPU.
3956 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)3957 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3958 {
3959 if (!amdgpu_device_ip_check_soft_reset(adev)) {
3960 DRM_INFO("Timeout, but no hardware hang detected.\n");
3961 return false;
3962 }
3963
3964 if (amdgpu_gpu_recovery == 0)
3965 goto disabled;
3966
3967 if (amdgpu_sriov_vf(adev))
3968 return true;
3969
3970 if (amdgpu_gpu_recovery == -1) {
3971 switch (adev->asic_type) {
3972 case CHIP_BONAIRE:
3973 case CHIP_HAWAII:
3974 case CHIP_TOPAZ:
3975 case CHIP_TONGA:
3976 case CHIP_FIJI:
3977 case CHIP_POLARIS10:
3978 case CHIP_POLARIS11:
3979 case CHIP_POLARIS12:
3980 case CHIP_VEGAM:
3981 case CHIP_VEGA20:
3982 case CHIP_VEGA10:
3983 case CHIP_VEGA12:
3984 case CHIP_RAVEN:
3985 case CHIP_ARCTURUS:
3986 case CHIP_RENOIR:
3987 case CHIP_NAVI10:
3988 case CHIP_NAVI14:
3989 case CHIP_NAVI12:
3990 break;
3991 default:
3992 goto disabled;
3993 }
3994 }
3995
3996 return true;
3997
3998 disabled:
3999 DRM_INFO("GPU recovery disabled.\n");
4000 return false;
4001 }
4002
4003
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4004 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4005 struct amdgpu_job *job,
4006 bool *need_full_reset_arg)
4007 {
4008 int i, r = 0;
4009 bool need_full_reset = *need_full_reset_arg;
4010
4011 /* block all schedulers and reset given job's ring */
4012 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4013 struct amdgpu_ring *ring = adev->rings[i];
4014
4015 if (!ring || !ring->sched.thread)
4016 continue;
4017
4018 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4019 amdgpu_fence_driver_force_completion(ring);
4020 }
4021
4022 if(job)
4023 drm_sched_increase_karma(&job->base);
4024
4025 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4026 if (!amdgpu_sriov_vf(adev)) {
4027
4028 if (!need_full_reset)
4029 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4030
4031 if (!need_full_reset) {
4032 amdgpu_device_ip_pre_soft_reset(adev);
4033 r = amdgpu_device_ip_soft_reset(adev);
4034 amdgpu_device_ip_post_soft_reset(adev);
4035 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4036 DRM_INFO("soft reset failed, will fallback to full reset!\n");
4037 need_full_reset = true;
4038 }
4039 }
4040
4041 if (need_full_reset)
4042 r = amdgpu_device_ip_suspend(adev);
4043
4044 *need_full_reset_arg = need_full_reset;
4045 }
4046
4047 return r;
4048 }
4049
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg)4050 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4051 struct list_head *device_list_handle,
4052 bool *need_full_reset_arg)
4053 {
4054 struct amdgpu_device *tmp_adev = NULL;
4055 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4056 int r = 0;
4057
4058 /*
4059 * ASIC reset has to be done on all HGMI hive nodes ASAP
4060 * to allow proper links negotiation in FW (within 1 sec)
4061 */
4062 if (need_full_reset) {
4063 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4064 /* For XGMI run all resets in parallel to speed up the process */
4065 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4066 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4067 r = -EALREADY;
4068 } else
4069 r = amdgpu_asic_reset(tmp_adev);
4070
4071 if (r) {
4072 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4073 r, tmp_adev->ddev->unique);
4074 break;
4075 }
4076 }
4077
4078 /* For XGMI wait for all resets to complete before proceed */
4079 if (!r) {
4080 list_for_each_entry(tmp_adev, device_list_handle,
4081 gmc.xgmi.head) {
4082 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4083 flush_work(&tmp_adev->xgmi_reset_work);
4084 r = tmp_adev->asic_reset_res;
4085 if (r)
4086 break;
4087 }
4088 }
4089 }
4090 }
4091
4092 if (!r && amdgpu_ras_intr_triggered())
4093 amdgpu_ras_intr_cleared();
4094
4095 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4096 if (need_full_reset) {
4097 /* post card */
4098 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4099 DRM_WARN("asic atom init failed!");
4100
4101 if (!r) {
4102 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4103 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4104 if (r)
4105 goto out;
4106
4107 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4108 if (vram_lost) {
4109 DRM_INFO("VRAM is lost due to GPU reset!\n");
4110 amdgpu_inc_vram_lost(tmp_adev);
4111 }
4112
4113 r = amdgpu_gtt_mgr_recover(
4114 &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4115 if (r)
4116 goto out;
4117
4118 r = amdgpu_device_fw_loading(tmp_adev);
4119 if (r)
4120 return r;
4121
4122 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4123 if (r)
4124 goto out;
4125
4126 if (vram_lost)
4127 amdgpu_device_fill_reset_magic(tmp_adev);
4128
4129 /*
4130 * Add this ASIC as tracked as reset was already
4131 * complete successfully.
4132 */
4133 amdgpu_register_gpu_instance(tmp_adev);
4134
4135 r = amdgpu_device_ip_late_init(tmp_adev);
4136 if (r)
4137 goto out;
4138
4139 /* must succeed. */
4140 amdgpu_ras_resume(tmp_adev);
4141
4142 /* Update PSP FW topology after reset */
4143 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4144 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4145 }
4146 }
4147
4148
4149 out:
4150 if (!r) {
4151 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4152 r = amdgpu_ib_ring_tests(tmp_adev);
4153 if (r) {
4154 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4155 r = amdgpu_device_ip_suspend(tmp_adev);
4156 need_full_reset = true;
4157 r = -EAGAIN;
4158 goto end;
4159 }
4160 }
4161
4162 if (!r)
4163 r = amdgpu_device_recover_vram(tmp_adev);
4164 else
4165 tmp_adev->asic_reset_res = r;
4166 }
4167
4168 end:
4169 *need_full_reset_arg = need_full_reset;
4170 return r;
4171 }
4172
amdgpu_device_lock_adev(struct amdgpu_device * adev,bool trylock)4173 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4174 {
4175 if (trylock) {
4176 if (!mutex_trylock(&adev->lock_reset))
4177 return false;
4178 } else
4179 mutex_lock(&adev->lock_reset);
4180
4181 atomic_inc(&adev->gpu_reset_counter);
4182 adev->in_gpu_reset = true;
4183 switch (amdgpu_asic_reset_method(adev)) {
4184 case AMD_RESET_METHOD_MODE1:
4185 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4186 break;
4187 case AMD_RESET_METHOD_MODE2:
4188 adev->mp1_state = PP_MP1_STATE_RESET;
4189 break;
4190 default:
4191 adev->mp1_state = PP_MP1_STATE_NONE;
4192 break;
4193 }
4194
4195 return true;
4196 }
4197
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4198 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4199 {
4200 amdgpu_vf_error_trans_all(adev);
4201 adev->mp1_state = PP_MP1_STATE_NONE;
4202 adev->in_gpu_reset = false;
4203 mutex_unlock(&adev->lock_reset);
4204 }
4205
4206 /**
4207 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4208 *
4209 * @adev: amdgpu device pointer
4210 * @job: which job trigger hang
4211 *
4212 * Attempt to reset the GPU if it has hung (all asics).
4213 * Attempt to do soft-reset or full-reset and reinitialize Asic
4214 * Returns 0 for success or an error on failure.
4215 */
4216
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4217 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4218 struct amdgpu_job *job)
4219 {
4220 struct list_head device_list, *device_list_handle = NULL;
4221 bool need_full_reset, job_signaled;
4222 struct amdgpu_hive_info *hive = NULL;
4223 struct amdgpu_device *tmp_adev = NULL;
4224 int i, r = 0;
4225 bool in_ras_intr = amdgpu_ras_intr_triggered();
4226 bool use_baco =
4227 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
4228 true : false;
4229
4230 /*
4231 * Flush RAM to disk so that after reboot
4232 * the user can read log and see why the system rebooted.
4233 */
4234 if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
4235
4236 DRM_WARN("Emergency reboot.");
4237
4238 ksys_sync_helper();
4239 emergency_restart();
4240 }
4241
4242 need_full_reset = job_signaled = false;
4243 INIT_LIST_HEAD(&device_list);
4244
4245 dev_info(adev->dev, "GPU %s begin!\n",
4246 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
4247
4248 cancel_delayed_work_sync(&adev->delayed_init_work);
4249
4250 hive = amdgpu_get_xgmi_hive(adev, false);
4251
4252 /*
4253 * Here we trylock to avoid chain of resets executing from
4254 * either trigger by jobs on different adevs in XGMI hive or jobs on
4255 * different schedulers for same device while this TO handler is running.
4256 * We always reset all schedulers for device and all devices for XGMI
4257 * hive so that should take care of them too.
4258 */
4259
4260 if (hive && !mutex_trylock(&hive->reset_lock)) {
4261 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", hive: %"PRIx64" as another already in progress",
4262 job ? job->base.id : -1, hive->hive_id);
4263 return 0;
4264 }
4265
4266 /* Start with adev pre asic reset first for soft reset check.*/
4267 if (!amdgpu_device_lock_adev(adev, !hive)) {
4268 DRM_INFO("Bailing on TDR for s_job:%"PRIx64", as another already in progress",
4269 job ? job->base.id : -1);
4270 return 0;
4271 }
4272
4273 /* Block kfd: SRIOV would do it separately */
4274 if (!amdgpu_sriov_vf(adev))
4275 amdgpu_amdkfd_pre_reset(adev);
4276
4277 /* Build list of devices to reset */
4278 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4279 if (!hive) {
4280 /*unlock kfd: SRIOV would do it separately */
4281 if (!amdgpu_sriov_vf(adev))
4282 amdgpu_amdkfd_post_reset(adev);
4283 amdgpu_device_unlock_adev(adev);
4284 return -ENODEV;
4285 }
4286
4287 /*
4288 * In case we are in XGMI hive mode device reset is done for all the
4289 * nodes in the hive to retrain all XGMI links and hence the reset
4290 * sequence is executed in loop on all nodes.
4291 */
4292 device_list_handle = &hive->device_list;
4293 } else {
4294 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4295 device_list_handle = &device_list;
4296 }
4297
4298 /* block all schedulers and reset given job's ring */
4299 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4300 if (tmp_adev != adev) {
4301 amdgpu_device_lock_adev(tmp_adev, false);
4302 if (!amdgpu_sriov_vf(tmp_adev))
4303 amdgpu_amdkfd_pre_reset(tmp_adev);
4304 }
4305
4306 /*
4307 * Mark these ASICs to be reseted as untracked first
4308 * And add them back after reset completed
4309 */
4310 amdgpu_unregister_gpu_instance(tmp_adev);
4311
4312 /* disable ras on ALL IPs */
4313 if (!(in_ras_intr && !use_baco) &&
4314 amdgpu_device_ip_need_full_reset(tmp_adev))
4315 amdgpu_ras_suspend(tmp_adev);
4316
4317 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4318 struct amdgpu_ring *ring = tmp_adev->rings[i];
4319
4320 if (!ring || !ring->sched.thread)
4321 continue;
4322
4323 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4324
4325 if (in_ras_intr && !use_baco)
4326 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4327 }
4328 }
4329
4330
4331 if (in_ras_intr && !use_baco)
4332 goto skip_sched_resume;
4333
4334 /*
4335 * Must check guilty signal here since after this point all old
4336 * HW fences are force signaled.
4337 *
4338 * job->base holds a reference to parent fence
4339 */
4340 if (job && job->base.s_fence->parent &&
4341 dma_fence_is_signaled(job->base.s_fence->parent))
4342 job_signaled = true;
4343
4344 if (job_signaled) {
4345 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4346 goto skip_hw_reset;
4347 }
4348
4349
4350 /* Guilty job will be freed after this*/
4351 r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
4352 if (r) {
4353 /*TODO Should we stop ?*/
4354 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4355 r, adev->ddev->unique);
4356 adev->asic_reset_res = r;
4357 }
4358
4359 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4360 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4361
4362 if (tmp_adev == adev)
4363 continue;
4364
4365 r = amdgpu_device_pre_asic_reset(tmp_adev,
4366 NULL,
4367 &need_full_reset);
4368 /*TODO Should we stop ?*/
4369 if (r) {
4370 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4371 r, tmp_adev->ddev->unique);
4372 tmp_adev->asic_reset_res = r;
4373 }
4374 }
4375
4376 /* Actual ASIC resets if needed.*/
4377 /* TODO Implement XGMI hive reset logic for SRIOV */
4378 if (amdgpu_sriov_vf(adev)) {
4379 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4380 if (r)
4381 adev->asic_reset_res = r;
4382 } else {
4383 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4384 if (r && r == -EAGAIN)
4385 goto retry;
4386 }
4387
4388 skip_hw_reset:
4389
4390 /* Post ASIC reset for all devs .*/
4391 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4392
4393 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4394 struct amdgpu_ring *ring = tmp_adev->rings[i];
4395
4396 if (!ring || !ring->sched.thread)
4397 continue;
4398
4399 /* No point to resubmit jobs if we didn't HW reset*/
4400 if (!tmp_adev->asic_reset_res && !job_signaled)
4401 drm_sched_resubmit_jobs(&ring->sched);
4402
4403 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4404 }
4405
4406 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4407 drm_helper_resume_force_mode(tmp_adev->ddev);
4408 }
4409
4410 tmp_adev->asic_reset_res = 0;
4411
4412 if (r) {
4413 /* bad news, how to tell it to userspace ? */
4414 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4415 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4416 } else {
4417 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4418 }
4419 }
4420
4421 skip_sched_resume:
4422 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4423 /*unlock kfd: SRIOV would do it separately */
4424 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
4425 amdgpu_amdkfd_post_reset(tmp_adev);
4426 amdgpu_device_unlock_adev(tmp_adev);
4427 }
4428
4429 if (hive)
4430 mutex_unlock(&hive->reset_lock);
4431
4432 if (r)
4433 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4434 return r;
4435 }
4436
4437 /**
4438 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4439 *
4440 * @adev: amdgpu_device pointer
4441 *
4442 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4443 * and lanes) of the slot the device is in. Handles APUs and
4444 * virtualized environments where PCIE config space may not be available.
4445 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4446 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4447 {
4448 struct pci_dev *pdev;
4449 enum pci_bus_speed speed_cap, platform_speed_cap;
4450 enum pcie_link_width platform_link_width;
4451
4452 if (amdgpu_pcie_gen_cap)
4453 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4454
4455 if (amdgpu_pcie_lane_cap)
4456 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4457
4458 /* covers APUs as well */
4459 if (pci_is_root_bus(adev->pdev->bus)) {
4460 if (adev->pm.pcie_gen_mask == 0)
4461 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4462 if (adev->pm.pcie_mlw_mask == 0)
4463 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4464 return;
4465 }
4466
4467 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4468 return;
4469
4470 pcie_bandwidth_available(adev->pdev, NULL,
4471 &platform_speed_cap, &platform_link_width);
4472
4473 if (adev->pm.pcie_gen_mask == 0) {
4474 /* asic caps */
4475 pdev = adev->pdev;
4476 speed_cap = pcie_get_speed_cap(pdev);
4477 if (speed_cap == PCI_SPEED_UNKNOWN) {
4478 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4479 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4480 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4481 } else {
4482 if (speed_cap == PCIE_SPEED_16_0GT)
4483 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4487 else if (speed_cap == PCIE_SPEED_8_0GT)
4488 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4489 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4491 else if (speed_cap == PCIE_SPEED_5_0GT)
4492 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4494 else
4495 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4496 }
4497 /* platform caps */
4498 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4499 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4501 } else {
4502 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4503 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4506 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4507 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4508 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4511 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4512 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4513 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4514 else
4515 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4516
4517 }
4518 }
4519 if (adev->pm.pcie_mlw_mask == 0) {
4520 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4521 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4522 } else {
4523 switch (platform_link_width) {
4524 case PCIE_LNK_X32:
4525 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4526 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4532 break;
4533 case PCIE_LNK_X16:
4534 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4540 break;
4541 case PCIE_LNK_X12:
4542 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4547 break;
4548 case PCIE_LNK_X8:
4549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4553 break;
4554 case PCIE_LNK_X4:
4555 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4558 break;
4559 case PCIE_LNK_X2:
4560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4562 break;
4563 case PCIE_LNK_X1:
4564 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4565 break;
4566 default:
4567 break;
4568 }
4569 }
4570 }
4571 }
4572
amdgpu_device_baco_enter(struct drm_device * dev)4573 int amdgpu_device_baco_enter(struct drm_device *dev)
4574 {
4575 struct amdgpu_device *adev = dev->dev_private;
4576 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4577
4578 if (!amdgpu_device_supports_baco(adev->ddev))
4579 return -ENOTSUPP;
4580
4581 if (ras && ras->supported)
4582 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4583
4584 return amdgpu_dpm_baco_enter(adev);
4585 }
4586
amdgpu_device_baco_exit(struct drm_device * dev)4587 int amdgpu_device_baco_exit(struct drm_device *dev)
4588 {
4589 struct amdgpu_device *adev = dev->dev_private;
4590 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4591 int ret = 0;
4592
4593 if (!amdgpu_device_supports_baco(adev->ddev))
4594 return -ENOTSUPP;
4595
4596 ret = amdgpu_dpm_baco_exit(adev);
4597 if (ret)
4598 return ret;
4599
4600 if (ras && ras->supported)
4601 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4602
4603 return 0;
4604 }
4605