xref: /openbsd-src/sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c (revision 0be5e928845b741c9c0f7c1669bc1b98078d4dae)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/pci.h>
34 
35 #include <drm/drm_atomic_helper.h>
36 #include <drm/drm_probe_helper.h>
37 #include <drm/amdgpu_drm.h>
38 #include <linux/vgaarb.h>
39 #include <linux/vga_switcheroo.h>
40 #include <linux/efi.h>
41 #include "amdgpu.h"
42 #include "amdgpu_trace.h"
43 #include "amdgpu_i2c.h"
44 #include "atom.h"
45 #include "amdgpu_atombios.h"
46 #include "amdgpu_atomfirmware.h"
47 #include "amd_pcie.h"
48 #ifdef CONFIG_DRM_AMDGPU_SI
49 #include "si.h"
50 #endif
51 #ifdef CONFIG_DRM_AMDGPU_CIK
52 #include "cik.h"
53 #endif
54 #include "vi.h"
55 #include "soc15.h"
56 #include "nv.h"
57 #include "bif/bif_4_1_d.h"
58 #include <linux/pci.h>
59 #include <linux/firmware.h>
60 #include "amdgpu_vf_error.h"
61 
62 #include "amdgpu_amdkfd.h"
63 #include "amdgpu_pm.h"
64 
65 #include "amdgpu_xgmi.h"
66 #include "amdgpu_ras.h"
67 #include "amdgpu_pmu.h"
68 #include "amdgpu_fru_eeprom.h"
69 #include "amdgpu_reset.h"
70 
71 #include <linux/suspend.h>
72 #include <drm/task_barrier.h>
73 #include <linux/pm_runtime.h>
74 
75 #include <drm/drm_drv.h>
76 
77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
89 
90 #define AMDGPU_RESUME_MS		2000
91 
92 const char *amdgpu_asic_name[] = {
93 	"TAHITI",
94 	"PITCAIRN",
95 	"VERDE",
96 	"OLAND",
97 	"HAINAN",
98 	"BONAIRE",
99 	"KAVERI",
100 	"KABINI",
101 	"HAWAII",
102 	"MULLINS",
103 	"TOPAZ",
104 	"TONGA",
105 	"FIJI",
106 	"CARRIZO",
107 	"STONEY",
108 	"POLARIS10",
109 	"POLARIS11",
110 	"POLARIS12",
111 	"VEGAM",
112 	"VEGA10",
113 	"VEGA12",
114 	"VEGA20",
115 	"RAVEN",
116 	"ARCTURUS",
117 	"RENOIR",
118 	"ALDEBARAN",
119 	"NAVI10",
120 	"CYAN_SKILLFISH",
121 	"NAVI14",
122 	"NAVI12",
123 	"SIENNA_CICHLID",
124 	"NAVY_FLOUNDER",
125 	"VANGOGH",
126 	"DIMGREY_CAVEFISH",
127 	"BEIGE_GOBY",
128 	"YELLOW_CARP",
129 	"LAST",
130 };
131 
132 /**
133  * DOC: pcie_replay_count
134  *
135  * The amdgpu driver provides a sysfs API for reporting the total number
136  * of PCIe replays (NAKs)
137  * The file pcie_replay_count is used for this and returns the total
138  * number of replays as a sum of the NAKs generated and NAKs received
139  */
140 
141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
142 		struct device_attribute *attr, char *buf)
143 {
144 	struct drm_device *ddev = dev_get_drvdata(dev);
145 	struct amdgpu_device *adev = drm_to_adev(ddev);
146 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
147 
148 	return sysfs_emit(buf, "%llu\n", cnt);
149 }
150 
151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
152 		amdgpu_device_get_pcie_replay_count, NULL);
153 
154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
155 
156 /**
157  * DOC: product_name
158  *
159  * The amdgpu driver provides a sysfs API for reporting the product name
160  * for the device
161  * The file serial_number is used for this and returns the product name
162  * as returned from the FRU.
163  * NOTE: This is only available for certain server cards
164  */
165 
166 static ssize_t amdgpu_device_get_product_name(struct device *dev,
167 		struct device_attribute *attr, char *buf)
168 {
169 	struct drm_device *ddev = dev_get_drvdata(dev);
170 	struct amdgpu_device *adev = drm_to_adev(ddev);
171 
172 	return sysfs_emit(buf, "%s\n", adev->product_name);
173 }
174 
175 static DEVICE_ATTR(product_name, S_IRUGO,
176 		amdgpu_device_get_product_name, NULL);
177 
178 /**
179  * DOC: product_number
180  *
181  * The amdgpu driver provides a sysfs API for reporting the part number
182  * for the device
183  * The file serial_number is used for this and returns the part number
184  * as returned from the FRU.
185  * NOTE: This is only available for certain server cards
186  */
187 
188 static ssize_t amdgpu_device_get_product_number(struct device *dev,
189 		struct device_attribute *attr, char *buf)
190 {
191 	struct drm_device *ddev = dev_get_drvdata(dev);
192 	struct amdgpu_device *adev = drm_to_adev(ddev);
193 
194 	return sysfs_emit(buf, "%s\n", adev->product_number);
195 }
196 
197 static DEVICE_ATTR(product_number, S_IRUGO,
198 		amdgpu_device_get_product_number, NULL);
199 
200 /**
201  * DOC: serial_number
202  *
203  * The amdgpu driver provides a sysfs API for reporting the serial number
204  * for the device
205  * The file serial_number is used for this and returns the serial number
206  * as returned from the FRU.
207  * NOTE: This is only available for certain server cards
208  */
209 
210 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
211 		struct device_attribute *attr, char *buf)
212 {
213 	struct drm_device *ddev = dev_get_drvdata(dev);
214 	struct amdgpu_device *adev = drm_to_adev(ddev);
215 
216 	return sysfs_emit(buf, "%s\n", adev->serial);
217 }
218 
219 static DEVICE_ATTR(serial_number, S_IRUGO,
220 		amdgpu_device_get_serial_number, NULL);
221 
222 /**
223  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
224  *
225  * @dev: drm_device pointer
226  *
227  * Returns true if the device is a dGPU with ATPX power control,
228  * otherwise return false.
229  */
230 bool amdgpu_device_supports_px(struct drm_device *dev)
231 {
232 	struct amdgpu_device *adev = drm_to_adev(dev);
233 
234 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
235 		return true;
236 	return false;
237 }
238 
239 /**
240  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
241  *
242  * @dev: drm_device pointer
243  *
244  * Returns true if the device is a dGPU with ACPI power control,
245  * otherwise return false.
246  */
247 bool amdgpu_device_supports_boco(struct drm_device *dev)
248 {
249 	struct amdgpu_device *adev = drm_to_adev(dev);
250 
251 	if (adev->has_pr3 ||
252 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
253 		return true;
254 	return false;
255 }
256 
257 /**
258  * amdgpu_device_supports_baco - Does the device support BACO
259  *
260  * @dev: drm_device pointer
261  *
262  * Returns true if the device supporte BACO,
263  * otherwise return false.
264  */
265 bool amdgpu_device_supports_baco(struct drm_device *dev)
266 {
267 	struct amdgpu_device *adev = drm_to_adev(dev);
268 
269 	return amdgpu_asic_supports_baco(adev);
270 }
271 
272 /**
273  * amdgpu_device_supports_smart_shift - Is the device dGPU with
274  * smart shift support
275  *
276  * @dev: drm_device pointer
277  *
278  * Returns true if the device is a dGPU with Smart Shift support,
279  * otherwise returns false.
280  */
281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
282 {
283 	return (amdgpu_device_supports_boco(dev) &&
284 		amdgpu_acpi_is_power_shift_control_supported());
285 }
286 
287 /*
288  * VRAM access helper functions
289  */
290 
291 /**
292  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
293  *
294  * @adev: amdgpu_device pointer
295  * @pos: offset of the buffer in vram
296  * @buf: virtual address of the buffer in system memory
297  * @size: read/write size, sizeof(@buf) must > @size
298  * @write: true - write to vram, otherwise - read from vram
299  */
300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
301 			     void *buf, size_t size, bool write)
302 {
303 	unsigned long flags;
304 	uint32_t hi = ~0, tmp = 0;
305 	uint32_t *data = buf;
306 	uint64_t last;
307 	int idx;
308 
309 	if (!drm_dev_enter(&adev->ddev, &idx))
310 		return;
311 
312 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
313 
314 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
315 	for (last = pos + size; pos < last; pos += 4) {
316 		tmp = pos >> 31;
317 
318 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
319 		if (tmp != hi) {
320 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
321 			hi = tmp;
322 		}
323 		if (write)
324 			WREG32_NO_KIQ(mmMM_DATA, *data++);
325 		else
326 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
327 	}
328 
329 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
330 	drm_dev_exit(idx);
331 }
332 
333 /**
334  * amdgpu_device_vram_access - access vram by vram aperature
335  *
336  * @adev: amdgpu_device pointer
337  * @pos: offset of the buffer in vram
338  * @buf: virtual address of the buffer in system memory
339  * @size: read/write size, sizeof(@buf) must > @size
340  * @write: true - write to vram, otherwise - read from vram
341  *
342  * The return value means how many bytes have been transferred.
343  */
344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
345 				 void *buf, size_t size, bool write)
346 {
347 #ifdef CONFIG_64BIT
348 	void __iomem *addr;
349 	size_t count = 0;
350 	uint64_t last;
351 
352 	if (!adev->mman.aper_base_kaddr)
353 		return 0;
354 
355 	last = min(pos + size, adev->gmc.visible_vram_size);
356 	if (last > pos) {
357 		addr = adev->mman.aper_base_kaddr + pos;
358 		count = last - pos;
359 
360 		if (write) {
361 			memcpy_toio(addr, buf, count);
362 			mb();
363 			amdgpu_device_flush_hdp(adev, NULL);
364 		} else {
365 			amdgpu_device_invalidate_hdp(adev, NULL);
366 			mb();
367 			memcpy_fromio(buf, addr, count);
368 		}
369 
370 	}
371 
372 	return count;
373 #else
374 	return 0;
375 #endif
376 }
377 
378 /**
379  * amdgpu_device_vram_access - read/write a buffer in vram
380  *
381  * @adev: amdgpu_device pointer
382  * @pos: offset of the buffer in vram
383  * @buf: virtual address of the buffer in system memory
384  * @size: read/write size, sizeof(@buf) must > @size
385  * @write: true - write to vram, otherwise - read from vram
386  */
387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
388 			       void *buf, size_t size, bool write)
389 {
390 	size_t count;
391 
392 	/* try to using vram apreature to access vram first */
393 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
394 	size -= count;
395 	if (size) {
396 		/* using MM to access rest vram */
397 		pos += count;
398 		buf += count;
399 		amdgpu_device_mm_access(adev, pos, buf, size, write);
400 	}
401 }
402 
403 /*
404  * register access helper functions.
405  */
406 
407 /* Check if hw access should be skipped because of hotplug or device error */
408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
409 {
410 	if (adev->no_hw_access)
411 		return true;
412 
413 #ifdef CONFIG_LOCKDEP
414 	/*
415 	 * This is a bit complicated to understand, so worth a comment. What we assert
416 	 * here is that the GPU reset is not running on another thread in parallel.
417 	 *
418 	 * For this we trylock the read side of the reset semaphore, if that succeeds
419 	 * we know that the reset is not running in paralell.
420 	 *
421 	 * If the trylock fails we assert that we are either already holding the read
422 	 * side of the lock or are the reset thread itself and hold the write side of
423 	 * the lock.
424 	 */
425 	if (in_task()) {
426 		if (down_read_trylock(&adev->reset_sem))
427 			up_read(&adev->reset_sem);
428 		else
429 			lockdep_assert_held(&adev->reset_sem);
430 	}
431 #endif
432 	return false;
433 }
434 
435 /**
436  * amdgpu_device_rreg - read a memory mapped IO or indirect register
437  *
438  * @adev: amdgpu_device pointer
439  * @reg: dword aligned register offset
440  * @acc_flags: access flags which require special behavior
441  *
442  * Returns the 32 bit value from the offset specified.
443  */
444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
445 			    uint32_t reg, uint32_t acc_flags)
446 {
447 	uint32_t ret;
448 
449 	if (amdgpu_device_skip_hw_access(adev))
450 		return 0;
451 
452 	if ((reg * 4) < adev->rmmio_size) {
453 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
454 		    amdgpu_sriov_runtime(adev) &&
455 		    down_read_trylock(&adev->reset_sem)) {
456 			ret = amdgpu_kiq_rreg(adev, reg);
457 			up_read(&adev->reset_sem);
458 		} else {
459 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
460 		}
461 	} else {
462 		ret = adev->pcie_rreg(adev, reg * 4);
463 	}
464 
465 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
466 
467 	return ret;
468 }
469 
470 /*
471  * MMIO register read with bytes helper functions
472  * @offset:bytes offset from MMIO start
473  *
474 */
475 
476 /**
477  * amdgpu_mm_rreg8 - read a memory mapped IO register
478  *
479  * @adev: amdgpu_device pointer
480  * @offset: byte aligned register offset
481  *
482  * Returns the 8 bit value from the offset specified.
483  */
484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return 0;
488 
489 	if (offset < adev->rmmio_size)
490 		return (readb(adev->rmmio + offset));
491 	BUG();
492 }
493 
494 /*
495  * MMIO register write with bytes helper functions
496  * @offset:bytes offset from MMIO start
497  * @value: the value want to be written to the register
498  *
499 */
500 /**
501  * amdgpu_mm_wreg8 - read a memory mapped IO register
502  *
503  * @adev: amdgpu_device pointer
504  * @offset: byte aligned register offset
505  * @value: 8 bit value to write
506  *
507  * Writes the value specified to the offset specified.
508  */
509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
510 {
511 	if (amdgpu_device_skip_hw_access(adev))
512 		return;
513 
514 	if (offset < adev->rmmio_size)
515 		writeb(value, adev->rmmio + offset);
516 	else
517 		BUG();
518 }
519 
520 /**
521  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
522  *
523  * @adev: amdgpu_device pointer
524  * @reg: dword aligned register offset
525  * @v: 32 bit value to write to the register
526  * @acc_flags: access flags which require special behavior
527  *
528  * Writes the value specified to the offset specified.
529  */
530 void amdgpu_device_wreg(struct amdgpu_device *adev,
531 			uint32_t reg, uint32_t v,
532 			uint32_t acc_flags)
533 {
534 	if (amdgpu_device_skip_hw_access(adev))
535 		return;
536 
537 	if ((reg * 4) < adev->rmmio_size) {
538 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
539 		    amdgpu_sriov_runtime(adev) &&
540 		    down_read_trylock(&adev->reset_sem)) {
541 			amdgpu_kiq_wreg(adev, reg, v);
542 			up_read(&adev->reset_sem);
543 		} else {
544 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
545 		}
546 	} else {
547 		adev->pcie_wreg(adev, reg * 4, v);
548 	}
549 
550 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
551 }
552 
553 /*
554  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
555  *
556  * this function is invoked only the debugfs register access
557  * */
558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
559 			     uint32_t reg, uint32_t v)
560 {
561 	if (amdgpu_device_skip_hw_access(adev))
562 		return;
563 
564 	if (amdgpu_sriov_fullaccess(adev) &&
565 	    adev->gfx.rlc.funcs &&
566 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
567 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
568 			return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
569 	} else {
570 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
571 	}
572 }
573 
574 /**
575  * amdgpu_mm_rdoorbell - read a doorbell dword
576  *
577  * @adev: amdgpu_device pointer
578  * @index: doorbell index
579  *
580  * Returns the value in the doorbell aperture at the
581  * requested doorbell index (CIK).
582  */
583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
584 {
585 	if (amdgpu_device_skip_hw_access(adev))
586 		return 0;
587 
588 	if (index < adev->doorbell.num_doorbells) {
589 		return readl(adev->doorbell.ptr + index);
590 	} else {
591 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
592 		return 0;
593 	}
594 }
595 
596 /**
597  * amdgpu_mm_wdoorbell - write a doorbell dword
598  *
599  * @adev: amdgpu_device pointer
600  * @index: doorbell index
601  * @v: value to write
602  *
603  * Writes @v to the doorbell aperture at the
604  * requested doorbell index (CIK).
605  */
606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
607 {
608 	if (amdgpu_device_skip_hw_access(adev))
609 		return;
610 
611 	if (index < adev->doorbell.num_doorbells) {
612 		writel(v, adev->doorbell.ptr + index);
613 	} else {
614 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
615 	}
616 }
617 
618 /**
619  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
620  *
621  * @adev: amdgpu_device pointer
622  * @index: doorbell index
623  *
624  * Returns the value in the doorbell aperture at the
625  * requested doorbell index (VEGA10+).
626  */
627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
628 {
629 	if (amdgpu_device_skip_hw_access(adev))
630 		return 0;
631 
632 	if (index < adev->doorbell.num_doorbells) {
633 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
634 	} else {
635 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
636 		return 0;
637 	}
638 }
639 
640 /**
641  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
642  *
643  * @adev: amdgpu_device pointer
644  * @index: doorbell index
645  * @v: value to write
646  *
647  * Writes @v to the doorbell aperture at the
648  * requested doorbell index (VEGA10+).
649  */
650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
651 {
652 	if (amdgpu_device_skip_hw_access(adev))
653 		return;
654 
655 	if (index < adev->doorbell.num_doorbells) {
656 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
657 	} else {
658 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
659 	}
660 }
661 
662 /**
663  * amdgpu_device_indirect_rreg - read an indirect register
664  *
665  * @adev: amdgpu_device pointer
666  * @pcie_index: mmio register offset
667  * @pcie_data: mmio register offset
668  * @reg_addr: indirect register address to read from
669  *
670  * Returns the value of indirect register @reg_addr
671  */
672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
673 				u32 pcie_index, u32 pcie_data,
674 				u32 reg_addr)
675 {
676 	unsigned long flags;
677 	u32 r;
678 	void __iomem *pcie_index_offset;
679 	void __iomem *pcie_data_offset;
680 
681 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
682 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
683 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
684 
685 	writel(reg_addr, pcie_index_offset);
686 	readl(pcie_index_offset);
687 	r = readl(pcie_data_offset);
688 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
689 
690 	return r;
691 }
692 
693 /**
694  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
695  *
696  * @adev: amdgpu_device pointer
697  * @pcie_index: mmio register offset
698  * @pcie_data: mmio register offset
699  * @reg_addr: indirect register address to read from
700  *
701  * Returns the value of indirect register @reg_addr
702  */
703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
704 				  u32 pcie_index, u32 pcie_data,
705 				  u32 reg_addr)
706 {
707 	unsigned long flags;
708 	u64 r;
709 	void __iomem *pcie_index_offset;
710 	void __iomem *pcie_data_offset;
711 
712 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
713 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
714 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
715 
716 	/* read low 32 bits */
717 	writel(reg_addr, pcie_index_offset);
718 	readl(pcie_index_offset);
719 	r = readl(pcie_data_offset);
720 	/* read high 32 bits */
721 	writel(reg_addr + 4, pcie_index_offset);
722 	readl(pcie_index_offset);
723 	r |= ((u64)readl(pcie_data_offset) << 32);
724 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
725 
726 	return r;
727 }
728 
729 /**
730  * amdgpu_device_indirect_wreg - write an indirect register address
731  *
732  * @adev: amdgpu_device pointer
733  * @pcie_index: mmio register offset
734  * @pcie_data: mmio register offset
735  * @reg_addr: indirect register offset
736  * @reg_data: indirect register data
737  *
738  */
739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
740 				 u32 pcie_index, u32 pcie_data,
741 				 u32 reg_addr, u32 reg_data)
742 {
743 	unsigned long flags;
744 	void __iomem *pcie_index_offset;
745 	void __iomem *pcie_data_offset;
746 
747 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
748 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
749 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
750 
751 	writel(reg_addr, pcie_index_offset);
752 	readl(pcie_index_offset);
753 	writel(reg_data, pcie_data_offset);
754 	readl(pcie_data_offset);
755 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
756 }
757 
758 /**
759  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
760  *
761  * @adev: amdgpu_device pointer
762  * @pcie_index: mmio register offset
763  * @pcie_data: mmio register offset
764  * @reg_addr: indirect register offset
765  * @reg_data: indirect register data
766  *
767  */
768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
769 				   u32 pcie_index, u32 pcie_data,
770 				   u32 reg_addr, u64 reg_data)
771 {
772 	unsigned long flags;
773 	void __iomem *pcie_index_offset;
774 	void __iomem *pcie_data_offset;
775 
776 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
777 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
778 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
779 
780 	/* write low 32 bits */
781 	writel(reg_addr, pcie_index_offset);
782 	readl(pcie_index_offset);
783 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
784 	readl(pcie_data_offset);
785 	/* write high 32 bits */
786 	writel(reg_addr + 4, pcie_index_offset);
787 	readl(pcie_index_offset);
788 	writel((u32)(reg_data >> 32), pcie_data_offset);
789 	readl(pcie_data_offset);
790 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
791 }
792 
793 /**
794  * amdgpu_invalid_rreg - dummy reg read function
795  *
796  * @adev: amdgpu_device pointer
797  * @reg: offset of register
798  *
799  * Dummy register read function.  Used for register blocks
800  * that certain asics don't have (all asics).
801  * Returns the value in the register.
802  */
803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
804 {
805 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
806 	BUG();
807 	return 0;
808 }
809 
810 /**
811  * amdgpu_invalid_wreg - dummy reg write function
812  *
813  * @adev: amdgpu_device pointer
814  * @reg: offset of register
815  * @v: value to write to the register
816  *
817  * Dummy register read function.  Used for register blocks
818  * that certain asics don't have (all asics).
819  */
820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
821 {
822 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
823 		  reg, v);
824 	BUG();
825 }
826 
827 /**
828  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
829  *
830  * @adev: amdgpu_device pointer
831  * @reg: offset of register
832  *
833  * Dummy register read function.  Used for register blocks
834  * that certain asics don't have (all asics).
835  * Returns the value in the register.
836  */
837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
838 {
839 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
840 	BUG();
841 	return 0;
842 }
843 
844 /**
845  * amdgpu_invalid_wreg64 - dummy reg write function
846  *
847  * @adev: amdgpu_device pointer
848  * @reg: offset of register
849  * @v: value to write to the register
850  *
851  * Dummy register read function.  Used for register blocks
852  * that certain asics don't have (all asics).
853  */
854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
855 {
856 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
857 		  reg, v);
858 	BUG();
859 }
860 
861 /**
862  * amdgpu_block_invalid_rreg - dummy reg read function
863  *
864  * @adev: amdgpu_device pointer
865  * @block: offset of instance
866  * @reg: offset of register
867  *
868  * Dummy register read function.  Used for register blocks
869  * that certain asics don't have (all asics).
870  * Returns the value in the register.
871  */
872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
873 					  uint32_t block, uint32_t reg)
874 {
875 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
876 		  reg, block);
877 	BUG();
878 	return 0;
879 }
880 
881 /**
882  * amdgpu_block_invalid_wreg - dummy reg write function
883  *
884  * @adev: amdgpu_device pointer
885  * @block: offset of instance
886  * @reg: offset of register
887  * @v: value to write to the register
888  *
889  * Dummy register read function.  Used for register blocks
890  * that certain asics don't have (all asics).
891  */
892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
893 				      uint32_t block,
894 				      uint32_t reg, uint32_t v)
895 {
896 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
897 		  reg, block, v);
898 	BUG();
899 }
900 
901 /**
902  * amdgpu_device_asic_init - Wrapper for atom asic_init
903  *
904  * @adev: amdgpu_device pointer
905  *
906  * Does any asic specific work and then calls atom asic init.
907  */
908 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
909 {
910 	amdgpu_asic_pre_asic_init(adev);
911 
912 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
913 }
914 
915 /**
916  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
917  *
918  * @adev: amdgpu_device pointer
919  *
920  * Allocates a scratch page of VRAM for use by various things in the
921  * driver.
922  */
923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
924 {
925 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
926 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
927 				       &adev->vram_scratch.robj,
928 				       &adev->vram_scratch.gpu_addr,
929 				       (void **)&adev->vram_scratch.ptr);
930 }
931 
932 /**
933  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
934  *
935  * @adev: amdgpu_device pointer
936  *
937  * Frees the VRAM scratch page.
938  */
939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
940 {
941 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
942 }
943 
944 /**
945  * amdgpu_device_program_register_sequence - program an array of registers.
946  *
947  * @adev: amdgpu_device pointer
948  * @registers: pointer to the register array
949  * @array_size: size of the register array
950  *
951  * Programs an array or registers with and and or masks.
952  * This is a helper for setting golden registers.
953  */
954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
955 					     const u32 *registers,
956 					     const u32 array_size)
957 {
958 	u32 tmp, reg, and_mask, or_mask;
959 	int i;
960 
961 	if (array_size % 3)
962 		return;
963 
964 	for (i = 0; i < array_size; i +=3) {
965 		reg = registers[i + 0];
966 		and_mask = registers[i + 1];
967 		or_mask = registers[i + 2];
968 
969 		if (and_mask == 0xffffffff) {
970 			tmp = or_mask;
971 		} else {
972 			tmp = RREG32(reg);
973 			tmp &= ~and_mask;
974 			if (adev->family >= AMDGPU_FAMILY_AI)
975 				tmp |= (or_mask & and_mask);
976 			else
977 				tmp |= or_mask;
978 		}
979 		WREG32(reg, tmp);
980 	}
981 }
982 
983 /**
984  * amdgpu_device_pci_config_reset - reset the GPU
985  *
986  * @adev: amdgpu_device pointer
987  *
988  * Resets the GPU using the pci config reset sequence.
989  * Only applicable to asics prior to vega10.
990  */
991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
992 {
993 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
994 }
995 
996 /**
997  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
998  *
999  * @adev: amdgpu_device pointer
1000  *
1001  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1002  */
1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1004 {
1005 	STUB();
1006 	return -ENOSYS;
1007 #ifdef notyet
1008 	return pci_reset_function(adev->pdev);
1009 #endif
1010 }
1011 
1012 /*
1013  * GPU doorbell aperture helpers function.
1014  */
1015 /**
1016  * amdgpu_device_doorbell_init - Init doorbell driver information.
1017  *
1018  * @adev: amdgpu_device pointer
1019  *
1020  * Init doorbell driver information (CIK)
1021  * Returns 0 on success, error on failure.
1022  */
1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1024 {
1025 
1026 	/* No doorbell on SI hardware generation */
1027 	if (adev->asic_type < CHIP_BONAIRE) {
1028 		adev->doorbell.base = 0;
1029 		adev->doorbell.size = 0;
1030 		adev->doorbell.num_doorbells = 0;
1031 		adev->doorbell.ptr = NULL;
1032 		return 0;
1033 	}
1034 
1035 #ifdef __linux__
1036 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1037 		return -EINVAL;
1038 #endif
1039 
1040 	amdgpu_asic_init_doorbell_index(adev);
1041 
1042 	/* doorbell bar mapping */
1043 #ifdef __linux__
1044 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046 #endif
1047 
1048 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
1049 					     adev->doorbell_index.max_assignment+1);
1050 	if (adev->doorbell.num_doorbells == 0)
1051 		return -EINVAL;
1052 
1053 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
1054 	 * paging queue doorbell use the second page. The
1055 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1056 	 * doorbells are in the first page. So with paging queue enabled,
1057 	 * the max num_doorbells should + 1 page (0x400 in dword)
1058 	 */
1059 	if (adev->asic_type >= CHIP_VEGA10)
1060 		adev->doorbell.num_doorbells += 0x400;
1061 
1062 #ifdef __linux__
1063 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
1064 				     adev->doorbell.num_doorbells *
1065 				     sizeof(u32));
1066 	if (adev->doorbell.ptr == NULL)
1067 		return -ENOMEM;
1068 #endif
1069 
1070 	return 0;
1071 }
1072 
1073 /**
1074  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1075  *
1076  * @adev: amdgpu_device pointer
1077  *
1078  * Tear down doorbell driver information (CIK)
1079  */
1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1081 {
1082 #ifdef __linux__
1083 	iounmap(adev->doorbell.ptr);
1084 #else
1085 	if (adev->doorbell.size > 0)
1086 		bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh,
1087 		    adev->doorbell.size);
1088 #endif
1089 	adev->doorbell.ptr = NULL;
1090 }
1091 
1092 
1093 
1094 /*
1095  * amdgpu_device_wb_*()
1096  * Writeback is the method by which the GPU updates special pages in memory
1097  * with the status of certain GPU events (fences, ring pointers,etc.).
1098  */
1099 
1100 /**
1101  * amdgpu_device_wb_fini - Disable Writeback and free memory
1102  *
1103  * @adev: amdgpu_device pointer
1104  *
1105  * Disables Writeback and frees the Writeback memory (all asics).
1106  * Used at driver shutdown.
1107  */
1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1109 {
1110 	if (adev->wb.wb_obj) {
1111 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1112 				      &adev->wb.gpu_addr,
1113 				      (void **)&adev->wb.wb);
1114 		adev->wb.wb_obj = NULL;
1115 	}
1116 }
1117 
1118 /**
1119  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1120  *
1121  * @adev: amdgpu_device pointer
1122  *
1123  * Initializes writeback and allocates writeback memory (all asics).
1124  * Used at driver startup.
1125  * Returns 0 on success or an -error on failure.
1126  */
1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1128 {
1129 	int r;
1130 
1131 	if (adev->wb.wb_obj == NULL) {
1132 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1133 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1134 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1135 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1136 					    (void **)&adev->wb.wb);
1137 		if (r) {
1138 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1139 			return r;
1140 		}
1141 
1142 		adev->wb.num_wb = AMDGPU_MAX_WB;
1143 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1144 
1145 		/* clear wb memory */
1146 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1147 	}
1148 
1149 	return 0;
1150 }
1151 
1152 /**
1153  * amdgpu_device_wb_get - Allocate a wb entry
1154  *
1155  * @adev: amdgpu_device pointer
1156  * @wb: wb index
1157  *
1158  * Allocate a wb slot for use by the driver (all asics).
1159  * Returns 0 on success or -EINVAL on failure.
1160  */
1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1162 {
1163 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1164 
1165 	if (offset < adev->wb.num_wb) {
1166 		__set_bit(offset, adev->wb.used);
1167 		*wb = offset << 3; /* convert to dw offset */
1168 		return 0;
1169 	} else {
1170 		return -EINVAL;
1171 	}
1172 }
1173 
1174 /**
1175  * amdgpu_device_wb_free - Free a wb entry
1176  *
1177  * @adev: amdgpu_device pointer
1178  * @wb: wb index
1179  *
1180  * Free a wb slot allocated for use by the driver (all asics)
1181  */
1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1183 {
1184 	wb >>= 3;
1185 	if (wb < adev->wb.num_wb)
1186 		__clear_bit(wb, adev->wb.used);
1187 }
1188 
1189 /**
1190  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1191  *
1192  * @adev: amdgpu_device pointer
1193  *
1194  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1195  * to fail, but if any of the BARs is not accessible after the size we abort
1196  * driver loading by returning -ENODEV.
1197  */
1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1199 {
1200 #ifdef __linux__
1201 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1202 	struct pci_bus *root;
1203 	struct resource *res;
1204 	unsigned i;
1205 	u16 cmd;
1206 	int r;
1207 
1208 	/* Bypass for VF */
1209 	if (amdgpu_sriov_vf(adev))
1210 		return 0;
1211 
1212 	/* skip if the bios has already enabled large BAR */
1213 	if (adev->gmc.real_vram_size &&
1214 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1215 		return 0;
1216 
1217 	/* Check if the root BUS has 64bit memory resources */
1218 	root = adev->pdev->bus;
1219 	while (root->parent)
1220 		root = root->parent;
1221 
1222 	pci_bus_for_each_resource(root, res, i) {
1223 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1224 		    res->start > 0x100000000ull)
1225 			break;
1226 	}
1227 
1228 	/* Trying to resize is pointless without a root hub window above 4GB */
1229 	if (!res)
1230 		return 0;
1231 
1232 	/* Limit the BAR size to what is available */
1233 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1234 			rbar_size);
1235 
1236 	/* Disable memory decoding while we change the BAR addresses and size */
1237 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1238 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1239 			      cmd & ~PCI_COMMAND_MEMORY);
1240 
1241 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1242 	amdgpu_device_doorbell_fini(adev);
1243 	if (adev->asic_type >= CHIP_BONAIRE)
1244 		pci_release_resource(adev->pdev, 2);
1245 
1246 	pci_release_resource(adev->pdev, 0);
1247 
1248 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1249 	if (r == -ENOSPC)
1250 		DRM_INFO("Not enough PCI address space for a large BAR.");
1251 	else if (r && r != -ENOTSUPP)
1252 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1253 
1254 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1255 
1256 	/* When the doorbell or fb BAR isn't available we have no chance of
1257 	 * using the device.
1258 	 */
1259 	r = amdgpu_device_doorbell_init(adev);
1260 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1261 		return -ENODEV;
1262 
1263 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1264 #endif /* __linux__ */
1265 
1266 	return 0;
1267 }
1268 
1269 /*
1270  * GPU helpers function.
1271  */
1272 /**
1273  * amdgpu_device_need_post - check if the hw need post or not
1274  *
1275  * @adev: amdgpu_device pointer
1276  *
1277  * Check if the asic has been initialized (all asics) at driver startup
1278  * or post is needed if  hw reset is performed.
1279  * Returns true if need or false if not.
1280  */
1281 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1282 {
1283 	uint32_t reg;
1284 
1285 	if (amdgpu_sriov_vf(adev))
1286 		return false;
1287 
1288 	if (amdgpu_passthrough(adev)) {
1289 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1290 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1291 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1292 		 * vpost executed for smc version below 22.15
1293 		 */
1294 		if (adev->asic_type == CHIP_FIJI) {
1295 			int err;
1296 			uint32_t fw_ver;
1297 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1298 			/* force vPost if error occured */
1299 			if (err)
1300 				return true;
1301 
1302 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1303 			if (fw_ver < 0x00160e00)
1304 				return true;
1305 		}
1306 	}
1307 
1308 	/* Don't post if we need to reset whole hive on init */
1309 	if (adev->gmc.xgmi.pending_reset)
1310 		return false;
1311 
1312 	if (adev->has_hw_reset) {
1313 		adev->has_hw_reset = false;
1314 		return true;
1315 	}
1316 
1317 	/* bios scratch used on CIK+ */
1318 	if (adev->asic_type >= CHIP_BONAIRE)
1319 		return amdgpu_atombios_scratch_need_asic_init(adev);
1320 
1321 	/* check MEM_SIZE for older asics */
1322 	reg = amdgpu_asic_get_config_memsize(adev);
1323 
1324 	if ((reg != 0) && (reg != 0xffffffff))
1325 		return false;
1326 
1327 	return true;
1328 }
1329 
1330 /* if we get transitioned to only one device, take VGA back */
1331 /**
1332  * amdgpu_device_vga_set_decode - enable/disable vga decode
1333  *
1334  * @pdev: PCI device pointer
1335  * @state: enable/disable vga decode
1336  *
1337  * Enable/disable vga decode (all asics).
1338  * Returns VGA resource flags.
1339  */
1340 #ifdef notyet
1341 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1342 		bool state)
1343 {
1344 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1345 	amdgpu_asic_set_vga_state(adev, state);
1346 	if (state)
1347 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1348 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1349 	else
1350 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1351 }
1352 #endif
1353 
1354 /**
1355  * amdgpu_device_check_block_size - validate the vm block size
1356  *
1357  * @adev: amdgpu_device pointer
1358  *
1359  * Validates the vm block size specified via module parameter.
1360  * The vm block size defines number of bits in page table versus page directory,
1361  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1362  * page table and the remaining bits are in the page directory.
1363  */
1364 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1365 {
1366 	/* defines number of bits in page table versus page directory,
1367 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1368 	 * page table and the remaining bits are in the page directory */
1369 	if (amdgpu_vm_block_size == -1)
1370 		return;
1371 
1372 	if (amdgpu_vm_block_size < 9) {
1373 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1374 			 amdgpu_vm_block_size);
1375 		amdgpu_vm_block_size = -1;
1376 	}
1377 }
1378 
1379 /**
1380  * amdgpu_device_check_vm_size - validate the vm size
1381  *
1382  * @adev: amdgpu_device pointer
1383  *
1384  * Validates the vm size in GB specified via module parameter.
1385  * The VM size is the size of the GPU virtual memory space in GB.
1386  */
1387 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1388 {
1389 	/* no need to check the default value */
1390 	if (amdgpu_vm_size == -1)
1391 		return;
1392 
1393 	if (amdgpu_vm_size < 1) {
1394 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1395 			 amdgpu_vm_size);
1396 		amdgpu_vm_size = -1;
1397 	}
1398 }
1399 
1400 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1401 {
1402 #ifdef __linux__
1403 	struct sysinfo si;
1404 #endif
1405 	bool is_os_64 = (sizeof(void *) == 8);
1406 	uint64_t total_memory;
1407 	uint64_t dram_size_seven_GB = 0x1B8000000;
1408 	uint64_t dram_size_three_GB = 0xB8000000;
1409 
1410 	if (amdgpu_smu_memory_pool_size == 0)
1411 		return;
1412 
1413 	if (!is_os_64) {
1414 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1415 		goto def_value;
1416 	}
1417 #ifdef __linux__
1418 	si_meminfo(&si);
1419 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1420 #else
1421 	total_memory = ptoa(physmem);
1422 #endif
1423 
1424 	if ((amdgpu_smu_memory_pool_size == 1) ||
1425 		(amdgpu_smu_memory_pool_size == 2)) {
1426 		if (total_memory < dram_size_three_GB)
1427 			goto def_value1;
1428 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1429 		(amdgpu_smu_memory_pool_size == 8)) {
1430 		if (total_memory < dram_size_seven_GB)
1431 			goto def_value1;
1432 	} else {
1433 		DRM_WARN("Smu memory pool size not supported\n");
1434 		goto def_value;
1435 	}
1436 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1437 
1438 	return;
1439 
1440 def_value1:
1441 	DRM_WARN("No enough system memory\n");
1442 def_value:
1443 	adev->pm.smu_prv_buffer_size = 0;
1444 }
1445 
1446 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1447 {
1448 	if (!(adev->flags & AMD_IS_APU) ||
1449 	    adev->asic_type < CHIP_RAVEN)
1450 		return 0;
1451 
1452 	switch (adev->asic_type) {
1453 	case CHIP_RAVEN:
1454 		if (adev->pdev->device == 0x15dd)
1455 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1456 		if (adev->pdev->device == 0x15d8)
1457 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1458 		break;
1459 	case CHIP_RENOIR:
1460 		if ((adev->pdev->device == 0x1636) ||
1461 		    (adev->pdev->device == 0x164c))
1462 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1463 		else
1464 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1465 		break;
1466 	case CHIP_VANGOGH:
1467 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1468 		break;
1469 	case CHIP_YELLOW_CARP:
1470 		break;
1471 	case CHIP_CYAN_SKILLFISH:
1472 		if (adev->pdev->device == 0x13FE)
1473 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1474 		break;
1475 	default:
1476 		return -EINVAL;
1477 	}
1478 
1479 	return 0;
1480 }
1481 
1482 /**
1483  * amdgpu_device_check_arguments - validate module params
1484  *
1485  * @adev: amdgpu_device pointer
1486  *
1487  * Validates certain module parameters and updates
1488  * the associated values used by the driver (all asics).
1489  */
1490 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1491 {
1492 	if (amdgpu_sched_jobs < 4) {
1493 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1494 			 amdgpu_sched_jobs);
1495 		amdgpu_sched_jobs = 4;
1496 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1497 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1498 			 amdgpu_sched_jobs);
1499 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1500 	}
1501 
1502 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1503 		/* gart size must be greater or equal to 32M */
1504 		dev_warn(adev->dev, "gart size (%d) too small\n",
1505 			 amdgpu_gart_size);
1506 		amdgpu_gart_size = -1;
1507 	}
1508 
1509 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1510 		/* gtt size must be greater or equal to 32M */
1511 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1512 				 amdgpu_gtt_size);
1513 		amdgpu_gtt_size = -1;
1514 	}
1515 
1516 	/* valid range is between 4 and 9 inclusive */
1517 	if (amdgpu_vm_fragment_size != -1 &&
1518 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1519 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1520 		amdgpu_vm_fragment_size = -1;
1521 	}
1522 
1523 	if (amdgpu_sched_hw_submission < 2) {
1524 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1525 			 amdgpu_sched_hw_submission);
1526 		amdgpu_sched_hw_submission = 2;
1527 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1528 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1529 			 amdgpu_sched_hw_submission);
1530 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1531 	}
1532 
1533 	amdgpu_device_check_smu_prv_buffer_size(adev);
1534 
1535 	amdgpu_device_check_vm_size(adev);
1536 
1537 	amdgpu_device_check_block_size(adev);
1538 
1539 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1540 
1541 	amdgpu_gmc_tmz_set(adev);
1542 
1543 	amdgpu_gmc_noretry_set(adev);
1544 
1545 	return 0;
1546 }
1547 
1548 #ifdef __linux__
1549 /**
1550  * amdgpu_switcheroo_set_state - set switcheroo state
1551  *
1552  * @pdev: pci dev pointer
1553  * @state: vga_switcheroo state
1554  *
1555  * Callback for the switcheroo driver.  Suspends or resumes the
1556  * the asics before or after it is powered up using ACPI methods.
1557  */
1558 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1559 					enum vga_switcheroo_state state)
1560 {
1561 	struct drm_device *dev = pci_get_drvdata(pdev);
1562 	int r;
1563 
1564 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1565 		return;
1566 
1567 	if (state == VGA_SWITCHEROO_ON) {
1568 		pr_info("switched on\n");
1569 		/* don't suspend or resume card normally */
1570 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 
1572 		pci_set_power_state(pdev, PCI_D0);
1573 		amdgpu_device_load_pci_state(pdev);
1574 		r = pci_enable_device(pdev);
1575 		if (r)
1576 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1577 		amdgpu_device_resume(dev, true);
1578 
1579 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1580 	} else {
1581 		pr_info("switched off\n");
1582 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1583 		amdgpu_device_suspend(dev, true);
1584 		amdgpu_device_cache_pci_state(pdev);
1585 		/* Shut down the device */
1586 		pci_disable_device(pdev);
1587 		pci_set_power_state(pdev, PCI_D3cold);
1588 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1589 	}
1590 }
1591 
1592 /**
1593  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1594  *
1595  * @pdev: pci dev pointer
1596  *
1597  * Callback for the switcheroo driver.  Check of the switcheroo
1598  * state can be changed.
1599  * Returns true if the state can be changed, false if not.
1600  */
1601 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1602 {
1603 	struct drm_device *dev = pci_get_drvdata(pdev);
1604 
1605 	/*
1606 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1607 	* locking inversion with the driver load path. And the access here is
1608 	* completely racy anyway. So don't bother with locking for now.
1609 	*/
1610 	return atomic_read(&dev->open_count) == 0;
1611 }
1612 #endif /* __linux__ */
1613 
1614 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1615 #ifdef notyet
1616 	.set_gpu_state = amdgpu_switcheroo_set_state,
1617 	.reprobe = NULL,
1618 	.can_switch = amdgpu_switcheroo_can_switch,
1619 #endif
1620 };
1621 
1622 /**
1623  * amdgpu_device_ip_set_clockgating_state - set the CG state
1624  *
1625  * @dev: amdgpu_device pointer
1626  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1627  * @state: clockgating state (gate or ungate)
1628  *
1629  * Sets the requested clockgating state for all instances of
1630  * the hardware IP specified.
1631  * Returns the error code from the last instance.
1632  */
1633 int amdgpu_device_ip_set_clockgating_state(void *dev,
1634 					   enum amd_ip_block_type block_type,
1635 					   enum amd_clockgating_state state)
1636 {
1637 	struct amdgpu_device *adev = dev;
1638 	int i, r = 0;
1639 
1640 	for (i = 0; i < adev->num_ip_blocks; i++) {
1641 		if (!adev->ip_blocks[i].status.valid)
1642 			continue;
1643 		if (adev->ip_blocks[i].version->type != block_type)
1644 			continue;
1645 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1646 			continue;
1647 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1648 			(void *)adev, state);
1649 		if (r)
1650 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1651 				  adev->ip_blocks[i].version->funcs->name, r);
1652 	}
1653 	return r;
1654 }
1655 
1656 /**
1657  * amdgpu_device_ip_set_powergating_state - set the PG state
1658  *
1659  * @dev: amdgpu_device pointer
1660  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1661  * @state: powergating state (gate or ungate)
1662  *
1663  * Sets the requested powergating state for all instances of
1664  * the hardware IP specified.
1665  * Returns the error code from the last instance.
1666  */
1667 int amdgpu_device_ip_set_powergating_state(void *dev,
1668 					   enum amd_ip_block_type block_type,
1669 					   enum amd_powergating_state state)
1670 {
1671 	struct amdgpu_device *adev = dev;
1672 	int i, r = 0;
1673 
1674 	for (i = 0; i < adev->num_ip_blocks; i++) {
1675 		if (!adev->ip_blocks[i].status.valid)
1676 			continue;
1677 		if (adev->ip_blocks[i].version->type != block_type)
1678 			continue;
1679 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1680 			continue;
1681 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1682 			(void *)adev, state);
1683 		if (r)
1684 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1685 				  adev->ip_blocks[i].version->funcs->name, r);
1686 	}
1687 	return r;
1688 }
1689 
1690 /**
1691  * amdgpu_device_ip_get_clockgating_state - get the CG state
1692  *
1693  * @adev: amdgpu_device pointer
1694  * @flags: clockgating feature flags
1695  *
1696  * Walks the list of IPs on the device and updates the clockgating
1697  * flags for each IP.
1698  * Updates @flags with the feature flags for each hardware IP where
1699  * clockgating is enabled.
1700  */
1701 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1702 					    u32 *flags)
1703 {
1704 	int i;
1705 
1706 	for (i = 0; i < adev->num_ip_blocks; i++) {
1707 		if (!adev->ip_blocks[i].status.valid)
1708 			continue;
1709 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1710 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1711 	}
1712 }
1713 
1714 /**
1715  * amdgpu_device_ip_wait_for_idle - wait for idle
1716  *
1717  * @adev: amdgpu_device pointer
1718  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1719  *
1720  * Waits for the request hardware IP to be idle.
1721  * Returns 0 for success or a negative error code on failure.
1722  */
1723 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1724 				   enum amd_ip_block_type block_type)
1725 {
1726 	int i, r;
1727 
1728 	for (i = 0; i < adev->num_ip_blocks; i++) {
1729 		if (!adev->ip_blocks[i].status.valid)
1730 			continue;
1731 		if (adev->ip_blocks[i].version->type == block_type) {
1732 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1733 			if (r)
1734 				return r;
1735 			break;
1736 		}
1737 	}
1738 	return 0;
1739 
1740 }
1741 
1742 /**
1743  * amdgpu_device_ip_is_idle - is the hardware IP idle
1744  *
1745  * @adev: amdgpu_device pointer
1746  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1747  *
1748  * Check if the hardware IP is idle or not.
1749  * Returns true if it the IP is idle, false if not.
1750  */
1751 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1752 			      enum amd_ip_block_type block_type)
1753 {
1754 	int i;
1755 
1756 	for (i = 0; i < adev->num_ip_blocks; i++) {
1757 		if (!adev->ip_blocks[i].status.valid)
1758 			continue;
1759 		if (adev->ip_blocks[i].version->type == block_type)
1760 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1761 	}
1762 	return true;
1763 
1764 }
1765 
1766 /**
1767  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1768  *
1769  * @adev: amdgpu_device pointer
1770  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1771  *
1772  * Returns a pointer to the hardware IP block structure
1773  * if it exists for the asic, otherwise NULL.
1774  */
1775 struct amdgpu_ip_block *
1776 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1777 			      enum amd_ip_block_type type)
1778 {
1779 	int i;
1780 
1781 	for (i = 0; i < adev->num_ip_blocks; i++)
1782 		if (adev->ip_blocks[i].version->type == type)
1783 			return &adev->ip_blocks[i];
1784 
1785 	return NULL;
1786 }
1787 
1788 /**
1789  * amdgpu_device_ip_block_version_cmp
1790  *
1791  * @adev: amdgpu_device pointer
1792  * @type: enum amd_ip_block_type
1793  * @major: major version
1794  * @minor: minor version
1795  *
1796  * return 0 if equal or greater
1797  * return 1 if smaller or the ip_block doesn't exist
1798  */
1799 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1800 				       enum amd_ip_block_type type,
1801 				       u32 major, u32 minor)
1802 {
1803 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1804 
1805 	if (ip_block && ((ip_block->version->major > major) ||
1806 			((ip_block->version->major == major) &&
1807 			(ip_block->version->minor >= minor))))
1808 		return 0;
1809 
1810 	return 1;
1811 }
1812 
1813 /**
1814  * amdgpu_device_ip_block_add
1815  *
1816  * @adev: amdgpu_device pointer
1817  * @ip_block_version: pointer to the IP to add
1818  *
1819  * Adds the IP block driver information to the collection of IPs
1820  * on the asic.
1821  */
1822 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1823 			       const struct amdgpu_ip_block_version *ip_block_version)
1824 {
1825 	if (!ip_block_version)
1826 		return -EINVAL;
1827 
1828 	switch (ip_block_version->type) {
1829 	case AMD_IP_BLOCK_TYPE_VCN:
1830 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1831 			return 0;
1832 		break;
1833 	case AMD_IP_BLOCK_TYPE_JPEG:
1834 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1835 			return 0;
1836 		break;
1837 	default:
1838 		break;
1839 	}
1840 
1841 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1842 		  ip_block_version->funcs->name);
1843 
1844 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1845 
1846 	return 0;
1847 }
1848 
1849 /**
1850  * amdgpu_device_enable_virtual_display - enable virtual display feature
1851  *
1852  * @adev: amdgpu_device pointer
1853  *
1854  * Enabled the virtual display feature if the user has enabled it via
1855  * the module parameter virtual_display.  This feature provides a virtual
1856  * display hardware on headless boards or in virtualized environments.
1857  * This function parses and validates the configuration string specified by
1858  * the user and configues the virtual display configuration (number of
1859  * virtual connectors, crtcs, etc.) specified.
1860  */
1861 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1862 {
1863 	adev->enable_virtual_display = false;
1864 
1865 #ifdef notyet
1866 	if (amdgpu_virtual_display) {
1867 		const char *pci_address_name = pci_name(adev->pdev);
1868 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1869 
1870 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1871 		pciaddstr_tmp = pciaddstr;
1872 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1873 			pciaddname = strsep(&pciaddname_tmp, ",");
1874 			if (!strcmp("all", pciaddname)
1875 			    || !strcmp(pci_address_name, pciaddname)) {
1876 				long num_crtc;
1877 				int res = -1;
1878 
1879 				adev->enable_virtual_display = true;
1880 
1881 				if (pciaddname_tmp)
1882 					res = kstrtol(pciaddname_tmp, 10,
1883 						      &num_crtc);
1884 
1885 				if (!res) {
1886 					if (num_crtc < 1)
1887 						num_crtc = 1;
1888 					if (num_crtc > 6)
1889 						num_crtc = 6;
1890 					adev->mode_info.num_crtc = num_crtc;
1891 				} else {
1892 					adev->mode_info.num_crtc = 1;
1893 				}
1894 				break;
1895 			}
1896 		}
1897 
1898 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1899 			 amdgpu_virtual_display, pci_address_name,
1900 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1901 
1902 		kfree(pciaddstr);
1903 	}
1904 #endif
1905 }
1906 
1907 /**
1908  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * Parses the asic configuration parameters specified in the gpu info
1913  * firmware and makes them availale to the driver for use in configuring
1914  * the asic.
1915  * Returns 0 on success, -EINVAL on failure.
1916  */
1917 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1918 {
1919 	const char *chip_name;
1920 	char fw_name[40];
1921 	int err;
1922 	const struct gpu_info_firmware_header_v1_0 *hdr;
1923 
1924 	adev->firmware.gpu_info_fw = NULL;
1925 
1926 	if (adev->mman.discovery_bin) {
1927 		amdgpu_discovery_get_gfx_info(adev);
1928 
1929 		/*
1930 		 * FIXME: The bounding box is still needed by Navi12, so
1931 		 * temporarily read it from gpu_info firmware. Should be droped
1932 		 * when DAL no longer needs it.
1933 		 */
1934 		if (adev->asic_type != CHIP_NAVI12)
1935 			return 0;
1936 	}
1937 
1938 	switch (adev->asic_type) {
1939 #ifdef CONFIG_DRM_AMDGPU_SI
1940 	case CHIP_VERDE:
1941 	case CHIP_TAHITI:
1942 	case CHIP_PITCAIRN:
1943 	case CHIP_OLAND:
1944 	case CHIP_HAINAN:
1945 #endif
1946 #ifdef CONFIG_DRM_AMDGPU_CIK
1947 	case CHIP_BONAIRE:
1948 	case CHIP_HAWAII:
1949 	case CHIP_KAVERI:
1950 	case CHIP_KABINI:
1951 	case CHIP_MULLINS:
1952 #endif
1953 	case CHIP_TOPAZ:
1954 	case CHIP_TONGA:
1955 	case CHIP_FIJI:
1956 	case CHIP_POLARIS10:
1957 	case CHIP_POLARIS11:
1958 	case CHIP_POLARIS12:
1959 	case CHIP_VEGAM:
1960 	case CHIP_CARRIZO:
1961 	case CHIP_STONEY:
1962 	case CHIP_VEGA20:
1963 	case CHIP_ALDEBARAN:
1964 	case CHIP_SIENNA_CICHLID:
1965 	case CHIP_NAVY_FLOUNDER:
1966 	case CHIP_DIMGREY_CAVEFISH:
1967 	case CHIP_BEIGE_GOBY:
1968 	default:
1969 		return 0;
1970 	case CHIP_VEGA10:
1971 		chip_name = "vega10";
1972 		break;
1973 	case CHIP_VEGA12:
1974 		chip_name = "vega12";
1975 		break;
1976 	case CHIP_RAVEN:
1977 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1978 			chip_name = "raven2";
1979 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1980 			chip_name = "picasso";
1981 		else
1982 			chip_name = "raven";
1983 		break;
1984 	case CHIP_ARCTURUS:
1985 		chip_name = "arcturus";
1986 		break;
1987 	case CHIP_RENOIR:
1988 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1989 			chip_name = "renoir";
1990 		else
1991 			chip_name = "green_sardine";
1992 		break;
1993 	case CHIP_NAVI10:
1994 		chip_name = "navi10";
1995 		break;
1996 	case CHIP_NAVI14:
1997 		chip_name = "navi14";
1998 		break;
1999 	case CHIP_NAVI12:
2000 		chip_name = "navi12";
2001 		break;
2002 	case CHIP_VANGOGH:
2003 		chip_name = "vangogh";
2004 		break;
2005 	case CHIP_YELLOW_CARP:
2006 		chip_name = "yellow_carp";
2007 		break;
2008 	}
2009 
2010 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2011 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2012 	if (err) {
2013 		dev_err(adev->dev,
2014 			"Failed to load gpu_info firmware \"%s\"\n",
2015 			fw_name);
2016 		goto out;
2017 	}
2018 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2019 	if (err) {
2020 		dev_err(adev->dev,
2021 			"Failed to validate gpu_info firmware \"%s\"\n",
2022 			fw_name);
2023 		goto out;
2024 	}
2025 
2026 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2027 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2028 
2029 	switch (hdr->version_major) {
2030 	case 1:
2031 	{
2032 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2033 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2034 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2035 
2036 		/*
2037 		 * Should be droped when DAL no longer needs it.
2038 		 */
2039 		if (adev->asic_type == CHIP_NAVI12)
2040 			goto parse_soc_bounding_box;
2041 
2042 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2043 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2044 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2045 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2046 		adev->gfx.config.max_texture_channel_caches =
2047 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2048 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2049 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2050 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2051 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2052 		adev->gfx.config.double_offchip_lds_buf =
2053 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2054 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2055 		adev->gfx.cu_info.max_waves_per_simd =
2056 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2057 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2058 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2059 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2060 		if (hdr->version_minor >= 1) {
2061 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2062 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2063 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2064 			adev->gfx.config.num_sc_per_sh =
2065 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2066 			adev->gfx.config.num_packer_per_sc =
2067 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2068 		}
2069 
2070 parse_soc_bounding_box:
2071 		/*
2072 		 * soc bounding box info is not integrated in disocovery table,
2073 		 * we always need to parse it from gpu info firmware if needed.
2074 		 */
2075 		if (hdr->version_minor == 2) {
2076 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2077 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2078 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2079 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2080 		}
2081 		break;
2082 	}
2083 	default:
2084 		dev_err(adev->dev,
2085 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2086 		err = -EINVAL;
2087 		goto out;
2088 	}
2089 out:
2090 	return err;
2091 }
2092 
2093 /**
2094  * amdgpu_device_ip_early_init - run early init for hardware IPs
2095  *
2096  * @adev: amdgpu_device pointer
2097  *
2098  * Early initialization pass for hardware IPs.  The hardware IPs that make
2099  * up each asic are discovered each IP's early_init callback is run.  This
2100  * is the first stage in initializing the asic.
2101  * Returns 0 on success, negative error code on failure.
2102  */
2103 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2104 {
2105 	struct drm_device *dev = adev_to_drm(adev);
2106 	struct pci_dev *parent;
2107 	int i, r;
2108 
2109 	amdgpu_device_enable_virtual_display(adev);
2110 
2111 	if (amdgpu_sriov_vf(adev)) {
2112 		r = amdgpu_virt_request_full_gpu(adev, true);
2113 		if (r)
2114 			return r;
2115 	}
2116 
2117 	switch (adev->asic_type) {
2118 #ifdef CONFIG_DRM_AMDGPU_SI
2119 	case CHIP_VERDE:
2120 	case CHIP_TAHITI:
2121 	case CHIP_PITCAIRN:
2122 	case CHIP_OLAND:
2123 	case CHIP_HAINAN:
2124 		adev->family = AMDGPU_FAMILY_SI;
2125 		r = si_set_ip_blocks(adev);
2126 		if (r)
2127 			return r;
2128 		break;
2129 #endif
2130 #ifdef CONFIG_DRM_AMDGPU_CIK
2131 	case CHIP_BONAIRE:
2132 	case CHIP_HAWAII:
2133 	case CHIP_KAVERI:
2134 	case CHIP_KABINI:
2135 	case CHIP_MULLINS:
2136 		if (adev->flags & AMD_IS_APU)
2137 			adev->family = AMDGPU_FAMILY_KV;
2138 		else
2139 			adev->family = AMDGPU_FAMILY_CI;
2140 
2141 		r = cik_set_ip_blocks(adev);
2142 		if (r)
2143 			return r;
2144 		break;
2145 #endif
2146 	case CHIP_TOPAZ:
2147 	case CHIP_TONGA:
2148 	case CHIP_FIJI:
2149 	case CHIP_POLARIS10:
2150 	case CHIP_POLARIS11:
2151 	case CHIP_POLARIS12:
2152 	case CHIP_VEGAM:
2153 	case CHIP_CARRIZO:
2154 	case CHIP_STONEY:
2155 		if (adev->flags & AMD_IS_APU)
2156 			adev->family = AMDGPU_FAMILY_CZ;
2157 		else
2158 			adev->family = AMDGPU_FAMILY_VI;
2159 
2160 		r = vi_set_ip_blocks(adev);
2161 		if (r)
2162 			return r;
2163 		break;
2164 	case CHIP_VEGA10:
2165 	case CHIP_VEGA12:
2166 	case CHIP_VEGA20:
2167 	case CHIP_RAVEN:
2168 	case CHIP_ARCTURUS:
2169 	case CHIP_RENOIR:
2170 	case CHIP_ALDEBARAN:
2171 		if (adev->flags & AMD_IS_APU)
2172 			adev->family = AMDGPU_FAMILY_RV;
2173 		else
2174 			adev->family = AMDGPU_FAMILY_AI;
2175 
2176 		r = soc15_set_ip_blocks(adev);
2177 		if (r)
2178 			return r;
2179 		break;
2180 	case  CHIP_NAVI10:
2181 	case  CHIP_NAVI14:
2182 	case  CHIP_NAVI12:
2183 	case  CHIP_SIENNA_CICHLID:
2184 	case  CHIP_NAVY_FLOUNDER:
2185 	case  CHIP_DIMGREY_CAVEFISH:
2186 	case  CHIP_BEIGE_GOBY:
2187 	case CHIP_VANGOGH:
2188 	case CHIP_YELLOW_CARP:
2189 	case CHIP_CYAN_SKILLFISH:
2190 		if (adev->asic_type == CHIP_VANGOGH)
2191 			adev->family = AMDGPU_FAMILY_VGH;
2192 		else if (adev->asic_type == CHIP_YELLOW_CARP)
2193 			adev->family = AMDGPU_FAMILY_YC;
2194 		else
2195 			adev->family = AMDGPU_FAMILY_NV;
2196 
2197 		r = nv_set_ip_blocks(adev);
2198 		if (r)
2199 			return r;
2200 		break;
2201 	default:
2202 		/* FIXME: not supported yet */
2203 		return -EINVAL;
2204 	}
2205 
2206 	if (amdgpu_has_atpx() &&
2207 	    (amdgpu_is_atpx_hybrid() ||
2208 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2209 	    ((adev->flags & AMD_IS_APU) == 0) &&
2210 	    !pci_is_thunderbolt_attached(dev->pdev))
2211 		adev->flags |= AMD_IS_PX;
2212 
2213 	if (!(adev->flags & AMD_IS_APU)) {
2214 		parent = pci_upstream_bridge(adev->pdev);
2215 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2216 	}
2217 
2218 	amdgpu_amdkfd_device_probe(adev);
2219 
2220 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2221 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2222 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2223 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2224 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2225 
2226 	for (i = 0; i < adev->num_ip_blocks; i++) {
2227 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2228 			DRM_ERROR("disabled ip block: %d <%s>\n",
2229 				  i, adev->ip_blocks[i].version->funcs->name);
2230 			adev->ip_blocks[i].status.valid = false;
2231 		} else {
2232 			if (adev->ip_blocks[i].version->funcs->early_init) {
2233 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2234 				if (r == -ENOENT) {
2235 					adev->ip_blocks[i].status.valid = false;
2236 				} else if (r) {
2237 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2238 						  adev->ip_blocks[i].version->funcs->name, r);
2239 					return r;
2240 				} else {
2241 					adev->ip_blocks[i].status.valid = true;
2242 				}
2243 			} else {
2244 				adev->ip_blocks[i].status.valid = true;
2245 			}
2246 		}
2247 		/* get the vbios after the asic_funcs are set up */
2248 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2249 			r = amdgpu_device_parse_gpu_info_fw(adev);
2250 			if (r)
2251 				return r;
2252 
2253 			/* Read BIOS */
2254 			if (!amdgpu_get_bios(adev))
2255 				return -EINVAL;
2256 
2257 			r = amdgpu_atombios_init(adev);
2258 			if (r) {
2259 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2260 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2261 				return r;
2262 			}
2263 
2264 			/*get pf2vf msg info at it's earliest time*/
2265 			if (amdgpu_sriov_vf(adev))
2266 				amdgpu_virt_init_data_exchange(adev);
2267 
2268 		}
2269 	}
2270 
2271 	adev->cg_flags &= amdgpu_cg_mask;
2272 	adev->pg_flags &= amdgpu_pg_mask;
2273 
2274 	return 0;
2275 }
2276 
2277 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2278 {
2279 	int i, r;
2280 
2281 	for (i = 0; i < adev->num_ip_blocks; i++) {
2282 		if (!adev->ip_blocks[i].status.sw)
2283 			continue;
2284 		if (adev->ip_blocks[i].status.hw)
2285 			continue;
2286 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2287 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2288 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2289 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2290 			if (r) {
2291 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2292 					  adev->ip_blocks[i].version->funcs->name, r);
2293 				return r;
2294 			}
2295 			adev->ip_blocks[i].status.hw = true;
2296 		}
2297 	}
2298 
2299 	return 0;
2300 }
2301 
2302 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2303 {
2304 	int i, r;
2305 
2306 	for (i = 0; i < adev->num_ip_blocks; i++) {
2307 		if (!adev->ip_blocks[i].status.sw)
2308 			continue;
2309 		if (adev->ip_blocks[i].status.hw)
2310 			continue;
2311 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2312 		if (r) {
2313 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2314 				  adev->ip_blocks[i].version->funcs->name, r);
2315 			return r;
2316 		}
2317 		adev->ip_blocks[i].status.hw = true;
2318 	}
2319 
2320 	return 0;
2321 }
2322 
2323 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2324 {
2325 	int r = 0;
2326 	int i;
2327 	uint32_t smu_version;
2328 
2329 	if (adev->asic_type >= CHIP_VEGA10) {
2330 		for (i = 0; i < adev->num_ip_blocks; i++) {
2331 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2332 				continue;
2333 
2334 			if (!adev->ip_blocks[i].status.sw)
2335 				continue;
2336 
2337 			/* no need to do the fw loading again if already done*/
2338 			if (adev->ip_blocks[i].status.hw == true)
2339 				break;
2340 
2341 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2342 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2343 				if (r) {
2344 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2345 							  adev->ip_blocks[i].version->funcs->name, r);
2346 					return r;
2347 				}
2348 			} else {
2349 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2350 				if (r) {
2351 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2352 							  adev->ip_blocks[i].version->funcs->name, r);
2353 					return r;
2354 				}
2355 			}
2356 
2357 			adev->ip_blocks[i].status.hw = true;
2358 			break;
2359 		}
2360 	}
2361 
2362 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2363 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2364 
2365 	return r;
2366 }
2367 
2368 /**
2369  * amdgpu_device_ip_init - run init for hardware IPs
2370  *
2371  * @adev: amdgpu_device pointer
2372  *
2373  * Main initialization pass for hardware IPs.  The list of all the hardware
2374  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2375  * are run.  sw_init initializes the software state associated with each IP
2376  * and hw_init initializes the hardware associated with each IP.
2377  * Returns 0 on success, negative error code on failure.
2378  */
2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2380 {
2381 	int i, r;
2382 
2383 	r = amdgpu_ras_init(adev);
2384 	if (r)
2385 		return r;
2386 
2387 	for (i = 0; i < adev->num_ip_blocks; i++) {
2388 		if (!adev->ip_blocks[i].status.valid)
2389 			continue;
2390 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2391 		if (r) {
2392 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2393 				  adev->ip_blocks[i].version->funcs->name, r);
2394 			goto init_failed;
2395 		}
2396 		adev->ip_blocks[i].status.sw = true;
2397 
2398 		/* need to do gmc hw init early so we can allocate gpu mem */
2399 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2400 			r = amdgpu_device_vram_scratch_init(adev);
2401 			if (r) {
2402 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2403 				goto init_failed;
2404 			}
2405 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2406 			if (r) {
2407 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2408 				goto init_failed;
2409 			}
2410 			r = amdgpu_device_wb_init(adev);
2411 			if (r) {
2412 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2413 				goto init_failed;
2414 			}
2415 			adev->ip_blocks[i].status.hw = true;
2416 
2417 			/* right after GMC hw init, we create CSA */
2418 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2419 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2420 								AMDGPU_GEM_DOMAIN_VRAM,
2421 								AMDGPU_CSA_SIZE);
2422 				if (r) {
2423 					DRM_ERROR("allocate CSA failed %d\n", r);
2424 					goto init_failed;
2425 				}
2426 			}
2427 		}
2428 	}
2429 
2430 	if (amdgpu_sriov_vf(adev))
2431 		amdgpu_virt_init_data_exchange(adev);
2432 
2433 	r = amdgpu_ib_pool_init(adev);
2434 	if (r) {
2435 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2436 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2437 		goto init_failed;
2438 	}
2439 
2440 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2441 	if (r)
2442 		goto init_failed;
2443 
2444 	r = amdgpu_amdkfd_resume_iommu(adev);
2445 	if (r)
2446 		goto init_failed;
2447 
2448 	r = amdgpu_device_ip_hw_init_phase1(adev);
2449 	if (r)
2450 		goto init_failed;
2451 
2452 	r = amdgpu_device_fw_loading(adev);
2453 	if (r)
2454 		goto init_failed;
2455 
2456 	r = amdgpu_device_ip_hw_init_phase2(adev);
2457 	if (r)
2458 		goto init_failed;
2459 
2460 	/*
2461 	 * retired pages will be loaded from eeprom and reserved here,
2462 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2463 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2464 	 * for I2C communication which only true at this point.
2465 	 *
2466 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2467 	 * failure from bad gpu situation and stop amdgpu init process
2468 	 * accordingly. For other failed cases, it will still release all
2469 	 * the resource and print error message, rather than returning one
2470 	 * negative value to upper level.
2471 	 *
2472 	 * Note: theoretically, this should be called before all vram allocations
2473 	 * to protect retired page from abusing
2474 	 */
2475 	r = amdgpu_ras_recovery_init(adev);
2476 	if (r)
2477 		goto init_failed;
2478 
2479 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2480 		amdgpu_xgmi_add_device(adev);
2481 
2482 	/* Don't init kfd if whole hive need to be reset during init */
2483 	if (!adev->gmc.xgmi.pending_reset)
2484 		amdgpu_amdkfd_device_init(adev);
2485 
2486 	amdgpu_fru_get_product_info(adev);
2487 
2488 init_failed:
2489 	if (amdgpu_sriov_vf(adev))
2490 		amdgpu_virt_release_full_gpu(adev, true);
2491 
2492 	return r;
2493 }
2494 
2495 /**
2496  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2497  *
2498  * @adev: amdgpu_device pointer
2499  *
2500  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2501  * this function before a GPU reset.  If the value is retained after a
2502  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2503  */
2504 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2505 {
2506 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2507 }
2508 
2509 /**
2510  * amdgpu_device_check_vram_lost - check if vram is valid
2511  *
2512  * @adev: amdgpu_device pointer
2513  *
2514  * Checks the reset magic value written to the gart pointer in VRAM.
2515  * The driver calls this after a GPU reset to see if the contents of
2516  * VRAM is lost or now.
2517  * returns true if vram is lost, false if not.
2518  */
2519 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2520 {
2521 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2522 			AMDGPU_RESET_MAGIC_NUM))
2523 		return true;
2524 
2525 	if (!amdgpu_in_reset(adev))
2526 		return false;
2527 
2528 	/*
2529 	 * For all ASICs with baco/mode1 reset, the VRAM is
2530 	 * always assumed to be lost.
2531 	 */
2532 	switch (amdgpu_asic_reset_method(adev)) {
2533 	case AMD_RESET_METHOD_BACO:
2534 	case AMD_RESET_METHOD_MODE1:
2535 		return true;
2536 	default:
2537 		return false;
2538 	}
2539 }
2540 
2541 /**
2542  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2543  *
2544  * @adev: amdgpu_device pointer
2545  * @state: clockgating state (gate or ungate)
2546  *
2547  * The list of all the hardware IPs that make up the asic is walked and the
2548  * set_clockgating_state callbacks are run.
2549  * Late initialization pass enabling clockgating for hardware IPs.
2550  * Fini or suspend, pass disabling clockgating for hardware IPs.
2551  * Returns 0 on success, negative error code on failure.
2552  */
2553 
2554 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2555 			       enum amd_clockgating_state state)
2556 {
2557 	int i, j, r;
2558 
2559 	if (amdgpu_emu_mode == 1)
2560 		return 0;
2561 
2562 	for (j = 0; j < adev->num_ip_blocks; j++) {
2563 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2564 		if (!adev->ip_blocks[i].status.late_initialized)
2565 			continue;
2566 		/* skip CG for GFX on S0ix */
2567 		if (adev->in_s0ix &&
2568 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2569 			continue;
2570 		/* skip CG for VCE/UVD, it's handled specially */
2571 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2572 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2573 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2574 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2575 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2576 			/* enable clockgating to save power */
2577 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2578 										     state);
2579 			if (r) {
2580 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2581 					  adev->ip_blocks[i].version->funcs->name, r);
2582 				return r;
2583 			}
2584 		}
2585 	}
2586 
2587 	return 0;
2588 }
2589 
2590 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2591 			       enum amd_powergating_state state)
2592 {
2593 	int i, j, r;
2594 
2595 	if (amdgpu_emu_mode == 1)
2596 		return 0;
2597 
2598 	for (j = 0; j < adev->num_ip_blocks; j++) {
2599 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2600 		if (!adev->ip_blocks[i].status.late_initialized)
2601 			continue;
2602 		/* skip PG for GFX on S0ix */
2603 		if (adev->in_s0ix &&
2604 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2605 			continue;
2606 		/* skip CG for VCE/UVD, it's handled specially */
2607 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2608 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2609 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2610 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2611 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2612 			/* enable powergating to save power */
2613 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2614 											state);
2615 			if (r) {
2616 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2617 					  adev->ip_blocks[i].version->funcs->name, r);
2618 				return r;
2619 			}
2620 		}
2621 	}
2622 	return 0;
2623 }
2624 
2625 static int amdgpu_device_enable_mgpu_fan_boost(void)
2626 {
2627 	struct amdgpu_gpu_instance *gpu_ins;
2628 	struct amdgpu_device *adev;
2629 	int i, ret = 0;
2630 
2631 	mutex_lock(&mgpu_info.mutex);
2632 
2633 	/*
2634 	 * MGPU fan boost feature should be enabled
2635 	 * only when there are two or more dGPUs in
2636 	 * the system
2637 	 */
2638 	if (mgpu_info.num_dgpu < 2)
2639 		goto out;
2640 
2641 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2642 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2643 		adev = gpu_ins->adev;
2644 		if (!(adev->flags & AMD_IS_APU) &&
2645 		    !gpu_ins->mgpu_fan_enabled) {
2646 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2647 			if (ret)
2648 				break;
2649 
2650 			gpu_ins->mgpu_fan_enabled = 1;
2651 		}
2652 	}
2653 
2654 out:
2655 	mutex_unlock(&mgpu_info.mutex);
2656 
2657 	return ret;
2658 }
2659 
2660 /**
2661  * amdgpu_device_ip_late_init - run late init for hardware IPs
2662  *
2663  * @adev: amdgpu_device pointer
2664  *
2665  * Late initialization pass for hardware IPs.  The list of all the hardware
2666  * IPs that make up the asic is walked and the late_init callbacks are run.
2667  * late_init covers any special initialization that an IP requires
2668  * after all of the have been initialized or something that needs to happen
2669  * late in the init process.
2670  * Returns 0 on success, negative error code on failure.
2671  */
2672 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2673 {
2674 	struct amdgpu_gpu_instance *gpu_instance;
2675 	int i = 0, r;
2676 
2677 	for (i = 0; i < adev->num_ip_blocks; i++) {
2678 		if (!adev->ip_blocks[i].status.hw)
2679 			continue;
2680 		if (adev->ip_blocks[i].version->funcs->late_init) {
2681 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2682 			if (r) {
2683 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2684 					  adev->ip_blocks[i].version->funcs->name, r);
2685 				return r;
2686 			}
2687 		}
2688 		adev->ip_blocks[i].status.late_initialized = true;
2689 	}
2690 
2691 	amdgpu_ras_set_error_query_ready(adev, true);
2692 
2693 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2694 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2695 
2696 	amdgpu_device_fill_reset_magic(adev);
2697 
2698 	r = amdgpu_device_enable_mgpu_fan_boost();
2699 	if (r)
2700 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2701 
2702 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2703 	if (adev->asic_type == CHIP_ARCTURUS &&
2704 	    amdgpu_passthrough(adev) &&
2705 	    adev->gmc.xgmi.num_physical_nodes > 1)
2706 		smu_set_light_sbr(&adev->smu, true);
2707 
2708 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2709 		mutex_lock(&mgpu_info.mutex);
2710 
2711 		/*
2712 		 * Reset device p-state to low as this was booted with high.
2713 		 *
2714 		 * This should be performed only after all devices from the same
2715 		 * hive get initialized.
2716 		 *
2717 		 * However, it's unknown how many device in the hive in advance.
2718 		 * As this is counted one by one during devices initializations.
2719 		 *
2720 		 * So, we wait for all XGMI interlinked devices initialized.
2721 		 * This may bring some delays as those devices may come from
2722 		 * different hives. But that should be OK.
2723 		 */
2724 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2725 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2726 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2727 				if (gpu_instance->adev->flags & AMD_IS_APU)
2728 					continue;
2729 
2730 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2731 						AMDGPU_XGMI_PSTATE_MIN);
2732 				if (r) {
2733 					DRM_ERROR("pstate setting failed (%d).\n", r);
2734 					break;
2735 				}
2736 			}
2737 		}
2738 
2739 		mutex_unlock(&mgpu_info.mutex);
2740 	}
2741 
2742 	return 0;
2743 }
2744 
2745 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2746 {
2747 	int i, r;
2748 
2749 	for (i = 0; i < adev->num_ip_blocks; i++) {
2750 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2751 			continue;
2752 
2753 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2754 		if (r) {
2755 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2756 				  adev->ip_blocks[i].version->funcs->name, r);
2757 		}
2758 	}
2759 
2760 	amdgpu_amdkfd_suspend(adev, false);
2761 
2762 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2763 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2764 
2765 	/* need to disable SMC first */
2766 	for (i = 0; i < adev->num_ip_blocks; i++) {
2767 		if (!adev->ip_blocks[i].status.hw)
2768 			continue;
2769 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2770 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2771 			/* XXX handle errors */
2772 			if (r) {
2773 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2774 					  adev->ip_blocks[i].version->funcs->name, r);
2775 			}
2776 			adev->ip_blocks[i].status.hw = false;
2777 			break;
2778 		}
2779 	}
2780 
2781 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2782 		if (!adev->ip_blocks[i].status.hw)
2783 			continue;
2784 
2785 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2786 		/* XXX handle errors */
2787 		if (r) {
2788 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2789 				  adev->ip_blocks[i].version->funcs->name, r);
2790 		}
2791 
2792 		adev->ip_blocks[i].status.hw = false;
2793 	}
2794 
2795 	if (amdgpu_sriov_vf(adev)) {
2796 		if (amdgpu_virt_release_full_gpu(adev, false))
2797 			DRM_ERROR("failed to release exclusive mode on fini\n");
2798 	}
2799 
2800 	return 0;
2801 }
2802 
2803 /**
2804  * amdgpu_device_ip_fini - run fini for hardware IPs
2805  *
2806  * @adev: amdgpu_device pointer
2807  *
2808  * Main teardown pass for hardware IPs.  The list of all the hardware
2809  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2810  * are run.  hw_fini tears down the hardware associated with each IP
2811  * and sw_fini tears down any software state associated with each IP.
2812  * Returns 0 on success, negative error code on failure.
2813  */
2814 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2815 {
2816 	int i, r;
2817 
2818 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2819 		amdgpu_virt_release_ras_err_handler_data(adev);
2820 
2821 	amdgpu_ras_pre_fini(adev);
2822 
2823 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2824 		amdgpu_xgmi_remove_device(adev);
2825 
2826 	amdgpu_amdkfd_device_fini_sw(adev);
2827 
2828 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2829 		if (!adev->ip_blocks[i].status.sw)
2830 			continue;
2831 
2832 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2833 			amdgpu_ucode_free_bo(adev);
2834 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2835 			amdgpu_device_wb_fini(adev);
2836 			amdgpu_device_vram_scratch_fini(adev);
2837 			amdgpu_ib_pool_fini(adev);
2838 		}
2839 
2840 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2841 		/* XXX handle errors */
2842 		if (r) {
2843 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2844 				  adev->ip_blocks[i].version->funcs->name, r);
2845 		}
2846 		adev->ip_blocks[i].status.sw = false;
2847 		adev->ip_blocks[i].status.valid = false;
2848 	}
2849 
2850 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2851 		if (!adev->ip_blocks[i].status.late_initialized)
2852 			continue;
2853 		if (adev->ip_blocks[i].version->funcs->late_fini)
2854 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2855 		adev->ip_blocks[i].status.late_initialized = false;
2856 	}
2857 
2858 	amdgpu_ras_fini(adev);
2859 
2860 	return 0;
2861 }
2862 
2863 /**
2864  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2865  *
2866  * @work: work_struct.
2867  */
2868 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2869 {
2870 	struct amdgpu_device *adev =
2871 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2872 	int r;
2873 
2874 	r = amdgpu_ib_ring_tests(adev);
2875 	if (r)
2876 		DRM_ERROR("ib ring test failed (%d).\n", r);
2877 }
2878 
2879 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2880 {
2881 	struct amdgpu_device *adev =
2882 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2883 
2884 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2885 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2886 
2887 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2888 		adev->gfx.gfx_off_state = true;
2889 }
2890 
2891 /**
2892  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2893  *
2894  * @adev: amdgpu_device pointer
2895  *
2896  * Main suspend function for hardware IPs.  The list of all the hardware
2897  * IPs that make up the asic is walked, clockgating is disabled and the
2898  * suspend callbacks are run.  suspend puts the hardware and software state
2899  * in each IP into a state suitable for suspend.
2900  * Returns 0 on success, negative error code on failure.
2901  */
2902 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2903 {
2904 	int i, r;
2905 
2906 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2907 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2908 
2909 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2910 		if (!adev->ip_blocks[i].status.valid)
2911 			continue;
2912 
2913 		/* displays are handled separately */
2914 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2915 			continue;
2916 
2917 		/* XXX handle errors */
2918 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2919 		/* XXX handle errors */
2920 		if (r) {
2921 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2922 				  adev->ip_blocks[i].version->funcs->name, r);
2923 			return r;
2924 		}
2925 
2926 		adev->ip_blocks[i].status.hw = false;
2927 	}
2928 
2929 	return 0;
2930 }
2931 
2932 /**
2933  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2934  *
2935  * @adev: amdgpu_device pointer
2936  *
2937  * Main suspend function for hardware IPs.  The list of all the hardware
2938  * IPs that make up the asic is walked, clockgating is disabled and the
2939  * suspend callbacks are run.  suspend puts the hardware and software state
2940  * in each IP into a state suitable for suspend.
2941  * Returns 0 on success, negative error code on failure.
2942  */
2943 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2944 {
2945 	int i, r;
2946 
2947 	if (adev->in_s0ix)
2948 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2949 
2950 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2951 		if (!adev->ip_blocks[i].status.valid)
2952 			continue;
2953 		/* displays are handled in phase1 */
2954 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2955 			continue;
2956 		/* PSP lost connection when err_event_athub occurs */
2957 		if (amdgpu_ras_intr_triggered() &&
2958 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2959 			adev->ip_blocks[i].status.hw = false;
2960 			continue;
2961 		}
2962 
2963 		/* skip unnecessary suspend if we do not initialize them yet */
2964 		if (adev->gmc.xgmi.pending_reset &&
2965 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2966 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2967 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2968 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2969 			adev->ip_blocks[i].status.hw = false;
2970 			continue;
2971 		}
2972 
2973 		/* skip suspend of gfx and psp for S0ix
2974 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2975 		 * like at runtime. PSP is also part of the always on hardware
2976 		 * so no need to suspend it.
2977 		 */
2978 		if (adev->in_s0ix &&
2979 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2980 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2981 			continue;
2982 
2983 		/* XXX handle errors */
2984 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2985 		/* XXX handle errors */
2986 		if (r) {
2987 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2988 				  adev->ip_blocks[i].version->funcs->name, r);
2989 		}
2990 		adev->ip_blocks[i].status.hw = false;
2991 		/* handle putting the SMC in the appropriate state */
2992 		if(!amdgpu_sriov_vf(adev)){
2993 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2994 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2995 				if (r) {
2996 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2997 							adev->mp1_state, r);
2998 					return r;
2999 				}
3000 			}
3001 		}
3002 	}
3003 
3004 	return 0;
3005 }
3006 
3007 /**
3008  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3009  *
3010  * @adev: amdgpu_device pointer
3011  *
3012  * Main suspend function for hardware IPs.  The list of all the hardware
3013  * IPs that make up the asic is walked, clockgating is disabled and the
3014  * suspend callbacks are run.  suspend puts the hardware and software state
3015  * in each IP into a state suitable for suspend.
3016  * Returns 0 on success, negative error code on failure.
3017  */
3018 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3019 {
3020 	int r;
3021 
3022 	if (amdgpu_sriov_vf(adev)) {
3023 		amdgpu_virt_fini_data_exchange(adev);
3024 		amdgpu_virt_request_full_gpu(adev, false);
3025 	}
3026 
3027 	r = amdgpu_device_ip_suspend_phase1(adev);
3028 	if (r)
3029 		return r;
3030 	r = amdgpu_device_ip_suspend_phase2(adev);
3031 
3032 	if (amdgpu_sriov_vf(adev))
3033 		amdgpu_virt_release_full_gpu(adev, false);
3034 
3035 	return r;
3036 }
3037 
3038 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3039 {
3040 	int i, r;
3041 
3042 	static enum amd_ip_block_type ip_order[] = {
3043 		AMD_IP_BLOCK_TYPE_GMC,
3044 		AMD_IP_BLOCK_TYPE_COMMON,
3045 		AMD_IP_BLOCK_TYPE_PSP,
3046 		AMD_IP_BLOCK_TYPE_IH,
3047 	};
3048 
3049 	for (i = 0; i < adev->num_ip_blocks; i++) {
3050 		int j;
3051 		struct amdgpu_ip_block *block;
3052 
3053 		block = &adev->ip_blocks[i];
3054 		block->status.hw = false;
3055 
3056 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3057 
3058 			if (block->version->type != ip_order[j] ||
3059 				!block->status.valid)
3060 				continue;
3061 
3062 			r = block->version->funcs->hw_init(adev);
3063 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3064 			if (r)
3065 				return r;
3066 			block->status.hw = true;
3067 		}
3068 	}
3069 
3070 	return 0;
3071 }
3072 
3073 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3074 {
3075 	int i, r;
3076 
3077 	static enum amd_ip_block_type ip_order[] = {
3078 		AMD_IP_BLOCK_TYPE_SMC,
3079 		AMD_IP_BLOCK_TYPE_DCE,
3080 		AMD_IP_BLOCK_TYPE_GFX,
3081 		AMD_IP_BLOCK_TYPE_SDMA,
3082 		AMD_IP_BLOCK_TYPE_UVD,
3083 		AMD_IP_BLOCK_TYPE_VCE,
3084 		AMD_IP_BLOCK_TYPE_VCN
3085 	};
3086 
3087 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3088 		int j;
3089 		struct amdgpu_ip_block *block;
3090 
3091 		for (j = 0; j < adev->num_ip_blocks; j++) {
3092 			block = &adev->ip_blocks[j];
3093 
3094 			if (block->version->type != ip_order[i] ||
3095 				!block->status.valid ||
3096 				block->status.hw)
3097 				continue;
3098 
3099 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3100 				r = block->version->funcs->resume(adev);
3101 			else
3102 				r = block->version->funcs->hw_init(adev);
3103 
3104 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3105 			if (r)
3106 				return r;
3107 			block->status.hw = true;
3108 		}
3109 	}
3110 
3111 	return 0;
3112 }
3113 
3114 /**
3115  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3116  *
3117  * @adev: amdgpu_device pointer
3118  *
3119  * First resume function for hardware IPs.  The list of all the hardware
3120  * IPs that make up the asic is walked and the resume callbacks are run for
3121  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3122  * after a suspend and updates the software state as necessary.  This
3123  * function is also used for restoring the GPU after a GPU reset.
3124  * Returns 0 on success, negative error code on failure.
3125  */
3126 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3127 {
3128 	int i, r;
3129 
3130 	for (i = 0; i < adev->num_ip_blocks; i++) {
3131 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3132 			continue;
3133 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3134 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3135 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
3136 
3137 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3138 			if (r) {
3139 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3140 					  adev->ip_blocks[i].version->funcs->name, r);
3141 				return r;
3142 			}
3143 			adev->ip_blocks[i].status.hw = true;
3144 		}
3145 	}
3146 
3147 	return 0;
3148 }
3149 
3150 /**
3151  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3152  *
3153  * @adev: amdgpu_device pointer
3154  *
3155  * First resume function for hardware IPs.  The list of all the hardware
3156  * IPs that make up the asic is walked and the resume callbacks are run for
3157  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3158  * functional state after a suspend and updates the software state as
3159  * necessary.  This function is also used for restoring the GPU after a GPU
3160  * reset.
3161  * Returns 0 on success, negative error code on failure.
3162  */
3163 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3164 {
3165 	int i, r;
3166 
3167 	for (i = 0; i < adev->num_ip_blocks; i++) {
3168 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3169 			continue;
3170 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3171 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3172 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3173 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3174 			continue;
3175 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3176 		if (r) {
3177 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3178 				  adev->ip_blocks[i].version->funcs->name, r);
3179 			return r;
3180 		}
3181 		adev->ip_blocks[i].status.hw = true;
3182 	}
3183 
3184 	return 0;
3185 }
3186 
3187 /**
3188  * amdgpu_device_ip_resume - run resume for hardware IPs
3189  *
3190  * @adev: amdgpu_device pointer
3191  *
3192  * Main resume function for hardware IPs.  The hardware IPs
3193  * are split into two resume functions because they are
3194  * are also used in in recovering from a GPU reset and some additional
3195  * steps need to be take between them.  In this case (S3/S4) they are
3196  * run sequentially.
3197  * Returns 0 on success, negative error code on failure.
3198  */
3199 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3200 {
3201 	int r;
3202 
3203 	r = amdgpu_amdkfd_resume_iommu(adev);
3204 	if (r)
3205 		return r;
3206 
3207 	r = amdgpu_device_ip_resume_phase1(adev);
3208 	if (r)
3209 		return r;
3210 
3211 	r = amdgpu_device_fw_loading(adev);
3212 	if (r)
3213 		return r;
3214 
3215 	r = amdgpu_device_ip_resume_phase2(adev);
3216 
3217 	return r;
3218 }
3219 
3220 /**
3221  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3222  *
3223  * @adev: amdgpu_device pointer
3224  *
3225  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3226  */
3227 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3228 {
3229 	if (amdgpu_sriov_vf(adev)) {
3230 		if (adev->is_atom_fw) {
3231 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3232 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3233 		} else {
3234 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3235 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3236 		}
3237 
3238 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3239 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3240 	}
3241 }
3242 
3243 /**
3244  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3245  *
3246  * @asic_type: AMD asic type
3247  *
3248  * Check if there is DC (new modesetting infrastructre) support for an asic.
3249  * returns true if DC has support, false if not.
3250  */
3251 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3252 {
3253 	switch (asic_type) {
3254 #if defined(CONFIG_DRM_AMD_DC)
3255 #if defined(CONFIG_DRM_AMD_DC_SI)
3256 	case CHIP_TAHITI:
3257 	case CHIP_PITCAIRN:
3258 	case CHIP_VERDE:
3259 	case CHIP_OLAND:
3260 #endif
3261 	case CHIP_BONAIRE:
3262 	case CHIP_KAVERI:
3263 	case CHIP_KABINI:
3264 	case CHIP_MULLINS:
3265 		/*
3266 		 * We have systems in the wild with these ASICs that require
3267 		 * LVDS and VGA support which is not supported with DC.
3268 		 *
3269 		 * Fallback to the non-DC driver here by default so as not to
3270 		 * cause regressions.
3271 		 */
3272 		return amdgpu_dc > 0;
3273 	case CHIP_HAWAII:
3274 	case CHIP_CARRIZO:
3275 	case CHIP_STONEY:
3276 	case CHIP_POLARIS10:
3277 	case CHIP_POLARIS11:
3278 	case CHIP_POLARIS12:
3279 	case CHIP_VEGAM:
3280 	case CHIP_TONGA:
3281 	case CHIP_FIJI:
3282 	case CHIP_VEGA10:
3283 	case CHIP_VEGA12:
3284 	case CHIP_VEGA20:
3285 #if defined(CONFIG_DRM_AMD_DC_DCN)
3286 	case CHIP_RAVEN:
3287 	case CHIP_NAVI10:
3288 	case CHIP_NAVI14:
3289 	case CHIP_NAVI12:
3290 	case CHIP_RENOIR:
3291 	case CHIP_SIENNA_CICHLID:
3292 	case CHIP_NAVY_FLOUNDER:
3293 	case CHIP_DIMGREY_CAVEFISH:
3294 	case CHIP_BEIGE_GOBY:
3295 	case CHIP_VANGOGH:
3296 	case CHIP_YELLOW_CARP:
3297 #endif
3298 		return amdgpu_dc != 0;
3299 #endif
3300 	default:
3301 		if (amdgpu_dc > 0)
3302 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3303 					 "but isn't supported by ASIC, ignoring\n");
3304 		return false;
3305 	}
3306 }
3307 
3308 /**
3309  * amdgpu_device_has_dc_support - check if dc is supported
3310  *
3311  * @adev: amdgpu_device pointer
3312  *
3313  * Returns true for supported, false for not supported
3314  */
3315 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3316 {
3317 	if (amdgpu_sriov_vf(adev) ||
3318 	    adev->enable_virtual_display ||
3319 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3320 		return false;
3321 
3322 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3323 }
3324 
3325 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3326 {
3327 	struct amdgpu_device *adev =
3328 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3329 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3330 
3331 	/* It's a bug to not have a hive within this function */
3332 	if (WARN_ON(!hive))
3333 		return;
3334 
3335 	/*
3336 	 * Use task barrier to synchronize all xgmi reset works across the
3337 	 * hive. task_barrier_enter and task_barrier_exit will block
3338 	 * until all the threads running the xgmi reset works reach
3339 	 * those points. task_barrier_full will do both blocks.
3340 	 */
3341 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3342 
3343 		task_barrier_enter(&hive->tb);
3344 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3345 
3346 		if (adev->asic_reset_res)
3347 			goto fail;
3348 
3349 		task_barrier_exit(&hive->tb);
3350 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3351 
3352 		if (adev->asic_reset_res)
3353 			goto fail;
3354 
3355 		if (adev->mmhub.ras_funcs &&
3356 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3357 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3358 	} else {
3359 
3360 		task_barrier_full(&hive->tb);
3361 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3362 	}
3363 
3364 fail:
3365 	if (adev->asic_reset_res)
3366 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3367 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3368 	amdgpu_put_xgmi_hive(hive);
3369 }
3370 
3371 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3372 {
3373 	char *input = amdgpu_lockup_timeout;
3374 	char *timeout_setting = NULL;
3375 	int index = 0;
3376 	long timeout;
3377 	int ret = 0;
3378 
3379 	/*
3380 	 * By default timeout for non compute jobs is 10000
3381 	 * and 60000 for compute jobs.
3382 	 * In SR-IOV or passthrough mode, timeout for compute
3383 	 * jobs are 60000 by default.
3384 	 */
3385 	adev->gfx_timeout = msecs_to_jiffies(10000);
3386 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3387 	if (amdgpu_sriov_vf(adev))
3388 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3389 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3390 	else
3391 		adev->compute_timeout =  msecs_to_jiffies(60000);
3392 
3393 #ifdef notyet
3394 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3395 		while ((timeout_setting = strsep(&input, ",")) &&
3396 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3397 			ret = kstrtol(timeout_setting, 0, &timeout);
3398 			if (ret)
3399 				return ret;
3400 
3401 			if (timeout == 0) {
3402 				index++;
3403 				continue;
3404 			} else if (timeout < 0) {
3405 				timeout = MAX_SCHEDULE_TIMEOUT;
3406 			} else {
3407 				timeout = msecs_to_jiffies(timeout);
3408 			}
3409 
3410 			switch (index++) {
3411 			case 0:
3412 				adev->gfx_timeout = timeout;
3413 				break;
3414 			case 1:
3415 				adev->compute_timeout = timeout;
3416 				break;
3417 			case 2:
3418 				adev->sdma_timeout = timeout;
3419 				break;
3420 			case 3:
3421 				adev->video_timeout = timeout;
3422 				break;
3423 			default:
3424 				break;
3425 			}
3426 		}
3427 		/*
3428 		 * There is only one value specified and
3429 		 * it should apply to all non-compute jobs.
3430 		 */
3431 		if (index == 1) {
3432 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3433 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3434 				adev->compute_timeout = adev->gfx_timeout;
3435 		}
3436 	}
3437 #endif
3438 
3439 	return ret;
3440 }
3441 
3442 static const struct attribute *amdgpu_dev_attributes[] = {
3443 	&dev_attr_product_name.attr,
3444 	&dev_attr_product_number.attr,
3445 	&dev_attr_serial_number.attr,
3446 	&dev_attr_pcie_replay_count.attr,
3447 	NULL
3448 };
3449 
3450 /**
3451  * amdgpu_device_init - initialize the driver
3452  *
3453  * @adev: amdgpu_device pointer
3454  * @flags: driver flags
3455  *
3456  * Initializes the driver info and hw (all asics).
3457  * Returns 0 for success or an error on failure.
3458  * Called at driver startup.
3459  */
3460 int amdgpu_device_init(struct amdgpu_device *adev,
3461 		       uint32_t flags)
3462 {
3463 	struct drm_device *ddev = adev_to_drm(adev);
3464 	struct pci_dev *pdev = adev->pdev;
3465 	int r, i;
3466 	bool px = false;
3467 	u32 max_MBps;
3468 
3469 	adev->shutdown = false;
3470 	adev->flags = flags;
3471 
3472 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3473 		adev->asic_type = amdgpu_force_asic_type;
3474 	else
3475 		adev->asic_type = flags & AMD_ASIC_MASK;
3476 
3477 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3478 	if (amdgpu_emu_mode == 1)
3479 		adev->usec_timeout *= 10;
3480 	adev->gmc.gart_size = 512 * 1024 * 1024;
3481 	adev->accel_working = false;
3482 	adev->num_rings = 0;
3483 	adev->mman.buffer_funcs = NULL;
3484 	adev->mman.buffer_funcs_ring = NULL;
3485 	adev->vm_manager.vm_pte_funcs = NULL;
3486 	adev->vm_manager.vm_pte_num_scheds = 0;
3487 	adev->gmc.gmc_funcs = NULL;
3488 	adev->harvest_ip_mask = 0x0;
3489 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3490 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3491 
3492 	adev->smc_rreg = &amdgpu_invalid_rreg;
3493 	adev->smc_wreg = &amdgpu_invalid_wreg;
3494 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3495 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3496 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3497 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3498 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3499 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3500 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3501 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3502 	adev->didt_rreg = &amdgpu_invalid_rreg;
3503 	adev->didt_wreg = &amdgpu_invalid_wreg;
3504 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3505 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3506 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3507 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3508 
3509 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3510 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3511 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3512 
3513 	/* mutex initialization are all done here so we
3514 	 * can recall function without having locking issues */
3515 	rw_init(&adev->firmware.mutex, "agfw");
3516 	rw_init(&adev->pm.mutex, "agpm");
3517 	rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3518 	rw_init(&adev->srbm_mutex, "srbm");
3519 	rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3520 	rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3521 	rw_init(&adev->grbm_idx_mutex, "grbmidx");
3522 	rw_init(&adev->mn_lock, "agpumn");
3523 	rw_init(&adev->virt.vf_errors.lock, "vferr");
3524 	hash_init(adev->mn_hash);
3525 	atomic_set(&adev->in_gpu_reset, 0);
3526 	rw_init(&adev->reset_sem, "amrs");
3527 	rw_init(&adev->psp.mutex, "agpsp");
3528 	rw_init(&adev->notifier_lock, "agnf");
3529 
3530 	r = amdgpu_device_init_apu_flags(adev);
3531 	if (r)
3532 		return r;
3533 
3534 	r = amdgpu_device_check_arguments(adev);
3535 	if (r)
3536 		return r;
3537 
3538 	mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3539 	mtx_init(&adev->smc_idx_lock, IPL_TTY);
3540 	mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3541 	mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3542 	mtx_init(&adev->didt_idx_lock, IPL_TTY);
3543 	mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3544 	mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3545 	mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3546 	mtx_init(&adev->mm_stats.lock, IPL_NONE);
3547 
3548 	INIT_LIST_HEAD(&adev->shadow_list);
3549 	rw_init(&adev->shadow_list_lock, "sdwlst");
3550 
3551 	INIT_LIST_HEAD(&adev->reset_list);
3552 
3553 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3554 			  amdgpu_device_delayed_init_work_handler);
3555 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3556 			  amdgpu_device_delay_enable_gfx_off);
3557 
3558 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3559 
3560 	adev->gfx.gfx_off_req_count = 1;
3561 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3562 
3563 	atomic_set(&adev->throttling_logging_enabled, 1);
3564 	/*
3565 	 * If throttling continues, logging will be performed every minute
3566 	 * to avoid log flooding. "-1" is subtracted since the thermal
3567 	 * throttling interrupt comes every second. Thus, the total logging
3568 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3569 	 * for throttling interrupt) = 60 seconds.
3570 	 */
3571 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3572 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3573 
3574 #ifdef __linux__
3575 	/* Registers mapping */
3576 	/* TODO: block userspace mapping of io register */
3577 	if (adev->asic_type >= CHIP_BONAIRE) {
3578 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3579 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3580 	} else {
3581 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3582 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3583 	}
3584 
3585 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3586 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3587 
3588 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3589 	if (adev->rmmio == NULL) {
3590 		return -ENOMEM;
3591 	}
3592 #endif
3593 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3594 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3595 
3596 	/* enable PCIE atomic ops */
3597 #ifdef notyet
3598 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3599 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3600 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3601 	if (r) {
3602 		adev->have_atomics_support = false;
3603 		DRM_INFO("PCIE atomic ops is not supported\n");
3604 	} else {
3605 		adev->have_atomics_support = true;
3606 	}
3607 #else
3608 	adev->have_atomics_support = false;
3609 #endif
3610 
3611 	amdgpu_device_get_pcie_info(adev);
3612 
3613 	if (amdgpu_mcbp)
3614 		DRM_INFO("MCBP is enabled\n");
3615 
3616 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3617 		adev->enable_mes = true;
3618 
3619 	/* detect hw virtualization here */
3620 	amdgpu_detect_virtualization(adev);
3621 
3622 	r = amdgpu_device_get_job_timeout_settings(adev);
3623 	if (r) {
3624 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3625 		return r;
3626 	}
3627 
3628 	/* early init functions */
3629 	r = amdgpu_device_ip_early_init(adev);
3630 	if (r)
3631 		return r;
3632 
3633 	/* doorbell bar mapping and doorbell index init*/
3634 	amdgpu_device_doorbell_init(adev);
3635 
3636 	if (amdgpu_emu_mode == 1) {
3637 		/* post the asic on emulation mode */
3638 		emu_soc_asic_init(adev);
3639 		goto fence_driver_init;
3640 	}
3641 
3642 	amdgpu_reset_init(adev);
3643 
3644 	/* detect if we are with an SRIOV vbios */
3645 	amdgpu_device_detect_sriov_bios(adev);
3646 
3647 	/* check if we need to reset the asic
3648 	 *  E.g., driver was not cleanly unloaded previously, etc.
3649 	 */
3650 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3651 		if (adev->gmc.xgmi.num_physical_nodes) {
3652 			dev_info(adev->dev, "Pending hive reset.\n");
3653 			adev->gmc.xgmi.pending_reset = true;
3654 			/* Only need to init necessary block for SMU to handle the reset */
3655 			for (i = 0; i < adev->num_ip_blocks; i++) {
3656 				if (!adev->ip_blocks[i].status.valid)
3657 					continue;
3658 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3659 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3660 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3661 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3662 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3663 						adev->ip_blocks[i].version->funcs->name);
3664 					adev->ip_blocks[i].status.hw = true;
3665 				}
3666 			}
3667 		} else {
3668 			r = amdgpu_asic_reset(adev);
3669 			if (r) {
3670 				dev_err(adev->dev, "asic reset on init failed\n");
3671 				goto failed;
3672 			}
3673 		}
3674 	}
3675 
3676 	pci_enable_pcie_error_reporting(adev->pdev);
3677 
3678 	/* Post card if necessary */
3679 	if (amdgpu_device_need_post(adev)) {
3680 		if (!adev->bios) {
3681 			dev_err(adev->dev, "no vBIOS found\n");
3682 			r = -EINVAL;
3683 			goto failed;
3684 		}
3685 		DRM_INFO("GPU posting now...\n");
3686 		r = amdgpu_device_asic_init(adev);
3687 		if (r) {
3688 			dev_err(adev->dev, "gpu post error!\n");
3689 			goto failed;
3690 		}
3691 	}
3692 
3693 	if (adev->is_atom_fw) {
3694 		/* Initialize clocks */
3695 		r = amdgpu_atomfirmware_get_clock_info(adev);
3696 		if (r) {
3697 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3698 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3699 			goto failed;
3700 		}
3701 	} else {
3702 		/* Initialize clocks */
3703 		r = amdgpu_atombios_get_clock_info(adev);
3704 		if (r) {
3705 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3706 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3707 			goto failed;
3708 		}
3709 		/* init i2c buses */
3710 		if (!amdgpu_device_has_dc_support(adev))
3711 			amdgpu_atombios_i2c_init(adev);
3712 	}
3713 
3714 fence_driver_init:
3715 	/* Fence driver */
3716 	r = amdgpu_fence_driver_sw_init(adev);
3717 	if (r) {
3718 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3719 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3720 		goto failed;
3721 	}
3722 
3723 	/* init the mode config */
3724 	drm_mode_config_init(adev_to_drm(adev));
3725 
3726 	r = amdgpu_device_ip_init(adev);
3727 	if (r) {
3728 		/* failed in exclusive mode due to timeout */
3729 		if (amdgpu_sriov_vf(adev) &&
3730 		    !amdgpu_sriov_runtime(adev) &&
3731 		    amdgpu_virt_mmio_blocked(adev) &&
3732 		    !amdgpu_virt_wait_reset(adev)) {
3733 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3734 			/* Don't send request since VF is inactive. */
3735 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3736 			adev->virt.ops = NULL;
3737 			r = -EAGAIN;
3738 			goto release_ras_con;
3739 		}
3740 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3741 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3742 		goto release_ras_con;
3743 	}
3744 
3745 	amdgpu_fence_driver_hw_init(adev);
3746 
3747 	dev_info(adev->dev,
3748 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3749 			adev->gfx.config.max_shader_engines,
3750 			adev->gfx.config.max_sh_per_se,
3751 			adev->gfx.config.max_cu_per_sh,
3752 			adev->gfx.cu_info.number);
3753 
3754 #ifdef __OpenBSD__
3755 {
3756 	const char *chip_name;
3757 
3758 	switch (adev->asic_type) {
3759 	case CHIP_RAVEN:
3760 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3761 			chip_name = "RAVEN2";
3762 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3763 			chip_name = "PICASSO";
3764 		else
3765 			chip_name = "RAVEN";
3766 		break;
3767 	case CHIP_RENOIR:
3768 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
3769 			chip_name = "RENOIR";
3770 		else
3771 			chip_name = "GREEN_SARDINE";
3772 		break;
3773 	default:
3774 		chip_name = amdgpu_asic_name[adev->asic_type];
3775 	}
3776 	printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname,
3777 	    chip_name, adev->gfx.cu_info.number, adev->rev_id);
3778 }
3779 #endif
3780 
3781 	adev->accel_working = true;
3782 
3783 	amdgpu_vm_check_compute_bug(adev);
3784 
3785 	/* Initialize the buffer migration limit. */
3786 	if (amdgpu_moverate >= 0)
3787 		max_MBps = amdgpu_moverate;
3788 	else
3789 		max_MBps = 8; /* Allow 8 MB/s. */
3790 	/* Get a log2 for easy divisions. */
3791 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3792 
3793 	amdgpu_fbdev_init(adev);
3794 
3795 	r = amdgpu_pm_sysfs_init(adev);
3796 	if (r) {
3797 		adev->pm_sysfs_en = false;
3798 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3799 	} else
3800 		adev->pm_sysfs_en = true;
3801 
3802 	r = amdgpu_ucode_sysfs_init(adev);
3803 	if (r) {
3804 		adev->ucode_sysfs_en = false;
3805 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3806 	} else
3807 		adev->ucode_sysfs_en = true;
3808 
3809 	if ((amdgpu_testing & 1)) {
3810 		if (adev->accel_working)
3811 			amdgpu_test_moves(adev);
3812 		else
3813 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3814 	}
3815 	if (amdgpu_benchmarking) {
3816 		if (adev->accel_working)
3817 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3818 		else
3819 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3820 	}
3821 
3822 	/*
3823 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3824 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3825 	 * gpu instance is counted less.
3826 	 */
3827 	amdgpu_register_gpu_instance(adev);
3828 
3829 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3830 	 * explicit gating rather than handling it automatically.
3831 	 */
3832 	if (!adev->gmc.xgmi.pending_reset) {
3833 		r = amdgpu_device_ip_late_init(adev);
3834 		if (r) {
3835 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3836 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3837 			goto release_ras_con;
3838 		}
3839 		/* must succeed. */
3840 		amdgpu_ras_resume(adev);
3841 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3842 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3843 	}
3844 
3845 	if (amdgpu_sriov_vf(adev))
3846 		flush_delayed_work(&adev->delayed_init_work);
3847 
3848 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3849 	if (r)
3850 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3851 
3852 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3853 		r = amdgpu_pmu_init(adev);
3854 	if (r)
3855 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3856 
3857 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3858 	if (amdgpu_device_cache_pci_state(adev->pdev))
3859 		pci_restore_state(pdev);
3860 
3861 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3862 	/* this will fail for cards that aren't VGA class devices, just
3863 	 * ignore it */
3864 #ifdef notyet
3865 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3866 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3867 #endif
3868 
3869 	if (amdgpu_device_supports_px(ddev)) {
3870 		px = true;
3871 		vga_switcheroo_register_client(adev->pdev,
3872 					       &amdgpu_switcheroo_ops, px);
3873 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3874 	}
3875 
3876 	if (adev->gmc.xgmi.pending_reset)
3877 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3878 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3879 
3880 	return 0;
3881 
3882 release_ras_con:
3883 	amdgpu_release_ras_context(adev);
3884 
3885 failed:
3886 	amdgpu_vf_error_trans_all(adev);
3887 
3888 	return r;
3889 }
3890 
3891 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3892 {
3893 	STUB();
3894 #ifdef notyet
3895 	/* Clear all CPU mappings pointing to this device */
3896 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3897 #endif
3898 
3899 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
3900 	amdgpu_device_doorbell_fini(adev);
3901 
3902 #ifdef __linux__
3903 	iounmap(adev->rmmio);
3904 	adev->rmmio = NULL;
3905 	if (adev->mman.aper_base_kaddr)
3906 		iounmap(adev->mman.aper_base_kaddr);
3907 	adev->mman.aper_base_kaddr = NULL;
3908 #else
3909 	if (adev->rmmio_size > 0)
3910 		bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
3911 		    adev->rmmio_size);
3912 	adev->rmmio_size = 0;
3913 	adev->rmmio = NULL;
3914 	if (adev->mman.aper_base_kaddr)
3915 		bus_space_unmap(adev->memt, adev->mman.aper_bsh,
3916 		    adev->gmc.visible_vram_size);
3917 	adev->mman.aper_base_kaddr = NULL;
3918 #endif
3919 
3920 	/* Memory manager related */
3921 	if (!adev->gmc.xgmi.connected_to_cpu) {
3922 #ifdef __linux__
3923 		arch_phys_wc_del(adev->gmc.vram_mtrr);
3924 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3925 #else
3926 		drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
3927 #endif
3928 	}
3929 }
3930 
3931 /**
3932  * amdgpu_device_fini - tear down the driver
3933  *
3934  * @adev: amdgpu_device pointer
3935  *
3936  * Tear down the driver info (all asics).
3937  * Called at driver shutdown.
3938  */
3939 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3940 {
3941 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3942 	flush_delayed_work(&adev->delayed_init_work);
3943 	if (adev->mman.initialized) {
3944 		flush_delayed_work(&adev->mman.bdev.wq);
3945 		ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3946 	}
3947 	adev->shutdown = true;
3948 
3949 	/* make sure IB test finished before entering exclusive mode
3950 	 * to avoid preemption on IB test
3951 	 * */
3952 	if (amdgpu_sriov_vf(adev)) {
3953 		amdgpu_virt_request_full_gpu(adev, false);
3954 		amdgpu_virt_fini_data_exchange(adev);
3955 	}
3956 
3957 	/* disable all interrupts */
3958 	amdgpu_irq_disable_all(adev);
3959 	if (adev->mode_info.mode_config_initialized){
3960 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3961 			drm_helper_force_disable_all(adev_to_drm(adev));
3962 		else
3963 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3964 	}
3965 	amdgpu_fence_driver_hw_fini(adev);
3966 
3967 	if (adev->pm_sysfs_en)
3968 		amdgpu_pm_sysfs_fini(adev);
3969 	if (adev->ucode_sysfs_en)
3970 		amdgpu_ucode_sysfs_fini(adev);
3971 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3972 
3973 	amdgpu_fbdev_fini(adev);
3974 
3975 	amdgpu_irq_fini_hw(adev);
3976 
3977 	amdgpu_device_ip_fini_early(adev);
3978 
3979 	amdgpu_gart_dummy_page_fini(adev);
3980 
3981 	amdgpu_device_unmap_mmio(adev);
3982 }
3983 
3984 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3985 {
3986 	amdgpu_fence_driver_sw_fini(adev);
3987 	amdgpu_device_ip_fini(adev);
3988 	release_firmware(adev->firmware.gpu_info_fw);
3989 	adev->firmware.gpu_info_fw = NULL;
3990 	adev->accel_working = false;
3991 
3992 	amdgpu_reset_fini(adev);
3993 
3994 	/* free i2c buses */
3995 	if (!amdgpu_device_has_dc_support(adev))
3996 		amdgpu_i2c_fini(adev);
3997 
3998 	if (amdgpu_emu_mode != 1)
3999 		amdgpu_atombios_fini(adev);
4000 
4001 	kfree(adev->bios);
4002 	adev->bios = NULL;
4003 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4004 		vga_switcheroo_unregister_client(adev->pdev);
4005 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4006 	}
4007 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4008 		vga_client_unregister(adev->pdev);
4009 
4010 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4011 		amdgpu_pmu_fini(adev);
4012 	if (adev->mman.discovery_bin)
4013 		amdgpu_discovery_fini(adev);
4014 
4015 	kfree(adev->pci_state);
4016 
4017 }
4018 
4019 /**
4020  * amdgpu_device_evict_resources - evict device resources
4021  * @adev: amdgpu device object
4022  *
4023  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4024  * of the vram memory type. Mainly used for evicting device resources
4025  * at suspend time.
4026  *
4027  */
4028 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4029 {
4030 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4031 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4032 		return;
4033 
4034 	if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4035 		DRM_WARN("evicting device resources failed\n");
4036 
4037 }
4038 
4039 /*
4040  * Suspend & resume.
4041  */
4042 /**
4043  * amdgpu_device_suspend - initiate device suspend
4044  *
4045  * @dev: drm dev pointer
4046  * @fbcon : notify the fbdev of suspend
4047  *
4048  * Puts the hw in the suspend state (all asics).
4049  * Returns 0 for success or an error on failure.
4050  * Called at driver suspend.
4051  */
4052 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4053 {
4054 	struct amdgpu_device *adev = drm_to_adev(dev);
4055 
4056 	if (adev->shutdown)
4057 		return 0;
4058 
4059 #ifdef notyet
4060 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4061 		return 0;
4062 #endif
4063 
4064 	adev->in_suspend = true;
4065 
4066 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4067 		DRM_WARN("smart shift update failed\n");
4068 
4069 	drm_kms_helper_poll_disable(dev);
4070 
4071 	if (fbcon)
4072 		amdgpu_fbdev_set_suspend(adev, 1);
4073 
4074 	cancel_delayed_work_sync(&adev->delayed_init_work);
4075 
4076 	amdgpu_ras_suspend(adev);
4077 
4078 	amdgpu_device_ip_suspend_phase1(adev);
4079 
4080 	if (!adev->in_s0ix)
4081 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4082 
4083 	/* First evict vram memory */
4084 	amdgpu_device_evict_resources(adev);
4085 
4086 	amdgpu_fence_driver_hw_fini(adev);
4087 
4088 	amdgpu_device_ip_suspend_phase2(adev);
4089 	/* This second call to evict device resources is to evict
4090 	 * the gart page table using the CPU.
4091 	 */
4092 	amdgpu_device_evict_resources(adev);
4093 
4094 	return 0;
4095 }
4096 
4097 /**
4098  * amdgpu_device_resume - initiate device resume
4099  *
4100  * @dev: drm dev pointer
4101  * @fbcon : notify the fbdev of resume
4102  *
4103  * Bring the hw back to operating state (all asics).
4104  * Returns 0 for success or an error on failure.
4105  * Called at driver resume.
4106  */
4107 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4108 {
4109 	struct amdgpu_device *adev = drm_to_adev(dev);
4110 	int r = 0;
4111 
4112 #ifdef notyet
4113 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4114 		return 0;
4115 #endif
4116 
4117 	if (adev->in_s0ix)
4118 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
4119 
4120 	/* post card */
4121 	if (amdgpu_device_need_post(adev)) {
4122 		r = amdgpu_device_asic_init(adev);
4123 		if (r)
4124 			dev_err(adev->dev, "amdgpu asic init failed\n");
4125 	}
4126 
4127 	r = amdgpu_device_ip_resume(adev);
4128 	if (r) {
4129 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4130 		return r;
4131 	}
4132 	amdgpu_fence_driver_hw_init(adev);
4133 
4134 	r = amdgpu_device_ip_late_init(adev);
4135 	if (r)
4136 		return r;
4137 
4138 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4139 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4140 
4141 	if (!adev->in_s0ix) {
4142 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4143 		if (r)
4144 			return r;
4145 	}
4146 
4147 	/* Make sure IB tests flushed */
4148 	flush_delayed_work(&adev->delayed_init_work);
4149 
4150 	if (fbcon)
4151 		amdgpu_fbdev_set_suspend(adev, 0);
4152 
4153 	drm_kms_helper_poll_enable(dev);
4154 
4155 	amdgpu_ras_resume(adev);
4156 
4157 	/*
4158 	 * Most of the connector probing functions try to acquire runtime pm
4159 	 * refs to ensure that the GPU is powered on when connector polling is
4160 	 * performed. Since we're calling this from a runtime PM callback,
4161 	 * trying to acquire rpm refs will cause us to deadlock.
4162 	 *
4163 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
4164 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
4165 	 */
4166 #if defined(CONFIG_PM) && defined(__linux__)
4167 	dev->dev->power.disable_depth++;
4168 #endif
4169 	if (!amdgpu_device_has_dc_support(adev))
4170 		drm_helper_hpd_irq_event(dev);
4171 	else
4172 		drm_kms_helper_hotplug_event(dev);
4173 #if defined(CONFIG_PM) && defined(__linux__)
4174 	dev->dev->power.disable_depth--;
4175 #endif
4176 	adev->in_suspend = false;
4177 
4178 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4179 		DRM_WARN("smart shift update failed\n");
4180 
4181 	return 0;
4182 }
4183 
4184 /**
4185  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4186  *
4187  * @adev: amdgpu_device pointer
4188  *
4189  * The list of all the hardware IPs that make up the asic is walked and
4190  * the check_soft_reset callbacks are run.  check_soft_reset determines
4191  * if the asic is still hung or not.
4192  * Returns true if any of the IPs are still in a hung state, false if not.
4193  */
4194 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4195 {
4196 	int i;
4197 	bool asic_hang = false;
4198 
4199 	if (amdgpu_sriov_vf(adev))
4200 		return true;
4201 
4202 	if (amdgpu_asic_need_full_reset(adev))
4203 		return true;
4204 
4205 	for (i = 0; i < adev->num_ip_blocks; i++) {
4206 		if (!adev->ip_blocks[i].status.valid)
4207 			continue;
4208 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4209 			adev->ip_blocks[i].status.hang =
4210 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4211 		if (adev->ip_blocks[i].status.hang) {
4212 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4213 			asic_hang = true;
4214 		}
4215 	}
4216 	return asic_hang;
4217 }
4218 
4219 /**
4220  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4221  *
4222  * @adev: amdgpu_device pointer
4223  *
4224  * The list of all the hardware IPs that make up the asic is walked and the
4225  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4226  * handles any IP specific hardware or software state changes that are
4227  * necessary for a soft reset to succeed.
4228  * Returns 0 on success, negative error code on failure.
4229  */
4230 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4231 {
4232 	int i, r = 0;
4233 
4234 	for (i = 0; i < adev->num_ip_blocks; i++) {
4235 		if (!adev->ip_blocks[i].status.valid)
4236 			continue;
4237 		if (adev->ip_blocks[i].status.hang &&
4238 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4239 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4240 			if (r)
4241 				return r;
4242 		}
4243 	}
4244 
4245 	return 0;
4246 }
4247 
4248 /**
4249  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4250  *
4251  * @adev: amdgpu_device pointer
4252  *
4253  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4254  * reset is necessary to recover.
4255  * Returns true if a full asic reset is required, false if not.
4256  */
4257 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4258 {
4259 	int i;
4260 
4261 	if (amdgpu_asic_need_full_reset(adev))
4262 		return true;
4263 
4264 	for (i = 0; i < adev->num_ip_blocks; i++) {
4265 		if (!adev->ip_blocks[i].status.valid)
4266 			continue;
4267 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4268 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4269 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4270 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4271 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4272 			if (adev->ip_blocks[i].status.hang) {
4273 				dev_info(adev->dev, "Some block need full reset!\n");
4274 				return true;
4275 			}
4276 		}
4277 	}
4278 	return false;
4279 }
4280 
4281 /**
4282  * amdgpu_device_ip_soft_reset - do a soft reset
4283  *
4284  * @adev: amdgpu_device pointer
4285  *
4286  * The list of all the hardware IPs that make up the asic is walked and the
4287  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4288  * IP specific hardware or software state changes that are necessary to soft
4289  * reset the IP.
4290  * Returns 0 on success, negative error code on failure.
4291  */
4292 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4293 {
4294 	int i, r = 0;
4295 
4296 	for (i = 0; i < adev->num_ip_blocks; i++) {
4297 		if (!adev->ip_blocks[i].status.valid)
4298 			continue;
4299 		if (adev->ip_blocks[i].status.hang &&
4300 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4301 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4302 			if (r)
4303 				return r;
4304 		}
4305 	}
4306 
4307 	return 0;
4308 }
4309 
4310 /**
4311  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4312  *
4313  * @adev: amdgpu_device pointer
4314  *
4315  * The list of all the hardware IPs that make up the asic is walked and the
4316  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4317  * handles any IP specific hardware or software state changes that are
4318  * necessary after the IP has been soft reset.
4319  * Returns 0 on success, negative error code on failure.
4320  */
4321 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4322 {
4323 	int i, r = 0;
4324 
4325 	for (i = 0; i < adev->num_ip_blocks; i++) {
4326 		if (!adev->ip_blocks[i].status.valid)
4327 			continue;
4328 		if (adev->ip_blocks[i].status.hang &&
4329 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4330 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4331 		if (r)
4332 			return r;
4333 	}
4334 
4335 	return 0;
4336 }
4337 
4338 /**
4339  * amdgpu_device_recover_vram - Recover some VRAM contents
4340  *
4341  * @adev: amdgpu_device pointer
4342  *
4343  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4344  * restore things like GPUVM page tables after a GPU reset where
4345  * the contents of VRAM might be lost.
4346  *
4347  * Returns:
4348  * 0 on success, negative error code on failure.
4349  */
4350 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4351 {
4352 	struct dma_fence *fence = NULL, *next = NULL;
4353 	struct amdgpu_bo *shadow;
4354 	struct amdgpu_bo_vm *vmbo;
4355 	long r = 1, tmo;
4356 
4357 	if (amdgpu_sriov_runtime(adev))
4358 		tmo = msecs_to_jiffies(8000);
4359 	else
4360 		tmo = msecs_to_jiffies(100);
4361 
4362 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4363 	mutex_lock(&adev->shadow_list_lock);
4364 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4365 		shadow = &vmbo->bo;
4366 		/* No need to recover an evicted BO */
4367 		if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4368 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4369 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4370 			continue;
4371 
4372 		r = amdgpu_bo_restore_shadow(shadow, &next);
4373 		if (r)
4374 			break;
4375 
4376 		if (fence) {
4377 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4378 			dma_fence_put(fence);
4379 			fence = next;
4380 			if (tmo == 0) {
4381 				r = -ETIMEDOUT;
4382 				break;
4383 			} else if (tmo < 0) {
4384 				r = tmo;
4385 				break;
4386 			}
4387 		} else {
4388 			fence = next;
4389 		}
4390 	}
4391 	mutex_unlock(&adev->shadow_list_lock);
4392 
4393 	if (fence)
4394 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4395 	dma_fence_put(fence);
4396 
4397 	if (r < 0 || tmo <= 0) {
4398 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4399 		return -EIO;
4400 	}
4401 
4402 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4403 	return 0;
4404 }
4405 
4406 
4407 /**
4408  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4409  *
4410  * @adev: amdgpu_device pointer
4411  * @from_hypervisor: request from hypervisor
4412  *
4413  * do VF FLR and reinitialize Asic
4414  * return 0 means succeeded otherwise failed
4415  */
4416 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4417 				     bool from_hypervisor)
4418 {
4419 	int r;
4420 
4421 	if (from_hypervisor)
4422 		r = amdgpu_virt_request_full_gpu(adev, true);
4423 	else
4424 		r = amdgpu_virt_reset_gpu(adev);
4425 	if (r)
4426 		return r;
4427 
4428 	amdgpu_amdkfd_pre_reset(adev);
4429 
4430 	/* Resume IP prior to SMC */
4431 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4432 	if (r)
4433 		goto error;
4434 
4435 	amdgpu_virt_init_data_exchange(adev);
4436 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4437 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4438 
4439 	r = amdgpu_device_fw_loading(adev);
4440 	if (r)
4441 		return r;
4442 
4443 	/* now we are okay to resume SMC/CP/SDMA */
4444 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4445 	if (r)
4446 		goto error;
4447 
4448 	amdgpu_irq_gpu_reset_resume_helper(adev);
4449 	r = amdgpu_ib_ring_tests(adev);
4450 	amdgpu_amdkfd_post_reset(adev);
4451 
4452 error:
4453 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4454 		amdgpu_inc_vram_lost(adev);
4455 		r = amdgpu_device_recover_vram(adev);
4456 	}
4457 	amdgpu_virt_release_full_gpu(adev, true);
4458 
4459 	return r;
4460 }
4461 
4462 /**
4463  * amdgpu_device_has_job_running - check if there is any job in mirror list
4464  *
4465  * @adev: amdgpu_device pointer
4466  *
4467  * check if there is any job in mirror list
4468  */
4469 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4470 {
4471 	int i;
4472 	struct drm_sched_job *job;
4473 
4474 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4475 		struct amdgpu_ring *ring = adev->rings[i];
4476 
4477 		if (!ring || !ring->sched.thread)
4478 			continue;
4479 
4480 		spin_lock(&ring->sched.job_list_lock);
4481 		job = list_first_entry_or_null(&ring->sched.pending_list,
4482 					       struct drm_sched_job, list);
4483 		spin_unlock(&ring->sched.job_list_lock);
4484 		if (job)
4485 			return true;
4486 	}
4487 	return false;
4488 }
4489 
4490 /**
4491  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4492  *
4493  * @adev: amdgpu_device pointer
4494  *
4495  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4496  * a hung GPU.
4497  */
4498 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4499 {
4500 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4501 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4502 		return false;
4503 	}
4504 
4505 	if (amdgpu_gpu_recovery == 0)
4506 		goto disabled;
4507 
4508 	if (amdgpu_sriov_vf(adev))
4509 		return true;
4510 
4511 	if (amdgpu_gpu_recovery == -1) {
4512 		switch (adev->asic_type) {
4513 		case CHIP_BONAIRE:
4514 		case CHIP_HAWAII:
4515 		case CHIP_TOPAZ:
4516 		case CHIP_TONGA:
4517 		case CHIP_FIJI:
4518 		case CHIP_POLARIS10:
4519 		case CHIP_POLARIS11:
4520 		case CHIP_POLARIS12:
4521 		case CHIP_VEGAM:
4522 		case CHIP_VEGA20:
4523 		case CHIP_VEGA10:
4524 		case CHIP_VEGA12:
4525 		case CHIP_RAVEN:
4526 		case CHIP_ARCTURUS:
4527 		case CHIP_RENOIR:
4528 		case CHIP_NAVI10:
4529 		case CHIP_NAVI14:
4530 		case CHIP_NAVI12:
4531 		case CHIP_SIENNA_CICHLID:
4532 		case CHIP_NAVY_FLOUNDER:
4533 		case CHIP_DIMGREY_CAVEFISH:
4534 		case CHIP_BEIGE_GOBY:
4535 		case CHIP_VANGOGH:
4536 		case CHIP_ALDEBARAN:
4537 			break;
4538 		default:
4539 			goto disabled;
4540 		}
4541 	}
4542 
4543 	return true;
4544 
4545 disabled:
4546 		dev_info(adev->dev, "GPU recovery disabled.\n");
4547 		return false;
4548 }
4549 
4550 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4551 {
4552         u32 i;
4553         int ret = 0;
4554 
4555         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4556 
4557         dev_info(adev->dev, "GPU mode1 reset\n");
4558 
4559         /* disable BM */
4560         pci_clear_master(adev->pdev);
4561 
4562         amdgpu_device_cache_pci_state(adev->pdev);
4563 
4564         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4565                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4566                 ret = amdgpu_dpm_mode1_reset(adev);
4567         } else {
4568                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4569                 ret = psp_gpu_reset(adev);
4570         }
4571 
4572         if (ret)
4573                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4574 
4575         amdgpu_device_load_pci_state(adev->pdev);
4576 
4577         /* wait for asic to come out of reset */
4578         for (i = 0; i < adev->usec_timeout; i++) {
4579                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4580 
4581                 if (memsize != 0xffffffff)
4582                         break;
4583                 udelay(1);
4584         }
4585 
4586         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4587         return ret;
4588 }
4589 
4590 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4591 				 struct amdgpu_reset_context *reset_context)
4592 {
4593 	int i, j, r = 0;
4594 	struct amdgpu_job *job = NULL;
4595 	bool need_full_reset =
4596 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4597 
4598 	if (reset_context->reset_req_dev == adev)
4599 		job = reset_context->job;
4600 
4601 	if (amdgpu_sriov_vf(adev)) {
4602 		/* stop the data exchange thread */
4603 		amdgpu_virt_fini_data_exchange(adev);
4604 	}
4605 
4606 	/* block all schedulers and reset given job's ring */
4607 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4608 		struct amdgpu_ring *ring = adev->rings[i];
4609 
4610 		if (!ring || !ring->sched.thread)
4611 			continue;
4612 
4613 		/*clear job fence from fence drv to avoid force_completion
4614 		 *leave NULL and vm flush fence in fence drv */
4615 		for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) {
4616 			struct dma_fence *old, **ptr;
4617 
4618 			ptr = &ring->fence_drv.fences[j];
4619 			old = rcu_dereference_protected(*ptr, 1);
4620 			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
4621 				RCU_INIT_POINTER(*ptr, NULL);
4622 			}
4623 		}
4624 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4625 		amdgpu_fence_driver_force_completion(ring);
4626 	}
4627 
4628 	if (job && job->vm)
4629 		drm_sched_increase_karma(&job->base);
4630 
4631 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4632 	/* If reset handler not implemented, continue; otherwise return */
4633 	if (r == -ENOSYS)
4634 		r = 0;
4635 	else
4636 		return r;
4637 
4638 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4639 	if (!amdgpu_sriov_vf(adev)) {
4640 
4641 		if (!need_full_reset)
4642 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4643 
4644 		if (!need_full_reset) {
4645 			amdgpu_device_ip_pre_soft_reset(adev);
4646 			r = amdgpu_device_ip_soft_reset(adev);
4647 			amdgpu_device_ip_post_soft_reset(adev);
4648 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4649 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4650 				need_full_reset = true;
4651 			}
4652 		}
4653 
4654 		if (need_full_reset)
4655 			r = amdgpu_device_ip_suspend(adev);
4656 		if (need_full_reset)
4657 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4658 		else
4659 			clear_bit(AMDGPU_NEED_FULL_RESET,
4660 				  &reset_context->flags);
4661 	}
4662 
4663 	return r;
4664 }
4665 
4666 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4667 			 struct amdgpu_reset_context *reset_context)
4668 {
4669 	struct amdgpu_device *tmp_adev = NULL;
4670 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4671 	int r = 0;
4672 
4673 	/* Try reset handler method first */
4674 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4675 				    reset_list);
4676 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4677 	/* If reset handler not implemented, continue; otherwise return */
4678 	if (r == -ENOSYS)
4679 		r = 0;
4680 	else
4681 		return r;
4682 
4683 	/* Reset handler not implemented, use the default method */
4684 	need_full_reset =
4685 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4686 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4687 
4688 	/*
4689 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4690 	 * to allow proper links negotiation in FW (within 1 sec)
4691 	 */
4692 	if (!skip_hw_reset && need_full_reset) {
4693 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4694 			/* For XGMI run all resets in parallel to speed up the process */
4695 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4696 				tmp_adev->gmc.xgmi.pending_reset = false;
4697 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4698 					r = -EALREADY;
4699 			} else
4700 				r = amdgpu_asic_reset(tmp_adev);
4701 
4702 			if (r) {
4703 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4704 					 r, adev_to_drm(tmp_adev)->unique);
4705 				break;
4706 			}
4707 		}
4708 
4709 		/* For XGMI wait for all resets to complete before proceed */
4710 		if (!r) {
4711 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4712 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4713 					flush_work(&tmp_adev->xgmi_reset_work);
4714 					r = tmp_adev->asic_reset_res;
4715 					if (r)
4716 						break;
4717 				}
4718 			}
4719 		}
4720 	}
4721 
4722 	if (!r && amdgpu_ras_intr_triggered()) {
4723 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4724 			if (tmp_adev->mmhub.ras_funcs &&
4725 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4726 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4727 		}
4728 
4729 		amdgpu_ras_intr_cleared();
4730 	}
4731 
4732 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4733 		if (need_full_reset) {
4734 			/* post card */
4735 			r = amdgpu_device_asic_init(tmp_adev);
4736 			if (r) {
4737 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4738 			} else {
4739 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4740 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4741 				if (r)
4742 					goto out;
4743 
4744 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4745 				if (r)
4746 					goto out;
4747 
4748 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4749 				if (vram_lost) {
4750 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4751 					amdgpu_inc_vram_lost(tmp_adev);
4752 				}
4753 
4754 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4755 				if (r)
4756 					goto out;
4757 
4758 				r = amdgpu_device_fw_loading(tmp_adev);
4759 				if (r)
4760 					return r;
4761 
4762 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4763 				if (r)
4764 					goto out;
4765 
4766 				if (vram_lost)
4767 					amdgpu_device_fill_reset_magic(tmp_adev);
4768 
4769 				/*
4770 				 * Add this ASIC as tracked as reset was already
4771 				 * complete successfully.
4772 				 */
4773 				amdgpu_register_gpu_instance(tmp_adev);
4774 
4775 				if (!reset_context->hive &&
4776 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4777 					amdgpu_xgmi_add_device(tmp_adev);
4778 
4779 				r = amdgpu_device_ip_late_init(tmp_adev);
4780 				if (r)
4781 					goto out;
4782 
4783 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4784 
4785 				/*
4786 				 * The GPU enters bad state once faulty pages
4787 				 * by ECC has reached the threshold, and ras
4788 				 * recovery is scheduled next. So add one check
4789 				 * here to break recovery if it indeed exceeds
4790 				 * bad page threshold, and remind user to
4791 				 * retire this GPU or setting one bigger
4792 				 * bad_page_threshold value to fix this once
4793 				 * probing driver again.
4794 				 */
4795 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4796 					/* must succeed. */
4797 					amdgpu_ras_resume(tmp_adev);
4798 				} else {
4799 					r = -EINVAL;
4800 					goto out;
4801 				}
4802 
4803 				/* Update PSP FW topology after reset */
4804 				if (reset_context->hive &&
4805 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4806 					r = amdgpu_xgmi_update_topology(
4807 						reset_context->hive, tmp_adev);
4808 			}
4809 		}
4810 
4811 out:
4812 		if (!r) {
4813 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4814 			r = amdgpu_ib_ring_tests(tmp_adev);
4815 			if (r) {
4816 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4817 				need_full_reset = true;
4818 				r = -EAGAIN;
4819 				goto end;
4820 			}
4821 		}
4822 
4823 		if (!r)
4824 			r = amdgpu_device_recover_vram(tmp_adev);
4825 		else
4826 			tmp_adev->asic_reset_res = r;
4827 	}
4828 
4829 end:
4830 	if (need_full_reset)
4831 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4832 	else
4833 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4834 	return r;
4835 }
4836 
4837 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4838 				struct amdgpu_hive_info *hive)
4839 {
4840 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4841 		return false;
4842 
4843 	if (hive) {
4844 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4845 	} else {
4846 		down_write(&adev->reset_sem);
4847 	}
4848 
4849 	switch (amdgpu_asic_reset_method(adev)) {
4850 	case AMD_RESET_METHOD_MODE1:
4851 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4852 		break;
4853 	case AMD_RESET_METHOD_MODE2:
4854 		adev->mp1_state = PP_MP1_STATE_RESET;
4855 		break;
4856 	default:
4857 		adev->mp1_state = PP_MP1_STATE_NONE;
4858 		break;
4859 	}
4860 
4861 	return true;
4862 }
4863 
4864 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4865 {
4866 	amdgpu_vf_error_trans_all(adev);
4867 	adev->mp1_state = PP_MP1_STATE_NONE;
4868 	atomic_set(&adev->in_gpu_reset, 0);
4869 	up_write(&adev->reset_sem);
4870 }
4871 
4872 /*
4873  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4874  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4875  *
4876  * unlock won't require roll back.
4877  */
4878 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4879 {
4880 	struct amdgpu_device *tmp_adev = NULL;
4881 
4882 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4883 		if (!hive) {
4884 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4885 			return -ENODEV;
4886 		}
4887 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4888 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4889 				goto roll_back;
4890 		}
4891 	} else if (!amdgpu_device_lock_adev(adev, hive))
4892 		return -EAGAIN;
4893 
4894 	return 0;
4895 roll_back:
4896 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4897 		/*
4898 		 * if the lockup iteration break in the middle of a hive,
4899 		 * it may means there may has a race issue,
4900 		 * or a hive device locked up independently.
4901 		 * we may be in trouble and may not, so will try to roll back
4902 		 * the lock and give out a warnning.
4903 		 */
4904 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4905 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4906 			amdgpu_device_unlock_adev(tmp_adev);
4907 		}
4908 	}
4909 	return -EAGAIN;
4910 }
4911 
4912 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4913 {
4914 	STUB();
4915 #ifdef notyet
4916 	struct pci_dev *p = NULL;
4917 
4918 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4919 			adev->pdev->bus->number, 1);
4920 	if (p) {
4921 		pm_runtime_enable(&(p->dev));
4922 		pm_runtime_resume(&(p->dev));
4923 	}
4924 #endif
4925 }
4926 
4927 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4928 {
4929 	enum amd_reset_method reset_method;
4930 	struct pci_dev *p = NULL;
4931 	u64 expires;
4932 
4933 	/*
4934 	 * For now, only BACO and mode1 reset are confirmed
4935 	 * to suffer the audio issue without proper suspended.
4936 	 */
4937 	reset_method = amdgpu_asic_reset_method(adev);
4938 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4939 	     (reset_method != AMD_RESET_METHOD_MODE1))
4940 		return -EINVAL;
4941 
4942 	STUB();
4943 	return -ENOSYS;
4944 #ifdef notyet
4945 
4946 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4947 			adev->pdev->bus->number, 1);
4948 	if (!p)
4949 		return -ENODEV;
4950 
4951 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4952 	if (!expires)
4953 		/*
4954 		 * If we cannot get the audio device autosuspend delay,
4955 		 * a fixed 4S interval will be used. Considering 3S is
4956 		 * the audio controller default autosuspend delay setting.
4957 		 * 4S used here is guaranteed to cover that.
4958 		 */
4959 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4960 
4961 	while (!pm_runtime_status_suspended(&(p->dev))) {
4962 		if (!pm_runtime_suspend(&(p->dev)))
4963 			break;
4964 
4965 		if (expires < ktime_get_mono_fast_ns()) {
4966 			dev_warn(adev->dev, "failed to suspend display audio\n");
4967 			/* TODO: abort the succeeding gpu reset? */
4968 			return -ETIMEDOUT;
4969 		}
4970 	}
4971 
4972 	pm_runtime_disable(&(p->dev));
4973 
4974 	return 0;
4975 #endif
4976 }
4977 
4978 static void amdgpu_device_recheck_guilty_jobs(
4979 	struct amdgpu_device *adev, struct list_head *device_list_handle,
4980 	struct amdgpu_reset_context *reset_context)
4981 {
4982 	int i, r = 0;
4983 
4984 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4985 		struct amdgpu_ring *ring = adev->rings[i];
4986 		int ret = 0;
4987 		struct drm_sched_job *s_job;
4988 
4989 		if (!ring || !ring->sched.thread)
4990 			continue;
4991 
4992 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
4993 				struct drm_sched_job, list);
4994 		if (s_job == NULL)
4995 			continue;
4996 
4997 		/* clear job's guilty and depend the folowing step to decide the real one */
4998 		drm_sched_reset_karma(s_job);
4999 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5000 
5001 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5002 		if (ret == 0) { /* timeout */
5003 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5004 						ring->sched.name, s_job->id);
5005 
5006 			/* set guilty */
5007 			drm_sched_increase_karma(s_job);
5008 retry:
5009 			/* do hw reset */
5010 			if (amdgpu_sriov_vf(adev)) {
5011 				amdgpu_virt_fini_data_exchange(adev);
5012 				r = amdgpu_device_reset_sriov(adev, false);
5013 				if (r)
5014 					adev->asic_reset_res = r;
5015 			} else {
5016 				clear_bit(AMDGPU_SKIP_HW_RESET,
5017 					  &reset_context->flags);
5018 				r = amdgpu_do_asic_reset(device_list_handle,
5019 							 reset_context);
5020 				if (r && r == -EAGAIN)
5021 					goto retry;
5022 			}
5023 
5024 			/*
5025 			 * add reset counter so that the following
5026 			 * resubmitted job could flush vmid
5027 			 */
5028 			atomic_inc(&adev->gpu_reset_counter);
5029 			continue;
5030 		}
5031 
5032 		/* got the hw fence, signal finished fence */
5033 		atomic_dec(ring->sched.score);
5034 		dma_fence_get(&s_job->s_fence->finished);
5035 		dma_fence_signal(&s_job->s_fence->finished);
5036 		dma_fence_put(&s_job->s_fence->finished);
5037 
5038 		/* remove node from list and free the job */
5039 		spin_lock(&ring->sched.job_list_lock);
5040 		list_del_init(&s_job->list);
5041 		spin_unlock(&ring->sched.job_list_lock);
5042 		ring->sched.ops->free_job(s_job);
5043 	}
5044 }
5045 
5046 /**
5047  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5048  *
5049  * @adev: amdgpu_device pointer
5050  * @job: which job trigger hang
5051  *
5052  * Attempt to reset the GPU if it has hung (all asics).
5053  * Attempt to do soft-reset or full-reset and reinitialize Asic
5054  * Returns 0 for success or an error on failure.
5055  */
5056 
5057 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5058 			      struct amdgpu_job *job)
5059 {
5060 	struct list_head device_list, *device_list_handle =  NULL;
5061 	bool job_signaled = false;
5062 	struct amdgpu_hive_info *hive = NULL;
5063 	struct amdgpu_device *tmp_adev = NULL;
5064 	int i, r = 0;
5065 	bool need_emergency_restart = false;
5066 	bool audio_suspended = false;
5067 	int tmp_vram_lost_counter;
5068 	struct amdgpu_reset_context reset_context;
5069 
5070 	memset(&reset_context, 0, sizeof(reset_context));
5071 
5072 	/*
5073 	 * Special case: RAS triggered and full reset isn't supported
5074 	 */
5075 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5076 
5077 	/*
5078 	 * Flush RAM to disk so that after reboot
5079 	 * the user can read log and see why the system rebooted.
5080 	 */
5081 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5082 		DRM_WARN("Emergency reboot.");
5083 
5084 #ifdef notyet
5085 		ksys_sync_helper();
5086 		emergency_restart();
5087 #else
5088 		panic("emergency_restart");
5089 #endif
5090 	}
5091 
5092 	dev_info(adev->dev, "GPU %s begin!\n",
5093 		need_emergency_restart ? "jobs stop":"reset");
5094 
5095 	/*
5096 	 * Here we trylock to avoid chain of resets executing from
5097 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
5098 	 * different schedulers for same device while this TO handler is running.
5099 	 * We always reset all schedulers for device and all devices for XGMI
5100 	 * hive so that should take care of them too.
5101 	 */
5102 	hive = amdgpu_get_xgmi_hive(adev);
5103 	if (hive) {
5104 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
5105 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
5106 				job ? job->base.id : -1, hive->hive_id);
5107 			amdgpu_put_xgmi_hive(hive);
5108 			if (job && job->vm)
5109 				drm_sched_increase_karma(&job->base);
5110 			return 0;
5111 		}
5112 		mutex_lock(&hive->hive_lock);
5113 	}
5114 
5115 	reset_context.method = AMD_RESET_METHOD_NONE;
5116 	reset_context.reset_req_dev = adev;
5117 	reset_context.job = job;
5118 	reset_context.hive = hive;
5119 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5120 
5121 	/*
5122 	 * lock the device before we try to operate the linked list
5123 	 * if didn't get the device lock, don't touch the linked list since
5124 	 * others may iterating it.
5125 	 */
5126 	r = amdgpu_device_lock_hive_adev(adev, hive);
5127 	if (r) {
5128 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
5129 					job ? job->base.id : -1);
5130 
5131 		/* even we skipped this reset, still need to set the job to guilty */
5132 		if (job && job->vm)
5133 			drm_sched_increase_karma(&job->base);
5134 		goto skip_recovery;
5135 	}
5136 
5137 	/*
5138 	 * Build list of devices to reset.
5139 	 * In case we are in XGMI hive mode, resort the device list
5140 	 * to put adev in the 1st position.
5141 	 */
5142 	INIT_LIST_HEAD(&device_list);
5143 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5144 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5145 			list_add_tail(&tmp_adev->reset_list, &device_list);
5146 		if (!list_is_first(&adev->reset_list, &device_list))
5147 			list_rotate_to_front(&adev->reset_list, &device_list);
5148 		device_list_handle = &device_list;
5149 	} else {
5150 		list_add_tail(&adev->reset_list, &device_list);
5151 		device_list_handle = &device_list;
5152 	}
5153 
5154 	/* block all schedulers and reset given job's ring */
5155 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5156 		/*
5157 		 * Try to put the audio codec into suspend state
5158 		 * before gpu reset started.
5159 		 *
5160 		 * Due to the power domain of the graphics device
5161 		 * is shared with AZ power domain. Without this,
5162 		 * we may change the audio hardware from behind
5163 		 * the audio driver's back. That will trigger
5164 		 * some audio codec errors.
5165 		 */
5166 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5167 			audio_suspended = true;
5168 
5169 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5170 
5171 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5172 
5173 		if (!amdgpu_sriov_vf(tmp_adev))
5174 			amdgpu_amdkfd_pre_reset(tmp_adev);
5175 
5176 		/*
5177 		 * Mark these ASICs to be reseted as untracked first
5178 		 * And add them back after reset completed
5179 		 */
5180 		amdgpu_unregister_gpu_instance(tmp_adev);
5181 
5182 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
5183 
5184 		/* disable ras on ALL IPs */
5185 		if (!need_emergency_restart &&
5186 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5187 			amdgpu_ras_suspend(tmp_adev);
5188 
5189 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5190 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5191 
5192 			if (!ring || !ring->sched.thread)
5193 				continue;
5194 
5195 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5196 
5197 			if (need_emergency_restart)
5198 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5199 		}
5200 		atomic_inc(&tmp_adev->gpu_reset_counter);
5201 	}
5202 
5203 	if (need_emergency_restart)
5204 		goto skip_sched_resume;
5205 
5206 	/*
5207 	 * Must check guilty signal here since after this point all old
5208 	 * HW fences are force signaled.
5209 	 *
5210 	 * job->base holds a reference to parent fence
5211 	 */
5212 	if (job && job->base.s_fence->parent &&
5213 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
5214 		job_signaled = true;
5215 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5216 		goto skip_hw_reset;
5217 	}
5218 
5219 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5220 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5221 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
5222 		/*TODO Should we stop ?*/
5223 		if (r) {
5224 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5225 				  r, adev_to_drm(tmp_adev)->unique);
5226 			tmp_adev->asic_reset_res = r;
5227 		}
5228 	}
5229 
5230 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5231 	/* Actual ASIC resets if needed.*/
5232 	/* TODO Implement XGMI hive reset logic for SRIOV */
5233 	if (amdgpu_sriov_vf(adev)) {
5234 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5235 		if (r)
5236 			adev->asic_reset_res = r;
5237 	} else {
5238 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
5239 		if (r && r == -EAGAIN)
5240 			goto retry;
5241 	}
5242 
5243 skip_hw_reset:
5244 
5245 	/* Post ASIC reset for all devs .*/
5246 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5247 
5248 		/*
5249 		 * Sometimes a later bad compute job can block a good gfx job as gfx
5250 		 * and compute ring share internal GC HW mutually. We add an additional
5251 		 * guilty jobs recheck step to find the real guilty job, it synchronously
5252 		 * submits and pends for the first job being signaled. If it gets timeout,
5253 		 * we identify it as a real guilty job.
5254 		 */
5255 		if (amdgpu_gpu_recovery == 2 &&
5256 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5257 			amdgpu_device_recheck_guilty_jobs(
5258 				tmp_adev, device_list_handle, &reset_context);
5259 
5260 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5261 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5262 
5263 			if (!ring || !ring->sched.thread)
5264 				continue;
5265 
5266 			/* No point to resubmit jobs if we didn't HW reset*/
5267 			if (!tmp_adev->asic_reset_res && !job_signaled)
5268 				drm_sched_resubmit_jobs(&ring->sched);
5269 
5270 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5271 		}
5272 
5273 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5274 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5275 		}
5276 
5277 		tmp_adev->asic_reset_res = 0;
5278 
5279 		if (r) {
5280 			/* bad news, how to tell it to userspace ? */
5281 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5282 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5283 		} else {
5284 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5285 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5286 				DRM_WARN("smart shift update failed\n");
5287 		}
5288 	}
5289 
5290 skip_sched_resume:
5291 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5292 		/* unlock kfd: SRIOV would do it separately */
5293 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5294 	                amdgpu_amdkfd_post_reset(tmp_adev);
5295 
5296 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5297 		 * need to bring up kfd here if it's not be initialized before
5298 		 */
5299 		if (!adev->kfd.init_complete)
5300 			amdgpu_amdkfd_device_init(adev);
5301 
5302 		if (audio_suspended)
5303 			amdgpu_device_resume_display_audio(tmp_adev);
5304 		amdgpu_device_unlock_adev(tmp_adev);
5305 	}
5306 
5307 skip_recovery:
5308 	if (hive) {
5309 		atomic_set(&hive->in_reset, 0);
5310 		mutex_unlock(&hive->hive_lock);
5311 		amdgpu_put_xgmi_hive(hive);
5312 	}
5313 
5314 	if (r && r != -EAGAIN)
5315 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5316 	return r;
5317 }
5318 
5319 /**
5320  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5321  *
5322  * @adev: amdgpu_device pointer
5323  *
5324  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5325  * and lanes) of the slot the device is in. Handles APUs and
5326  * virtualized environments where PCIE config space may not be available.
5327  */
5328 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5329 {
5330 	struct pci_dev *pdev;
5331 	enum pci_bus_speed speed_cap, platform_speed_cap;
5332 	enum pcie_link_width platform_link_width;
5333 
5334 	if (amdgpu_pcie_gen_cap)
5335 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5336 
5337 	if (amdgpu_pcie_lane_cap)
5338 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5339 
5340 	/* covers APUs as well */
5341 	if (pci_is_root_bus(adev->pdev->bus)) {
5342 		if (adev->pm.pcie_gen_mask == 0)
5343 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5344 		if (adev->pm.pcie_mlw_mask == 0)
5345 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5346 		return;
5347 	}
5348 
5349 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5350 		return;
5351 
5352 	pcie_bandwidth_available(adev->pdev, NULL,
5353 				 &platform_speed_cap, &platform_link_width);
5354 
5355 	if (adev->pm.pcie_gen_mask == 0) {
5356 		/* asic caps */
5357 		pdev = adev->pdev;
5358 		speed_cap = pcie_get_speed_cap(pdev);
5359 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5360 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5361 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5362 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5363 		} else {
5364 			if (speed_cap == PCIE_SPEED_32_0GT)
5365 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5366 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5367 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5368 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5369 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5370 			else if (speed_cap == PCIE_SPEED_16_0GT)
5371 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5372 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5373 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5374 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5375 			else if (speed_cap == PCIE_SPEED_8_0GT)
5376 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5377 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5378 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5379 			else if (speed_cap == PCIE_SPEED_5_0GT)
5380 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5381 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5382 			else
5383 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5384 		}
5385 		/* platform caps */
5386 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5387 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5388 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5389 		} else {
5390 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5391 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5392 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5393 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5394 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5395 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5396 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5397 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5398 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5399 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5400 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5401 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5402 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5403 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5404 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5405 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5406 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5407 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5408 			else
5409 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5410 
5411 		}
5412 	}
5413 	if (adev->pm.pcie_mlw_mask == 0) {
5414 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5415 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5416 		} else {
5417 			switch (platform_link_width) {
5418 			case PCIE_LNK_X32:
5419 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5420 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5421 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5422 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5423 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5424 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5425 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5426 				break;
5427 			case PCIE_LNK_X16:
5428 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5429 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5430 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5431 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5432 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5433 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5434 				break;
5435 			case PCIE_LNK_X12:
5436 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5437 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5438 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5439 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5440 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5441 				break;
5442 			case PCIE_LNK_X8:
5443 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5444 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5445 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5446 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5447 				break;
5448 			case PCIE_LNK_X4:
5449 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5450 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5451 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5452 				break;
5453 			case PCIE_LNK_X2:
5454 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5455 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5456 				break;
5457 			case PCIE_LNK_X1:
5458 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5459 				break;
5460 			default:
5461 				break;
5462 			}
5463 		}
5464 	}
5465 }
5466 
5467 int amdgpu_device_baco_enter(struct drm_device *dev)
5468 {
5469 	struct amdgpu_device *adev = drm_to_adev(dev);
5470 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5471 
5472 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5473 		return -ENOTSUPP;
5474 
5475 	if (ras && adev->ras_enabled &&
5476 	    adev->nbio.funcs->enable_doorbell_interrupt)
5477 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5478 
5479 	return amdgpu_dpm_baco_enter(adev);
5480 }
5481 
5482 int amdgpu_device_baco_exit(struct drm_device *dev)
5483 {
5484 	struct amdgpu_device *adev = drm_to_adev(dev);
5485 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5486 	int ret = 0;
5487 
5488 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5489 		return -ENOTSUPP;
5490 
5491 	ret = amdgpu_dpm_baco_exit(adev);
5492 	if (ret)
5493 		return ret;
5494 
5495 	if (ras && adev->ras_enabled &&
5496 	    adev->nbio.funcs->enable_doorbell_interrupt)
5497 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5498 
5499 	if (amdgpu_passthrough(adev) &&
5500 	    adev->nbio.funcs->clear_doorbell_interrupt)
5501 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5502 
5503 	return 0;
5504 }
5505 
5506 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5507 {
5508 	int i;
5509 
5510 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5511 		struct amdgpu_ring *ring = adev->rings[i];
5512 
5513 		if (!ring || !ring->sched.thread)
5514 			continue;
5515 
5516 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5517 	}
5518 }
5519 
5520 /**
5521  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5522  * @pdev: PCI device struct
5523  * @state: PCI channel state
5524  *
5525  * Description: Called when a PCI error is detected.
5526  *
5527  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5528  */
5529 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5530 {
5531 	STUB();
5532 	return 0;
5533 #ifdef notyet
5534 	struct drm_device *dev = pci_get_drvdata(pdev);
5535 	struct amdgpu_device *adev = drm_to_adev(dev);
5536 	int i;
5537 
5538 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5539 
5540 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5541 		DRM_WARN("No support for XGMI hive yet...");
5542 		return PCI_ERS_RESULT_DISCONNECT;
5543 	}
5544 
5545 	adev->pci_channel_state = state;
5546 
5547 	switch (state) {
5548 	case pci_channel_io_normal:
5549 		return PCI_ERS_RESULT_CAN_RECOVER;
5550 	/* Fatal error, prepare for slot reset */
5551 	case pci_channel_io_frozen:
5552 		/*
5553 		 * Cancel and wait for all TDRs in progress if failing to
5554 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5555 		 *
5556 		 * Locking adev->reset_sem will prevent any external access
5557 		 * to GPU during PCI error recovery
5558 		 */
5559 		while (!amdgpu_device_lock_adev(adev, NULL))
5560 			amdgpu_cancel_all_tdr(adev);
5561 
5562 		/*
5563 		 * Block any work scheduling as we do for regular GPU reset
5564 		 * for the duration of the recovery
5565 		 */
5566 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5567 			struct amdgpu_ring *ring = adev->rings[i];
5568 
5569 			if (!ring || !ring->sched.thread)
5570 				continue;
5571 
5572 			drm_sched_stop(&ring->sched, NULL);
5573 		}
5574 		atomic_inc(&adev->gpu_reset_counter);
5575 		return PCI_ERS_RESULT_NEED_RESET;
5576 	case pci_channel_io_perm_failure:
5577 		/* Permanent error, prepare for device removal */
5578 		return PCI_ERS_RESULT_DISCONNECT;
5579 	}
5580 
5581 	return PCI_ERS_RESULT_NEED_RESET;
5582 #endif
5583 }
5584 
5585 /**
5586  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5587  * @pdev: pointer to PCI device
5588  */
5589 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5590 {
5591 
5592 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5593 
5594 	/* TODO - dump whatever for debugging purposes */
5595 
5596 	/* This called only if amdgpu_pci_error_detected returns
5597 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5598 	 * works, no need to reset slot.
5599 	 */
5600 
5601 	return PCI_ERS_RESULT_RECOVERED;
5602 }
5603 
5604 /**
5605  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5606  * @pdev: PCI device struct
5607  *
5608  * Description: This routine is called by the pci error recovery
5609  * code after the PCI slot has been reset, just before we
5610  * should resume normal operations.
5611  */
5612 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5613 {
5614 	STUB();
5615 	return PCI_ERS_RESULT_RECOVERED;
5616 #ifdef notyet
5617 	struct drm_device *dev = pci_get_drvdata(pdev);
5618 	struct amdgpu_device *adev = drm_to_adev(dev);
5619 	int r, i;
5620 	struct amdgpu_reset_context reset_context;
5621 	u32 memsize;
5622 	struct list_head device_list;
5623 
5624 	DRM_INFO("PCI error: slot reset callback!!\n");
5625 
5626 	memset(&reset_context, 0, sizeof(reset_context));
5627 
5628 	INIT_LIST_HEAD(&device_list);
5629 	list_add_tail(&adev->reset_list, &device_list);
5630 
5631 	/* wait for asic to come out of reset */
5632 	drm_msleep(500);
5633 
5634 	/* Restore PCI confspace */
5635 	amdgpu_device_load_pci_state(pdev);
5636 
5637 	/* confirm  ASIC came out of reset */
5638 	for (i = 0; i < adev->usec_timeout; i++) {
5639 		memsize = amdgpu_asic_get_config_memsize(adev);
5640 
5641 		if (memsize != 0xffffffff)
5642 			break;
5643 		udelay(1);
5644 	}
5645 	if (memsize == 0xffffffff) {
5646 		r = -ETIME;
5647 		goto out;
5648 	}
5649 
5650 	reset_context.method = AMD_RESET_METHOD_NONE;
5651 	reset_context.reset_req_dev = adev;
5652 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5653 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5654 
5655 	adev->no_hw_access = true;
5656 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5657 	adev->no_hw_access = false;
5658 	if (r)
5659 		goto out;
5660 
5661 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5662 
5663 out:
5664 	if (!r) {
5665 		if (amdgpu_device_cache_pci_state(adev->pdev))
5666 			pci_restore_state(adev->pdev);
5667 
5668 		DRM_INFO("PCIe error recovery succeeded\n");
5669 	} else {
5670 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5671 		amdgpu_device_unlock_adev(adev);
5672 	}
5673 
5674 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5675 #endif
5676 }
5677 
5678 /**
5679  * amdgpu_pci_resume() - resume normal ops after PCI reset
5680  * @pdev: pointer to PCI device
5681  *
5682  * Called when the error recovery driver tells us that its
5683  * OK to resume normal operation.
5684  */
5685 void amdgpu_pci_resume(struct pci_dev *pdev)
5686 {
5687 	STUB();
5688 #ifdef notyet
5689 	struct drm_device *dev = pci_get_drvdata(pdev);
5690 	struct amdgpu_device *adev = drm_to_adev(dev);
5691 	int i;
5692 
5693 
5694 	DRM_INFO("PCI error: resume callback!!\n");
5695 
5696 	/* Only continue execution for the case of pci_channel_io_frozen */
5697 	if (adev->pci_channel_state != pci_channel_io_frozen)
5698 		return;
5699 
5700 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5701 		struct amdgpu_ring *ring = adev->rings[i];
5702 
5703 		if (!ring || !ring->sched.thread)
5704 			continue;
5705 
5706 
5707 		drm_sched_resubmit_jobs(&ring->sched);
5708 		drm_sched_start(&ring->sched, true);
5709 	}
5710 
5711 	amdgpu_device_unlock_adev(adev);
5712 #endif
5713 }
5714 
5715 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5716 {
5717 	return false;
5718 #ifdef notyet
5719 	struct drm_device *dev = pci_get_drvdata(pdev);
5720 	struct amdgpu_device *adev = drm_to_adev(dev);
5721 	int r;
5722 
5723 	r = pci_save_state(pdev);
5724 	if (!r) {
5725 		kfree(adev->pci_state);
5726 
5727 		adev->pci_state = pci_store_saved_state(pdev);
5728 
5729 		if (!adev->pci_state) {
5730 			DRM_ERROR("Failed to store PCI saved state");
5731 			return false;
5732 		}
5733 	} else {
5734 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5735 		return false;
5736 	}
5737 
5738 	return true;
5739 #endif
5740 }
5741 
5742 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5743 {
5744 	STUB();
5745 	return false;
5746 #ifdef notyet
5747 	struct drm_device *dev = pci_get_drvdata(pdev);
5748 	struct amdgpu_device *adev = drm_to_adev(dev);
5749 	int r;
5750 
5751 	if (!adev->pci_state)
5752 		return false;
5753 
5754 	r = pci_load_saved_state(pdev, adev->pci_state);
5755 
5756 	if (!r) {
5757 		pci_restore_state(pdev);
5758 	} else {
5759 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5760 		return false;
5761 	}
5762 
5763 	return true;
5764 #endif
5765 }
5766 
5767 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5768 		struct amdgpu_ring *ring)
5769 {
5770 #ifdef CONFIG_X86_64
5771 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5772 		return;
5773 #endif
5774 	if (adev->gmc.xgmi.connected_to_cpu)
5775 		return;
5776 
5777 	if (ring && ring->funcs->emit_hdp_flush)
5778 		amdgpu_ring_emit_hdp_flush(ring);
5779 	else
5780 		amdgpu_asic_flush_hdp(adev, ring);
5781 }
5782 
5783 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5784 		struct amdgpu_ring *ring)
5785 {
5786 #ifdef CONFIG_X86_64
5787 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5788 		return;
5789 #endif
5790 	if (adev->gmc.xgmi.connected_to_cpu)
5791 		return;
5792 
5793 	amdgpu_asic_invalidate_hdp(adev, ring);
5794 }
5795