xref: /openbsd-src/sys/dev/pci/drm/amd/amdgpu/amdgpu_device.c (revision c1a45aed656e7d5627c30c92421893a76f370ccb)
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/pci.h>
34 
35 #include <drm/drm_atomic_helper.h>
36 #include <drm/drm_probe_helper.h>
37 #include <drm/amdgpu_drm.h>
38 #include <linux/vgaarb.h>
39 #include <linux/vga_switcheroo.h>
40 #include <linux/efi.h>
41 #include "amdgpu.h"
42 #include "amdgpu_trace.h"
43 #include "amdgpu_i2c.h"
44 #include "atom.h"
45 #include "amdgpu_atombios.h"
46 #include "amdgpu_atomfirmware.h"
47 #include "amd_pcie.h"
48 #ifdef CONFIG_DRM_AMDGPU_SI
49 #include "si.h"
50 #endif
51 #ifdef CONFIG_DRM_AMDGPU_CIK
52 #include "cik.h"
53 #endif
54 #include "vi.h"
55 #include "soc15.h"
56 #include "nv.h"
57 #include "bif/bif_4_1_d.h"
58 #include <linux/pci.h>
59 #include <linux/firmware.h>
60 #include "amdgpu_vf_error.h"
61 
62 #include "amdgpu_amdkfd.h"
63 #include "amdgpu_pm.h"
64 
65 #include "amdgpu_xgmi.h"
66 #include "amdgpu_ras.h"
67 #include "amdgpu_pmu.h"
68 #include "amdgpu_fru_eeprom.h"
69 #include "amdgpu_reset.h"
70 
71 #include <linux/suspend.h>
72 #include <drm/task_barrier.h>
73 #include <linux/pm_runtime.h>
74 
75 #include <drm/drm_drv.h>
76 
77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
89 
90 #define AMDGPU_RESUME_MS		2000
91 
92 const char *amdgpu_asic_name[] = {
93 	"TAHITI",
94 	"PITCAIRN",
95 	"VERDE",
96 	"OLAND",
97 	"HAINAN",
98 	"BONAIRE",
99 	"KAVERI",
100 	"KABINI",
101 	"HAWAII",
102 	"MULLINS",
103 	"TOPAZ",
104 	"TONGA",
105 	"FIJI",
106 	"CARRIZO",
107 	"STONEY",
108 	"POLARIS10",
109 	"POLARIS11",
110 	"POLARIS12",
111 	"VEGAM",
112 	"VEGA10",
113 	"VEGA12",
114 	"VEGA20",
115 	"RAVEN",
116 	"ARCTURUS",
117 	"RENOIR",
118 	"ALDEBARAN",
119 	"NAVI10",
120 	"CYAN_SKILLFISH",
121 	"NAVI14",
122 	"NAVI12",
123 	"SIENNA_CICHLID",
124 	"NAVY_FLOUNDER",
125 	"VANGOGH",
126 	"DIMGREY_CAVEFISH",
127 	"BEIGE_GOBY",
128 	"YELLOW_CARP",
129 	"LAST",
130 };
131 
132 /**
133  * DOC: pcie_replay_count
134  *
135  * The amdgpu driver provides a sysfs API for reporting the total number
136  * of PCIe replays (NAKs)
137  * The file pcie_replay_count is used for this and returns the total
138  * number of replays as a sum of the NAKs generated and NAKs received
139  */
140 
141 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
142 		struct device_attribute *attr, char *buf)
143 {
144 	struct drm_device *ddev = dev_get_drvdata(dev);
145 	struct amdgpu_device *adev = drm_to_adev(ddev);
146 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
147 
148 	return sysfs_emit(buf, "%llu\n", cnt);
149 }
150 
151 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
152 		amdgpu_device_get_pcie_replay_count, NULL);
153 
154 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
155 
156 /**
157  * DOC: product_name
158  *
159  * The amdgpu driver provides a sysfs API for reporting the product name
160  * for the device
161  * The file serial_number is used for this and returns the product name
162  * as returned from the FRU.
163  * NOTE: This is only available for certain server cards
164  */
165 
166 static ssize_t amdgpu_device_get_product_name(struct device *dev,
167 		struct device_attribute *attr, char *buf)
168 {
169 	struct drm_device *ddev = dev_get_drvdata(dev);
170 	struct amdgpu_device *adev = drm_to_adev(ddev);
171 
172 	return sysfs_emit(buf, "%s\n", adev->product_name);
173 }
174 
175 static DEVICE_ATTR(product_name, S_IRUGO,
176 		amdgpu_device_get_product_name, NULL);
177 
178 /**
179  * DOC: product_number
180  *
181  * The amdgpu driver provides a sysfs API for reporting the part number
182  * for the device
183  * The file serial_number is used for this and returns the part number
184  * as returned from the FRU.
185  * NOTE: This is only available for certain server cards
186  */
187 
188 static ssize_t amdgpu_device_get_product_number(struct device *dev,
189 		struct device_attribute *attr, char *buf)
190 {
191 	struct drm_device *ddev = dev_get_drvdata(dev);
192 	struct amdgpu_device *adev = drm_to_adev(ddev);
193 
194 	return sysfs_emit(buf, "%s\n", adev->product_number);
195 }
196 
197 static DEVICE_ATTR(product_number, S_IRUGO,
198 		amdgpu_device_get_product_number, NULL);
199 
200 /**
201  * DOC: serial_number
202  *
203  * The amdgpu driver provides a sysfs API for reporting the serial number
204  * for the device
205  * The file serial_number is used for this and returns the serial number
206  * as returned from the FRU.
207  * NOTE: This is only available for certain server cards
208  */
209 
210 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
211 		struct device_attribute *attr, char *buf)
212 {
213 	struct drm_device *ddev = dev_get_drvdata(dev);
214 	struct amdgpu_device *adev = drm_to_adev(ddev);
215 
216 	return sysfs_emit(buf, "%s\n", adev->serial);
217 }
218 
219 static DEVICE_ATTR(serial_number, S_IRUGO,
220 		amdgpu_device_get_serial_number, NULL);
221 
222 /**
223  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
224  *
225  * @dev: drm_device pointer
226  *
227  * Returns true if the device is a dGPU with ATPX power control,
228  * otherwise return false.
229  */
230 bool amdgpu_device_supports_px(struct drm_device *dev)
231 {
232 	struct amdgpu_device *adev = drm_to_adev(dev);
233 
234 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
235 		return true;
236 	return false;
237 }
238 
239 /**
240  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
241  *
242  * @dev: drm_device pointer
243  *
244  * Returns true if the device is a dGPU with ACPI power control,
245  * otherwise return false.
246  */
247 bool amdgpu_device_supports_boco(struct drm_device *dev)
248 {
249 	struct amdgpu_device *adev = drm_to_adev(dev);
250 
251 	if (adev->has_pr3 ||
252 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
253 		return true;
254 	return false;
255 }
256 
257 /**
258  * amdgpu_device_supports_baco - Does the device support BACO
259  *
260  * @dev: drm_device pointer
261  *
262  * Returns true if the device supporte BACO,
263  * otherwise return false.
264  */
265 bool amdgpu_device_supports_baco(struct drm_device *dev)
266 {
267 	struct amdgpu_device *adev = drm_to_adev(dev);
268 
269 	return amdgpu_asic_supports_baco(adev);
270 }
271 
272 /**
273  * amdgpu_device_supports_smart_shift - Is the device dGPU with
274  * smart shift support
275  *
276  * @dev: drm_device pointer
277  *
278  * Returns true if the device is a dGPU with Smart Shift support,
279  * otherwise returns false.
280  */
281 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
282 {
283 	return (amdgpu_device_supports_boco(dev) &&
284 		amdgpu_acpi_is_power_shift_control_supported());
285 }
286 
287 /*
288  * VRAM access helper functions
289  */
290 
291 /**
292  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
293  *
294  * @adev: amdgpu_device pointer
295  * @pos: offset of the buffer in vram
296  * @buf: virtual address of the buffer in system memory
297  * @size: read/write size, sizeof(@buf) must > @size
298  * @write: true - write to vram, otherwise - read from vram
299  */
300 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
301 			     void *buf, size_t size, bool write)
302 {
303 	unsigned long flags;
304 	uint32_t hi = ~0, tmp = 0;
305 	uint32_t *data = buf;
306 	uint64_t last;
307 	int idx;
308 
309 	if (!drm_dev_enter(&adev->ddev, &idx))
310 		return;
311 
312 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
313 
314 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
315 	for (last = pos + size; pos < last; pos += 4) {
316 		tmp = pos >> 31;
317 
318 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
319 		if (tmp != hi) {
320 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
321 			hi = tmp;
322 		}
323 		if (write)
324 			WREG32_NO_KIQ(mmMM_DATA, *data++);
325 		else
326 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
327 	}
328 
329 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
330 	drm_dev_exit(idx);
331 }
332 
333 /**
334  * amdgpu_device_vram_access - access vram by vram aperature
335  *
336  * @adev: amdgpu_device pointer
337  * @pos: offset of the buffer in vram
338  * @buf: virtual address of the buffer in system memory
339  * @size: read/write size, sizeof(@buf) must > @size
340  * @write: true - write to vram, otherwise - read from vram
341  *
342  * The return value means how many bytes have been transferred.
343  */
344 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
345 				 void *buf, size_t size, bool write)
346 {
347 #ifdef CONFIG_64BIT
348 	void __iomem *addr;
349 	size_t count = 0;
350 	uint64_t last;
351 
352 	if (!adev->mman.aper_base_kaddr)
353 		return 0;
354 
355 	last = min(pos + size, adev->gmc.visible_vram_size);
356 	if (last > pos) {
357 		addr = adev->mman.aper_base_kaddr + pos;
358 		count = last - pos;
359 
360 		if (write) {
361 			memcpy_toio(addr, buf, count);
362 			mb();
363 			amdgpu_device_flush_hdp(adev, NULL);
364 		} else {
365 			amdgpu_device_invalidate_hdp(adev, NULL);
366 			mb();
367 			memcpy_fromio(buf, addr, count);
368 		}
369 
370 	}
371 
372 	return count;
373 #else
374 	return 0;
375 #endif
376 }
377 
378 /**
379  * amdgpu_device_vram_access - read/write a buffer in vram
380  *
381  * @adev: amdgpu_device pointer
382  * @pos: offset of the buffer in vram
383  * @buf: virtual address of the buffer in system memory
384  * @size: read/write size, sizeof(@buf) must > @size
385  * @write: true - write to vram, otherwise - read from vram
386  */
387 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
388 			       void *buf, size_t size, bool write)
389 {
390 	size_t count;
391 
392 	/* try to using vram apreature to access vram first */
393 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
394 	size -= count;
395 	if (size) {
396 		/* using MM to access rest vram */
397 		pos += count;
398 		buf += count;
399 		amdgpu_device_mm_access(adev, pos, buf, size, write);
400 	}
401 }
402 
403 /*
404  * register access helper functions.
405  */
406 
407 /* Check if hw access should be skipped because of hotplug or device error */
408 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
409 {
410 	if (adev->no_hw_access)
411 		return true;
412 
413 #ifdef CONFIG_LOCKDEP
414 	/*
415 	 * This is a bit complicated to understand, so worth a comment. What we assert
416 	 * here is that the GPU reset is not running on another thread in parallel.
417 	 *
418 	 * For this we trylock the read side of the reset semaphore, if that succeeds
419 	 * we know that the reset is not running in paralell.
420 	 *
421 	 * If the trylock fails we assert that we are either already holding the read
422 	 * side of the lock or are the reset thread itself and hold the write side of
423 	 * the lock.
424 	 */
425 	if (in_task()) {
426 		if (down_read_trylock(&adev->reset_sem))
427 			up_read(&adev->reset_sem);
428 		else
429 			lockdep_assert_held(&adev->reset_sem);
430 	}
431 #endif
432 	return false;
433 }
434 
435 /**
436  * amdgpu_device_rreg - read a memory mapped IO or indirect register
437  *
438  * @adev: amdgpu_device pointer
439  * @reg: dword aligned register offset
440  * @acc_flags: access flags which require special behavior
441  *
442  * Returns the 32 bit value from the offset specified.
443  */
444 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
445 			    uint32_t reg, uint32_t acc_flags)
446 {
447 	uint32_t ret;
448 
449 	if (amdgpu_device_skip_hw_access(adev))
450 		return 0;
451 
452 	if ((reg * 4) < adev->rmmio_size) {
453 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
454 		    amdgpu_sriov_runtime(adev) &&
455 		    down_read_trylock(&adev->reset_sem)) {
456 			ret = amdgpu_kiq_rreg(adev, reg);
457 			up_read(&adev->reset_sem);
458 		} else {
459 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
460 		}
461 	} else {
462 		ret = adev->pcie_rreg(adev, reg * 4);
463 	}
464 
465 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
466 
467 	return ret;
468 }
469 
470 /*
471  * MMIO register read with bytes helper functions
472  * @offset:bytes offset from MMIO start
473  *
474 */
475 
476 /**
477  * amdgpu_mm_rreg8 - read a memory mapped IO register
478  *
479  * @adev: amdgpu_device pointer
480  * @offset: byte aligned register offset
481  *
482  * Returns the 8 bit value from the offset specified.
483  */
484 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
485 {
486 	if (amdgpu_device_skip_hw_access(adev))
487 		return 0;
488 
489 	if (offset < adev->rmmio_size)
490 		return (readb(adev->rmmio + offset));
491 	BUG();
492 }
493 
494 /*
495  * MMIO register write with bytes helper functions
496  * @offset:bytes offset from MMIO start
497  * @value: the value want to be written to the register
498  *
499 */
500 /**
501  * amdgpu_mm_wreg8 - read a memory mapped IO register
502  *
503  * @adev: amdgpu_device pointer
504  * @offset: byte aligned register offset
505  * @value: 8 bit value to write
506  *
507  * Writes the value specified to the offset specified.
508  */
509 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
510 {
511 	if (amdgpu_device_skip_hw_access(adev))
512 		return;
513 
514 	if (offset < adev->rmmio_size)
515 		writeb(value, adev->rmmio + offset);
516 	else
517 		BUG();
518 }
519 
520 /**
521  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
522  *
523  * @adev: amdgpu_device pointer
524  * @reg: dword aligned register offset
525  * @v: 32 bit value to write to the register
526  * @acc_flags: access flags which require special behavior
527  *
528  * Writes the value specified to the offset specified.
529  */
530 void amdgpu_device_wreg(struct amdgpu_device *adev,
531 			uint32_t reg, uint32_t v,
532 			uint32_t acc_flags)
533 {
534 	if (amdgpu_device_skip_hw_access(adev))
535 		return;
536 
537 	if ((reg * 4) < adev->rmmio_size) {
538 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
539 		    amdgpu_sriov_runtime(adev) &&
540 		    down_read_trylock(&adev->reset_sem)) {
541 			amdgpu_kiq_wreg(adev, reg, v);
542 			up_read(&adev->reset_sem);
543 		} else {
544 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
545 		}
546 	} else {
547 		adev->pcie_wreg(adev, reg * 4, v);
548 	}
549 
550 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
551 }
552 
553 /*
554  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
555  *
556  * this function is invoked only the debugfs register access
557  * */
558 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
559 			     uint32_t reg, uint32_t v)
560 {
561 	if (amdgpu_device_skip_hw_access(adev))
562 		return;
563 
564 	if (amdgpu_sriov_fullaccess(adev) &&
565 	    adev->gfx.rlc.funcs &&
566 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
567 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
568 			return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
569 	} else {
570 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
571 	}
572 }
573 
574 /**
575  * amdgpu_mm_rdoorbell - read a doorbell dword
576  *
577  * @adev: amdgpu_device pointer
578  * @index: doorbell index
579  *
580  * Returns the value in the doorbell aperture at the
581  * requested doorbell index (CIK).
582  */
583 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
584 {
585 	if (amdgpu_device_skip_hw_access(adev))
586 		return 0;
587 
588 	if (index < adev->doorbell.num_doorbells) {
589 		return readl(adev->doorbell.ptr + index);
590 	} else {
591 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
592 		return 0;
593 	}
594 }
595 
596 /**
597  * amdgpu_mm_wdoorbell - write a doorbell dword
598  *
599  * @adev: amdgpu_device pointer
600  * @index: doorbell index
601  * @v: value to write
602  *
603  * Writes @v to the doorbell aperture at the
604  * requested doorbell index (CIK).
605  */
606 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
607 {
608 	if (amdgpu_device_skip_hw_access(adev))
609 		return;
610 
611 	if (index < adev->doorbell.num_doorbells) {
612 		writel(v, adev->doorbell.ptr + index);
613 	} else {
614 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
615 	}
616 }
617 
618 /**
619  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
620  *
621  * @adev: amdgpu_device pointer
622  * @index: doorbell index
623  *
624  * Returns the value in the doorbell aperture at the
625  * requested doorbell index (VEGA10+).
626  */
627 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
628 {
629 	if (amdgpu_device_skip_hw_access(adev))
630 		return 0;
631 
632 	if (index < adev->doorbell.num_doorbells) {
633 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
634 	} else {
635 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
636 		return 0;
637 	}
638 }
639 
640 /**
641  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
642  *
643  * @adev: amdgpu_device pointer
644  * @index: doorbell index
645  * @v: value to write
646  *
647  * Writes @v to the doorbell aperture at the
648  * requested doorbell index (VEGA10+).
649  */
650 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
651 {
652 	if (amdgpu_device_skip_hw_access(adev))
653 		return;
654 
655 	if (index < adev->doorbell.num_doorbells) {
656 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
657 	} else {
658 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
659 	}
660 }
661 
662 /**
663  * amdgpu_device_indirect_rreg - read an indirect register
664  *
665  * @adev: amdgpu_device pointer
666  * @pcie_index: mmio register offset
667  * @pcie_data: mmio register offset
668  * @reg_addr: indirect register address to read from
669  *
670  * Returns the value of indirect register @reg_addr
671  */
672 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
673 				u32 pcie_index, u32 pcie_data,
674 				u32 reg_addr)
675 {
676 	unsigned long flags;
677 	u32 r;
678 	void __iomem *pcie_index_offset;
679 	void __iomem *pcie_data_offset;
680 
681 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
682 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
683 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
684 
685 	writel(reg_addr, pcie_index_offset);
686 	readl(pcie_index_offset);
687 	r = readl(pcie_data_offset);
688 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
689 
690 	return r;
691 }
692 
693 /**
694  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
695  *
696  * @adev: amdgpu_device pointer
697  * @pcie_index: mmio register offset
698  * @pcie_data: mmio register offset
699  * @reg_addr: indirect register address to read from
700  *
701  * Returns the value of indirect register @reg_addr
702  */
703 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
704 				  u32 pcie_index, u32 pcie_data,
705 				  u32 reg_addr)
706 {
707 	unsigned long flags;
708 	u64 r;
709 	void __iomem *pcie_index_offset;
710 	void __iomem *pcie_data_offset;
711 
712 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
713 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
714 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
715 
716 	/* read low 32 bits */
717 	writel(reg_addr, pcie_index_offset);
718 	readl(pcie_index_offset);
719 	r = readl(pcie_data_offset);
720 	/* read high 32 bits */
721 	writel(reg_addr + 4, pcie_index_offset);
722 	readl(pcie_index_offset);
723 	r |= ((u64)readl(pcie_data_offset) << 32);
724 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
725 
726 	return r;
727 }
728 
729 /**
730  * amdgpu_device_indirect_wreg - write an indirect register address
731  *
732  * @adev: amdgpu_device pointer
733  * @pcie_index: mmio register offset
734  * @pcie_data: mmio register offset
735  * @reg_addr: indirect register offset
736  * @reg_data: indirect register data
737  *
738  */
739 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
740 				 u32 pcie_index, u32 pcie_data,
741 				 u32 reg_addr, u32 reg_data)
742 {
743 	unsigned long flags;
744 	void __iomem *pcie_index_offset;
745 	void __iomem *pcie_data_offset;
746 
747 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
748 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
749 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
750 
751 	writel(reg_addr, pcie_index_offset);
752 	readl(pcie_index_offset);
753 	writel(reg_data, pcie_data_offset);
754 	readl(pcie_data_offset);
755 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
756 }
757 
758 /**
759  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
760  *
761  * @adev: amdgpu_device pointer
762  * @pcie_index: mmio register offset
763  * @pcie_data: mmio register offset
764  * @reg_addr: indirect register offset
765  * @reg_data: indirect register data
766  *
767  */
768 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
769 				   u32 pcie_index, u32 pcie_data,
770 				   u32 reg_addr, u64 reg_data)
771 {
772 	unsigned long flags;
773 	void __iomem *pcie_index_offset;
774 	void __iomem *pcie_data_offset;
775 
776 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
777 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
778 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
779 
780 	/* write low 32 bits */
781 	writel(reg_addr, pcie_index_offset);
782 	readl(pcie_index_offset);
783 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
784 	readl(pcie_data_offset);
785 	/* write high 32 bits */
786 	writel(reg_addr + 4, pcie_index_offset);
787 	readl(pcie_index_offset);
788 	writel((u32)(reg_data >> 32), pcie_data_offset);
789 	readl(pcie_data_offset);
790 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
791 }
792 
793 /**
794  * amdgpu_invalid_rreg - dummy reg read function
795  *
796  * @adev: amdgpu_device pointer
797  * @reg: offset of register
798  *
799  * Dummy register read function.  Used for register blocks
800  * that certain asics don't have (all asics).
801  * Returns the value in the register.
802  */
803 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
804 {
805 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
806 	BUG();
807 	return 0;
808 }
809 
810 /**
811  * amdgpu_invalid_wreg - dummy reg write function
812  *
813  * @adev: amdgpu_device pointer
814  * @reg: offset of register
815  * @v: value to write to the register
816  *
817  * Dummy register read function.  Used for register blocks
818  * that certain asics don't have (all asics).
819  */
820 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
821 {
822 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
823 		  reg, v);
824 	BUG();
825 }
826 
827 /**
828  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
829  *
830  * @adev: amdgpu_device pointer
831  * @reg: offset of register
832  *
833  * Dummy register read function.  Used for register blocks
834  * that certain asics don't have (all asics).
835  * Returns the value in the register.
836  */
837 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
838 {
839 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
840 	BUG();
841 	return 0;
842 }
843 
844 /**
845  * amdgpu_invalid_wreg64 - dummy reg write function
846  *
847  * @adev: amdgpu_device pointer
848  * @reg: offset of register
849  * @v: value to write to the register
850  *
851  * Dummy register read function.  Used for register blocks
852  * that certain asics don't have (all asics).
853  */
854 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
855 {
856 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
857 		  reg, v);
858 	BUG();
859 }
860 
861 /**
862  * amdgpu_block_invalid_rreg - dummy reg read function
863  *
864  * @adev: amdgpu_device pointer
865  * @block: offset of instance
866  * @reg: offset of register
867  *
868  * Dummy register read function.  Used for register blocks
869  * that certain asics don't have (all asics).
870  * Returns the value in the register.
871  */
872 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
873 					  uint32_t block, uint32_t reg)
874 {
875 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
876 		  reg, block);
877 	BUG();
878 	return 0;
879 }
880 
881 /**
882  * amdgpu_block_invalid_wreg - dummy reg write function
883  *
884  * @adev: amdgpu_device pointer
885  * @block: offset of instance
886  * @reg: offset of register
887  * @v: value to write to the register
888  *
889  * Dummy register read function.  Used for register blocks
890  * that certain asics don't have (all asics).
891  */
892 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
893 				      uint32_t block,
894 				      uint32_t reg, uint32_t v)
895 {
896 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
897 		  reg, block, v);
898 	BUG();
899 }
900 
901 /**
902  * amdgpu_device_asic_init - Wrapper for atom asic_init
903  *
904  * @adev: amdgpu_device pointer
905  *
906  * Does any asic specific work and then calls atom asic init.
907  */
908 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
909 {
910 	amdgpu_asic_pre_asic_init(adev);
911 
912 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
913 }
914 
915 /**
916  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
917  *
918  * @adev: amdgpu_device pointer
919  *
920  * Allocates a scratch page of VRAM for use by various things in the
921  * driver.
922  */
923 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
924 {
925 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
926 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
927 				       &adev->vram_scratch.robj,
928 				       &adev->vram_scratch.gpu_addr,
929 				       (void **)&adev->vram_scratch.ptr);
930 }
931 
932 /**
933  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
934  *
935  * @adev: amdgpu_device pointer
936  *
937  * Frees the VRAM scratch page.
938  */
939 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
940 {
941 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
942 }
943 
944 /**
945  * amdgpu_device_program_register_sequence - program an array of registers.
946  *
947  * @adev: amdgpu_device pointer
948  * @registers: pointer to the register array
949  * @array_size: size of the register array
950  *
951  * Programs an array or registers with and and or masks.
952  * This is a helper for setting golden registers.
953  */
954 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
955 					     const u32 *registers,
956 					     const u32 array_size)
957 {
958 	u32 tmp, reg, and_mask, or_mask;
959 	int i;
960 
961 	if (array_size % 3)
962 		return;
963 
964 	for (i = 0; i < array_size; i +=3) {
965 		reg = registers[i + 0];
966 		and_mask = registers[i + 1];
967 		or_mask = registers[i + 2];
968 
969 		if (and_mask == 0xffffffff) {
970 			tmp = or_mask;
971 		} else {
972 			tmp = RREG32(reg);
973 			tmp &= ~and_mask;
974 			if (adev->family >= AMDGPU_FAMILY_AI)
975 				tmp |= (or_mask & and_mask);
976 			else
977 				tmp |= or_mask;
978 		}
979 		WREG32(reg, tmp);
980 	}
981 }
982 
983 /**
984  * amdgpu_device_pci_config_reset - reset the GPU
985  *
986  * @adev: amdgpu_device pointer
987  *
988  * Resets the GPU using the pci config reset sequence.
989  * Only applicable to asics prior to vega10.
990  */
991 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
992 {
993 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
994 }
995 
996 /**
997  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
998  *
999  * @adev: amdgpu_device pointer
1000  *
1001  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1002  */
1003 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1004 {
1005 	STUB();
1006 	return -ENOSYS;
1007 #ifdef notyet
1008 	return pci_reset_function(adev->pdev);
1009 #endif
1010 }
1011 
1012 /*
1013  * GPU doorbell aperture helpers function.
1014  */
1015 /**
1016  * amdgpu_device_doorbell_init - Init doorbell driver information.
1017  *
1018  * @adev: amdgpu_device pointer
1019  *
1020  * Init doorbell driver information (CIK)
1021  * Returns 0 on success, error on failure.
1022  */
1023 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1024 {
1025 
1026 	/* No doorbell on SI hardware generation */
1027 	if (adev->asic_type < CHIP_BONAIRE) {
1028 		adev->doorbell.base = 0;
1029 		adev->doorbell.size = 0;
1030 		adev->doorbell.num_doorbells = 0;
1031 		adev->doorbell.ptr = NULL;
1032 		return 0;
1033 	}
1034 
1035 #ifdef __linux__
1036 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1037 		return -EINVAL;
1038 #endif
1039 
1040 	amdgpu_asic_init_doorbell_index(adev);
1041 
1042 	/* doorbell bar mapping */
1043 #ifdef __linux__
1044 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046 #endif
1047 
1048 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
1049 					     adev->doorbell_index.max_assignment+1);
1050 	if (adev->doorbell.num_doorbells == 0)
1051 		return -EINVAL;
1052 
1053 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
1054 	 * paging queue doorbell use the second page. The
1055 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1056 	 * doorbells are in the first page. So with paging queue enabled,
1057 	 * the max num_doorbells should + 1 page (0x400 in dword)
1058 	 */
1059 	if (adev->asic_type >= CHIP_VEGA10)
1060 		adev->doorbell.num_doorbells += 0x400;
1061 
1062 #ifdef __linux__
1063 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
1064 				     adev->doorbell.num_doorbells *
1065 				     sizeof(u32));
1066 	if (adev->doorbell.ptr == NULL)
1067 		return -ENOMEM;
1068 #endif
1069 
1070 	return 0;
1071 }
1072 
1073 /**
1074  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1075  *
1076  * @adev: amdgpu_device pointer
1077  *
1078  * Tear down doorbell driver information (CIK)
1079  */
1080 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1081 {
1082 #ifdef __linux__
1083 	iounmap(adev->doorbell.ptr);
1084 #else
1085 	if (adev->doorbell.size > 0)
1086 		bus_space_unmap(adev->doorbell.bst, adev->doorbell.bsh,
1087 		    adev->doorbell.size);
1088 #endif
1089 	adev->doorbell.ptr = NULL;
1090 }
1091 
1092 
1093 
1094 /*
1095  * amdgpu_device_wb_*()
1096  * Writeback is the method by which the GPU updates special pages in memory
1097  * with the status of certain GPU events (fences, ring pointers,etc.).
1098  */
1099 
1100 /**
1101  * amdgpu_device_wb_fini - Disable Writeback and free memory
1102  *
1103  * @adev: amdgpu_device pointer
1104  *
1105  * Disables Writeback and frees the Writeback memory (all asics).
1106  * Used at driver shutdown.
1107  */
1108 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1109 {
1110 	if (adev->wb.wb_obj) {
1111 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1112 				      &adev->wb.gpu_addr,
1113 				      (void **)&adev->wb.wb);
1114 		adev->wb.wb_obj = NULL;
1115 	}
1116 }
1117 
1118 /**
1119  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1120  *
1121  * @adev: amdgpu_device pointer
1122  *
1123  * Initializes writeback and allocates writeback memory (all asics).
1124  * Used at driver startup.
1125  * Returns 0 on success or an -error on failure.
1126  */
1127 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1128 {
1129 	int r;
1130 
1131 	if (adev->wb.wb_obj == NULL) {
1132 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1133 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1134 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1135 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1136 					    (void **)&adev->wb.wb);
1137 		if (r) {
1138 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1139 			return r;
1140 		}
1141 
1142 		adev->wb.num_wb = AMDGPU_MAX_WB;
1143 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1144 
1145 		/* clear wb memory */
1146 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1147 	}
1148 
1149 	return 0;
1150 }
1151 
1152 /**
1153  * amdgpu_device_wb_get - Allocate a wb entry
1154  *
1155  * @adev: amdgpu_device pointer
1156  * @wb: wb index
1157  *
1158  * Allocate a wb slot for use by the driver (all asics).
1159  * Returns 0 on success or -EINVAL on failure.
1160  */
1161 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1162 {
1163 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1164 
1165 	if (offset < adev->wb.num_wb) {
1166 		__set_bit(offset, adev->wb.used);
1167 		*wb = offset << 3; /* convert to dw offset */
1168 		return 0;
1169 	} else {
1170 		return -EINVAL;
1171 	}
1172 }
1173 
1174 /**
1175  * amdgpu_device_wb_free - Free a wb entry
1176  *
1177  * @adev: amdgpu_device pointer
1178  * @wb: wb index
1179  *
1180  * Free a wb slot allocated for use by the driver (all asics)
1181  */
1182 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1183 {
1184 	wb >>= 3;
1185 	if (wb < adev->wb.num_wb)
1186 		__clear_bit(wb, adev->wb.used);
1187 }
1188 
1189 /**
1190  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1191  *
1192  * @adev: amdgpu_device pointer
1193  *
1194  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1195  * to fail, but if any of the BARs is not accessible after the size we abort
1196  * driver loading by returning -ENODEV.
1197  */
1198 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1199 {
1200 #ifdef __linux__
1201 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1202 	struct pci_bus *root;
1203 	struct resource *res;
1204 	unsigned i;
1205 	u16 cmd;
1206 	int r;
1207 
1208 	/* Bypass for VF */
1209 	if (amdgpu_sriov_vf(adev))
1210 		return 0;
1211 
1212 	/* skip if the bios has already enabled large BAR */
1213 	if (adev->gmc.real_vram_size &&
1214 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1215 		return 0;
1216 
1217 	/* Check if the root BUS has 64bit memory resources */
1218 	root = adev->pdev->bus;
1219 	while (root->parent)
1220 		root = root->parent;
1221 
1222 	pci_bus_for_each_resource(root, res, i) {
1223 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1224 		    res->start > 0x100000000ull)
1225 			break;
1226 	}
1227 
1228 	/* Trying to resize is pointless without a root hub window above 4GB */
1229 	if (!res)
1230 		return 0;
1231 
1232 	/* Limit the BAR size to what is available */
1233 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1234 			rbar_size);
1235 
1236 	/* Disable memory decoding while we change the BAR addresses and size */
1237 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1238 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1239 			      cmd & ~PCI_COMMAND_MEMORY);
1240 
1241 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1242 	amdgpu_device_doorbell_fini(adev);
1243 	if (adev->asic_type >= CHIP_BONAIRE)
1244 		pci_release_resource(adev->pdev, 2);
1245 
1246 	pci_release_resource(adev->pdev, 0);
1247 
1248 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1249 	if (r == -ENOSPC)
1250 		DRM_INFO("Not enough PCI address space for a large BAR.");
1251 	else if (r && r != -ENOTSUPP)
1252 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1253 
1254 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1255 
1256 	/* When the doorbell or fb BAR isn't available we have no chance of
1257 	 * using the device.
1258 	 */
1259 	r = amdgpu_device_doorbell_init(adev);
1260 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1261 		return -ENODEV;
1262 
1263 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1264 #endif /* __linux__ */
1265 
1266 	return 0;
1267 }
1268 
1269 /*
1270  * GPU helpers function.
1271  */
1272 /**
1273  * amdgpu_device_need_post - check if the hw need post or not
1274  *
1275  * @adev: amdgpu_device pointer
1276  *
1277  * Check if the asic has been initialized (all asics) at driver startup
1278  * or post is needed if  hw reset is performed.
1279  * Returns true if need or false if not.
1280  */
1281 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1282 {
1283 	uint32_t reg;
1284 
1285 	if (amdgpu_sriov_vf(adev))
1286 		return false;
1287 
1288 	if (amdgpu_passthrough(adev)) {
1289 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1290 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1291 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1292 		 * vpost executed for smc version below 22.15
1293 		 */
1294 		if (adev->asic_type == CHIP_FIJI) {
1295 			int err;
1296 			uint32_t fw_ver;
1297 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1298 			/* force vPost if error occured */
1299 			if (err)
1300 				return true;
1301 
1302 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1303 			if (fw_ver < 0x00160e00)
1304 				return true;
1305 		}
1306 	}
1307 
1308 	/* Don't post if we need to reset whole hive on init */
1309 	if (adev->gmc.xgmi.pending_reset)
1310 		return false;
1311 
1312 	if (adev->has_hw_reset) {
1313 		adev->has_hw_reset = false;
1314 		return true;
1315 	}
1316 
1317 	/* bios scratch used on CIK+ */
1318 	if (adev->asic_type >= CHIP_BONAIRE)
1319 		return amdgpu_atombios_scratch_need_asic_init(adev);
1320 
1321 	/* check MEM_SIZE for older asics */
1322 	reg = amdgpu_asic_get_config_memsize(adev);
1323 
1324 	if ((reg != 0) && (reg != 0xffffffff))
1325 		return false;
1326 
1327 	return true;
1328 }
1329 
1330 /* if we get transitioned to only one device, take VGA back */
1331 /**
1332  * amdgpu_device_vga_set_decode - enable/disable vga decode
1333  *
1334  * @pdev: PCI device pointer
1335  * @state: enable/disable vga decode
1336  *
1337  * Enable/disable vga decode (all asics).
1338  * Returns VGA resource flags.
1339  */
1340 #ifdef notyet
1341 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1342 		bool state)
1343 {
1344 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1345 	amdgpu_asic_set_vga_state(adev, state);
1346 	if (state)
1347 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1348 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1349 	else
1350 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1351 }
1352 #endif
1353 
1354 /**
1355  * amdgpu_device_check_block_size - validate the vm block size
1356  *
1357  * @adev: amdgpu_device pointer
1358  *
1359  * Validates the vm block size specified via module parameter.
1360  * The vm block size defines number of bits in page table versus page directory,
1361  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1362  * page table and the remaining bits are in the page directory.
1363  */
1364 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1365 {
1366 	/* defines number of bits in page table versus page directory,
1367 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1368 	 * page table and the remaining bits are in the page directory */
1369 	if (amdgpu_vm_block_size == -1)
1370 		return;
1371 
1372 	if (amdgpu_vm_block_size < 9) {
1373 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1374 			 amdgpu_vm_block_size);
1375 		amdgpu_vm_block_size = -1;
1376 	}
1377 }
1378 
1379 /**
1380  * amdgpu_device_check_vm_size - validate the vm size
1381  *
1382  * @adev: amdgpu_device pointer
1383  *
1384  * Validates the vm size in GB specified via module parameter.
1385  * The VM size is the size of the GPU virtual memory space in GB.
1386  */
1387 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1388 {
1389 	/* no need to check the default value */
1390 	if (amdgpu_vm_size == -1)
1391 		return;
1392 
1393 	if (amdgpu_vm_size < 1) {
1394 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1395 			 amdgpu_vm_size);
1396 		amdgpu_vm_size = -1;
1397 	}
1398 }
1399 
1400 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1401 {
1402 #ifdef __linux__
1403 	struct sysinfo si;
1404 #endif
1405 	bool is_os_64 = (sizeof(void *) == 8);
1406 	uint64_t total_memory;
1407 	uint64_t dram_size_seven_GB = 0x1B8000000;
1408 	uint64_t dram_size_three_GB = 0xB8000000;
1409 
1410 	if (amdgpu_smu_memory_pool_size == 0)
1411 		return;
1412 
1413 	if (!is_os_64) {
1414 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1415 		goto def_value;
1416 	}
1417 #ifdef __linux__
1418 	si_meminfo(&si);
1419 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1420 #else
1421 	total_memory = ptoa(physmem);
1422 #endif
1423 
1424 	if ((amdgpu_smu_memory_pool_size == 1) ||
1425 		(amdgpu_smu_memory_pool_size == 2)) {
1426 		if (total_memory < dram_size_three_GB)
1427 			goto def_value1;
1428 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1429 		(amdgpu_smu_memory_pool_size == 8)) {
1430 		if (total_memory < dram_size_seven_GB)
1431 			goto def_value1;
1432 	} else {
1433 		DRM_WARN("Smu memory pool size not supported\n");
1434 		goto def_value;
1435 	}
1436 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1437 
1438 	return;
1439 
1440 def_value1:
1441 	DRM_WARN("No enough system memory\n");
1442 def_value:
1443 	adev->pm.smu_prv_buffer_size = 0;
1444 }
1445 
1446 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1447 {
1448 	if (!(adev->flags & AMD_IS_APU) ||
1449 	    adev->asic_type < CHIP_RAVEN)
1450 		return 0;
1451 
1452 	switch (adev->asic_type) {
1453 	case CHIP_RAVEN:
1454 		if (adev->pdev->device == 0x15dd)
1455 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1456 		if (adev->pdev->device == 0x15d8)
1457 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1458 		break;
1459 	case CHIP_RENOIR:
1460 		if ((adev->pdev->device == 0x1636) ||
1461 		    (adev->pdev->device == 0x164c))
1462 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1463 		else
1464 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1465 		break;
1466 	case CHIP_VANGOGH:
1467 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1468 		break;
1469 	case CHIP_YELLOW_CARP:
1470 		break;
1471 	case CHIP_CYAN_SKILLFISH:
1472 		if (adev->pdev->device == 0x13FE)
1473 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1474 		break;
1475 	default:
1476 		return -EINVAL;
1477 	}
1478 
1479 	return 0;
1480 }
1481 
1482 /**
1483  * amdgpu_device_check_arguments - validate module params
1484  *
1485  * @adev: amdgpu_device pointer
1486  *
1487  * Validates certain module parameters and updates
1488  * the associated values used by the driver (all asics).
1489  */
1490 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1491 {
1492 	if (amdgpu_sched_jobs < 4) {
1493 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1494 			 amdgpu_sched_jobs);
1495 		amdgpu_sched_jobs = 4;
1496 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1497 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1498 			 amdgpu_sched_jobs);
1499 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1500 	}
1501 
1502 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1503 		/* gart size must be greater or equal to 32M */
1504 		dev_warn(adev->dev, "gart size (%d) too small\n",
1505 			 amdgpu_gart_size);
1506 		amdgpu_gart_size = -1;
1507 	}
1508 
1509 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1510 		/* gtt size must be greater or equal to 32M */
1511 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1512 				 amdgpu_gtt_size);
1513 		amdgpu_gtt_size = -1;
1514 	}
1515 
1516 	/* valid range is between 4 and 9 inclusive */
1517 	if (amdgpu_vm_fragment_size != -1 &&
1518 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1519 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1520 		amdgpu_vm_fragment_size = -1;
1521 	}
1522 
1523 	if (amdgpu_sched_hw_submission < 2) {
1524 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1525 			 amdgpu_sched_hw_submission);
1526 		amdgpu_sched_hw_submission = 2;
1527 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1528 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1529 			 amdgpu_sched_hw_submission);
1530 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1531 	}
1532 
1533 	amdgpu_device_check_smu_prv_buffer_size(adev);
1534 
1535 	amdgpu_device_check_vm_size(adev);
1536 
1537 	amdgpu_device_check_block_size(adev);
1538 
1539 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1540 
1541 	amdgpu_gmc_tmz_set(adev);
1542 
1543 	amdgpu_gmc_noretry_set(adev);
1544 
1545 	return 0;
1546 }
1547 
1548 #ifdef __linux__
1549 /**
1550  * amdgpu_switcheroo_set_state - set switcheroo state
1551  *
1552  * @pdev: pci dev pointer
1553  * @state: vga_switcheroo state
1554  *
1555  * Callback for the switcheroo driver.  Suspends or resumes the
1556  * the asics before or after it is powered up using ACPI methods.
1557  */
1558 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1559 					enum vga_switcheroo_state state)
1560 {
1561 	struct drm_device *dev = pci_get_drvdata(pdev);
1562 	int r;
1563 
1564 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1565 		return;
1566 
1567 	if (state == VGA_SWITCHEROO_ON) {
1568 		pr_info("switched on\n");
1569 		/* don't suspend or resume card normally */
1570 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1571 
1572 		pci_set_power_state(pdev, PCI_D0);
1573 		amdgpu_device_load_pci_state(pdev);
1574 		r = pci_enable_device(pdev);
1575 		if (r)
1576 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1577 		amdgpu_device_resume(dev, true);
1578 
1579 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1580 	} else {
1581 		pr_info("switched off\n");
1582 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1583 		amdgpu_device_suspend(dev, true);
1584 		amdgpu_device_cache_pci_state(pdev);
1585 		/* Shut down the device */
1586 		pci_disable_device(pdev);
1587 		pci_set_power_state(pdev, PCI_D3cold);
1588 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1589 	}
1590 }
1591 
1592 /**
1593  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1594  *
1595  * @pdev: pci dev pointer
1596  *
1597  * Callback for the switcheroo driver.  Check of the switcheroo
1598  * state can be changed.
1599  * Returns true if the state can be changed, false if not.
1600  */
1601 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1602 {
1603 	struct drm_device *dev = pci_get_drvdata(pdev);
1604 
1605 	/*
1606 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1607 	* locking inversion with the driver load path. And the access here is
1608 	* completely racy anyway. So don't bother with locking for now.
1609 	*/
1610 	return atomic_read(&dev->open_count) == 0;
1611 }
1612 #endif /* __linux__ */
1613 
1614 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1615 #ifdef notyet
1616 	.set_gpu_state = amdgpu_switcheroo_set_state,
1617 	.reprobe = NULL,
1618 	.can_switch = amdgpu_switcheroo_can_switch,
1619 #endif
1620 };
1621 
1622 /**
1623  * amdgpu_device_ip_set_clockgating_state - set the CG state
1624  *
1625  * @dev: amdgpu_device pointer
1626  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1627  * @state: clockgating state (gate or ungate)
1628  *
1629  * Sets the requested clockgating state for all instances of
1630  * the hardware IP specified.
1631  * Returns the error code from the last instance.
1632  */
1633 int amdgpu_device_ip_set_clockgating_state(void *dev,
1634 					   enum amd_ip_block_type block_type,
1635 					   enum amd_clockgating_state state)
1636 {
1637 	struct amdgpu_device *adev = dev;
1638 	int i, r = 0;
1639 
1640 	for (i = 0; i < adev->num_ip_blocks; i++) {
1641 		if (!adev->ip_blocks[i].status.valid)
1642 			continue;
1643 		if (adev->ip_blocks[i].version->type != block_type)
1644 			continue;
1645 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1646 			continue;
1647 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1648 			(void *)adev, state);
1649 		if (r)
1650 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1651 				  adev->ip_blocks[i].version->funcs->name, r);
1652 	}
1653 	return r;
1654 }
1655 
1656 /**
1657  * amdgpu_device_ip_set_powergating_state - set the PG state
1658  *
1659  * @dev: amdgpu_device pointer
1660  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1661  * @state: powergating state (gate or ungate)
1662  *
1663  * Sets the requested powergating state for all instances of
1664  * the hardware IP specified.
1665  * Returns the error code from the last instance.
1666  */
1667 int amdgpu_device_ip_set_powergating_state(void *dev,
1668 					   enum amd_ip_block_type block_type,
1669 					   enum amd_powergating_state state)
1670 {
1671 	struct amdgpu_device *adev = dev;
1672 	int i, r = 0;
1673 
1674 	for (i = 0; i < adev->num_ip_blocks; i++) {
1675 		if (!adev->ip_blocks[i].status.valid)
1676 			continue;
1677 		if (adev->ip_blocks[i].version->type != block_type)
1678 			continue;
1679 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1680 			continue;
1681 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1682 			(void *)adev, state);
1683 		if (r)
1684 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1685 				  adev->ip_blocks[i].version->funcs->name, r);
1686 	}
1687 	return r;
1688 }
1689 
1690 /**
1691  * amdgpu_device_ip_get_clockgating_state - get the CG state
1692  *
1693  * @adev: amdgpu_device pointer
1694  * @flags: clockgating feature flags
1695  *
1696  * Walks the list of IPs on the device and updates the clockgating
1697  * flags for each IP.
1698  * Updates @flags with the feature flags for each hardware IP where
1699  * clockgating is enabled.
1700  */
1701 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1702 					    u32 *flags)
1703 {
1704 	int i;
1705 
1706 	for (i = 0; i < adev->num_ip_blocks; i++) {
1707 		if (!adev->ip_blocks[i].status.valid)
1708 			continue;
1709 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1710 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1711 	}
1712 }
1713 
1714 /**
1715  * amdgpu_device_ip_wait_for_idle - wait for idle
1716  *
1717  * @adev: amdgpu_device pointer
1718  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1719  *
1720  * Waits for the request hardware IP to be idle.
1721  * Returns 0 for success or a negative error code on failure.
1722  */
1723 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1724 				   enum amd_ip_block_type block_type)
1725 {
1726 	int i, r;
1727 
1728 	for (i = 0; i < adev->num_ip_blocks; i++) {
1729 		if (!adev->ip_blocks[i].status.valid)
1730 			continue;
1731 		if (adev->ip_blocks[i].version->type == block_type) {
1732 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1733 			if (r)
1734 				return r;
1735 			break;
1736 		}
1737 	}
1738 	return 0;
1739 
1740 }
1741 
1742 /**
1743  * amdgpu_device_ip_is_idle - is the hardware IP idle
1744  *
1745  * @adev: amdgpu_device pointer
1746  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1747  *
1748  * Check if the hardware IP is idle or not.
1749  * Returns true if it the IP is idle, false if not.
1750  */
1751 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1752 			      enum amd_ip_block_type block_type)
1753 {
1754 	int i;
1755 
1756 	for (i = 0; i < adev->num_ip_blocks; i++) {
1757 		if (!adev->ip_blocks[i].status.valid)
1758 			continue;
1759 		if (adev->ip_blocks[i].version->type == block_type)
1760 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1761 	}
1762 	return true;
1763 
1764 }
1765 
1766 /**
1767  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1768  *
1769  * @adev: amdgpu_device pointer
1770  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1771  *
1772  * Returns a pointer to the hardware IP block structure
1773  * if it exists for the asic, otherwise NULL.
1774  */
1775 struct amdgpu_ip_block *
1776 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1777 			      enum amd_ip_block_type type)
1778 {
1779 	int i;
1780 
1781 	for (i = 0; i < adev->num_ip_blocks; i++)
1782 		if (adev->ip_blocks[i].version->type == type)
1783 			return &adev->ip_blocks[i];
1784 
1785 	return NULL;
1786 }
1787 
1788 /**
1789  * amdgpu_device_ip_block_version_cmp
1790  *
1791  * @adev: amdgpu_device pointer
1792  * @type: enum amd_ip_block_type
1793  * @major: major version
1794  * @minor: minor version
1795  *
1796  * return 0 if equal or greater
1797  * return 1 if smaller or the ip_block doesn't exist
1798  */
1799 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1800 				       enum amd_ip_block_type type,
1801 				       u32 major, u32 minor)
1802 {
1803 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1804 
1805 	if (ip_block && ((ip_block->version->major > major) ||
1806 			((ip_block->version->major == major) &&
1807 			(ip_block->version->minor >= minor))))
1808 		return 0;
1809 
1810 	return 1;
1811 }
1812 
1813 /**
1814  * amdgpu_device_ip_block_add
1815  *
1816  * @adev: amdgpu_device pointer
1817  * @ip_block_version: pointer to the IP to add
1818  *
1819  * Adds the IP block driver information to the collection of IPs
1820  * on the asic.
1821  */
1822 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1823 			       const struct amdgpu_ip_block_version *ip_block_version)
1824 {
1825 	if (!ip_block_version)
1826 		return -EINVAL;
1827 
1828 	switch (ip_block_version->type) {
1829 	case AMD_IP_BLOCK_TYPE_VCN:
1830 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1831 			return 0;
1832 		break;
1833 	case AMD_IP_BLOCK_TYPE_JPEG:
1834 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1835 			return 0;
1836 		break;
1837 	default:
1838 		break;
1839 	}
1840 
1841 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1842 		  ip_block_version->funcs->name);
1843 
1844 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1845 
1846 	return 0;
1847 }
1848 
1849 /**
1850  * amdgpu_device_enable_virtual_display - enable virtual display feature
1851  *
1852  * @adev: amdgpu_device pointer
1853  *
1854  * Enabled the virtual display feature if the user has enabled it via
1855  * the module parameter virtual_display.  This feature provides a virtual
1856  * display hardware on headless boards or in virtualized environments.
1857  * This function parses and validates the configuration string specified by
1858  * the user and configues the virtual display configuration (number of
1859  * virtual connectors, crtcs, etc.) specified.
1860  */
1861 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1862 {
1863 	adev->enable_virtual_display = false;
1864 
1865 #ifdef notyet
1866 	if (amdgpu_virtual_display) {
1867 		const char *pci_address_name = pci_name(adev->pdev);
1868 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1869 
1870 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1871 		pciaddstr_tmp = pciaddstr;
1872 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1873 			pciaddname = strsep(&pciaddname_tmp, ",");
1874 			if (!strcmp("all", pciaddname)
1875 			    || !strcmp(pci_address_name, pciaddname)) {
1876 				long num_crtc;
1877 				int res = -1;
1878 
1879 				adev->enable_virtual_display = true;
1880 
1881 				if (pciaddname_tmp)
1882 					res = kstrtol(pciaddname_tmp, 10,
1883 						      &num_crtc);
1884 
1885 				if (!res) {
1886 					if (num_crtc < 1)
1887 						num_crtc = 1;
1888 					if (num_crtc > 6)
1889 						num_crtc = 6;
1890 					adev->mode_info.num_crtc = num_crtc;
1891 				} else {
1892 					adev->mode_info.num_crtc = 1;
1893 				}
1894 				break;
1895 			}
1896 		}
1897 
1898 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1899 			 amdgpu_virtual_display, pci_address_name,
1900 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1901 
1902 		kfree(pciaddstr);
1903 	}
1904 #endif
1905 }
1906 
1907 /**
1908  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * Parses the asic configuration parameters specified in the gpu info
1913  * firmware and makes them availale to the driver for use in configuring
1914  * the asic.
1915  * Returns 0 on success, -EINVAL on failure.
1916  */
1917 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1918 {
1919 	const char *chip_name;
1920 	char fw_name[40];
1921 	int err;
1922 	const struct gpu_info_firmware_header_v1_0 *hdr;
1923 
1924 	adev->firmware.gpu_info_fw = NULL;
1925 
1926 	if (adev->mman.discovery_bin) {
1927 		amdgpu_discovery_get_gfx_info(adev);
1928 
1929 		/*
1930 		 * FIXME: The bounding box is still needed by Navi12, so
1931 		 * temporarily read it from gpu_info firmware. Should be droped
1932 		 * when DAL no longer needs it.
1933 		 */
1934 		if (adev->asic_type != CHIP_NAVI12)
1935 			return 0;
1936 	}
1937 
1938 	switch (adev->asic_type) {
1939 #ifdef CONFIG_DRM_AMDGPU_SI
1940 	case CHIP_VERDE:
1941 	case CHIP_TAHITI:
1942 	case CHIP_PITCAIRN:
1943 	case CHIP_OLAND:
1944 	case CHIP_HAINAN:
1945 #endif
1946 #ifdef CONFIG_DRM_AMDGPU_CIK
1947 	case CHIP_BONAIRE:
1948 	case CHIP_HAWAII:
1949 	case CHIP_KAVERI:
1950 	case CHIP_KABINI:
1951 	case CHIP_MULLINS:
1952 #endif
1953 	case CHIP_TOPAZ:
1954 	case CHIP_TONGA:
1955 	case CHIP_FIJI:
1956 	case CHIP_POLARIS10:
1957 	case CHIP_POLARIS11:
1958 	case CHIP_POLARIS12:
1959 	case CHIP_VEGAM:
1960 	case CHIP_CARRIZO:
1961 	case CHIP_STONEY:
1962 	case CHIP_VEGA20:
1963 	case CHIP_ALDEBARAN:
1964 	case CHIP_SIENNA_CICHLID:
1965 	case CHIP_NAVY_FLOUNDER:
1966 	case CHIP_DIMGREY_CAVEFISH:
1967 	case CHIP_BEIGE_GOBY:
1968 	default:
1969 		return 0;
1970 	case CHIP_VEGA10:
1971 		chip_name = "vega10";
1972 		break;
1973 	case CHIP_VEGA12:
1974 		chip_name = "vega12";
1975 		break;
1976 	case CHIP_RAVEN:
1977 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1978 			chip_name = "raven2";
1979 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1980 			chip_name = "picasso";
1981 		else
1982 			chip_name = "raven";
1983 		break;
1984 	case CHIP_ARCTURUS:
1985 		chip_name = "arcturus";
1986 		break;
1987 	case CHIP_RENOIR:
1988 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1989 			chip_name = "renoir";
1990 		else
1991 			chip_name = "green_sardine";
1992 		break;
1993 	case CHIP_NAVI10:
1994 		chip_name = "navi10";
1995 		break;
1996 	case CHIP_NAVI14:
1997 		chip_name = "navi14";
1998 		break;
1999 	case CHIP_NAVI12:
2000 		chip_name = "navi12";
2001 		break;
2002 	case CHIP_VANGOGH:
2003 		chip_name = "vangogh";
2004 		break;
2005 	case CHIP_YELLOW_CARP:
2006 		chip_name = "yellow_carp";
2007 		break;
2008 	}
2009 
2010 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2011 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2012 	if (err) {
2013 		dev_err(adev->dev,
2014 			"Failed to load gpu_info firmware \"%s\"\n",
2015 			fw_name);
2016 		goto out;
2017 	}
2018 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2019 	if (err) {
2020 		dev_err(adev->dev,
2021 			"Failed to validate gpu_info firmware \"%s\"\n",
2022 			fw_name);
2023 		goto out;
2024 	}
2025 
2026 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2027 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2028 
2029 	switch (hdr->version_major) {
2030 	case 1:
2031 	{
2032 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2033 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2034 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2035 
2036 		/*
2037 		 * Should be droped when DAL no longer needs it.
2038 		 */
2039 		if (adev->asic_type == CHIP_NAVI12)
2040 			goto parse_soc_bounding_box;
2041 
2042 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2043 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2044 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2045 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2046 		adev->gfx.config.max_texture_channel_caches =
2047 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2048 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2049 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2050 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2051 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2052 		adev->gfx.config.double_offchip_lds_buf =
2053 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2054 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2055 		adev->gfx.cu_info.max_waves_per_simd =
2056 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2057 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2058 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2059 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2060 		if (hdr->version_minor >= 1) {
2061 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2062 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2063 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2064 			adev->gfx.config.num_sc_per_sh =
2065 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2066 			adev->gfx.config.num_packer_per_sc =
2067 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2068 		}
2069 
2070 parse_soc_bounding_box:
2071 		/*
2072 		 * soc bounding box info is not integrated in disocovery table,
2073 		 * we always need to parse it from gpu info firmware if needed.
2074 		 */
2075 		if (hdr->version_minor == 2) {
2076 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2077 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2078 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2079 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2080 		}
2081 		break;
2082 	}
2083 	default:
2084 		dev_err(adev->dev,
2085 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2086 		err = -EINVAL;
2087 		goto out;
2088 	}
2089 out:
2090 	return err;
2091 }
2092 
2093 /**
2094  * amdgpu_device_ip_early_init - run early init for hardware IPs
2095  *
2096  * @adev: amdgpu_device pointer
2097  *
2098  * Early initialization pass for hardware IPs.  The hardware IPs that make
2099  * up each asic are discovered each IP's early_init callback is run.  This
2100  * is the first stage in initializing the asic.
2101  * Returns 0 on success, negative error code on failure.
2102  */
2103 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2104 {
2105 	struct drm_device *dev = adev_to_drm(adev);
2106 	struct pci_dev *parent;
2107 	int i, r;
2108 
2109 	amdgpu_device_enable_virtual_display(adev);
2110 
2111 	if (amdgpu_sriov_vf(adev)) {
2112 		r = amdgpu_virt_request_full_gpu(adev, true);
2113 		if (r)
2114 			return r;
2115 	}
2116 
2117 	switch (adev->asic_type) {
2118 #ifdef CONFIG_DRM_AMDGPU_SI
2119 	case CHIP_VERDE:
2120 	case CHIP_TAHITI:
2121 	case CHIP_PITCAIRN:
2122 	case CHIP_OLAND:
2123 	case CHIP_HAINAN:
2124 		adev->family = AMDGPU_FAMILY_SI;
2125 		r = si_set_ip_blocks(adev);
2126 		if (r)
2127 			return r;
2128 		break;
2129 #endif
2130 #ifdef CONFIG_DRM_AMDGPU_CIK
2131 	case CHIP_BONAIRE:
2132 	case CHIP_HAWAII:
2133 	case CHIP_KAVERI:
2134 	case CHIP_KABINI:
2135 	case CHIP_MULLINS:
2136 		if (adev->flags & AMD_IS_APU)
2137 			adev->family = AMDGPU_FAMILY_KV;
2138 		else
2139 			adev->family = AMDGPU_FAMILY_CI;
2140 
2141 		r = cik_set_ip_blocks(adev);
2142 		if (r)
2143 			return r;
2144 		break;
2145 #endif
2146 	case CHIP_TOPAZ:
2147 	case CHIP_TONGA:
2148 	case CHIP_FIJI:
2149 	case CHIP_POLARIS10:
2150 	case CHIP_POLARIS11:
2151 	case CHIP_POLARIS12:
2152 	case CHIP_VEGAM:
2153 	case CHIP_CARRIZO:
2154 	case CHIP_STONEY:
2155 		if (adev->flags & AMD_IS_APU)
2156 			adev->family = AMDGPU_FAMILY_CZ;
2157 		else
2158 			adev->family = AMDGPU_FAMILY_VI;
2159 
2160 		r = vi_set_ip_blocks(adev);
2161 		if (r)
2162 			return r;
2163 		break;
2164 	case CHIP_VEGA10:
2165 	case CHIP_VEGA12:
2166 	case CHIP_VEGA20:
2167 	case CHIP_RAVEN:
2168 	case CHIP_ARCTURUS:
2169 	case CHIP_RENOIR:
2170 	case CHIP_ALDEBARAN:
2171 		if (adev->flags & AMD_IS_APU)
2172 			adev->family = AMDGPU_FAMILY_RV;
2173 		else
2174 			adev->family = AMDGPU_FAMILY_AI;
2175 
2176 		r = soc15_set_ip_blocks(adev);
2177 		if (r)
2178 			return r;
2179 		break;
2180 	case  CHIP_NAVI10:
2181 	case  CHIP_NAVI14:
2182 	case  CHIP_NAVI12:
2183 	case  CHIP_SIENNA_CICHLID:
2184 	case  CHIP_NAVY_FLOUNDER:
2185 	case  CHIP_DIMGREY_CAVEFISH:
2186 	case  CHIP_BEIGE_GOBY:
2187 	case CHIP_VANGOGH:
2188 	case CHIP_YELLOW_CARP:
2189 	case CHIP_CYAN_SKILLFISH:
2190 		if (adev->asic_type == CHIP_VANGOGH)
2191 			adev->family = AMDGPU_FAMILY_VGH;
2192 		else if (adev->asic_type == CHIP_YELLOW_CARP)
2193 			adev->family = AMDGPU_FAMILY_YC;
2194 		else
2195 			adev->family = AMDGPU_FAMILY_NV;
2196 
2197 		r = nv_set_ip_blocks(adev);
2198 		if (r)
2199 			return r;
2200 		break;
2201 	default:
2202 		/* FIXME: not supported yet */
2203 		return -EINVAL;
2204 	}
2205 
2206 	if (amdgpu_has_atpx() &&
2207 	    (amdgpu_is_atpx_hybrid() ||
2208 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2209 	    ((adev->flags & AMD_IS_APU) == 0) &&
2210 	    !pci_is_thunderbolt_attached(dev->pdev))
2211 		adev->flags |= AMD_IS_PX;
2212 
2213 	if (!(adev->flags & AMD_IS_APU)) {
2214 		parent = pci_upstream_bridge(adev->pdev);
2215 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2216 	}
2217 
2218 	amdgpu_amdkfd_device_probe(adev);
2219 
2220 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2221 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2222 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2223 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2224 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2225 
2226 	for (i = 0; i < adev->num_ip_blocks; i++) {
2227 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2228 			DRM_ERROR("disabled ip block: %d <%s>\n",
2229 				  i, adev->ip_blocks[i].version->funcs->name);
2230 			adev->ip_blocks[i].status.valid = false;
2231 		} else {
2232 			if (adev->ip_blocks[i].version->funcs->early_init) {
2233 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2234 				if (r == -ENOENT) {
2235 					adev->ip_blocks[i].status.valid = false;
2236 				} else if (r) {
2237 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2238 						  adev->ip_blocks[i].version->funcs->name, r);
2239 					return r;
2240 				} else {
2241 					adev->ip_blocks[i].status.valid = true;
2242 				}
2243 			} else {
2244 				adev->ip_blocks[i].status.valid = true;
2245 			}
2246 		}
2247 		/* get the vbios after the asic_funcs are set up */
2248 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2249 			r = amdgpu_device_parse_gpu_info_fw(adev);
2250 			if (r)
2251 				return r;
2252 
2253 			/* Read BIOS */
2254 			if (!amdgpu_get_bios(adev))
2255 				return -EINVAL;
2256 
2257 			r = amdgpu_atombios_init(adev);
2258 			if (r) {
2259 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2260 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2261 				return r;
2262 			}
2263 
2264 			/*get pf2vf msg info at it's earliest time*/
2265 			if (amdgpu_sriov_vf(adev))
2266 				amdgpu_virt_init_data_exchange(adev);
2267 
2268 		}
2269 	}
2270 
2271 	adev->cg_flags &= amdgpu_cg_mask;
2272 	adev->pg_flags &= amdgpu_pg_mask;
2273 
2274 	return 0;
2275 }
2276 
2277 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2278 {
2279 	int i, r;
2280 
2281 	for (i = 0; i < adev->num_ip_blocks; i++) {
2282 		if (!adev->ip_blocks[i].status.sw)
2283 			continue;
2284 		if (adev->ip_blocks[i].status.hw)
2285 			continue;
2286 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2287 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2288 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2289 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2290 			if (r) {
2291 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2292 					  adev->ip_blocks[i].version->funcs->name, r);
2293 				return r;
2294 			}
2295 			adev->ip_blocks[i].status.hw = true;
2296 		}
2297 	}
2298 
2299 	return 0;
2300 }
2301 
2302 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2303 {
2304 	int i, r;
2305 
2306 	for (i = 0; i < adev->num_ip_blocks; i++) {
2307 		if (!adev->ip_blocks[i].status.sw)
2308 			continue;
2309 		if (adev->ip_blocks[i].status.hw)
2310 			continue;
2311 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2312 		if (r) {
2313 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2314 				  adev->ip_blocks[i].version->funcs->name, r);
2315 			return r;
2316 		}
2317 		adev->ip_blocks[i].status.hw = true;
2318 	}
2319 
2320 	return 0;
2321 }
2322 
2323 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2324 {
2325 	int r = 0;
2326 	int i;
2327 	uint32_t smu_version;
2328 
2329 	if (adev->asic_type >= CHIP_VEGA10) {
2330 		for (i = 0; i < adev->num_ip_blocks; i++) {
2331 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2332 				continue;
2333 
2334 			if (!adev->ip_blocks[i].status.sw)
2335 				continue;
2336 
2337 			/* no need to do the fw loading again if already done*/
2338 			if (adev->ip_blocks[i].status.hw == true)
2339 				break;
2340 
2341 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2342 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2343 				if (r) {
2344 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2345 							  adev->ip_blocks[i].version->funcs->name, r);
2346 					return r;
2347 				}
2348 			} else {
2349 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2350 				if (r) {
2351 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2352 							  adev->ip_blocks[i].version->funcs->name, r);
2353 					return r;
2354 				}
2355 			}
2356 
2357 			adev->ip_blocks[i].status.hw = true;
2358 			break;
2359 		}
2360 	}
2361 
2362 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2363 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2364 
2365 	return r;
2366 }
2367 
2368 /**
2369  * amdgpu_device_ip_init - run init for hardware IPs
2370  *
2371  * @adev: amdgpu_device pointer
2372  *
2373  * Main initialization pass for hardware IPs.  The list of all the hardware
2374  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2375  * are run.  sw_init initializes the software state associated with each IP
2376  * and hw_init initializes the hardware associated with each IP.
2377  * Returns 0 on success, negative error code on failure.
2378  */
2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2380 {
2381 	int i, r;
2382 
2383 	r = amdgpu_ras_init(adev);
2384 	if (r)
2385 		return r;
2386 
2387 	for (i = 0; i < adev->num_ip_blocks; i++) {
2388 		if (!adev->ip_blocks[i].status.valid)
2389 			continue;
2390 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2391 		if (r) {
2392 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2393 				  adev->ip_blocks[i].version->funcs->name, r);
2394 			goto init_failed;
2395 		}
2396 		adev->ip_blocks[i].status.sw = true;
2397 
2398 		/* need to do gmc hw init early so we can allocate gpu mem */
2399 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2400 			r = amdgpu_device_vram_scratch_init(adev);
2401 			if (r) {
2402 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2403 				goto init_failed;
2404 			}
2405 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2406 			if (r) {
2407 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2408 				goto init_failed;
2409 			}
2410 			r = amdgpu_device_wb_init(adev);
2411 			if (r) {
2412 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2413 				goto init_failed;
2414 			}
2415 			adev->ip_blocks[i].status.hw = true;
2416 
2417 			/* right after GMC hw init, we create CSA */
2418 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2419 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2420 								AMDGPU_GEM_DOMAIN_VRAM,
2421 								AMDGPU_CSA_SIZE);
2422 				if (r) {
2423 					DRM_ERROR("allocate CSA failed %d\n", r);
2424 					goto init_failed;
2425 				}
2426 			}
2427 		}
2428 	}
2429 
2430 	if (amdgpu_sriov_vf(adev))
2431 		amdgpu_virt_init_data_exchange(adev);
2432 
2433 	r = amdgpu_ib_pool_init(adev);
2434 	if (r) {
2435 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2436 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2437 		goto init_failed;
2438 	}
2439 
2440 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2441 	if (r)
2442 		goto init_failed;
2443 
2444 	r = amdgpu_amdkfd_resume_iommu(adev);
2445 	if (r)
2446 		goto init_failed;
2447 
2448 	r = amdgpu_device_ip_hw_init_phase1(adev);
2449 	if (r)
2450 		goto init_failed;
2451 
2452 	r = amdgpu_device_fw_loading(adev);
2453 	if (r)
2454 		goto init_failed;
2455 
2456 	r = amdgpu_device_ip_hw_init_phase2(adev);
2457 	if (r)
2458 		goto init_failed;
2459 
2460 	/*
2461 	 * retired pages will be loaded from eeprom and reserved here,
2462 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2463 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2464 	 * for I2C communication which only true at this point.
2465 	 *
2466 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2467 	 * failure from bad gpu situation and stop amdgpu init process
2468 	 * accordingly. For other failed cases, it will still release all
2469 	 * the resource and print error message, rather than returning one
2470 	 * negative value to upper level.
2471 	 *
2472 	 * Note: theoretically, this should be called before all vram allocations
2473 	 * to protect retired page from abusing
2474 	 */
2475 	r = amdgpu_ras_recovery_init(adev);
2476 	if (r)
2477 		goto init_failed;
2478 
2479 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2480 		amdgpu_xgmi_add_device(adev);
2481 
2482 	/* Don't init kfd if whole hive need to be reset during init */
2483 	if (!adev->gmc.xgmi.pending_reset)
2484 		amdgpu_amdkfd_device_init(adev);
2485 
2486 	amdgpu_fru_get_product_info(adev);
2487 
2488 init_failed:
2489 	if (amdgpu_sriov_vf(adev))
2490 		amdgpu_virt_release_full_gpu(adev, true);
2491 
2492 	return r;
2493 }
2494 
2495 /**
2496  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2497  *
2498  * @adev: amdgpu_device pointer
2499  *
2500  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2501  * this function before a GPU reset.  If the value is retained after a
2502  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2503  */
2504 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2505 {
2506 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2507 }
2508 
2509 /**
2510  * amdgpu_device_check_vram_lost - check if vram is valid
2511  *
2512  * @adev: amdgpu_device pointer
2513  *
2514  * Checks the reset magic value written to the gart pointer in VRAM.
2515  * The driver calls this after a GPU reset to see if the contents of
2516  * VRAM is lost or now.
2517  * returns true if vram is lost, false if not.
2518  */
2519 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2520 {
2521 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2522 			AMDGPU_RESET_MAGIC_NUM))
2523 		return true;
2524 
2525 	if (!amdgpu_in_reset(adev))
2526 		return false;
2527 
2528 	/*
2529 	 * For all ASICs with baco/mode1 reset, the VRAM is
2530 	 * always assumed to be lost.
2531 	 */
2532 	switch (amdgpu_asic_reset_method(adev)) {
2533 	case AMD_RESET_METHOD_BACO:
2534 	case AMD_RESET_METHOD_MODE1:
2535 		return true;
2536 	default:
2537 		return false;
2538 	}
2539 }
2540 
2541 /**
2542  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2543  *
2544  * @adev: amdgpu_device pointer
2545  * @state: clockgating state (gate or ungate)
2546  *
2547  * The list of all the hardware IPs that make up the asic is walked and the
2548  * set_clockgating_state callbacks are run.
2549  * Late initialization pass enabling clockgating for hardware IPs.
2550  * Fini or suspend, pass disabling clockgating for hardware IPs.
2551  * Returns 0 on success, negative error code on failure.
2552  */
2553 
2554 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2555 			       enum amd_clockgating_state state)
2556 {
2557 	int i, j, r;
2558 
2559 	if (amdgpu_emu_mode == 1)
2560 		return 0;
2561 
2562 	for (j = 0; j < adev->num_ip_blocks; j++) {
2563 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2564 		if (!adev->ip_blocks[i].status.late_initialized)
2565 			continue;
2566 		/* skip CG for GFX on S0ix */
2567 		if (adev->in_s0ix &&
2568 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2569 			continue;
2570 		/* skip CG for VCE/UVD, it's handled specially */
2571 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2572 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2573 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2574 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2575 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2576 			/* enable clockgating to save power */
2577 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2578 										     state);
2579 			if (r) {
2580 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2581 					  adev->ip_blocks[i].version->funcs->name, r);
2582 				return r;
2583 			}
2584 		}
2585 	}
2586 
2587 	return 0;
2588 }
2589 
2590 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2591 			       enum amd_powergating_state state)
2592 {
2593 	int i, j, r;
2594 
2595 	if (amdgpu_emu_mode == 1)
2596 		return 0;
2597 
2598 	for (j = 0; j < adev->num_ip_blocks; j++) {
2599 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2600 		if (!adev->ip_blocks[i].status.late_initialized)
2601 			continue;
2602 		/* skip PG for GFX on S0ix */
2603 		if (adev->in_s0ix &&
2604 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2605 			continue;
2606 		/* skip CG for VCE/UVD, it's handled specially */
2607 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2608 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2609 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2610 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2611 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2612 			/* enable powergating to save power */
2613 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2614 											state);
2615 			if (r) {
2616 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2617 					  adev->ip_blocks[i].version->funcs->name, r);
2618 				return r;
2619 			}
2620 		}
2621 	}
2622 	return 0;
2623 }
2624 
2625 static int amdgpu_device_enable_mgpu_fan_boost(void)
2626 {
2627 	struct amdgpu_gpu_instance *gpu_ins;
2628 	struct amdgpu_device *adev;
2629 	int i, ret = 0;
2630 
2631 	mutex_lock(&mgpu_info.mutex);
2632 
2633 	/*
2634 	 * MGPU fan boost feature should be enabled
2635 	 * only when there are two or more dGPUs in
2636 	 * the system
2637 	 */
2638 	if (mgpu_info.num_dgpu < 2)
2639 		goto out;
2640 
2641 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2642 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2643 		adev = gpu_ins->adev;
2644 		if (!(adev->flags & AMD_IS_APU) &&
2645 		    !gpu_ins->mgpu_fan_enabled) {
2646 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2647 			if (ret)
2648 				break;
2649 
2650 			gpu_ins->mgpu_fan_enabled = 1;
2651 		}
2652 	}
2653 
2654 out:
2655 	mutex_unlock(&mgpu_info.mutex);
2656 
2657 	return ret;
2658 }
2659 
2660 /**
2661  * amdgpu_device_ip_late_init - run late init for hardware IPs
2662  *
2663  * @adev: amdgpu_device pointer
2664  *
2665  * Late initialization pass for hardware IPs.  The list of all the hardware
2666  * IPs that make up the asic is walked and the late_init callbacks are run.
2667  * late_init covers any special initialization that an IP requires
2668  * after all of the have been initialized or something that needs to happen
2669  * late in the init process.
2670  * Returns 0 on success, negative error code on failure.
2671  */
2672 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2673 {
2674 	struct amdgpu_gpu_instance *gpu_instance;
2675 	int i = 0, r;
2676 
2677 	for (i = 0; i < adev->num_ip_blocks; i++) {
2678 		if (!adev->ip_blocks[i].status.hw)
2679 			continue;
2680 		if (adev->ip_blocks[i].version->funcs->late_init) {
2681 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2682 			if (r) {
2683 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2684 					  adev->ip_blocks[i].version->funcs->name, r);
2685 				return r;
2686 			}
2687 		}
2688 		adev->ip_blocks[i].status.late_initialized = true;
2689 	}
2690 
2691 	amdgpu_ras_set_error_query_ready(adev, true);
2692 
2693 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2694 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2695 
2696 	amdgpu_device_fill_reset_magic(adev);
2697 
2698 	r = amdgpu_device_enable_mgpu_fan_boost();
2699 	if (r)
2700 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2701 
2702 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2703 	if (adev->asic_type == CHIP_ARCTURUS &&
2704 	    amdgpu_passthrough(adev) &&
2705 	    adev->gmc.xgmi.num_physical_nodes > 1)
2706 		smu_set_light_sbr(&adev->smu, true);
2707 
2708 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2709 		mutex_lock(&mgpu_info.mutex);
2710 
2711 		/*
2712 		 * Reset device p-state to low as this was booted with high.
2713 		 *
2714 		 * This should be performed only after all devices from the same
2715 		 * hive get initialized.
2716 		 *
2717 		 * However, it's unknown how many device in the hive in advance.
2718 		 * As this is counted one by one during devices initializations.
2719 		 *
2720 		 * So, we wait for all XGMI interlinked devices initialized.
2721 		 * This may bring some delays as those devices may come from
2722 		 * different hives. But that should be OK.
2723 		 */
2724 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2725 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2726 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2727 				if (gpu_instance->adev->flags & AMD_IS_APU)
2728 					continue;
2729 
2730 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2731 						AMDGPU_XGMI_PSTATE_MIN);
2732 				if (r) {
2733 					DRM_ERROR("pstate setting failed (%d).\n", r);
2734 					break;
2735 				}
2736 			}
2737 		}
2738 
2739 		mutex_unlock(&mgpu_info.mutex);
2740 	}
2741 
2742 	return 0;
2743 }
2744 
2745 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2746 {
2747 	int i, r;
2748 
2749 	for (i = 0; i < adev->num_ip_blocks; i++) {
2750 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2751 			continue;
2752 
2753 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2754 		if (r) {
2755 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2756 				  adev->ip_blocks[i].version->funcs->name, r);
2757 		}
2758 	}
2759 
2760 	amdgpu_amdkfd_suspend(adev, false);
2761 
2762 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2763 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2764 
2765 	/* need to disable SMC first */
2766 	for (i = 0; i < adev->num_ip_blocks; i++) {
2767 		if (!adev->ip_blocks[i].status.hw)
2768 			continue;
2769 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2770 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2771 			/* XXX handle errors */
2772 			if (r) {
2773 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2774 					  adev->ip_blocks[i].version->funcs->name, r);
2775 			}
2776 			adev->ip_blocks[i].status.hw = false;
2777 			break;
2778 		}
2779 	}
2780 
2781 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2782 		if (!adev->ip_blocks[i].status.hw)
2783 			continue;
2784 
2785 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2786 		/* XXX handle errors */
2787 		if (r) {
2788 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2789 				  adev->ip_blocks[i].version->funcs->name, r);
2790 		}
2791 
2792 		adev->ip_blocks[i].status.hw = false;
2793 	}
2794 
2795 	if (amdgpu_sriov_vf(adev)) {
2796 		if (amdgpu_virt_release_full_gpu(adev, false))
2797 			DRM_ERROR("failed to release exclusive mode on fini\n");
2798 	}
2799 
2800 	return 0;
2801 }
2802 
2803 /**
2804  * amdgpu_device_ip_fini - run fini for hardware IPs
2805  *
2806  * @adev: amdgpu_device pointer
2807  *
2808  * Main teardown pass for hardware IPs.  The list of all the hardware
2809  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2810  * are run.  hw_fini tears down the hardware associated with each IP
2811  * and sw_fini tears down any software state associated with each IP.
2812  * Returns 0 on success, negative error code on failure.
2813  */
2814 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2815 {
2816 	int i, r;
2817 
2818 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2819 		amdgpu_virt_release_ras_err_handler_data(adev);
2820 
2821 	amdgpu_ras_pre_fini(adev);
2822 
2823 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2824 		amdgpu_xgmi_remove_device(adev);
2825 
2826 	amdgpu_amdkfd_device_fini_sw(adev);
2827 
2828 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2829 		if (!adev->ip_blocks[i].status.sw)
2830 			continue;
2831 
2832 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2833 			amdgpu_ucode_free_bo(adev);
2834 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2835 			amdgpu_device_wb_fini(adev);
2836 			amdgpu_device_vram_scratch_fini(adev);
2837 			amdgpu_ib_pool_fini(adev);
2838 		}
2839 
2840 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2841 		/* XXX handle errors */
2842 		if (r) {
2843 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2844 				  adev->ip_blocks[i].version->funcs->name, r);
2845 		}
2846 		adev->ip_blocks[i].status.sw = false;
2847 		adev->ip_blocks[i].status.valid = false;
2848 	}
2849 
2850 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2851 		if (!adev->ip_blocks[i].status.late_initialized)
2852 			continue;
2853 		if (adev->ip_blocks[i].version->funcs->late_fini)
2854 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2855 		adev->ip_blocks[i].status.late_initialized = false;
2856 	}
2857 
2858 	amdgpu_ras_fini(adev);
2859 
2860 	return 0;
2861 }
2862 
2863 /**
2864  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2865  *
2866  * @work: work_struct.
2867  */
2868 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2869 {
2870 	struct amdgpu_device *adev =
2871 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2872 	int r;
2873 
2874 	r = amdgpu_ib_ring_tests(adev);
2875 	if (r)
2876 		DRM_ERROR("ib ring test failed (%d).\n", r);
2877 }
2878 
2879 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2880 {
2881 	struct amdgpu_device *adev =
2882 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2883 
2884 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2885 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2886 
2887 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2888 		adev->gfx.gfx_off_state = true;
2889 }
2890 
2891 /**
2892  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2893  *
2894  * @adev: amdgpu_device pointer
2895  *
2896  * Main suspend function for hardware IPs.  The list of all the hardware
2897  * IPs that make up the asic is walked, clockgating is disabled and the
2898  * suspend callbacks are run.  suspend puts the hardware and software state
2899  * in each IP into a state suitable for suspend.
2900  * Returns 0 on success, negative error code on failure.
2901  */
2902 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2903 {
2904 	int i, r;
2905 
2906 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2907 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2908 
2909 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2910 		if (!adev->ip_blocks[i].status.valid)
2911 			continue;
2912 
2913 		/* displays are handled separately */
2914 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2915 			continue;
2916 
2917 		/* XXX handle errors */
2918 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2919 		/* XXX handle errors */
2920 		if (r) {
2921 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2922 				  adev->ip_blocks[i].version->funcs->name, r);
2923 			return r;
2924 		}
2925 
2926 		adev->ip_blocks[i].status.hw = false;
2927 	}
2928 
2929 	return 0;
2930 }
2931 
2932 /**
2933  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2934  *
2935  * @adev: amdgpu_device pointer
2936  *
2937  * Main suspend function for hardware IPs.  The list of all the hardware
2938  * IPs that make up the asic is walked, clockgating is disabled and the
2939  * suspend callbacks are run.  suspend puts the hardware and software state
2940  * in each IP into a state suitable for suspend.
2941  * Returns 0 on success, negative error code on failure.
2942  */
2943 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2944 {
2945 	int i, r;
2946 
2947 	if (adev->in_s0ix)
2948 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2949 
2950 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2951 		if (!adev->ip_blocks[i].status.valid)
2952 			continue;
2953 		/* displays are handled in phase1 */
2954 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2955 			continue;
2956 		/* PSP lost connection when err_event_athub occurs */
2957 		if (amdgpu_ras_intr_triggered() &&
2958 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2959 			adev->ip_blocks[i].status.hw = false;
2960 			continue;
2961 		}
2962 
2963 		/* skip unnecessary suspend if we do not initialize them yet */
2964 		if (adev->gmc.xgmi.pending_reset &&
2965 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2966 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2967 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2968 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2969 			adev->ip_blocks[i].status.hw = false;
2970 			continue;
2971 		}
2972 
2973 		/* skip suspend of gfx and psp for S0ix
2974 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2975 		 * like at runtime. PSP is also part of the always on hardware
2976 		 * so no need to suspend it.
2977 		 */
2978 		if (adev->in_s0ix &&
2979 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2980 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
2981 			continue;
2982 
2983 		/* XXX handle errors */
2984 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2985 		/* XXX handle errors */
2986 		if (r) {
2987 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2988 				  adev->ip_blocks[i].version->funcs->name, r);
2989 		}
2990 		adev->ip_blocks[i].status.hw = false;
2991 		/* handle putting the SMC in the appropriate state */
2992 		if(!amdgpu_sriov_vf(adev)){
2993 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2994 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2995 				if (r) {
2996 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2997 							adev->mp1_state, r);
2998 					return r;
2999 				}
3000 			}
3001 		}
3002 	}
3003 
3004 	return 0;
3005 }
3006 
3007 /**
3008  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3009  *
3010  * @adev: amdgpu_device pointer
3011  *
3012  * Main suspend function for hardware IPs.  The list of all the hardware
3013  * IPs that make up the asic is walked, clockgating is disabled and the
3014  * suspend callbacks are run.  suspend puts the hardware and software state
3015  * in each IP into a state suitable for suspend.
3016  * Returns 0 on success, negative error code on failure.
3017  */
3018 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3019 {
3020 	int r;
3021 
3022 	if (amdgpu_sriov_vf(adev)) {
3023 		amdgpu_virt_fini_data_exchange(adev);
3024 		amdgpu_virt_request_full_gpu(adev, false);
3025 	}
3026 
3027 	r = amdgpu_device_ip_suspend_phase1(adev);
3028 	if (r)
3029 		return r;
3030 	r = amdgpu_device_ip_suspend_phase2(adev);
3031 
3032 	if (amdgpu_sriov_vf(adev))
3033 		amdgpu_virt_release_full_gpu(adev, false);
3034 
3035 	return r;
3036 }
3037 
3038 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3039 {
3040 	int i, r;
3041 
3042 	static enum amd_ip_block_type ip_order[] = {
3043 		AMD_IP_BLOCK_TYPE_GMC,
3044 		AMD_IP_BLOCK_TYPE_COMMON,
3045 		AMD_IP_BLOCK_TYPE_PSP,
3046 		AMD_IP_BLOCK_TYPE_IH,
3047 	};
3048 
3049 	for (i = 0; i < adev->num_ip_blocks; i++) {
3050 		int j;
3051 		struct amdgpu_ip_block *block;
3052 
3053 		block = &adev->ip_blocks[i];
3054 		block->status.hw = false;
3055 
3056 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3057 
3058 			if (block->version->type != ip_order[j] ||
3059 				!block->status.valid)
3060 				continue;
3061 
3062 			r = block->version->funcs->hw_init(adev);
3063 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3064 			if (r)
3065 				return r;
3066 			block->status.hw = true;
3067 		}
3068 	}
3069 
3070 	return 0;
3071 }
3072 
3073 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3074 {
3075 	int i, r;
3076 
3077 	static enum amd_ip_block_type ip_order[] = {
3078 		AMD_IP_BLOCK_TYPE_SMC,
3079 		AMD_IP_BLOCK_TYPE_DCE,
3080 		AMD_IP_BLOCK_TYPE_GFX,
3081 		AMD_IP_BLOCK_TYPE_SDMA,
3082 		AMD_IP_BLOCK_TYPE_UVD,
3083 		AMD_IP_BLOCK_TYPE_VCE,
3084 		AMD_IP_BLOCK_TYPE_VCN
3085 	};
3086 
3087 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3088 		int j;
3089 		struct amdgpu_ip_block *block;
3090 
3091 		for (j = 0; j < adev->num_ip_blocks; j++) {
3092 			block = &adev->ip_blocks[j];
3093 
3094 			if (block->version->type != ip_order[i] ||
3095 				!block->status.valid ||
3096 				block->status.hw)
3097 				continue;
3098 
3099 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3100 				r = block->version->funcs->resume(adev);
3101 			else
3102 				r = block->version->funcs->hw_init(adev);
3103 
3104 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3105 			if (r)
3106 				return r;
3107 			block->status.hw = true;
3108 		}
3109 	}
3110 
3111 	return 0;
3112 }
3113 
3114 /**
3115  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3116  *
3117  * @adev: amdgpu_device pointer
3118  *
3119  * First resume function for hardware IPs.  The list of all the hardware
3120  * IPs that make up the asic is walked and the resume callbacks are run for
3121  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3122  * after a suspend and updates the software state as necessary.  This
3123  * function is also used for restoring the GPU after a GPU reset.
3124  * Returns 0 on success, negative error code on failure.
3125  */
3126 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3127 {
3128 	int i, r;
3129 
3130 	for (i = 0; i < adev->num_ip_blocks; i++) {
3131 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3132 			continue;
3133 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3134 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3135 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
3136 
3137 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3138 			if (r) {
3139 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3140 					  adev->ip_blocks[i].version->funcs->name, r);
3141 				return r;
3142 			}
3143 			adev->ip_blocks[i].status.hw = true;
3144 		}
3145 	}
3146 
3147 	return 0;
3148 }
3149 
3150 /**
3151  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3152  *
3153  * @adev: amdgpu_device pointer
3154  *
3155  * First resume function for hardware IPs.  The list of all the hardware
3156  * IPs that make up the asic is walked and the resume callbacks are run for
3157  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3158  * functional state after a suspend and updates the software state as
3159  * necessary.  This function is also used for restoring the GPU after a GPU
3160  * reset.
3161  * Returns 0 on success, negative error code on failure.
3162  */
3163 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3164 {
3165 	int i, r;
3166 
3167 	for (i = 0; i < adev->num_ip_blocks; i++) {
3168 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3169 			continue;
3170 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3171 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3172 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3173 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3174 			continue;
3175 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3176 		if (r) {
3177 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3178 				  adev->ip_blocks[i].version->funcs->name, r);
3179 			return r;
3180 		}
3181 		adev->ip_blocks[i].status.hw = true;
3182 	}
3183 
3184 	return 0;
3185 }
3186 
3187 /**
3188  * amdgpu_device_ip_resume - run resume for hardware IPs
3189  *
3190  * @adev: amdgpu_device pointer
3191  *
3192  * Main resume function for hardware IPs.  The hardware IPs
3193  * are split into two resume functions because they are
3194  * are also used in in recovering from a GPU reset and some additional
3195  * steps need to be take between them.  In this case (S3/S4) they are
3196  * run sequentially.
3197  * Returns 0 on success, negative error code on failure.
3198  */
3199 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3200 {
3201 	int r;
3202 
3203 	r = amdgpu_amdkfd_resume_iommu(adev);
3204 	if (r)
3205 		return r;
3206 
3207 	r = amdgpu_device_ip_resume_phase1(adev);
3208 	if (r)
3209 		return r;
3210 
3211 	r = amdgpu_device_fw_loading(adev);
3212 	if (r)
3213 		return r;
3214 
3215 	r = amdgpu_device_ip_resume_phase2(adev);
3216 
3217 	return r;
3218 }
3219 
3220 /**
3221  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3222  *
3223  * @adev: amdgpu_device pointer
3224  *
3225  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3226  */
3227 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3228 {
3229 	if (amdgpu_sriov_vf(adev)) {
3230 		if (adev->is_atom_fw) {
3231 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3232 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3233 		} else {
3234 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3235 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3236 		}
3237 
3238 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3239 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3240 	}
3241 }
3242 
3243 /**
3244  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3245  *
3246  * @asic_type: AMD asic type
3247  *
3248  * Check if there is DC (new modesetting infrastructre) support for an asic.
3249  * returns true if DC has support, false if not.
3250  */
3251 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3252 {
3253 	switch (asic_type) {
3254 #if defined(CONFIG_DRM_AMD_DC)
3255 #if defined(CONFIG_DRM_AMD_DC_SI)
3256 	case CHIP_TAHITI:
3257 	case CHIP_PITCAIRN:
3258 	case CHIP_VERDE:
3259 	case CHIP_OLAND:
3260 #endif
3261 	case CHIP_BONAIRE:
3262 	case CHIP_KAVERI:
3263 	case CHIP_KABINI:
3264 	case CHIP_MULLINS:
3265 		/*
3266 		 * We have systems in the wild with these ASICs that require
3267 		 * LVDS and VGA support which is not supported with DC.
3268 		 *
3269 		 * Fallback to the non-DC driver here by default so as not to
3270 		 * cause regressions.
3271 		 */
3272 		return amdgpu_dc > 0;
3273 	case CHIP_HAWAII:
3274 	case CHIP_CARRIZO:
3275 	case CHIP_STONEY:
3276 	case CHIP_POLARIS10:
3277 	case CHIP_POLARIS11:
3278 	case CHIP_POLARIS12:
3279 	case CHIP_VEGAM:
3280 	case CHIP_TONGA:
3281 	case CHIP_FIJI:
3282 	case CHIP_VEGA10:
3283 	case CHIP_VEGA12:
3284 	case CHIP_VEGA20:
3285 #if defined(CONFIG_DRM_AMD_DC_DCN)
3286 	case CHIP_RAVEN:
3287 	case CHIP_NAVI10:
3288 	case CHIP_NAVI14:
3289 	case CHIP_NAVI12:
3290 	case CHIP_RENOIR:
3291 	case CHIP_SIENNA_CICHLID:
3292 	case CHIP_NAVY_FLOUNDER:
3293 	case CHIP_DIMGREY_CAVEFISH:
3294 	case CHIP_BEIGE_GOBY:
3295 	case CHIP_VANGOGH:
3296 	case CHIP_YELLOW_CARP:
3297 #endif
3298 		return amdgpu_dc != 0;
3299 #endif
3300 	default:
3301 		if (amdgpu_dc > 0)
3302 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3303 					 "but isn't supported by ASIC, ignoring\n");
3304 		return false;
3305 	}
3306 }
3307 
3308 /**
3309  * amdgpu_device_has_dc_support - check if dc is supported
3310  *
3311  * @adev: amdgpu_device pointer
3312  *
3313  * Returns true for supported, false for not supported
3314  */
3315 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3316 {
3317 	if (amdgpu_sriov_vf(adev) ||
3318 	    adev->enable_virtual_display ||
3319 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3320 		return false;
3321 
3322 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3323 }
3324 
3325 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3326 {
3327 	struct amdgpu_device *adev =
3328 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3329 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3330 
3331 	/* It's a bug to not have a hive within this function */
3332 	if (WARN_ON(!hive))
3333 		return;
3334 
3335 	/*
3336 	 * Use task barrier to synchronize all xgmi reset works across the
3337 	 * hive. task_barrier_enter and task_barrier_exit will block
3338 	 * until all the threads running the xgmi reset works reach
3339 	 * those points. task_barrier_full will do both blocks.
3340 	 */
3341 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3342 
3343 		task_barrier_enter(&hive->tb);
3344 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3345 
3346 		if (adev->asic_reset_res)
3347 			goto fail;
3348 
3349 		task_barrier_exit(&hive->tb);
3350 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3351 
3352 		if (adev->asic_reset_res)
3353 			goto fail;
3354 
3355 		if (adev->mmhub.ras_funcs &&
3356 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3357 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3358 	} else {
3359 
3360 		task_barrier_full(&hive->tb);
3361 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3362 	}
3363 
3364 fail:
3365 	if (adev->asic_reset_res)
3366 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3367 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3368 	amdgpu_put_xgmi_hive(hive);
3369 }
3370 
3371 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3372 {
3373 	char *input = amdgpu_lockup_timeout;
3374 	char *timeout_setting = NULL;
3375 	int index = 0;
3376 	long timeout;
3377 	int ret = 0;
3378 
3379 	/*
3380 	 * By default timeout for non compute jobs is 10000
3381 	 * and 60000 for compute jobs.
3382 	 * In SR-IOV or passthrough mode, timeout for compute
3383 	 * jobs are 60000 by default.
3384 	 */
3385 	adev->gfx_timeout = msecs_to_jiffies(10000);
3386 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3387 	if (amdgpu_sriov_vf(adev))
3388 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3389 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3390 	else
3391 		adev->compute_timeout =  msecs_to_jiffies(60000);
3392 
3393 #ifdef notyet
3394 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3395 		while ((timeout_setting = strsep(&input, ",")) &&
3396 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3397 			ret = kstrtol(timeout_setting, 0, &timeout);
3398 			if (ret)
3399 				return ret;
3400 
3401 			if (timeout == 0) {
3402 				index++;
3403 				continue;
3404 			} else if (timeout < 0) {
3405 				timeout = MAX_SCHEDULE_TIMEOUT;
3406 			} else {
3407 				timeout = msecs_to_jiffies(timeout);
3408 			}
3409 
3410 			switch (index++) {
3411 			case 0:
3412 				adev->gfx_timeout = timeout;
3413 				break;
3414 			case 1:
3415 				adev->compute_timeout = timeout;
3416 				break;
3417 			case 2:
3418 				adev->sdma_timeout = timeout;
3419 				break;
3420 			case 3:
3421 				adev->video_timeout = timeout;
3422 				break;
3423 			default:
3424 				break;
3425 			}
3426 		}
3427 		/*
3428 		 * There is only one value specified and
3429 		 * it should apply to all non-compute jobs.
3430 		 */
3431 		if (index == 1) {
3432 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3433 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3434 				adev->compute_timeout = adev->gfx_timeout;
3435 		}
3436 	}
3437 #endif
3438 
3439 	return ret;
3440 }
3441 
3442 static const struct attribute *amdgpu_dev_attributes[] = {
3443 	&dev_attr_product_name.attr,
3444 	&dev_attr_product_number.attr,
3445 	&dev_attr_serial_number.attr,
3446 	&dev_attr_pcie_replay_count.attr,
3447 	NULL
3448 };
3449 
3450 /**
3451  * amdgpu_device_init - initialize the driver
3452  *
3453  * @adev: amdgpu_device pointer
3454  * @flags: driver flags
3455  *
3456  * Initializes the driver info and hw (all asics).
3457  * Returns 0 for success or an error on failure.
3458  * Called at driver startup.
3459  */
3460 int amdgpu_device_init(struct amdgpu_device *adev,
3461 		       uint32_t flags)
3462 {
3463 	struct drm_device *ddev = adev_to_drm(adev);
3464 	struct pci_dev *pdev = adev->pdev;
3465 	int r, i;
3466 	bool px = false;
3467 	u32 max_MBps;
3468 
3469 	adev->shutdown = false;
3470 	adev->flags = flags;
3471 
3472 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3473 		adev->asic_type = amdgpu_force_asic_type;
3474 	else
3475 		adev->asic_type = flags & AMD_ASIC_MASK;
3476 
3477 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3478 	if (amdgpu_emu_mode == 1)
3479 		adev->usec_timeout *= 10;
3480 	adev->gmc.gart_size = 512 * 1024 * 1024;
3481 	adev->accel_working = false;
3482 	adev->num_rings = 0;
3483 	adev->mman.buffer_funcs = NULL;
3484 	adev->mman.buffer_funcs_ring = NULL;
3485 	adev->vm_manager.vm_pte_funcs = NULL;
3486 	adev->vm_manager.vm_pte_num_scheds = 0;
3487 	adev->gmc.gmc_funcs = NULL;
3488 	adev->harvest_ip_mask = 0x0;
3489 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3490 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3491 
3492 	adev->smc_rreg = &amdgpu_invalid_rreg;
3493 	adev->smc_wreg = &amdgpu_invalid_wreg;
3494 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3495 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3496 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3497 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3498 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3499 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3500 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3501 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3502 	adev->didt_rreg = &amdgpu_invalid_rreg;
3503 	adev->didt_wreg = &amdgpu_invalid_wreg;
3504 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3505 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3506 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3507 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3508 
3509 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3510 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3511 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3512 
3513 	/* mutex initialization are all done here so we
3514 	 * can recall function without having locking issues */
3515 	rw_init(&adev->firmware.mutex, "agfw");
3516 	rw_init(&adev->pm.mutex, "agpm");
3517 	rw_init(&adev->gfx.gpu_clock_mutex, "gfxclk");
3518 	rw_init(&adev->srbm_mutex, "srbm");
3519 	rw_init(&adev->gfx.pipe_reserve_mutex, "pipers");
3520 	rw_init(&adev->gfx.gfx_off_mutex, "gfxoff");
3521 	rw_init(&adev->grbm_idx_mutex, "grbmidx");
3522 	rw_init(&adev->mn_lock, "agpumn");
3523 	rw_init(&adev->virt.vf_errors.lock, "vferr");
3524 	hash_init(adev->mn_hash);
3525 	atomic_set(&adev->in_gpu_reset, 0);
3526 	rw_init(&adev->reset_sem, "amrs");
3527 	rw_init(&adev->psp.mutex, "agpsp");
3528 	rw_init(&adev->notifier_lock, "agnf");
3529 
3530 	r = amdgpu_device_init_apu_flags(adev);
3531 	if (r)
3532 		return r;
3533 
3534 	r = amdgpu_device_check_arguments(adev);
3535 	if (r)
3536 		return r;
3537 
3538 	mtx_init(&adev->mmio_idx_lock, IPL_TTY);
3539 	mtx_init(&adev->smc_idx_lock, IPL_TTY);
3540 	mtx_init(&adev->pcie_idx_lock, IPL_TTY);
3541 	mtx_init(&adev->uvd_ctx_idx_lock, IPL_TTY);
3542 	mtx_init(&adev->didt_idx_lock, IPL_TTY);
3543 	mtx_init(&adev->gc_cac_idx_lock, IPL_TTY);
3544 	mtx_init(&adev->se_cac_idx_lock, IPL_TTY);
3545 	mtx_init(&adev->audio_endpt_idx_lock, IPL_TTY);
3546 	mtx_init(&adev->mm_stats.lock, IPL_NONE);
3547 
3548 	INIT_LIST_HEAD(&adev->shadow_list);
3549 	rw_init(&adev->shadow_list_lock, "sdwlst");
3550 
3551 	INIT_LIST_HEAD(&adev->reset_list);
3552 
3553 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3554 			  amdgpu_device_delayed_init_work_handler);
3555 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3556 			  amdgpu_device_delay_enable_gfx_off);
3557 
3558 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3559 
3560 	adev->gfx.gfx_off_req_count = 1;
3561 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3562 
3563 	atomic_set(&adev->throttling_logging_enabled, 1);
3564 	/*
3565 	 * If throttling continues, logging will be performed every minute
3566 	 * to avoid log flooding. "-1" is subtracted since the thermal
3567 	 * throttling interrupt comes every second. Thus, the total logging
3568 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3569 	 * for throttling interrupt) = 60 seconds.
3570 	 */
3571 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3572 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3573 
3574 #ifdef __linux__
3575 	/* Registers mapping */
3576 	/* TODO: block userspace mapping of io register */
3577 	if (adev->asic_type >= CHIP_BONAIRE) {
3578 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3579 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3580 	} else {
3581 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3582 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3583 	}
3584 
3585 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3586 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3587 
3588 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3589 	if (adev->rmmio == NULL) {
3590 		return -ENOMEM;
3591 	}
3592 #endif
3593 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3594 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3595 
3596 	/* enable PCIE atomic ops */
3597 #ifdef notyet
3598 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3599 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3600 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3601 	if (r) {
3602 		adev->have_atomics_support = false;
3603 		DRM_INFO("PCIE atomic ops is not supported\n");
3604 	} else {
3605 		adev->have_atomics_support = true;
3606 	}
3607 #else
3608 	adev->have_atomics_support = false;
3609 #endif
3610 
3611 	amdgpu_device_get_pcie_info(adev);
3612 
3613 	if (amdgpu_mcbp)
3614 		DRM_INFO("MCBP is enabled\n");
3615 
3616 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3617 		adev->enable_mes = true;
3618 
3619 	/* detect hw virtualization here */
3620 	amdgpu_detect_virtualization(adev);
3621 
3622 	r = amdgpu_device_get_job_timeout_settings(adev);
3623 	if (r) {
3624 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3625 		return r;
3626 	}
3627 
3628 	/* early init functions */
3629 	r = amdgpu_device_ip_early_init(adev);
3630 	if (r)
3631 		return r;
3632 
3633 	/* doorbell bar mapping and doorbell index init*/
3634 	amdgpu_device_doorbell_init(adev);
3635 
3636 	if (amdgpu_emu_mode == 1) {
3637 		/* post the asic on emulation mode */
3638 		emu_soc_asic_init(adev);
3639 		goto fence_driver_init;
3640 	}
3641 
3642 	amdgpu_reset_init(adev);
3643 
3644 	/* detect if we are with an SRIOV vbios */
3645 	amdgpu_device_detect_sriov_bios(adev);
3646 
3647 	/* check if we need to reset the asic
3648 	 *  E.g., driver was not cleanly unloaded previously, etc.
3649 	 */
3650 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3651 		if (adev->gmc.xgmi.num_physical_nodes) {
3652 			dev_info(adev->dev, "Pending hive reset.\n");
3653 			adev->gmc.xgmi.pending_reset = true;
3654 			/* Only need to init necessary block for SMU to handle the reset */
3655 			for (i = 0; i < adev->num_ip_blocks; i++) {
3656 				if (!adev->ip_blocks[i].status.valid)
3657 					continue;
3658 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3659 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3660 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3661 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3662 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3663 						adev->ip_blocks[i].version->funcs->name);
3664 					adev->ip_blocks[i].status.hw = true;
3665 				}
3666 			}
3667 		} else {
3668 			r = amdgpu_asic_reset(adev);
3669 			if (r) {
3670 				dev_err(adev->dev, "asic reset on init failed\n");
3671 				goto failed;
3672 			}
3673 		}
3674 	}
3675 
3676 	pci_enable_pcie_error_reporting(adev->pdev);
3677 
3678 	/* Post card if necessary */
3679 	if (amdgpu_device_need_post(adev)) {
3680 		if (!adev->bios) {
3681 			dev_err(adev->dev, "no vBIOS found\n");
3682 			r = -EINVAL;
3683 			goto failed;
3684 		}
3685 		DRM_INFO("GPU posting now...\n");
3686 		r = amdgpu_device_asic_init(adev);
3687 		if (r) {
3688 			dev_err(adev->dev, "gpu post error!\n");
3689 			goto failed;
3690 		}
3691 	}
3692 
3693 	if (adev->is_atom_fw) {
3694 		/* Initialize clocks */
3695 		r = amdgpu_atomfirmware_get_clock_info(adev);
3696 		if (r) {
3697 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3698 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3699 			goto failed;
3700 		}
3701 	} else {
3702 		/* Initialize clocks */
3703 		r = amdgpu_atombios_get_clock_info(adev);
3704 		if (r) {
3705 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3706 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3707 			goto failed;
3708 		}
3709 		/* init i2c buses */
3710 		if (!amdgpu_device_has_dc_support(adev))
3711 			amdgpu_atombios_i2c_init(adev);
3712 	}
3713 
3714 fence_driver_init:
3715 	/* Fence driver */
3716 	r = amdgpu_fence_driver_sw_init(adev);
3717 	if (r) {
3718 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3719 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3720 		goto failed;
3721 	}
3722 
3723 	/* init the mode config */
3724 	drm_mode_config_init(adev_to_drm(adev));
3725 
3726 	r = amdgpu_device_ip_init(adev);
3727 	if (r) {
3728 		/* failed in exclusive mode due to timeout */
3729 		if (amdgpu_sriov_vf(adev) &&
3730 		    !amdgpu_sriov_runtime(adev) &&
3731 		    amdgpu_virt_mmio_blocked(adev) &&
3732 		    !amdgpu_virt_wait_reset(adev)) {
3733 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3734 			/* Don't send request since VF is inactive. */
3735 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3736 			adev->virt.ops = NULL;
3737 			r = -EAGAIN;
3738 			goto release_ras_con;
3739 		}
3740 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3741 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3742 		goto release_ras_con;
3743 	}
3744 
3745 	amdgpu_fence_driver_hw_init(adev);
3746 
3747 	dev_info(adev->dev,
3748 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3749 			adev->gfx.config.max_shader_engines,
3750 			adev->gfx.config.max_sh_per_se,
3751 			adev->gfx.config.max_cu_per_sh,
3752 			adev->gfx.cu_info.number);
3753 
3754 #ifdef __OpenBSD__
3755 {
3756 	const char *chip_name;
3757 
3758 	switch (adev->asic_type) {
3759 	case CHIP_RAVEN:
3760 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
3761 			chip_name = "RAVEN2";
3762 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
3763 			chip_name = "PICASSO";
3764 		else
3765 			chip_name = "RAVEN";
3766 		break;
3767 	case CHIP_RENOIR:
3768 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
3769 			chip_name = "RENOIR";
3770 		else
3771 			chip_name = "GREEN_SARDINE";
3772 		break;
3773 	default:
3774 		chip_name = amdgpu_asic_name[adev->asic_type];
3775 	}
3776 	printf("%s: %s %d CU rev 0x%02x\n", adev->self.dv_xname,
3777 	    chip_name, adev->gfx.cu_info.number, adev->rev_id);
3778 }
3779 #endif
3780 
3781 	adev->accel_working = true;
3782 
3783 	amdgpu_vm_check_compute_bug(adev);
3784 
3785 	/* Initialize the buffer migration limit. */
3786 	if (amdgpu_moverate >= 0)
3787 		max_MBps = amdgpu_moverate;
3788 	else
3789 		max_MBps = 8; /* Allow 8 MB/s. */
3790 	/* Get a log2 for easy divisions. */
3791 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3792 
3793 	amdgpu_fbdev_init(adev);
3794 
3795 	r = amdgpu_pm_sysfs_init(adev);
3796 	if (r) {
3797 		adev->pm_sysfs_en = false;
3798 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3799 	} else
3800 		adev->pm_sysfs_en = true;
3801 
3802 	r = amdgpu_ucode_sysfs_init(adev);
3803 	if (r) {
3804 		adev->ucode_sysfs_en = false;
3805 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3806 	} else
3807 		adev->ucode_sysfs_en = true;
3808 
3809 	if ((amdgpu_testing & 1)) {
3810 		if (adev->accel_working)
3811 			amdgpu_test_moves(adev);
3812 		else
3813 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3814 	}
3815 	if (amdgpu_benchmarking) {
3816 		if (adev->accel_working)
3817 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3818 		else
3819 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3820 	}
3821 
3822 	/*
3823 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3824 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3825 	 * gpu instance is counted less.
3826 	 */
3827 	amdgpu_register_gpu_instance(adev);
3828 
3829 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3830 	 * explicit gating rather than handling it automatically.
3831 	 */
3832 	if (!adev->gmc.xgmi.pending_reset) {
3833 		r = amdgpu_device_ip_late_init(adev);
3834 		if (r) {
3835 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3836 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3837 			goto release_ras_con;
3838 		}
3839 		/* must succeed. */
3840 		amdgpu_ras_resume(adev);
3841 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3842 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3843 	}
3844 
3845 	if (amdgpu_sriov_vf(adev))
3846 		flush_delayed_work(&adev->delayed_init_work);
3847 
3848 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3849 	if (r)
3850 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3851 
3852 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3853 		r = amdgpu_pmu_init(adev);
3854 	if (r)
3855 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3856 
3857 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3858 	if (amdgpu_device_cache_pci_state(adev->pdev))
3859 		pci_restore_state(pdev);
3860 
3861 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3862 	/* this will fail for cards that aren't VGA class devices, just
3863 	 * ignore it */
3864 #ifdef notyet
3865 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3866 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3867 #endif
3868 
3869 	if (amdgpu_device_supports_px(ddev)) {
3870 		px = true;
3871 		vga_switcheroo_register_client(adev->pdev,
3872 					       &amdgpu_switcheroo_ops, px);
3873 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3874 	}
3875 
3876 	if (adev->gmc.xgmi.pending_reset)
3877 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3878 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3879 
3880 	return 0;
3881 
3882 release_ras_con:
3883 	amdgpu_release_ras_context(adev);
3884 
3885 failed:
3886 	amdgpu_vf_error_trans_all(adev);
3887 
3888 	return r;
3889 }
3890 
3891 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3892 {
3893 	STUB();
3894 #ifdef notyet
3895 	/* Clear all CPU mappings pointing to this device */
3896 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3897 #endif
3898 
3899 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
3900 	amdgpu_device_doorbell_fini(adev);
3901 
3902 #ifdef __linux__
3903 	iounmap(adev->rmmio);
3904 	adev->rmmio = NULL;
3905 	if (adev->mman.aper_base_kaddr)
3906 		iounmap(adev->mman.aper_base_kaddr);
3907 	adev->mman.aper_base_kaddr = NULL;
3908 #else
3909 	if (adev->rmmio_size > 0)
3910 		bus_space_unmap(adev->rmmio_bst, adev->rmmio_bsh,
3911 		    adev->rmmio_size);
3912 	adev->rmmio_size = 0;
3913 	adev->rmmio = NULL;
3914 	if (adev->mman.aper_base_kaddr)
3915 		bus_space_unmap(adev->memt, adev->mman.aper_bsh,
3916 		    adev->gmc.visible_vram_size);
3917 	adev->mman.aper_base_kaddr = NULL;
3918 #endif
3919 
3920 	/* Memory manager related */
3921 	if (!adev->gmc.xgmi.connected_to_cpu) {
3922 #ifdef __linux__
3923 		arch_phys_wc_del(adev->gmc.vram_mtrr);
3924 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3925 #else
3926 		drm_mtrr_del(0, adev->gmc.aper_base, adev->gmc.aper_size, DRM_MTRR_WC);
3927 #endif
3928 	}
3929 }
3930 
3931 /**
3932  * amdgpu_device_fini - tear down the driver
3933  *
3934  * @adev: amdgpu_device pointer
3935  *
3936  * Tear down the driver info (all asics).
3937  * Called at driver shutdown.
3938  */
3939 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3940 {
3941 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3942 	flush_delayed_work(&adev->delayed_init_work);
3943 	if (adev->mman.initialized) {
3944 		flush_delayed_work(&adev->mman.bdev.wq);
3945 		ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3946 	}
3947 	adev->shutdown = true;
3948 
3949 	/* make sure IB test finished before entering exclusive mode
3950 	 * to avoid preemption on IB test
3951 	 * */
3952 	if (amdgpu_sriov_vf(adev)) {
3953 		amdgpu_virt_request_full_gpu(adev, false);
3954 		amdgpu_virt_fini_data_exchange(adev);
3955 	}
3956 
3957 	/* disable all interrupts */
3958 	amdgpu_irq_disable_all(adev);
3959 	if (adev->mode_info.mode_config_initialized){
3960 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3961 			drm_helper_force_disable_all(adev_to_drm(adev));
3962 		else
3963 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3964 	}
3965 	amdgpu_fence_driver_hw_fini(adev);
3966 
3967 	if (adev->pm_sysfs_en)
3968 		amdgpu_pm_sysfs_fini(adev);
3969 	if (adev->ucode_sysfs_en)
3970 		amdgpu_ucode_sysfs_fini(adev);
3971 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3972 
3973 	amdgpu_fbdev_fini(adev);
3974 
3975 	amdgpu_irq_fini_hw(adev);
3976 
3977 	amdgpu_device_ip_fini_early(adev);
3978 
3979 	amdgpu_gart_dummy_page_fini(adev);
3980 
3981 	amdgpu_device_unmap_mmio(adev);
3982 }
3983 
3984 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3985 {
3986 	amdgpu_fence_driver_sw_fini(adev);
3987 	amdgpu_device_ip_fini(adev);
3988 	release_firmware(adev->firmware.gpu_info_fw);
3989 	adev->firmware.gpu_info_fw = NULL;
3990 	adev->accel_working = false;
3991 
3992 	amdgpu_reset_fini(adev);
3993 
3994 	/* free i2c buses */
3995 	if (!amdgpu_device_has_dc_support(adev))
3996 		amdgpu_i2c_fini(adev);
3997 
3998 	if (amdgpu_emu_mode != 1)
3999 		amdgpu_atombios_fini(adev);
4000 
4001 	kfree(adev->bios);
4002 	adev->bios = NULL;
4003 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4004 		vga_switcheroo_unregister_client(adev->pdev);
4005 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4006 	}
4007 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4008 		vga_client_unregister(adev->pdev);
4009 
4010 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4011 		amdgpu_pmu_fini(adev);
4012 	if (adev->mman.discovery_bin)
4013 		amdgpu_discovery_fini(adev);
4014 
4015 	kfree(adev->pci_state);
4016 
4017 }
4018 
4019 
4020 /*
4021  * Suspend & resume.
4022  */
4023 /**
4024  * amdgpu_device_suspend - initiate device suspend
4025  *
4026  * @dev: drm dev pointer
4027  * @fbcon : notify the fbdev of suspend
4028  *
4029  * Puts the hw in the suspend state (all asics).
4030  * Returns 0 for success or an error on failure.
4031  * Called at driver suspend.
4032  */
4033 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4034 {
4035 	struct amdgpu_device *adev = drm_to_adev(dev);
4036 
4037 	if (adev->shutdown)
4038 		return 0;
4039 
4040 #ifdef notyet
4041 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4042 		return 0;
4043 #endif
4044 
4045 	adev->in_suspend = true;
4046 
4047 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4048 		DRM_WARN("smart shift update failed\n");
4049 
4050 	drm_kms_helper_poll_disable(dev);
4051 
4052 	if (fbcon)
4053 		amdgpu_fbdev_set_suspend(adev, 1);
4054 
4055 	cancel_delayed_work_sync(&adev->delayed_init_work);
4056 
4057 	amdgpu_ras_suspend(adev);
4058 
4059 	amdgpu_device_ip_suspend_phase1(adev);
4060 
4061 	if (!adev->in_s0ix)
4062 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4063 
4064 	/* evict vram memory */
4065 	amdgpu_bo_evict_vram(adev);
4066 
4067 	amdgpu_fence_driver_hw_fini(adev);
4068 
4069 	amdgpu_device_ip_suspend_phase2(adev);
4070 	/* evict remaining vram memory
4071 	 * This second call to evict vram is to evict the gart page table
4072 	 * using the CPU.
4073 	 */
4074 	amdgpu_bo_evict_vram(adev);
4075 
4076 	return 0;
4077 }
4078 
4079 /**
4080  * amdgpu_device_resume - initiate device resume
4081  *
4082  * @dev: drm dev pointer
4083  * @fbcon : notify the fbdev of resume
4084  *
4085  * Bring the hw back to operating state (all asics).
4086  * Returns 0 for success or an error on failure.
4087  * Called at driver resume.
4088  */
4089 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4090 {
4091 	struct amdgpu_device *adev = drm_to_adev(dev);
4092 	int r = 0;
4093 
4094 #ifdef notyet
4095 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4096 		return 0;
4097 #endif
4098 
4099 	if (adev->in_s0ix)
4100 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
4101 
4102 	/* post card */
4103 	if (amdgpu_device_need_post(adev)) {
4104 		r = amdgpu_device_asic_init(adev);
4105 		if (r)
4106 			dev_err(adev->dev, "amdgpu asic init failed\n");
4107 	}
4108 
4109 	r = amdgpu_device_ip_resume(adev);
4110 	if (r) {
4111 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4112 		return r;
4113 	}
4114 	amdgpu_fence_driver_hw_init(adev);
4115 
4116 	r = amdgpu_device_ip_late_init(adev);
4117 	if (r)
4118 		return r;
4119 
4120 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4121 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4122 
4123 	if (!adev->in_s0ix) {
4124 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4125 		if (r)
4126 			return r;
4127 	}
4128 
4129 	/* Make sure IB tests flushed */
4130 	flush_delayed_work(&adev->delayed_init_work);
4131 
4132 	if (fbcon)
4133 		amdgpu_fbdev_set_suspend(adev, 0);
4134 
4135 	drm_kms_helper_poll_enable(dev);
4136 
4137 	amdgpu_ras_resume(adev);
4138 
4139 	/*
4140 	 * Most of the connector probing functions try to acquire runtime pm
4141 	 * refs to ensure that the GPU is powered on when connector polling is
4142 	 * performed. Since we're calling this from a runtime PM callback,
4143 	 * trying to acquire rpm refs will cause us to deadlock.
4144 	 *
4145 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
4146 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
4147 	 */
4148 #if defined(CONFIG_PM) && defined(__linux__)
4149 	dev->dev->power.disable_depth++;
4150 #endif
4151 	if (!amdgpu_device_has_dc_support(adev))
4152 		drm_helper_hpd_irq_event(dev);
4153 	else
4154 		drm_kms_helper_hotplug_event(dev);
4155 #if defined(CONFIG_PM) && defined(__linux__)
4156 	dev->dev->power.disable_depth--;
4157 #endif
4158 	adev->in_suspend = false;
4159 
4160 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4161 		DRM_WARN("smart shift update failed\n");
4162 
4163 	return 0;
4164 }
4165 
4166 /**
4167  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4168  *
4169  * @adev: amdgpu_device pointer
4170  *
4171  * The list of all the hardware IPs that make up the asic is walked and
4172  * the check_soft_reset callbacks are run.  check_soft_reset determines
4173  * if the asic is still hung or not.
4174  * Returns true if any of the IPs are still in a hung state, false if not.
4175  */
4176 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4177 {
4178 	int i;
4179 	bool asic_hang = false;
4180 
4181 	if (amdgpu_sriov_vf(adev))
4182 		return true;
4183 
4184 	if (amdgpu_asic_need_full_reset(adev))
4185 		return true;
4186 
4187 	for (i = 0; i < adev->num_ip_blocks; i++) {
4188 		if (!adev->ip_blocks[i].status.valid)
4189 			continue;
4190 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4191 			adev->ip_blocks[i].status.hang =
4192 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4193 		if (adev->ip_blocks[i].status.hang) {
4194 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4195 			asic_hang = true;
4196 		}
4197 	}
4198 	return asic_hang;
4199 }
4200 
4201 /**
4202  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4203  *
4204  * @adev: amdgpu_device pointer
4205  *
4206  * The list of all the hardware IPs that make up the asic is walked and the
4207  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4208  * handles any IP specific hardware or software state changes that are
4209  * necessary for a soft reset to succeed.
4210  * Returns 0 on success, negative error code on failure.
4211  */
4212 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4213 {
4214 	int i, r = 0;
4215 
4216 	for (i = 0; i < adev->num_ip_blocks; i++) {
4217 		if (!adev->ip_blocks[i].status.valid)
4218 			continue;
4219 		if (adev->ip_blocks[i].status.hang &&
4220 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4221 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4222 			if (r)
4223 				return r;
4224 		}
4225 	}
4226 
4227 	return 0;
4228 }
4229 
4230 /**
4231  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4232  *
4233  * @adev: amdgpu_device pointer
4234  *
4235  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4236  * reset is necessary to recover.
4237  * Returns true if a full asic reset is required, false if not.
4238  */
4239 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4240 {
4241 	int i;
4242 
4243 	if (amdgpu_asic_need_full_reset(adev))
4244 		return true;
4245 
4246 	for (i = 0; i < adev->num_ip_blocks; i++) {
4247 		if (!adev->ip_blocks[i].status.valid)
4248 			continue;
4249 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4250 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4251 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4252 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4253 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4254 			if (adev->ip_blocks[i].status.hang) {
4255 				dev_info(adev->dev, "Some block need full reset!\n");
4256 				return true;
4257 			}
4258 		}
4259 	}
4260 	return false;
4261 }
4262 
4263 /**
4264  * amdgpu_device_ip_soft_reset - do a soft reset
4265  *
4266  * @adev: amdgpu_device pointer
4267  *
4268  * The list of all the hardware IPs that make up the asic is walked and the
4269  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4270  * IP specific hardware or software state changes that are necessary to soft
4271  * reset the IP.
4272  * Returns 0 on success, negative error code on failure.
4273  */
4274 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4275 {
4276 	int i, r = 0;
4277 
4278 	for (i = 0; i < adev->num_ip_blocks; i++) {
4279 		if (!adev->ip_blocks[i].status.valid)
4280 			continue;
4281 		if (adev->ip_blocks[i].status.hang &&
4282 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4283 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4284 			if (r)
4285 				return r;
4286 		}
4287 	}
4288 
4289 	return 0;
4290 }
4291 
4292 /**
4293  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4294  *
4295  * @adev: amdgpu_device pointer
4296  *
4297  * The list of all the hardware IPs that make up the asic is walked and the
4298  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4299  * handles any IP specific hardware or software state changes that are
4300  * necessary after the IP has been soft reset.
4301  * Returns 0 on success, negative error code on failure.
4302  */
4303 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4304 {
4305 	int i, r = 0;
4306 
4307 	for (i = 0; i < adev->num_ip_blocks; i++) {
4308 		if (!adev->ip_blocks[i].status.valid)
4309 			continue;
4310 		if (adev->ip_blocks[i].status.hang &&
4311 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4312 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4313 		if (r)
4314 			return r;
4315 	}
4316 
4317 	return 0;
4318 }
4319 
4320 /**
4321  * amdgpu_device_recover_vram - Recover some VRAM contents
4322  *
4323  * @adev: amdgpu_device pointer
4324  *
4325  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4326  * restore things like GPUVM page tables after a GPU reset where
4327  * the contents of VRAM might be lost.
4328  *
4329  * Returns:
4330  * 0 on success, negative error code on failure.
4331  */
4332 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4333 {
4334 	struct dma_fence *fence = NULL, *next = NULL;
4335 	struct amdgpu_bo *shadow;
4336 	struct amdgpu_bo_vm *vmbo;
4337 	long r = 1, tmo;
4338 
4339 	if (amdgpu_sriov_runtime(adev))
4340 		tmo = msecs_to_jiffies(8000);
4341 	else
4342 		tmo = msecs_to_jiffies(100);
4343 
4344 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4345 	mutex_lock(&adev->shadow_list_lock);
4346 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4347 		shadow = &vmbo->bo;
4348 		/* No need to recover an evicted BO */
4349 		if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4350 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4351 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4352 			continue;
4353 
4354 		r = amdgpu_bo_restore_shadow(shadow, &next);
4355 		if (r)
4356 			break;
4357 
4358 		if (fence) {
4359 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4360 			dma_fence_put(fence);
4361 			fence = next;
4362 			if (tmo == 0) {
4363 				r = -ETIMEDOUT;
4364 				break;
4365 			} else if (tmo < 0) {
4366 				r = tmo;
4367 				break;
4368 			}
4369 		} else {
4370 			fence = next;
4371 		}
4372 	}
4373 	mutex_unlock(&adev->shadow_list_lock);
4374 
4375 	if (fence)
4376 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4377 	dma_fence_put(fence);
4378 
4379 	if (r < 0 || tmo <= 0) {
4380 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4381 		return -EIO;
4382 	}
4383 
4384 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4385 	return 0;
4386 }
4387 
4388 
4389 /**
4390  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4391  *
4392  * @adev: amdgpu_device pointer
4393  * @from_hypervisor: request from hypervisor
4394  *
4395  * do VF FLR and reinitialize Asic
4396  * return 0 means succeeded otherwise failed
4397  */
4398 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4399 				     bool from_hypervisor)
4400 {
4401 	int r;
4402 
4403 	if (from_hypervisor)
4404 		r = amdgpu_virt_request_full_gpu(adev, true);
4405 	else
4406 		r = amdgpu_virt_reset_gpu(adev);
4407 	if (r)
4408 		return r;
4409 
4410 	amdgpu_amdkfd_pre_reset(adev);
4411 
4412 	/* Resume IP prior to SMC */
4413 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4414 	if (r)
4415 		goto error;
4416 
4417 	amdgpu_virt_init_data_exchange(adev);
4418 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4419 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4420 
4421 	r = amdgpu_device_fw_loading(adev);
4422 	if (r)
4423 		return r;
4424 
4425 	/* now we are okay to resume SMC/CP/SDMA */
4426 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4427 	if (r)
4428 		goto error;
4429 
4430 	amdgpu_irq_gpu_reset_resume_helper(adev);
4431 	r = amdgpu_ib_ring_tests(adev);
4432 	amdgpu_amdkfd_post_reset(adev);
4433 
4434 error:
4435 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4436 		amdgpu_inc_vram_lost(adev);
4437 		r = amdgpu_device_recover_vram(adev);
4438 	}
4439 	amdgpu_virt_release_full_gpu(adev, true);
4440 
4441 	return r;
4442 }
4443 
4444 /**
4445  * amdgpu_device_has_job_running - check if there is any job in mirror list
4446  *
4447  * @adev: amdgpu_device pointer
4448  *
4449  * check if there is any job in mirror list
4450  */
4451 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4452 {
4453 	int i;
4454 	struct drm_sched_job *job;
4455 
4456 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4457 		struct amdgpu_ring *ring = adev->rings[i];
4458 
4459 		if (!ring || !ring->sched.thread)
4460 			continue;
4461 
4462 		spin_lock(&ring->sched.job_list_lock);
4463 		job = list_first_entry_or_null(&ring->sched.pending_list,
4464 					       struct drm_sched_job, list);
4465 		spin_unlock(&ring->sched.job_list_lock);
4466 		if (job)
4467 			return true;
4468 	}
4469 	return false;
4470 }
4471 
4472 /**
4473  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4474  *
4475  * @adev: amdgpu_device pointer
4476  *
4477  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4478  * a hung GPU.
4479  */
4480 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4481 {
4482 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4483 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4484 		return false;
4485 	}
4486 
4487 	if (amdgpu_gpu_recovery == 0)
4488 		goto disabled;
4489 
4490 	if (amdgpu_sriov_vf(adev))
4491 		return true;
4492 
4493 	if (amdgpu_gpu_recovery == -1) {
4494 		switch (adev->asic_type) {
4495 		case CHIP_BONAIRE:
4496 		case CHIP_HAWAII:
4497 		case CHIP_TOPAZ:
4498 		case CHIP_TONGA:
4499 		case CHIP_FIJI:
4500 		case CHIP_POLARIS10:
4501 		case CHIP_POLARIS11:
4502 		case CHIP_POLARIS12:
4503 		case CHIP_VEGAM:
4504 		case CHIP_VEGA20:
4505 		case CHIP_VEGA10:
4506 		case CHIP_VEGA12:
4507 		case CHIP_RAVEN:
4508 		case CHIP_ARCTURUS:
4509 		case CHIP_RENOIR:
4510 		case CHIP_NAVI10:
4511 		case CHIP_NAVI14:
4512 		case CHIP_NAVI12:
4513 		case CHIP_SIENNA_CICHLID:
4514 		case CHIP_NAVY_FLOUNDER:
4515 		case CHIP_DIMGREY_CAVEFISH:
4516 		case CHIP_BEIGE_GOBY:
4517 		case CHIP_VANGOGH:
4518 		case CHIP_ALDEBARAN:
4519 			break;
4520 		default:
4521 			goto disabled;
4522 		}
4523 	}
4524 
4525 	return true;
4526 
4527 disabled:
4528 		dev_info(adev->dev, "GPU recovery disabled.\n");
4529 		return false;
4530 }
4531 
4532 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4533 {
4534         u32 i;
4535         int ret = 0;
4536 
4537         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4538 
4539         dev_info(adev->dev, "GPU mode1 reset\n");
4540 
4541         /* disable BM */
4542         pci_clear_master(adev->pdev);
4543 
4544         amdgpu_device_cache_pci_state(adev->pdev);
4545 
4546         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4547                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4548                 ret = amdgpu_dpm_mode1_reset(adev);
4549         } else {
4550                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4551                 ret = psp_gpu_reset(adev);
4552         }
4553 
4554         if (ret)
4555                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4556 
4557         amdgpu_device_load_pci_state(adev->pdev);
4558 
4559         /* wait for asic to come out of reset */
4560         for (i = 0; i < adev->usec_timeout; i++) {
4561                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4562 
4563                 if (memsize != 0xffffffff)
4564                         break;
4565                 udelay(1);
4566         }
4567 
4568         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4569         return ret;
4570 }
4571 
4572 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4573 				 struct amdgpu_reset_context *reset_context)
4574 {
4575 	int i, j, r = 0;
4576 	struct amdgpu_job *job = NULL;
4577 	bool need_full_reset =
4578 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4579 
4580 	if (reset_context->reset_req_dev == adev)
4581 		job = reset_context->job;
4582 
4583 	if (amdgpu_sriov_vf(adev)) {
4584 		/* stop the data exchange thread */
4585 		amdgpu_virt_fini_data_exchange(adev);
4586 	}
4587 
4588 	/* block all schedulers and reset given job's ring */
4589 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4590 		struct amdgpu_ring *ring = adev->rings[i];
4591 
4592 		if (!ring || !ring->sched.thread)
4593 			continue;
4594 
4595 		/*clear job fence from fence drv to avoid force_completion
4596 		 *leave NULL and vm flush fence in fence drv */
4597 		for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) {
4598 			struct dma_fence *old, **ptr;
4599 
4600 			ptr = &ring->fence_drv.fences[j];
4601 			old = rcu_dereference_protected(*ptr, 1);
4602 			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
4603 				RCU_INIT_POINTER(*ptr, NULL);
4604 			}
4605 		}
4606 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4607 		amdgpu_fence_driver_force_completion(ring);
4608 	}
4609 
4610 	if (job && job->vm)
4611 		drm_sched_increase_karma(&job->base);
4612 
4613 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4614 	/* If reset handler not implemented, continue; otherwise return */
4615 	if (r == -ENOSYS)
4616 		r = 0;
4617 	else
4618 		return r;
4619 
4620 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4621 	if (!amdgpu_sriov_vf(adev)) {
4622 
4623 		if (!need_full_reset)
4624 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4625 
4626 		if (!need_full_reset) {
4627 			amdgpu_device_ip_pre_soft_reset(adev);
4628 			r = amdgpu_device_ip_soft_reset(adev);
4629 			amdgpu_device_ip_post_soft_reset(adev);
4630 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4631 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4632 				need_full_reset = true;
4633 			}
4634 		}
4635 
4636 		if (need_full_reset)
4637 			r = amdgpu_device_ip_suspend(adev);
4638 		if (need_full_reset)
4639 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4640 		else
4641 			clear_bit(AMDGPU_NEED_FULL_RESET,
4642 				  &reset_context->flags);
4643 	}
4644 
4645 	return r;
4646 }
4647 
4648 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4649 			 struct amdgpu_reset_context *reset_context)
4650 {
4651 	struct amdgpu_device *tmp_adev = NULL;
4652 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4653 	int r = 0;
4654 
4655 	/* Try reset handler method first */
4656 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4657 				    reset_list);
4658 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4659 	/* If reset handler not implemented, continue; otherwise return */
4660 	if (r == -ENOSYS)
4661 		r = 0;
4662 	else
4663 		return r;
4664 
4665 	/* Reset handler not implemented, use the default method */
4666 	need_full_reset =
4667 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4668 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4669 
4670 	/*
4671 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4672 	 * to allow proper links negotiation in FW (within 1 sec)
4673 	 */
4674 	if (!skip_hw_reset && need_full_reset) {
4675 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4676 			/* For XGMI run all resets in parallel to speed up the process */
4677 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4678 				tmp_adev->gmc.xgmi.pending_reset = false;
4679 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4680 					r = -EALREADY;
4681 			} else
4682 				r = amdgpu_asic_reset(tmp_adev);
4683 
4684 			if (r) {
4685 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4686 					 r, adev_to_drm(tmp_adev)->unique);
4687 				break;
4688 			}
4689 		}
4690 
4691 		/* For XGMI wait for all resets to complete before proceed */
4692 		if (!r) {
4693 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4694 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4695 					flush_work(&tmp_adev->xgmi_reset_work);
4696 					r = tmp_adev->asic_reset_res;
4697 					if (r)
4698 						break;
4699 				}
4700 			}
4701 		}
4702 	}
4703 
4704 	if (!r && amdgpu_ras_intr_triggered()) {
4705 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4706 			if (tmp_adev->mmhub.ras_funcs &&
4707 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4708 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4709 		}
4710 
4711 		amdgpu_ras_intr_cleared();
4712 	}
4713 
4714 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4715 		if (need_full_reset) {
4716 			/* post card */
4717 			r = amdgpu_device_asic_init(tmp_adev);
4718 			if (r) {
4719 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4720 			} else {
4721 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4722 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4723 				if (r)
4724 					goto out;
4725 
4726 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4727 				if (r)
4728 					goto out;
4729 
4730 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4731 				if (vram_lost) {
4732 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4733 					amdgpu_inc_vram_lost(tmp_adev);
4734 				}
4735 
4736 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4737 				if (r)
4738 					goto out;
4739 
4740 				r = amdgpu_device_fw_loading(tmp_adev);
4741 				if (r)
4742 					return r;
4743 
4744 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4745 				if (r)
4746 					goto out;
4747 
4748 				if (vram_lost)
4749 					amdgpu_device_fill_reset_magic(tmp_adev);
4750 
4751 				/*
4752 				 * Add this ASIC as tracked as reset was already
4753 				 * complete successfully.
4754 				 */
4755 				amdgpu_register_gpu_instance(tmp_adev);
4756 
4757 				if (!reset_context->hive &&
4758 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4759 					amdgpu_xgmi_add_device(tmp_adev);
4760 
4761 				r = amdgpu_device_ip_late_init(tmp_adev);
4762 				if (r)
4763 					goto out;
4764 
4765 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4766 
4767 				/*
4768 				 * The GPU enters bad state once faulty pages
4769 				 * by ECC has reached the threshold, and ras
4770 				 * recovery is scheduled next. So add one check
4771 				 * here to break recovery if it indeed exceeds
4772 				 * bad page threshold, and remind user to
4773 				 * retire this GPU or setting one bigger
4774 				 * bad_page_threshold value to fix this once
4775 				 * probing driver again.
4776 				 */
4777 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4778 					/* must succeed. */
4779 					amdgpu_ras_resume(tmp_adev);
4780 				} else {
4781 					r = -EINVAL;
4782 					goto out;
4783 				}
4784 
4785 				/* Update PSP FW topology after reset */
4786 				if (reset_context->hive &&
4787 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4788 					r = amdgpu_xgmi_update_topology(
4789 						reset_context->hive, tmp_adev);
4790 			}
4791 		}
4792 
4793 out:
4794 		if (!r) {
4795 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4796 			r = amdgpu_ib_ring_tests(tmp_adev);
4797 			if (r) {
4798 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4799 				need_full_reset = true;
4800 				r = -EAGAIN;
4801 				goto end;
4802 			}
4803 		}
4804 
4805 		if (!r)
4806 			r = amdgpu_device_recover_vram(tmp_adev);
4807 		else
4808 			tmp_adev->asic_reset_res = r;
4809 	}
4810 
4811 end:
4812 	if (need_full_reset)
4813 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4814 	else
4815 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4816 	return r;
4817 }
4818 
4819 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4820 				struct amdgpu_hive_info *hive)
4821 {
4822 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4823 		return false;
4824 
4825 	if (hive) {
4826 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4827 	} else {
4828 		down_write(&adev->reset_sem);
4829 	}
4830 
4831 	switch (amdgpu_asic_reset_method(adev)) {
4832 	case AMD_RESET_METHOD_MODE1:
4833 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4834 		break;
4835 	case AMD_RESET_METHOD_MODE2:
4836 		adev->mp1_state = PP_MP1_STATE_RESET;
4837 		break;
4838 	default:
4839 		adev->mp1_state = PP_MP1_STATE_NONE;
4840 		break;
4841 	}
4842 
4843 	return true;
4844 }
4845 
4846 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4847 {
4848 	amdgpu_vf_error_trans_all(adev);
4849 	adev->mp1_state = PP_MP1_STATE_NONE;
4850 	atomic_set(&adev->in_gpu_reset, 0);
4851 	up_write(&adev->reset_sem);
4852 }
4853 
4854 /*
4855  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4856  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4857  *
4858  * unlock won't require roll back.
4859  */
4860 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4861 {
4862 	struct amdgpu_device *tmp_adev = NULL;
4863 
4864 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4865 		if (!hive) {
4866 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4867 			return -ENODEV;
4868 		}
4869 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4870 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4871 				goto roll_back;
4872 		}
4873 	} else if (!amdgpu_device_lock_adev(adev, hive))
4874 		return -EAGAIN;
4875 
4876 	return 0;
4877 roll_back:
4878 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4879 		/*
4880 		 * if the lockup iteration break in the middle of a hive,
4881 		 * it may means there may has a race issue,
4882 		 * or a hive device locked up independently.
4883 		 * we may be in trouble and may not, so will try to roll back
4884 		 * the lock and give out a warnning.
4885 		 */
4886 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4887 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4888 			amdgpu_device_unlock_adev(tmp_adev);
4889 		}
4890 	}
4891 	return -EAGAIN;
4892 }
4893 
4894 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4895 {
4896 	STUB();
4897 #ifdef notyet
4898 	struct pci_dev *p = NULL;
4899 
4900 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4901 			adev->pdev->bus->number, 1);
4902 	if (p) {
4903 		pm_runtime_enable(&(p->dev));
4904 		pm_runtime_resume(&(p->dev));
4905 	}
4906 #endif
4907 }
4908 
4909 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4910 {
4911 	enum amd_reset_method reset_method;
4912 	struct pci_dev *p = NULL;
4913 	u64 expires;
4914 
4915 	/*
4916 	 * For now, only BACO and mode1 reset are confirmed
4917 	 * to suffer the audio issue without proper suspended.
4918 	 */
4919 	reset_method = amdgpu_asic_reset_method(adev);
4920 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4921 	     (reset_method != AMD_RESET_METHOD_MODE1))
4922 		return -EINVAL;
4923 
4924 	STUB();
4925 	return -ENOSYS;
4926 #ifdef notyet
4927 
4928 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4929 			adev->pdev->bus->number, 1);
4930 	if (!p)
4931 		return -ENODEV;
4932 
4933 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4934 	if (!expires)
4935 		/*
4936 		 * If we cannot get the audio device autosuspend delay,
4937 		 * a fixed 4S interval will be used. Considering 3S is
4938 		 * the audio controller default autosuspend delay setting.
4939 		 * 4S used here is guaranteed to cover that.
4940 		 */
4941 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4942 
4943 	while (!pm_runtime_status_suspended(&(p->dev))) {
4944 		if (!pm_runtime_suspend(&(p->dev)))
4945 			break;
4946 
4947 		if (expires < ktime_get_mono_fast_ns()) {
4948 			dev_warn(adev->dev, "failed to suspend display audio\n");
4949 			/* TODO: abort the succeeding gpu reset? */
4950 			return -ETIMEDOUT;
4951 		}
4952 	}
4953 
4954 	pm_runtime_disable(&(p->dev));
4955 
4956 	return 0;
4957 #endif
4958 }
4959 
4960 static void amdgpu_device_recheck_guilty_jobs(
4961 	struct amdgpu_device *adev, struct list_head *device_list_handle,
4962 	struct amdgpu_reset_context *reset_context)
4963 {
4964 	int i, r = 0;
4965 
4966 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4967 		struct amdgpu_ring *ring = adev->rings[i];
4968 		int ret = 0;
4969 		struct drm_sched_job *s_job;
4970 
4971 		if (!ring || !ring->sched.thread)
4972 			continue;
4973 
4974 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
4975 				struct drm_sched_job, list);
4976 		if (s_job == NULL)
4977 			continue;
4978 
4979 		/* clear job's guilty and depend the folowing step to decide the real one */
4980 		drm_sched_reset_karma(s_job);
4981 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4982 
4983 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4984 		if (ret == 0) { /* timeout */
4985 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4986 						ring->sched.name, s_job->id);
4987 
4988 			/* set guilty */
4989 			drm_sched_increase_karma(s_job);
4990 retry:
4991 			/* do hw reset */
4992 			if (amdgpu_sriov_vf(adev)) {
4993 				amdgpu_virt_fini_data_exchange(adev);
4994 				r = amdgpu_device_reset_sriov(adev, false);
4995 				if (r)
4996 					adev->asic_reset_res = r;
4997 			} else {
4998 				clear_bit(AMDGPU_SKIP_HW_RESET,
4999 					  &reset_context->flags);
5000 				r = amdgpu_do_asic_reset(device_list_handle,
5001 							 reset_context);
5002 				if (r && r == -EAGAIN)
5003 					goto retry;
5004 			}
5005 
5006 			/*
5007 			 * add reset counter so that the following
5008 			 * resubmitted job could flush vmid
5009 			 */
5010 			atomic_inc(&adev->gpu_reset_counter);
5011 			continue;
5012 		}
5013 
5014 		/* got the hw fence, signal finished fence */
5015 		atomic_dec(ring->sched.score);
5016 		dma_fence_get(&s_job->s_fence->finished);
5017 		dma_fence_signal(&s_job->s_fence->finished);
5018 		dma_fence_put(&s_job->s_fence->finished);
5019 
5020 		/* remove node from list and free the job */
5021 		spin_lock(&ring->sched.job_list_lock);
5022 		list_del_init(&s_job->list);
5023 		spin_unlock(&ring->sched.job_list_lock);
5024 		ring->sched.ops->free_job(s_job);
5025 	}
5026 }
5027 
5028 /**
5029  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5030  *
5031  * @adev: amdgpu_device pointer
5032  * @job: which job trigger hang
5033  *
5034  * Attempt to reset the GPU if it has hung (all asics).
5035  * Attempt to do soft-reset or full-reset and reinitialize Asic
5036  * Returns 0 for success or an error on failure.
5037  */
5038 
5039 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5040 			      struct amdgpu_job *job)
5041 {
5042 	struct list_head device_list, *device_list_handle =  NULL;
5043 	bool job_signaled = false;
5044 	struct amdgpu_hive_info *hive = NULL;
5045 	struct amdgpu_device *tmp_adev = NULL;
5046 	int i, r = 0;
5047 	bool need_emergency_restart = false;
5048 	bool audio_suspended = false;
5049 	int tmp_vram_lost_counter;
5050 	struct amdgpu_reset_context reset_context;
5051 
5052 	memset(&reset_context, 0, sizeof(reset_context));
5053 
5054 	/*
5055 	 * Special case: RAS triggered and full reset isn't supported
5056 	 */
5057 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5058 
5059 	/*
5060 	 * Flush RAM to disk so that after reboot
5061 	 * the user can read log and see why the system rebooted.
5062 	 */
5063 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5064 		DRM_WARN("Emergency reboot.");
5065 
5066 #ifdef notyet
5067 		ksys_sync_helper();
5068 		emergency_restart();
5069 #else
5070 		panic("emergency_restart");
5071 #endif
5072 	}
5073 
5074 	dev_info(adev->dev, "GPU %s begin!\n",
5075 		need_emergency_restart ? "jobs stop":"reset");
5076 
5077 	/*
5078 	 * Here we trylock to avoid chain of resets executing from
5079 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
5080 	 * different schedulers for same device while this TO handler is running.
5081 	 * We always reset all schedulers for device and all devices for XGMI
5082 	 * hive so that should take care of them too.
5083 	 */
5084 	hive = amdgpu_get_xgmi_hive(adev);
5085 	if (hive) {
5086 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
5087 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
5088 				job ? job->base.id : -1, hive->hive_id);
5089 			amdgpu_put_xgmi_hive(hive);
5090 			if (job && job->vm)
5091 				drm_sched_increase_karma(&job->base);
5092 			return 0;
5093 		}
5094 		mutex_lock(&hive->hive_lock);
5095 	}
5096 
5097 	reset_context.method = AMD_RESET_METHOD_NONE;
5098 	reset_context.reset_req_dev = adev;
5099 	reset_context.job = job;
5100 	reset_context.hive = hive;
5101 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5102 
5103 	/*
5104 	 * lock the device before we try to operate the linked list
5105 	 * if didn't get the device lock, don't touch the linked list since
5106 	 * others may iterating it.
5107 	 */
5108 	r = amdgpu_device_lock_hive_adev(adev, hive);
5109 	if (r) {
5110 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
5111 					job ? job->base.id : -1);
5112 
5113 		/* even we skipped this reset, still need to set the job to guilty */
5114 		if (job && job->vm)
5115 			drm_sched_increase_karma(&job->base);
5116 		goto skip_recovery;
5117 	}
5118 
5119 	/*
5120 	 * Build list of devices to reset.
5121 	 * In case we are in XGMI hive mode, resort the device list
5122 	 * to put adev in the 1st position.
5123 	 */
5124 	INIT_LIST_HEAD(&device_list);
5125 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5126 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5127 			list_add_tail(&tmp_adev->reset_list, &device_list);
5128 		if (!list_is_first(&adev->reset_list, &device_list))
5129 			list_rotate_to_front(&adev->reset_list, &device_list);
5130 		device_list_handle = &device_list;
5131 	} else {
5132 		list_add_tail(&adev->reset_list, &device_list);
5133 		device_list_handle = &device_list;
5134 	}
5135 
5136 	/* block all schedulers and reset given job's ring */
5137 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5138 		/*
5139 		 * Try to put the audio codec into suspend state
5140 		 * before gpu reset started.
5141 		 *
5142 		 * Due to the power domain of the graphics device
5143 		 * is shared with AZ power domain. Without this,
5144 		 * we may change the audio hardware from behind
5145 		 * the audio driver's back. That will trigger
5146 		 * some audio codec errors.
5147 		 */
5148 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5149 			audio_suspended = true;
5150 
5151 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5152 
5153 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5154 
5155 		if (!amdgpu_sriov_vf(tmp_adev))
5156 			amdgpu_amdkfd_pre_reset(tmp_adev);
5157 
5158 		/*
5159 		 * Mark these ASICs to be reseted as untracked first
5160 		 * And add them back after reset completed
5161 		 */
5162 		amdgpu_unregister_gpu_instance(tmp_adev);
5163 
5164 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
5165 
5166 		/* disable ras on ALL IPs */
5167 		if (!need_emergency_restart &&
5168 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5169 			amdgpu_ras_suspend(tmp_adev);
5170 
5171 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5172 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5173 
5174 			if (!ring || !ring->sched.thread)
5175 				continue;
5176 
5177 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5178 
5179 			if (need_emergency_restart)
5180 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5181 		}
5182 		atomic_inc(&tmp_adev->gpu_reset_counter);
5183 	}
5184 
5185 	if (need_emergency_restart)
5186 		goto skip_sched_resume;
5187 
5188 	/*
5189 	 * Must check guilty signal here since after this point all old
5190 	 * HW fences are force signaled.
5191 	 *
5192 	 * job->base holds a reference to parent fence
5193 	 */
5194 	if (job && job->base.s_fence->parent &&
5195 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
5196 		job_signaled = true;
5197 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5198 		goto skip_hw_reset;
5199 	}
5200 
5201 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5202 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5203 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
5204 		/*TODO Should we stop ?*/
5205 		if (r) {
5206 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5207 				  r, adev_to_drm(tmp_adev)->unique);
5208 			tmp_adev->asic_reset_res = r;
5209 		}
5210 	}
5211 
5212 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5213 	/* Actual ASIC resets if needed.*/
5214 	/* TODO Implement XGMI hive reset logic for SRIOV */
5215 	if (amdgpu_sriov_vf(adev)) {
5216 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5217 		if (r)
5218 			adev->asic_reset_res = r;
5219 	} else {
5220 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
5221 		if (r && r == -EAGAIN)
5222 			goto retry;
5223 	}
5224 
5225 skip_hw_reset:
5226 
5227 	/* Post ASIC reset for all devs .*/
5228 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5229 
5230 		/*
5231 		 * Sometimes a later bad compute job can block a good gfx job as gfx
5232 		 * and compute ring share internal GC HW mutually. We add an additional
5233 		 * guilty jobs recheck step to find the real guilty job, it synchronously
5234 		 * submits and pends for the first job being signaled. If it gets timeout,
5235 		 * we identify it as a real guilty job.
5236 		 */
5237 		if (amdgpu_gpu_recovery == 2 &&
5238 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5239 			amdgpu_device_recheck_guilty_jobs(
5240 				tmp_adev, device_list_handle, &reset_context);
5241 
5242 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5243 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5244 
5245 			if (!ring || !ring->sched.thread)
5246 				continue;
5247 
5248 			/* No point to resubmit jobs if we didn't HW reset*/
5249 			if (!tmp_adev->asic_reset_res && !job_signaled)
5250 				drm_sched_resubmit_jobs(&ring->sched);
5251 
5252 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5253 		}
5254 
5255 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5256 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5257 		}
5258 
5259 		tmp_adev->asic_reset_res = 0;
5260 
5261 		if (r) {
5262 			/* bad news, how to tell it to userspace ? */
5263 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5264 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5265 		} else {
5266 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5267 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5268 				DRM_WARN("smart shift update failed\n");
5269 		}
5270 	}
5271 
5272 skip_sched_resume:
5273 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5274 		/* unlock kfd: SRIOV would do it separately */
5275 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5276 	                amdgpu_amdkfd_post_reset(tmp_adev);
5277 
5278 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5279 		 * need to bring up kfd here if it's not be initialized before
5280 		 */
5281 		if (!adev->kfd.init_complete)
5282 			amdgpu_amdkfd_device_init(adev);
5283 
5284 		if (audio_suspended)
5285 			amdgpu_device_resume_display_audio(tmp_adev);
5286 		amdgpu_device_unlock_adev(tmp_adev);
5287 	}
5288 
5289 skip_recovery:
5290 	if (hive) {
5291 		atomic_set(&hive->in_reset, 0);
5292 		mutex_unlock(&hive->hive_lock);
5293 		amdgpu_put_xgmi_hive(hive);
5294 	}
5295 
5296 	if (r && r != -EAGAIN)
5297 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5298 	return r;
5299 }
5300 
5301 /**
5302  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5303  *
5304  * @adev: amdgpu_device pointer
5305  *
5306  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5307  * and lanes) of the slot the device is in. Handles APUs and
5308  * virtualized environments where PCIE config space may not be available.
5309  */
5310 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5311 {
5312 	struct pci_dev *pdev;
5313 	enum pci_bus_speed speed_cap, platform_speed_cap;
5314 	enum pcie_link_width platform_link_width;
5315 
5316 	if (amdgpu_pcie_gen_cap)
5317 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5318 
5319 	if (amdgpu_pcie_lane_cap)
5320 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5321 
5322 	/* covers APUs as well */
5323 	if (pci_is_root_bus(adev->pdev->bus)) {
5324 		if (adev->pm.pcie_gen_mask == 0)
5325 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5326 		if (adev->pm.pcie_mlw_mask == 0)
5327 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5328 		return;
5329 	}
5330 
5331 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5332 		return;
5333 
5334 	pcie_bandwidth_available(adev->pdev, NULL,
5335 				 &platform_speed_cap, &platform_link_width);
5336 
5337 	if (adev->pm.pcie_gen_mask == 0) {
5338 		/* asic caps */
5339 		pdev = adev->pdev;
5340 		speed_cap = pcie_get_speed_cap(pdev);
5341 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5342 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5343 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5344 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5345 		} else {
5346 			if (speed_cap == PCIE_SPEED_32_0GT)
5347 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5348 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5349 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5350 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5351 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5352 			else if (speed_cap == PCIE_SPEED_16_0GT)
5353 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5354 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5355 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5356 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5357 			else if (speed_cap == PCIE_SPEED_8_0GT)
5358 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5359 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5360 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5361 			else if (speed_cap == PCIE_SPEED_5_0GT)
5362 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5363 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5364 			else
5365 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5366 		}
5367 		/* platform caps */
5368 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5369 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5370 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5371 		} else {
5372 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5373 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5374 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5375 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5376 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5377 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5378 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5379 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5380 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5381 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5382 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5383 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5384 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5385 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5386 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5387 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5388 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5389 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5390 			else
5391 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5392 
5393 		}
5394 	}
5395 	if (adev->pm.pcie_mlw_mask == 0) {
5396 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5397 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5398 		} else {
5399 			switch (platform_link_width) {
5400 			case PCIE_LNK_X32:
5401 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5402 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5403 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5404 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5405 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5406 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5407 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5408 				break;
5409 			case PCIE_LNK_X16:
5410 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5411 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5412 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5413 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5414 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5415 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5416 				break;
5417 			case PCIE_LNK_X12:
5418 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5419 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5420 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5421 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5422 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5423 				break;
5424 			case PCIE_LNK_X8:
5425 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5426 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5427 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5428 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5429 				break;
5430 			case PCIE_LNK_X4:
5431 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5432 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5433 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5434 				break;
5435 			case PCIE_LNK_X2:
5436 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5437 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5438 				break;
5439 			case PCIE_LNK_X1:
5440 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5441 				break;
5442 			default:
5443 				break;
5444 			}
5445 		}
5446 	}
5447 }
5448 
5449 int amdgpu_device_baco_enter(struct drm_device *dev)
5450 {
5451 	struct amdgpu_device *adev = drm_to_adev(dev);
5452 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5453 
5454 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5455 		return -ENOTSUPP;
5456 
5457 	if (ras && adev->ras_enabled &&
5458 	    adev->nbio.funcs->enable_doorbell_interrupt)
5459 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5460 
5461 	return amdgpu_dpm_baco_enter(adev);
5462 }
5463 
5464 int amdgpu_device_baco_exit(struct drm_device *dev)
5465 {
5466 	struct amdgpu_device *adev = drm_to_adev(dev);
5467 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5468 	int ret = 0;
5469 
5470 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5471 		return -ENOTSUPP;
5472 
5473 	ret = amdgpu_dpm_baco_exit(adev);
5474 	if (ret)
5475 		return ret;
5476 
5477 	if (ras && adev->ras_enabled &&
5478 	    adev->nbio.funcs->enable_doorbell_interrupt)
5479 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5480 
5481 	if (amdgpu_passthrough(adev) &&
5482 	    adev->nbio.funcs->clear_doorbell_interrupt)
5483 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5484 
5485 	return 0;
5486 }
5487 
5488 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5489 {
5490 	int i;
5491 
5492 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5493 		struct amdgpu_ring *ring = adev->rings[i];
5494 
5495 		if (!ring || !ring->sched.thread)
5496 			continue;
5497 
5498 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5499 	}
5500 }
5501 
5502 /**
5503  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5504  * @pdev: PCI device struct
5505  * @state: PCI channel state
5506  *
5507  * Description: Called when a PCI error is detected.
5508  *
5509  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5510  */
5511 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5512 {
5513 	STUB();
5514 	return 0;
5515 #ifdef notyet
5516 	struct drm_device *dev = pci_get_drvdata(pdev);
5517 	struct amdgpu_device *adev = drm_to_adev(dev);
5518 	int i;
5519 
5520 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5521 
5522 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5523 		DRM_WARN("No support for XGMI hive yet...");
5524 		return PCI_ERS_RESULT_DISCONNECT;
5525 	}
5526 
5527 	adev->pci_channel_state = state;
5528 
5529 	switch (state) {
5530 	case pci_channel_io_normal:
5531 		return PCI_ERS_RESULT_CAN_RECOVER;
5532 	/* Fatal error, prepare for slot reset */
5533 	case pci_channel_io_frozen:
5534 		/*
5535 		 * Cancel and wait for all TDRs in progress if failing to
5536 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5537 		 *
5538 		 * Locking adev->reset_sem will prevent any external access
5539 		 * to GPU during PCI error recovery
5540 		 */
5541 		while (!amdgpu_device_lock_adev(adev, NULL))
5542 			amdgpu_cancel_all_tdr(adev);
5543 
5544 		/*
5545 		 * Block any work scheduling as we do for regular GPU reset
5546 		 * for the duration of the recovery
5547 		 */
5548 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5549 			struct amdgpu_ring *ring = adev->rings[i];
5550 
5551 			if (!ring || !ring->sched.thread)
5552 				continue;
5553 
5554 			drm_sched_stop(&ring->sched, NULL);
5555 		}
5556 		atomic_inc(&adev->gpu_reset_counter);
5557 		return PCI_ERS_RESULT_NEED_RESET;
5558 	case pci_channel_io_perm_failure:
5559 		/* Permanent error, prepare for device removal */
5560 		return PCI_ERS_RESULT_DISCONNECT;
5561 	}
5562 
5563 	return PCI_ERS_RESULT_NEED_RESET;
5564 #endif
5565 }
5566 
5567 /**
5568  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5569  * @pdev: pointer to PCI device
5570  */
5571 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5572 {
5573 
5574 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5575 
5576 	/* TODO - dump whatever for debugging purposes */
5577 
5578 	/* This called only if amdgpu_pci_error_detected returns
5579 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5580 	 * works, no need to reset slot.
5581 	 */
5582 
5583 	return PCI_ERS_RESULT_RECOVERED;
5584 }
5585 
5586 /**
5587  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5588  * @pdev: PCI device struct
5589  *
5590  * Description: This routine is called by the pci error recovery
5591  * code after the PCI slot has been reset, just before we
5592  * should resume normal operations.
5593  */
5594 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5595 {
5596 	STUB();
5597 	return PCI_ERS_RESULT_RECOVERED;
5598 #ifdef notyet
5599 	struct drm_device *dev = pci_get_drvdata(pdev);
5600 	struct amdgpu_device *adev = drm_to_adev(dev);
5601 	int r, i;
5602 	struct amdgpu_reset_context reset_context;
5603 	u32 memsize;
5604 	struct list_head device_list;
5605 
5606 	DRM_INFO("PCI error: slot reset callback!!\n");
5607 
5608 	memset(&reset_context, 0, sizeof(reset_context));
5609 
5610 	INIT_LIST_HEAD(&device_list);
5611 	list_add_tail(&adev->reset_list, &device_list);
5612 
5613 	/* wait for asic to come out of reset */
5614 	drm_msleep(500);
5615 
5616 	/* Restore PCI confspace */
5617 	amdgpu_device_load_pci_state(pdev);
5618 
5619 	/* confirm  ASIC came out of reset */
5620 	for (i = 0; i < adev->usec_timeout; i++) {
5621 		memsize = amdgpu_asic_get_config_memsize(adev);
5622 
5623 		if (memsize != 0xffffffff)
5624 			break;
5625 		udelay(1);
5626 	}
5627 	if (memsize == 0xffffffff) {
5628 		r = -ETIME;
5629 		goto out;
5630 	}
5631 
5632 	reset_context.method = AMD_RESET_METHOD_NONE;
5633 	reset_context.reset_req_dev = adev;
5634 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5635 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5636 
5637 	adev->no_hw_access = true;
5638 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5639 	adev->no_hw_access = false;
5640 	if (r)
5641 		goto out;
5642 
5643 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5644 
5645 out:
5646 	if (!r) {
5647 		if (amdgpu_device_cache_pci_state(adev->pdev))
5648 			pci_restore_state(adev->pdev);
5649 
5650 		DRM_INFO("PCIe error recovery succeeded\n");
5651 	} else {
5652 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5653 		amdgpu_device_unlock_adev(adev);
5654 	}
5655 
5656 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5657 #endif
5658 }
5659 
5660 /**
5661  * amdgpu_pci_resume() - resume normal ops after PCI reset
5662  * @pdev: pointer to PCI device
5663  *
5664  * Called when the error recovery driver tells us that its
5665  * OK to resume normal operation.
5666  */
5667 void amdgpu_pci_resume(struct pci_dev *pdev)
5668 {
5669 	STUB();
5670 #ifdef notyet
5671 	struct drm_device *dev = pci_get_drvdata(pdev);
5672 	struct amdgpu_device *adev = drm_to_adev(dev);
5673 	int i;
5674 
5675 
5676 	DRM_INFO("PCI error: resume callback!!\n");
5677 
5678 	/* Only continue execution for the case of pci_channel_io_frozen */
5679 	if (adev->pci_channel_state != pci_channel_io_frozen)
5680 		return;
5681 
5682 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5683 		struct amdgpu_ring *ring = adev->rings[i];
5684 
5685 		if (!ring || !ring->sched.thread)
5686 			continue;
5687 
5688 
5689 		drm_sched_resubmit_jobs(&ring->sched);
5690 		drm_sched_start(&ring->sched, true);
5691 	}
5692 
5693 	amdgpu_device_unlock_adev(adev);
5694 #endif
5695 }
5696 
5697 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5698 {
5699 	return false;
5700 #ifdef notyet
5701 	struct drm_device *dev = pci_get_drvdata(pdev);
5702 	struct amdgpu_device *adev = drm_to_adev(dev);
5703 	int r;
5704 
5705 	r = pci_save_state(pdev);
5706 	if (!r) {
5707 		kfree(adev->pci_state);
5708 
5709 		adev->pci_state = pci_store_saved_state(pdev);
5710 
5711 		if (!adev->pci_state) {
5712 			DRM_ERROR("Failed to store PCI saved state");
5713 			return false;
5714 		}
5715 	} else {
5716 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5717 		return false;
5718 	}
5719 
5720 	return true;
5721 #endif
5722 }
5723 
5724 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5725 {
5726 	STUB();
5727 	return false;
5728 #ifdef notyet
5729 	struct drm_device *dev = pci_get_drvdata(pdev);
5730 	struct amdgpu_device *adev = drm_to_adev(dev);
5731 	int r;
5732 
5733 	if (!adev->pci_state)
5734 		return false;
5735 
5736 	r = pci_load_saved_state(pdev, adev->pci_state);
5737 
5738 	if (!r) {
5739 		pci_restore_state(pdev);
5740 	} else {
5741 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5742 		return false;
5743 	}
5744 
5745 	return true;
5746 #endif
5747 }
5748 
5749 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5750 		struct amdgpu_ring *ring)
5751 {
5752 #ifdef CONFIG_X86_64
5753 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5754 		return;
5755 #endif
5756 	if (adev->gmc.xgmi.connected_to_cpu)
5757 		return;
5758 
5759 	if (ring && ring->funcs->emit_hdp_flush)
5760 		amdgpu_ring_emit_hdp_flush(ring);
5761 	else
5762 		amdgpu_asic_flush_hdp(adev, ring);
5763 }
5764 
5765 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5766 		struct amdgpu_ring *ring)
5767 {
5768 #ifdef CONFIG_X86_64
5769 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5770 		return;
5771 #endif
5772 	if (adev->gmc.xgmi.connected_to_cpu)
5773 		return;
5774 
5775 	amdgpu_asic_invalidate_hdp(adev, ring);
5776 }
5777