xref: /openbsd-src/sys/dev/pci/drm/amd/amdgpu/amdgpu_ras.c (revision 8602cf8bff3de046a319b2e7feed6f8e80a7ae8c)
1c349dbc7Sjsg /*
2c349dbc7Sjsg  * Copyright 2018 Advanced Micro Devices, Inc.
3c349dbc7Sjsg  *
4c349dbc7Sjsg  * Permission is hereby granted, free of charge, to any person obtaining a
5c349dbc7Sjsg  * copy of this software and associated documentation files (the "Software"),
6c349dbc7Sjsg  * to deal in the Software without restriction, including without limitation
7c349dbc7Sjsg  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c349dbc7Sjsg  * and/or sell copies of the Software, and to permit persons to whom the
9c349dbc7Sjsg  * Software is furnished to do so, subject to the following conditions:
10c349dbc7Sjsg  *
11c349dbc7Sjsg  * The above copyright notice and this permission notice shall be included in
12c349dbc7Sjsg  * all copies or substantial portions of the Software.
13c349dbc7Sjsg  *
14c349dbc7Sjsg  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15c349dbc7Sjsg  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16c349dbc7Sjsg  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17c349dbc7Sjsg  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18c349dbc7Sjsg  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19c349dbc7Sjsg  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20c349dbc7Sjsg  * OTHER DEALINGS IN THE SOFTWARE.
21c349dbc7Sjsg  *
22c349dbc7Sjsg  *
23c349dbc7Sjsg  */
24c349dbc7Sjsg #include <linux/debugfs.h>
25c349dbc7Sjsg #include <linux/list.h>
26c349dbc7Sjsg #include <linux/module.h>
27c349dbc7Sjsg #include <linux/uaccess.h>
28c349dbc7Sjsg #include <linux/reboot.h>
29c349dbc7Sjsg #include <linux/syscalls.h>
305ca02815Sjsg #include <linux/pm_runtime.h>
31c349dbc7Sjsg 
32c349dbc7Sjsg #include "amdgpu.h"
33c349dbc7Sjsg #include "amdgpu_ras.h"
34c349dbc7Sjsg #include "amdgpu_atomfirmware.h"
35c349dbc7Sjsg #include "amdgpu_xgmi.h"
36c349dbc7Sjsg #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37f005ef32Sjsg #include "nbio_v4_3.h"
38f005ef32Sjsg #include "nbio_v7_9.h"
395ca02815Sjsg #include "atom.h"
401bb76ff1Sjsg #include "amdgpu_reset.h"
41c349dbc7Sjsg 
421bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD
431bb76ff1Sjsg #include <asm/mce.h>
441bb76ff1Sjsg 
451bb76ff1Sjsg static bool notifier_registered;
461bb76ff1Sjsg #endif
47ad8b1aafSjsg static const char *RAS_FS_NAME = "ras";
48ad8b1aafSjsg 
49c349dbc7Sjsg const char *ras_error_string[] = {
50c349dbc7Sjsg 	"none",
51c349dbc7Sjsg 	"parity",
52c349dbc7Sjsg 	"single_correctable",
53c349dbc7Sjsg 	"multi_uncorrectable",
54c349dbc7Sjsg 	"poison",
55c349dbc7Sjsg };
56c349dbc7Sjsg 
57c349dbc7Sjsg const char *ras_block_string[] = {
58c349dbc7Sjsg 	"umc",
59c349dbc7Sjsg 	"sdma",
60c349dbc7Sjsg 	"gfx",
61c349dbc7Sjsg 	"mmhub",
62c349dbc7Sjsg 	"athub",
63c349dbc7Sjsg 	"pcie_bif",
64c349dbc7Sjsg 	"hdp",
65c349dbc7Sjsg 	"xgmi_wafl",
66c349dbc7Sjsg 	"df",
67c349dbc7Sjsg 	"smn",
68c349dbc7Sjsg 	"sem",
69c349dbc7Sjsg 	"mp0",
70c349dbc7Sjsg 	"mp1",
71c349dbc7Sjsg 	"fuse",
721bb76ff1Sjsg 	"mca",
731bb76ff1Sjsg 	"vcn",
741bb76ff1Sjsg 	"jpeg",
75c349dbc7Sjsg };
76c349dbc7Sjsg 
771bb76ff1Sjsg const char *ras_mca_block_string[] = {
781bb76ff1Sjsg 	"mca_mp0",
791bb76ff1Sjsg 	"mca_mp1",
801bb76ff1Sjsg 	"mca_mpio",
811bb76ff1Sjsg 	"mca_iohc",
821bb76ff1Sjsg };
831bb76ff1Sjsg 
841bb76ff1Sjsg struct amdgpu_ras_block_list {
851bb76ff1Sjsg 	/* ras block link */
861bb76ff1Sjsg 	struct list_head node;
871bb76ff1Sjsg 
881bb76ff1Sjsg 	struct amdgpu_ras_block_object *ras_obj;
891bb76ff1Sjsg };
901bb76ff1Sjsg 
911bb76ff1Sjsg const char *get_ras_block_str(struct ras_common_if *ras_block)
921bb76ff1Sjsg {
931bb76ff1Sjsg 	if (!ras_block)
941bb76ff1Sjsg 		return "NULL";
951bb76ff1Sjsg 
961bb76ff1Sjsg 	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
971bb76ff1Sjsg 		return "OUT OF RANGE";
981bb76ff1Sjsg 
991bb76ff1Sjsg 	if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
1001bb76ff1Sjsg 		return ras_mca_block_string[ras_block->sub_block_index];
1011bb76ff1Sjsg 
1021bb76ff1Sjsg 	return ras_block_string[ras_block->block];
1031bb76ff1Sjsg }
1041bb76ff1Sjsg 
1051bb76ff1Sjsg #define ras_block_str(_BLOCK_) \
1061bb76ff1Sjsg 	(((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
1071bb76ff1Sjsg 
108c349dbc7Sjsg #define ras_err_str(i) (ras_error_string[ffs(i)])
109c349dbc7Sjsg 
110c349dbc7Sjsg #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
111c349dbc7Sjsg 
112c349dbc7Sjsg /* inject address is 52 bits */
113c349dbc7Sjsg #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
114c349dbc7Sjsg 
1155ca02815Sjsg /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
1165ca02815Sjsg #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
117ad8b1aafSjsg 
118c349dbc7Sjsg enum amdgpu_ras_retire_page_reservation {
119c349dbc7Sjsg 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
120c349dbc7Sjsg 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
121c349dbc7Sjsg 	AMDGPU_RAS_RETIRE_PAGE_FAULT,
122c349dbc7Sjsg };
123c349dbc7Sjsg 
124c349dbc7Sjsg atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
125c349dbc7Sjsg 
1265ca02815Sjsg static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
1275ca02815Sjsg 				uint64_t addr);
128c349dbc7Sjsg static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
129c349dbc7Sjsg 				uint64_t addr);
1301bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD
1311bb76ff1Sjsg static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
1321bb76ff1Sjsg struct mce_notifier_adev_list {
1331bb76ff1Sjsg 	struct amdgpu_device *devs[MAX_GPU_INSTANCE];
1341bb76ff1Sjsg 	int num_gpu;
1351bb76ff1Sjsg };
1361bb76ff1Sjsg static struct mce_notifier_adev_list mce_adev_list;
1371bb76ff1Sjsg #endif
138c349dbc7Sjsg 
139af8ed3f7Sjsg void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
140af8ed3f7Sjsg {
14157896dd2Sjsg 	if (adev && amdgpu_ras_get_context(adev))
142af8ed3f7Sjsg 		amdgpu_ras_get_context(adev)->error_query_ready = ready;
143af8ed3f7Sjsg }
144af8ed3f7Sjsg 
145ad8b1aafSjsg static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
146af8ed3f7Sjsg {
14757896dd2Sjsg 	if (adev && amdgpu_ras_get_context(adev))
148af8ed3f7Sjsg 		return amdgpu_ras_get_context(adev)->error_query_ready;
149af8ed3f7Sjsg 
150af8ed3f7Sjsg 	return false;
151af8ed3f7Sjsg }
152af8ed3f7Sjsg 
1535ca02815Sjsg static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
1545ca02815Sjsg {
1555ca02815Sjsg 	struct ras_err_data err_data = {0, 0, 0, NULL};
1565ca02815Sjsg 	struct eeprom_table_record err_rec;
1575ca02815Sjsg 
1585ca02815Sjsg 	if ((address >= adev->gmc.mc_vram_size) ||
1595ca02815Sjsg 	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
1605ca02815Sjsg 		dev_warn(adev->dev,
1615ca02815Sjsg 		         "RAS WARN: input address 0x%llx is invalid.\n",
1625ca02815Sjsg 		         address);
1635ca02815Sjsg 		return -EINVAL;
1645ca02815Sjsg 	}
1655ca02815Sjsg 
1665ca02815Sjsg 	if (amdgpu_ras_check_bad_page(adev, address)) {
1675ca02815Sjsg 		dev_warn(adev->dev,
1685ca02815Sjsg 			 "RAS WARN: 0x%llx has already been marked as bad page!\n",
1695ca02815Sjsg 			 address);
1705ca02815Sjsg 		return 0;
1715ca02815Sjsg 	}
1725ca02815Sjsg 
1735ca02815Sjsg 	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
1745ca02815Sjsg 	err_data.err_addr = &err_rec;
1758ad1b843Sjsg 	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
1765ca02815Sjsg 
1775ca02815Sjsg 	if (amdgpu_bad_page_threshold != 0) {
1785ca02815Sjsg 		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
1795ca02815Sjsg 					 err_data.err_addr_cnt);
180f005ef32Sjsg 		amdgpu_ras_save_bad_pages(adev, NULL);
1815ca02815Sjsg 	}
1825ca02815Sjsg 
1835ca02815Sjsg 	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
1845ca02815Sjsg 	dev_warn(adev->dev, "Clear EEPROM:\n");
1855ca02815Sjsg 	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
1865ca02815Sjsg 
1875ca02815Sjsg 	return 0;
1885ca02815Sjsg }
1895ca02815Sjsg 
190c349dbc7Sjsg #ifdef __linux__
191c349dbc7Sjsg 
192c349dbc7Sjsg static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
193c349dbc7Sjsg 					size_t size, loff_t *pos)
194c349dbc7Sjsg {
195c349dbc7Sjsg 	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
196c349dbc7Sjsg 	struct ras_query_if info = {
197c349dbc7Sjsg 		.head = obj->head,
198c349dbc7Sjsg 	};
199c349dbc7Sjsg 	ssize_t s;
200c349dbc7Sjsg 	char val[128];
201c349dbc7Sjsg 
2025ca02815Sjsg 	if (amdgpu_ras_query_error_status(obj->adev, &info))
203c349dbc7Sjsg 		return -EINVAL;
204c349dbc7Sjsg 
2051bb76ff1Sjsg 	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
2061bb76ff1Sjsg 	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
2071bb76ff1Sjsg 	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
2081bb76ff1Sjsg 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
2091bb76ff1Sjsg 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
2101bb76ff1Sjsg 	}
2111bb76ff1Sjsg 
212c349dbc7Sjsg 	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
213c349dbc7Sjsg 			"ue", info.ue_count,
214c349dbc7Sjsg 			"ce", info.ce_count);
215c349dbc7Sjsg 	if (*pos >= s)
216c349dbc7Sjsg 		return 0;
217c349dbc7Sjsg 
218c349dbc7Sjsg 	s -= *pos;
219c349dbc7Sjsg 	s = min_t(u64, s, size);
220c349dbc7Sjsg 
221c349dbc7Sjsg 
222c349dbc7Sjsg 	if (copy_to_user(buf, &val[*pos], s))
223c349dbc7Sjsg 		return -EINVAL;
224c349dbc7Sjsg 
225c349dbc7Sjsg 	*pos += s;
226c349dbc7Sjsg 
227c349dbc7Sjsg 	return s;
228c349dbc7Sjsg }
229c349dbc7Sjsg 
230c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_ops = {
231c349dbc7Sjsg 	.owner = THIS_MODULE,
232c349dbc7Sjsg 	.read = amdgpu_ras_debugfs_read,
233c349dbc7Sjsg 	.write = NULL,
234c349dbc7Sjsg 	.llseek = default_llseek
235c349dbc7Sjsg };
236c349dbc7Sjsg 
237c349dbc7Sjsg static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
238c349dbc7Sjsg {
239c349dbc7Sjsg 	int i;
240c349dbc7Sjsg 
241c349dbc7Sjsg 	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
242c349dbc7Sjsg 		*block_id = i;
2431bb76ff1Sjsg 		if (strcmp(name, ras_block_string[i]) == 0)
244c349dbc7Sjsg 			return 0;
245c349dbc7Sjsg 	}
246c349dbc7Sjsg 	return -EINVAL;
247c349dbc7Sjsg }
248c349dbc7Sjsg 
249c349dbc7Sjsg static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
250c349dbc7Sjsg 		const char __user *buf, size_t size,
251c349dbc7Sjsg 		loff_t *pos, struct ras_debug_if *data)
252c349dbc7Sjsg {
253c349dbc7Sjsg 	ssize_t s = min_t(u64, 64, size);
254c349dbc7Sjsg 	char str[65];
255c349dbc7Sjsg 	char block_name[33];
256c349dbc7Sjsg 	char err[9] = "ue";
257c349dbc7Sjsg 	int op = -1;
258c349dbc7Sjsg 	int block_id;
259c349dbc7Sjsg 	uint32_t sub_block;
260c349dbc7Sjsg 	u64 address, value;
261f005ef32Sjsg 	/* default value is 0 if the mask is not set by user */
262f005ef32Sjsg 	u32 instance_mask = 0;
263c349dbc7Sjsg 
264c349dbc7Sjsg 	if (*pos)
265c349dbc7Sjsg 		return -EINVAL;
266c349dbc7Sjsg 	*pos = size;
267c349dbc7Sjsg 
268c349dbc7Sjsg 	memset(str, 0, sizeof(str));
269c349dbc7Sjsg 	memset(data, 0, sizeof(*data));
270c349dbc7Sjsg 
271c349dbc7Sjsg 	if (copy_from_user(str, buf, s))
272c349dbc7Sjsg 		return -EINVAL;
273c349dbc7Sjsg 
274c349dbc7Sjsg 	if (sscanf(str, "disable %32s", block_name) == 1)
275c349dbc7Sjsg 		op = 0;
276c349dbc7Sjsg 	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
277c349dbc7Sjsg 		op = 1;
278c349dbc7Sjsg 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
279c349dbc7Sjsg 		op = 2;
2805ca02815Sjsg 	else if (strstr(str, "retire_page") != NULL)
2815ca02815Sjsg 		op = 3;
282c349dbc7Sjsg 	else if (str[0] && str[1] && str[2] && str[3])
283c349dbc7Sjsg 		/* ascii string, but commands are not matched. */
284c349dbc7Sjsg 		return -EINVAL;
285c349dbc7Sjsg 
286c349dbc7Sjsg 	if (op != -1) {
2875ca02815Sjsg 		if (op == 3) {
2885ca02815Sjsg 			if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
2895ca02815Sjsg 			    sscanf(str, "%*s %llu", &address) != 1)
2905ca02815Sjsg 				return -EINVAL;
2915ca02815Sjsg 
2925ca02815Sjsg 			data->op = op;
2935ca02815Sjsg 			data->inject.address = address;
2945ca02815Sjsg 
2955ca02815Sjsg 			return 0;
2965ca02815Sjsg 		}
2975ca02815Sjsg 
298c349dbc7Sjsg 		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
299c349dbc7Sjsg 			return -EINVAL;
300c349dbc7Sjsg 
301c349dbc7Sjsg 		data->head.block = block_id;
302c349dbc7Sjsg 		/* only ue and ce errors are supported */
303c349dbc7Sjsg 		if (!memcmp("ue", err, 2))
304c349dbc7Sjsg 			data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
305c349dbc7Sjsg 		else if (!memcmp("ce", err, 2))
306c349dbc7Sjsg 			data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
307c349dbc7Sjsg 		else
308c349dbc7Sjsg 			return -EINVAL;
309c349dbc7Sjsg 
310c349dbc7Sjsg 		data->op = op;
311c349dbc7Sjsg 
312c349dbc7Sjsg 		if (op == 2) {
313f005ef32Sjsg 			if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
314f005ef32Sjsg 				   &sub_block, &address, &value, &instance_mask) != 4 &&
315f005ef32Sjsg 			    sscanf(str, "%*s %*s %*s %u %llu %llu %u",
316f005ef32Sjsg 				   &sub_block, &address, &value, &instance_mask) != 4 &&
317f005ef32Sjsg 				sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
3185ca02815Sjsg 				   &sub_block, &address, &value) != 3 &&
3195ca02815Sjsg 			    sscanf(str, "%*s %*s %*s %u %llu %llu",
320c349dbc7Sjsg 				   &sub_block, &address, &value) != 3)
321c349dbc7Sjsg 				return -EINVAL;
322c349dbc7Sjsg 			data->head.sub_block_index = sub_block;
323c349dbc7Sjsg 			data->inject.address = address;
324c349dbc7Sjsg 			data->inject.value = value;
325f005ef32Sjsg 			data->inject.instance_mask = instance_mask;
326c349dbc7Sjsg 		}
327c349dbc7Sjsg 	} else {
328c349dbc7Sjsg 		if (size < sizeof(*data))
329c349dbc7Sjsg 			return -EINVAL;
330c349dbc7Sjsg 
331c349dbc7Sjsg 		if (copy_from_user(data, buf, sizeof(*data)))
332c349dbc7Sjsg 			return -EINVAL;
333c349dbc7Sjsg 	}
334c349dbc7Sjsg 
335c349dbc7Sjsg 	return 0;
336c349dbc7Sjsg }
337c349dbc7Sjsg 
338f005ef32Sjsg static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
339f005ef32Sjsg 				struct ras_debug_if *data)
340f005ef32Sjsg {
341f005ef32Sjsg 	int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
342f005ef32Sjsg 	uint32_t mask, inst_mask = data->inject.instance_mask;
343f005ef32Sjsg 
344f005ef32Sjsg 	/* no need to set instance mask if there is only one instance */
345f005ef32Sjsg 	if (num_xcc <= 1 && inst_mask) {
346f005ef32Sjsg 		data->inject.instance_mask = 0;
347f005ef32Sjsg 		dev_dbg(adev->dev,
348f005ef32Sjsg 			"RAS inject mask(0x%x) isn't supported and force it to 0.\n",
349f005ef32Sjsg 			inst_mask);
350f005ef32Sjsg 
351f005ef32Sjsg 		return;
352f005ef32Sjsg 	}
353f005ef32Sjsg 
354f005ef32Sjsg 	switch (data->head.block) {
355f005ef32Sjsg 	case AMDGPU_RAS_BLOCK__GFX:
356f005ef32Sjsg 		mask = GENMASK(num_xcc - 1, 0);
357f005ef32Sjsg 		break;
358f005ef32Sjsg 	case AMDGPU_RAS_BLOCK__SDMA:
359f005ef32Sjsg 		mask = GENMASK(adev->sdma.num_instances - 1, 0);
360f005ef32Sjsg 		break;
361f005ef32Sjsg 	case AMDGPU_RAS_BLOCK__VCN:
362f005ef32Sjsg 	case AMDGPU_RAS_BLOCK__JPEG:
363f005ef32Sjsg 		mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0);
364f005ef32Sjsg 		break;
365f005ef32Sjsg 	default:
366f005ef32Sjsg 		mask = inst_mask;
367f005ef32Sjsg 		break;
368f005ef32Sjsg 	}
369f005ef32Sjsg 
370f005ef32Sjsg 	/* remove invalid bits in instance mask */
371f005ef32Sjsg 	data->inject.instance_mask &= mask;
372f005ef32Sjsg 	if (inst_mask != data->inject.instance_mask)
373f005ef32Sjsg 		dev_dbg(adev->dev,
374f005ef32Sjsg 			"Adjust RAS inject mask 0x%x to 0x%x\n",
375f005ef32Sjsg 			inst_mask, data->inject.instance_mask);
376f005ef32Sjsg }
377f005ef32Sjsg 
378c349dbc7Sjsg /**
379c349dbc7Sjsg  * DOC: AMDGPU RAS debugfs control interface
380c349dbc7Sjsg  *
3815ca02815Sjsg  * The control interface accepts struct ras_debug_if which has two members.
382c349dbc7Sjsg  *
383c349dbc7Sjsg  * First member: ras_debug_if::head or ras_debug_if::inject.
384c349dbc7Sjsg  *
385c349dbc7Sjsg  * head is used to indicate which IP block will be under control.
386c349dbc7Sjsg  *
387c349dbc7Sjsg  * head has four members, they are block, type, sub_block_index, name.
388c349dbc7Sjsg  * block: which IP will be under control.
389c349dbc7Sjsg  * type: what kind of error will be enabled/disabled/injected.
390c349dbc7Sjsg  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
391c349dbc7Sjsg  * name: the name of IP.
392c349dbc7Sjsg  *
393f005ef32Sjsg  * inject has three more members than head, they are address, value and mask.
394c349dbc7Sjsg  * As their names indicate, inject operation will write the
395c349dbc7Sjsg  * value to the address.
396c349dbc7Sjsg  *
397c349dbc7Sjsg  * The second member: struct ras_debug_if::op.
398c349dbc7Sjsg  * It has three kinds of operations.
399c349dbc7Sjsg  *
400c349dbc7Sjsg  * - 0: disable RAS on the block. Take ::head as its data.
401c349dbc7Sjsg  * - 1: enable RAS on the block. Take ::head as its data.
402c349dbc7Sjsg  * - 2: inject errors on the block. Take ::inject as its data.
403c349dbc7Sjsg  *
404c349dbc7Sjsg  * How to use the interface?
405c349dbc7Sjsg  *
4065ca02815Sjsg  * In a program
407c349dbc7Sjsg  *
4085ca02815Sjsg  * Copy the struct ras_debug_if in your code and initialize it.
4095ca02815Sjsg  * Write the struct to the control interface.
410c349dbc7Sjsg  *
4115ca02815Sjsg  * From shell
412c349dbc7Sjsg  *
413c349dbc7Sjsg  * .. code-block:: bash
414c349dbc7Sjsg  *
4155ca02815Sjsg  *	echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
4165ca02815Sjsg  *	echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
417f005ef32Sjsg  *	echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
418c349dbc7Sjsg  *
4195ca02815Sjsg  * Where N, is the card which you want to affect.
420c349dbc7Sjsg  *
4215ca02815Sjsg  * "disable" requires only the block.
4225ca02815Sjsg  * "enable" requires the block and error type.
4235ca02815Sjsg  * "inject" requires the block, error type, address, and value.
4245ca02815Sjsg  *
4255ca02815Sjsg  * The block is one of: umc, sdma, gfx, etc.
426c349dbc7Sjsg  *	see ras_block_string[] for details
427c349dbc7Sjsg  *
4285ca02815Sjsg  * The error type is one of: ue, ce, where,
4295ca02815Sjsg  *	ue is multi-uncorrectable
4305ca02815Sjsg  *	ce is single-correctable
4315ca02815Sjsg  *
4325ca02815Sjsg  * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
4335ca02815Sjsg  * The address and value are hexadecimal numbers, leading 0x is optional.
434f005ef32Sjsg  * The mask means instance mask, is optional, default value is 0x1.
4355ca02815Sjsg  *
4365ca02815Sjsg  * For instance,
437c349dbc7Sjsg  *
438c349dbc7Sjsg  * .. code-block:: bash
439c349dbc7Sjsg  *
440c349dbc7Sjsg  *	echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
441f005ef32Sjsg  *	echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
442c349dbc7Sjsg  *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
443c349dbc7Sjsg  *
4445ca02815Sjsg  * How to check the result of the operation?
445c349dbc7Sjsg  *
4465ca02815Sjsg  * To check disable/enable, see "ras" features at,
447c349dbc7Sjsg  * /sys/class/drm/card[0/1/2...]/device/ras/features
448c349dbc7Sjsg  *
4495ca02815Sjsg  * To check inject, see the corresponding error count at,
4505ca02815Sjsg  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
451c349dbc7Sjsg  *
452c349dbc7Sjsg  * .. note::
453c349dbc7Sjsg  *	Operations are only allowed on blocks which are supported.
4545ca02815Sjsg  *	Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
455c349dbc7Sjsg  *	to see which blocks support RAS on a particular asic.
456c349dbc7Sjsg  *
457c349dbc7Sjsg  */
4585ca02815Sjsg static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
4595ca02815Sjsg 					     const char __user *buf,
460c349dbc7Sjsg 					     size_t size, loff_t *pos)
461c349dbc7Sjsg {
462c349dbc7Sjsg 	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
463c349dbc7Sjsg 	struct ras_debug_if data;
464c349dbc7Sjsg 	int ret = 0;
465c349dbc7Sjsg 
466af8ed3f7Sjsg 	if (!amdgpu_ras_get_error_query_ready(adev)) {
467ad8b1aafSjsg 		dev_warn(adev->dev, "RAS WARN: error injection "
468ad8b1aafSjsg 				"currently inaccessible\n");
469c349dbc7Sjsg 		return size;
470c349dbc7Sjsg 	}
471c349dbc7Sjsg 
472c349dbc7Sjsg 	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
473c349dbc7Sjsg 	if (ret)
4745ca02815Sjsg 		return ret;
4755ca02815Sjsg 
4765ca02815Sjsg 	if (data.op == 3) {
4775ca02815Sjsg 		ret = amdgpu_reserve_page_direct(adev, data.inject.address);
4785ca02815Sjsg 		if (!ret)
4795ca02815Sjsg 			return size;
4805ca02815Sjsg 		else
4815ca02815Sjsg 			return ret;
4825ca02815Sjsg 	}
483c349dbc7Sjsg 
484c349dbc7Sjsg 	if (!amdgpu_ras_is_supported(adev, data.head.block))
485c349dbc7Sjsg 		return -EINVAL;
486c349dbc7Sjsg 
487c349dbc7Sjsg 	switch (data.op) {
488c349dbc7Sjsg 	case 0:
489c349dbc7Sjsg 		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
490c349dbc7Sjsg 		break;
491c349dbc7Sjsg 	case 1:
492c349dbc7Sjsg 		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
493c349dbc7Sjsg 		break;
494c349dbc7Sjsg 	case 2:
495f005ef32Sjsg 		if ((data.inject.address >= adev->gmc.mc_vram_size &&
496f005ef32Sjsg 		    adev->gmc.mc_vram_size) ||
497c349dbc7Sjsg 		    (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
498ad8b1aafSjsg 			dev_warn(adev->dev, "RAS WARN: input address "
499ad8b1aafSjsg 					"0x%llx is invalid.",
500ad8b1aafSjsg 					data.inject.address);
501c349dbc7Sjsg 			ret = -EINVAL;
502c349dbc7Sjsg 			break;
503c349dbc7Sjsg 		}
504c349dbc7Sjsg 
505c349dbc7Sjsg 		/* umc ce/ue error injection for a bad page is not allowed */
506c349dbc7Sjsg 		if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
507c349dbc7Sjsg 		    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
5085ca02815Sjsg 			dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
5095ca02815Sjsg 				 "already been marked as bad!\n",
510c349dbc7Sjsg 				 data.inject.address);
511c349dbc7Sjsg 			break;
512c349dbc7Sjsg 		}
513c349dbc7Sjsg 
514f005ef32Sjsg 		amdgpu_ras_instance_mask_check(adev, &data);
515f005ef32Sjsg 
516c349dbc7Sjsg 		/* data.inject.address is offset instead of absolute gpu address */
517c349dbc7Sjsg 		ret = amdgpu_ras_error_inject(adev, &data.inject);
518c349dbc7Sjsg 		break;
519c349dbc7Sjsg 	default:
520c349dbc7Sjsg 		ret = -EINVAL;
521c349dbc7Sjsg 		break;
522c349dbc7Sjsg 	}
523c349dbc7Sjsg 
524c349dbc7Sjsg 	if (ret)
5251bb76ff1Sjsg 		return ret;
526c349dbc7Sjsg 
527c349dbc7Sjsg 	return size;
528c349dbc7Sjsg }
529c349dbc7Sjsg 
530c349dbc7Sjsg /**
531c349dbc7Sjsg  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
532c349dbc7Sjsg  *
533c349dbc7Sjsg  * Some boards contain an EEPROM which is used to persistently store a list of
534c349dbc7Sjsg  * bad pages which experiences ECC errors in vram.  This interface provides
535c349dbc7Sjsg  * a way to reset the EEPROM, e.g., after testing error injection.
536c349dbc7Sjsg  *
537c349dbc7Sjsg  * Usage:
538c349dbc7Sjsg  *
539c349dbc7Sjsg  * .. code-block:: bash
540c349dbc7Sjsg  *
541c349dbc7Sjsg  *	echo 1 > ../ras/ras_eeprom_reset
542c349dbc7Sjsg  *
543c349dbc7Sjsg  * will reset EEPROM table to 0 entries.
544c349dbc7Sjsg  *
545c349dbc7Sjsg  */
5465ca02815Sjsg static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
5475ca02815Sjsg 					       const char __user *buf,
548c349dbc7Sjsg 					       size_t size, loff_t *pos)
549c349dbc7Sjsg {
550ad8b1aafSjsg 	struct amdgpu_device *adev =
551ad8b1aafSjsg 		(struct amdgpu_device *)file_inode(f)->i_private;
552c349dbc7Sjsg 	int ret;
553c349dbc7Sjsg 
554ad8b1aafSjsg 	ret = amdgpu_ras_eeprom_reset_table(
555ad8b1aafSjsg 		&(amdgpu_ras_get_context(adev)->eeprom_control));
556c349dbc7Sjsg 
5575ca02815Sjsg 	if (!ret) {
5585ca02815Sjsg 		/* Something was written to EEPROM.
5595ca02815Sjsg 		 */
560ad8b1aafSjsg 		amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
561ad8b1aafSjsg 		return size;
562ad8b1aafSjsg 	} else {
5635ca02815Sjsg 		return ret;
564ad8b1aafSjsg 	}
565c349dbc7Sjsg }
566c349dbc7Sjsg 
567c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
568c349dbc7Sjsg 	.owner = THIS_MODULE,
569c349dbc7Sjsg 	.read = NULL,
570c349dbc7Sjsg 	.write = amdgpu_ras_debugfs_ctrl_write,
571c349dbc7Sjsg 	.llseek = default_llseek
572c349dbc7Sjsg };
573c349dbc7Sjsg 
574c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
575c349dbc7Sjsg 	.owner = THIS_MODULE,
576c349dbc7Sjsg 	.read = NULL,
577c349dbc7Sjsg 	.write = amdgpu_ras_debugfs_eeprom_write,
578c349dbc7Sjsg 	.llseek = default_llseek
579c349dbc7Sjsg };
580c349dbc7Sjsg 
581c349dbc7Sjsg /**
582c349dbc7Sjsg  * DOC: AMDGPU RAS sysfs Error Count Interface
583c349dbc7Sjsg  *
584c349dbc7Sjsg  * It allows the user to read the error count for each IP block on the gpu through
585c349dbc7Sjsg  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
586c349dbc7Sjsg  *
587c349dbc7Sjsg  * It outputs the multiple lines which report the uncorrected (ue) and corrected
588c349dbc7Sjsg  * (ce) error counts.
589c349dbc7Sjsg  *
590c349dbc7Sjsg  * The format of one line is below,
591c349dbc7Sjsg  *
592c349dbc7Sjsg  * [ce|ue]: count
593c349dbc7Sjsg  *
594c349dbc7Sjsg  * Example:
595c349dbc7Sjsg  *
596c349dbc7Sjsg  * .. code-block:: bash
597c349dbc7Sjsg  *
598c349dbc7Sjsg  *	ue: 0
599c349dbc7Sjsg  *	ce: 1
600c349dbc7Sjsg  *
601c349dbc7Sjsg  */
602c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
603c349dbc7Sjsg 		struct device_attribute *attr, char *buf)
604c349dbc7Sjsg {
605c349dbc7Sjsg 	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
606c349dbc7Sjsg 	struct ras_query_if info = {
607c349dbc7Sjsg 		.head = obj->head,
608c349dbc7Sjsg 	};
609c349dbc7Sjsg 
610af8ed3f7Sjsg 	if (!amdgpu_ras_get_error_query_ready(obj->adev))
6115ca02815Sjsg 		return sysfs_emit(buf, "Query currently inaccessible\n");
612c349dbc7Sjsg 
6135ca02815Sjsg 	if (amdgpu_ras_query_error_status(obj->adev, &info))
614c349dbc7Sjsg 		return -EINVAL;
615c349dbc7Sjsg 
6161bb76ff1Sjsg 	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
6171bb76ff1Sjsg 	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
6185ca02815Sjsg 		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
6191bb76ff1Sjsg 			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
6205ca02815Sjsg 	}
6215ca02815Sjsg 
6225ca02815Sjsg 	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
623c349dbc7Sjsg 			  "ce", info.ce_count);
624c349dbc7Sjsg }
625c349dbc7Sjsg 
626c349dbc7Sjsg #endif /* __linux__ */
627c349dbc7Sjsg 
628c349dbc7Sjsg /* obj begin */
629c349dbc7Sjsg 
630c349dbc7Sjsg #define get_obj(obj) do { (obj)->use++; } while (0)
631c349dbc7Sjsg #define alive_obj(obj) ((obj)->use)
632c349dbc7Sjsg 
633c349dbc7Sjsg static inline void put_obj(struct ras_manager *obj)
634c349dbc7Sjsg {
6355ca02815Sjsg 	if (obj && (--obj->use == 0))
636c349dbc7Sjsg 		list_del(&obj->node);
6375ca02815Sjsg 	if (obj && (obj->use < 0))
6381bb76ff1Sjsg 		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
639c349dbc7Sjsg }
640c349dbc7Sjsg 
641c349dbc7Sjsg /* make one obj and return it. */
642c349dbc7Sjsg static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
643c349dbc7Sjsg 		struct ras_common_if *head)
644c349dbc7Sjsg {
645c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
646c349dbc7Sjsg 	struct ras_manager *obj;
647c349dbc7Sjsg 
6485ca02815Sjsg 	if (!adev->ras_enabled || !con)
649c349dbc7Sjsg 		return NULL;
650c349dbc7Sjsg 
651c349dbc7Sjsg 	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
652c349dbc7Sjsg 		return NULL;
653c349dbc7Sjsg 
6541bb76ff1Sjsg 	if (head->block == AMDGPU_RAS_BLOCK__MCA) {
6551bb76ff1Sjsg 		if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
6561bb76ff1Sjsg 			return NULL;
6571bb76ff1Sjsg 
6581bb76ff1Sjsg 		obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
6591bb76ff1Sjsg 	} else
660c349dbc7Sjsg 		obj = &con->objs[head->block];
6611bb76ff1Sjsg 
662c349dbc7Sjsg 	/* already exist. return obj? */
663c349dbc7Sjsg 	if (alive_obj(obj))
664c349dbc7Sjsg 		return NULL;
665c349dbc7Sjsg 
666c349dbc7Sjsg 	obj->head = *head;
667c349dbc7Sjsg 	obj->adev = adev;
668c349dbc7Sjsg 	list_add(&obj->node, &con->head);
669c349dbc7Sjsg 	get_obj(obj);
670c349dbc7Sjsg 
671c349dbc7Sjsg 	return obj;
672c349dbc7Sjsg }
673c349dbc7Sjsg 
674c349dbc7Sjsg /* return an obj equal to head, or the first when head is NULL */
675c349dbc7Sjsg struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
676c349dbc7Sjsg 		struct ras_common_if *head)
677c349dbc7Sjsg {
678c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
679c349dbc7Sjsg 	struct ras_manager *obj;
680c349dbc7Sjsg 	int i;
681c349dbc7Sjsg 
6825ca02815Sjsg 	if (!adev->ras_enabled || !con)
683c349dbc7Sjsg 		return NULL;
684c349dbc7Sjsg 
685c349dbc7Sjsg 	if (head) {
686c349dbc7Sjsg 		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
687c349dbc7Sjsg 			return NULL;
688c349dbc7Sjsg 
6891bb76ff1Sjsg 		if (head->block == AMDGPU_RAS_BLOCK__MCA) {
6901bb76ff1Sjsg 			if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
6911bb76ff1Sjsg 				return NULL;
6921bb76ff1Sjsg 
6931bb76ff1Sjsg 			obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
6941bb76ff1Sjsg 		} else
695c349dbc7Sjsg 			obj = &con->objs[head->block];
696c349dbc7Sjsg 
6971bb76ff1Sjsg 		if (alive_obj(obj))
698c349dbc7Sjsg 			return obj;
699c349dbc7Sjsg 	} else {
7001bb76ff1Sjsg 		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
701c349dbc7Sjsg 			obj = &con->objs[i];
7021bb76ff1Sjsg 			if (alive_obj(obj))
703c349dbc7Sjsg 				return obj;
704c349dbc7Sjsg 		}
705c349dbc7Sjsg 	}
706c349dbc7Sjsg 
707c349dbc7Sjsg 	return NULL;
708c349dbc7Sjsg }
709c349dbc7Sjsg /* obj end */
710c349dbc7Sjsg 
711c349dbc7Sjsg /* feature ctl begin */
712c349dbc7Sjsg static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
713c349dbc7Sjsg 					 struct ras_common_if *head)
714c349dbc7Sjsg {
7155ca02815Sjsg 	return adev->ras_hw_enabled & BIT(head->block);
716c349dbc7Sjsg }
717c349dbc7Sjsg 
718c349dbc7Sjsg static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
719c349dbc7Sjsg 		struct ras_common_if *head)
720c349dbc7Sjsg {
721c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
722c349dbc7Sjsg 
723c349dbc7Sjsg 	return con->features & BIT(head->block);
724c349dbc7Sjsg }
725c349dbc7Sjsg 
726c349dbc7Sjsg /*
727c349dbc7Sjsg  * if obj is not created, then create one.
728c349dbc7Sjsg  * set feature enable flag.
729c349dbc7Sjsg  */
730c349dbc7Sjsg static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
731c349dbc7Sjsg 		struct ras_common_if *head, int enable)
732c349dbc7Sjsg {
733c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
734c349dbc7Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
735c349dbc7Sjsg 
736c349dbc7Sjsg 	/* If hardware does not support ras, then do not create obj.
737c349dbc7Sjsg 	 * But if hardware support ras, we can create the obj.
738c349dbc7Sjsg 	 * Ras framework checks con->hw_supported to see if it need do
739c349dbc7Sjsg 	 * corresponding initialization.
740c349dbc7Sjsg 	 * IP checks con->support to see if it need disable ras.
741c349dbc7Sjsg 	 */
742c349dbc7Sjsg 	if (!amdgpu_ras_is_feature_allowed(adev, head))
743c349dbc7Sjsg 		return 0;
744c349dbc7Sjsg 
745c349dbc7Sjsg 	if (enable) {
746c349dbc7Sjsg 		if (!obj) {
747c349dbc7Sjsg 			obj = amdgpu_ras_create_obj(adev, head);
748c349dbc7Sjsg 			if (!obj)
749c349dbc7Sjsg 				return -EINVAL;
750c349dbc7Sjsg 		} else {
751c349dbc7Sjsg 			/* In case we create obj somewhere else */
752c349dbc7Sjsg 			get_obj(obj);
753c349dbc7Sjsg 		}
754c349dbc7Sjsg 		con->features |= BIT(head->block);
755c349dbc7Sjsg 	} else {
756c349dbc7Sjsg 		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
757c349dbc7Sjsg 			con->features &= ~BIT(head->block);
758c349dbc7Sjsg 			put_obj(obj);
759c349dbc7Sjsg 		}
760c349dbc7Sjsg 	}
761c349dbc7Sjsg 
762c349dbc7Sjsg 	return 0;
763c349dbc7Sjsg }
764c349dbc7Sjsg 
765c349dbc7Sjsg /* wrapper of psp_ras_enable_features */
766c349dbc7Sjsg int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
767c349dbc7Sjsg 		struct ras_common_if *head, bool enable)
768c349dbc7Sjsg {
769c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
770ad8b1aafSjsg 	union ta_ras_cmd_input *info;
771c349dbc7Sjsg 	int ret;
772c349dbc7Sjsg 
773c349dbc7Sjsg 	if (!con)
774c349dbc7Sjsg 		return -EINVAL;
775c349dbc7Sjsg 
776f005ef32Sjsg 	/* Do not enable ras feature if it is not allowed */
777f005ef32Sjsg 	if (enable &&
778f005ef32Sjsg 	    head->block != AMDGPU_RAS_BLOCK__GFX &&
779f005ef32Sjsg 	    !amdgpu_ras_is_feature_allowed(adev, head))
780f005ef32Sjsg 		return 0;
781f005ef32Sjsg 
782f005ef32Sjsg 	/* Only enable gfx ras feature from host side */
783f005ef32Sjsg 	if (head->block == AMDGPU_RAS_BLOCK__GFX &&
784f005ef32Sjsg 	    !amdgpu_sriov_vf(adev) &&
785f005ef32Sjsg 	    !amdgpu_ras_intr_triggered()) {
786ad8b1aafSjsg 		info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
787ad8b1aafSjsg 		if (!info)
788ad8b1aafSjsg 			return -ENOMEM;
789ad8b1aafSjsg 
790c349dbc7Sjsg 		if (!enable) {
791ad8b1aafSjsg 			info->disable_features = (struct ta_ras_disable_features_input) {
792c349dbc7Sjsg 				.block_id =  amdgpu_ras_block_to_ta(head->block),
793c349dbc7Sjsg 				.error_type = amdgpu_ras_error_to_ta(head->type),
794c349dbc7Sjsg 			};
795c349dbc7Sjsg 		} else {
796ad8b1aafSjsg 			info->enable_features = (struct ta_ras_enable_features_input) {
797c349dbc7Sjsg 				.block_id =  amdgpu_ras_block_to_ta(head->block),
798c349dbc7Sjsg 				.error_type = amdgpu_ras_error_to_ta(head->type),
799c349dbc7Sjsg 			};
800c349dbc7Sjsg 		}
801c349dbc7Sjsg 
802ad8b1aafSjsg 		ret = psp_ras_enable_features(&adev->psp, info, enable);
803c349dbc7Sjsg 		if (ret) {
8041bb76ff1Sjsg 			dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
805c349dbc7Sjsg 				enable ? "enable":"disable",
8061bb76ff1Sjsg 				get_ras_block_str(head),
8071bb76ff1Sjsg 				amdgpu_ras_is_poison_mode_supported(adev), ret);
808f005ef32Sjsg 			kfree(info);
809f005ef32Sjsg 			return ret;
810c349dbc7Sjsg 		}
811f005ef32Sjsg 
812f005ef32Sjsg 		kfree(info);
813c349dbc7Sjsg 	}
814c349dbc7Sjsg 
815c349dbc7Sjsg 	/* setup the obj */
816c349dbc7Sjsg 	__amdgpu_ras_feature_enable(adev, head, enable);
817f005ef32Sjsg 
818f005ef32Sjsg 	return 0;
819c349dbc7Sjsg }
820c349dbc7Sjsg 
821c349dbc7Sjsg /* Only used in device probe stage and called only once. */
822c349dbc7Sjsg int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
823c349dbc7Sjsg 		struct ras_common_if *head, bool enable)
824c349dbc7Sjsg {
825c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
826c349dbc7Sjsg 	int ret;
827c349dbc7Sjsg 
828c349dbc7Sjsg 	if (!con)
829c349dbc7Sjsg 		return -EINVAL;
830c349dbc7Sjsg 
831c349dbc7Sjsg 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
832c349dbc7Sjsg 		if (enable) {
833c349dbc7Sjsg 			/* There is no harm to issue a ras TA cmd regardless of
834c349dbc7Sjsg 			 * the currecnt ras state.
835c349dbc7Sjsg 			 * If current state == target state, it will do nothing
836c349dbc7Sjsg 			 * But sometimes it requests driver to reset and repost
837c349dbc7Sjsg 			 * with error code -EAGAIN.
838c349dbc7Sjsg 			 */
839c349dbc7Sjsg 			ret = amdgpu_ras_feature_enable(adev, head, 1);
840c349dbc7Sjsg 			/* With old ras TA, we might fail to enable ras.
841c349dbc7Sjsg 			 * Log it and just setup the object.
842c349dbc7Sjsg 			 * TODO need remove this WA in the future.
843c349dbc7Sjsg 			 */
844c349dbc7Sjsg 			if (ret == -EINVAL) {
845c349dbc7Sjsg 				ret = __amdgpu_ras_feature_enable(adev, head, 1);
846c349dbc7Sjsg 				if (!ret)
847ad8b1aafSjsg 					dev_info(adev->dev,
848ad8b1aafSjsg 						"RAS INFO: %s setup object\n",
8491bb76ff1Sjsg 						get_ras_block_str(head));
850c349dbc7Sjsg 			}
851c349dbc7Sjsg 		} else {
852c349dbc7Sjsg 			/* setup the object then issue a ras TA disable cmd.*/
853c349dbc7Sjsg 			ret = __amdgpu_ras_feature_enable(adev, head, 1);
854c349dbc7Sjsg 			if (ret)
855c349dbc7Sjsg 				return ret;
856c349dbc7Sjsg 
8575ca02815Sjsg 			/* gfx block ras dsiable cmd must send to ras-ta */
8585ca02815Sjsg 			if (head->block == AMDGPU_RAS_BLOCK__GFX)
8595ca02815Sjsg 				con->features |= BIT(head->block);
8605ca02815Sjsg 
861c349dbc7Sjsg 			ret = amdgpu_ras_feature_enable(adev, head, 0);
8625ca02815Sjsg 
8635ca02815Sjsg 			/* clean gfx block ras features flag */
8645ca02815Sjsg 			if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
8655ca02815Sjsg 				con->features &= ~BIT(head->block);
866c349dbc7Sjsg 		}
867c349dbc7Sjsg 	} else
868c349dbc7Sjsg 		ret = amdgpu_ras_feature_enable(adev, head, enable);
869c349dbc7Sjsg 
870c349dbc7Sjsg 	return ret;
871c349dbc7Sjsg }
872c349dbc7Sjsg 
873c349dbc7Sjsg static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
874c349dbc7Sjsg 		bool bypass)
875c349dbc7Sjsg {
876c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
877c349dbc7Sjsg 	struct ras_manager *obj, *tmp;
878c349dbc7Sjsg 
879c349dbc7Sjsg 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
880c349dbc7Sjsg 		/* bypass psp.
881c349dbc7Sjsg 		 * aka just release the obj and corresponding flags
882c349dbc7Sjsg 		 */
883c349dbc7Sjsg 		if (bypass) {
884c349dbc7Sjsg 			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
885c349dbc7Sjsg 				break;
886c349dbc7Sjsg 		} else {
887c349dbc7Sjsg 			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
888c349dbc7Sjsg 				break;
889c349dbc7Sjsg 		}
890c349dbc7Sjsg 	}
891c349dbc7Sjsg 
892c349dbc7Sjsg 	return con->features;
893c349dbc7Sjsg }
894c349dbc7Sjsg 
895c349dbc7Sjsg static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
896c349dbc7Sjsg 		bool bypass)
897c349dbc7Sjsg {
898c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
899c349dbc7Sjsg 	int i;
9001bb76ff1Sjsg 	const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
901c349dbc7Sjsg 
9021bb76ff1Sjsg 	for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
903c349dbc7Sjsg 		struct ras_common_if head = {
904c349dbc7Sjsg 			.block = i,
905c349dbc7Sjsg 			.type = default_ras_type,
906c349dbc7Sjsg 			.sub_block_index = 0,
907c349dbc7Sjsg 		};
9081bb76ff1Sjsg 
9091bb76ff1Sjsg 		if (i == AMDGPU_RAS_BLOCK__MCA)
9101bb76ff1Sjsg 			continue;
9111bb76ff1Sjsg 
9121bb76ff1Sjsg 		if (bypass) {
9131bb76ff1Sjsg 			/*
9141bb76ff1Sjsg 			 * bypass psp. vbios enable ras for us.
9151bb76ff1Sjsg 			 * so just create the obj
9161bb76ff1Sjsg 			 */
9171bb76ff1Sjsg 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
9181bb76ff1Sjsg 				break;
9191bb76ff1Sjsg 		} else {
9201bb76ff1Sjsg 			if (amdgpu_ras_feature_enable(adev, &head, 1))
9211bb76ff1Sjsg 				break;
9221bb76ff1Sjsg 		}
9231bb76ff1Sjsg 	}
9241bb76ff1Sjsg 
9251bb76ff1Sjsg 	for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
9261bb76ff1Sjsg 		struct ras_common_if head = {
9271bb76ff1Sjsg 			.block = AMDGPU_RAS_BLOCK__MCA,
9281bb76ff1Sjsg 			.type = default_ras_type,
9291bb76ff1Sjsg 			.sub_block_index = i,
9301bb76ff1Sjsg 		};
9311bb76ff1Sjsg 
932c349dbc7Sjsg 		if (bypass) {
933c349dbc7Sjsg 			/*
934c349dbc7Sjsg 			 * bypass psp. vbios enable ras for us.
935c349dbc7Sjsg 			 * so just create the obj
936c349dbc7Sjsg 			 */
937c349dbc7Sjsg 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
938c349dbc7Sjsg 				break;
939c349dbc7Sjsg 		} else {
940c349dbc7Sjsg 			if (amdgpu_ras_feature_enable(adev, &head, 1))
941c349dbc7Sjsg 				break;
942c349dbc7Sjsg 		}
943c349dbc7Sjsg 	}
944c349dbc7Sjsg 
945c349dbc7Sjsg 	return con->features;
946c349dbc7Sjsg }
947c349dbc7Sjsg /* feature ctl end */
948c349dbc7Sjsg 
9491bb76ff1Sjsg static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
9501bb76ff1Sjsg 		enum amdgpu_ras_block block)
9511bb76ff1Sjsg {
9521bb76ff1Sjsg 	if (!block_obj)
9531bb76ff1Sjsg 		return -EINVAL;
9541bb76ff1Sjsg 
9551bb76ff1Sjsg 	if (block_obj->ras_comm.block == block)
9561bb76ff1Sjsg 		return 0;
9571bb76ff1Sjsg 
9581bb76ff1Sjsg 	return -EINVAL;
9591bb76ff1Sjsg }
9601bb76ff1Sjsg 
9611bb76ff1Sjsg static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
9621bb76ff1Sjsg 					enum amdgpu_ras_block block, uint32_t sub_block_index)
9631bb76ff1Sjsg {
9641bb76ff1Sjsg 	struct amdgpu_ras_block_list *node, *tmp;
9651bb76ff1Sjsg 	struct amdgpu_ras_block_object *obj;
9661bb76ff1Sjsg 
9671bb76ff1Sjsg 	if (block >= AMDGPU_RAS_BLOCK__LAST)
9681bb76ff1Sjsg 		return NULL;
9691bb76ff1Sjsg 
9701bb76ff1Sjsg 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
9711bb76ff1Sjsg 		if (!node->ras_obj) {
9721bb76ff1Sjsg 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
9731bb76ff1Sjsg 			continue;
9741bb76ff1Sjsg 		}
9751bb76ff1Sjsg 
9761bb76ff1Sjsg 		obj = node->ras_obj;
9771bb76ff1Sjsg 		if (obj->ras_block_match) {
9781bb76ff1Sjsg 			if (obj->ras_block_match(obj, block, sub_block_index) == 0)
9791bb76ff1Sjsg 				return obj;
9801bb76ff1Sjsg 		} else {
9811bb76ff1Sjsg 			if (amdgpu_ras_block_match_default(obj, block) == 0)
9821bb76ff1Sjsg 				return obj;
9831bb76ff1Sjsg 		}
9841bb76ff1Sjsg 	}
9851bb76ff1Sjsg 
9861bb76ff1Sjsg 	return NULL;
9871bb76ff1Sjsg }
9881bb76ff1Sjsg 
9891bb76ff1Sjsg static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
9901bb76ff1Sjsg {
9911bb76ff1Sjsg 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9921bb76ff1Sjsg 	int ret = 0;
9931bb76ff1Sjsg 
9941bb76ff1Sjsg 	/*
9951bb76ff1Sjsg 	 * choosing right query method according to
9961bb76ff1Sjsg 	 * whether smu support query error information
9971bb76ff1Sjsg 	 */
9981bb76ff1Sjsg 	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
9991bb76ff1Sjsg 	if (ret == -EOPNOTSUPP) {
10001bb76ff1Sjsg 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
10011bb76ff1Sjsg 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
10021bb76ff1Sjsg 			adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
10031bb76ff1Sjsg 
10041bb76ff1Sjsg 		/* umc query_ras_error_address is also responsible for clearing
10051bb76ff1Sjsg 		 * error status
10061bb76ff1Sjsg 		 */
10071bb76ff1Sjsg 		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
10081bb76ff1Sjsg 		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
10091bb76ff1Sjsg 			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
10101bb76ff1Sjsg 	} else if (!ret) {
10111bb76ff1Sjsg 		if (adev->umc.ras &&
10121bb76ff1Sjsg 			adev->umc.ras->ecc_info_query_ras_error_count)
10131bb76ff1Sjsg 			adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
10141bb76ff1Sjsg 
10151bb76ff1Sjsg 		if (adev->umc.ras &&
10161bb76ff1Sjsg 			adev->umc.ras->ecc_info_query_ras_error_address)
10171bb76ff1Sjsg 			adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
10181bb76ff1Sjsg 	}
10191bb76ff1Sjsg }
10201bb76ff1Sjsg 
1021c349dbc7Sjsg /* query/inject/cure begin */
10225ca02815Sjsg int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
1023c349dbc7Sjsg 				  struct ras_query_if *info)
1024c349dbc7Sjsg {
10251bb76ff1Sjsg 	struct amdgpu_ras_block_object *block_obj = NULL;
1026c349dbc7Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1027c349dbc7Sjsg 	struct ras_err_data err_data = {0, 0, 0, NULL};
1028c349dbc7Sjsg 
1029c349dbc7Sjsg 	if (!obj)
1030c349dbc7Sjsg 		return -EINVAL;
1031c349dbc7Sjsg 
1032b2bc41bbSjsg 	if (!info || info->head.block == AMDGPU_RAS_BLOCK_COUNT)
1033b2bc41bbSjsg 		return -EINVAL;
1034b2bc41bbSjsg 
10351bb76ff1Sjsg 	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
10361bb76ff1Sjsg 		amdgpu_ras_get_ecc_info(adev, &err_data);
10371bb76ff1Sjsg 	} else {
10381bb76ff1Sjsg 		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
10391bb76ff1Sjsg 		if (!block_obj || !block_obj->hw_ops)   {
10401bb76ff1Sjsg 			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
10411bb76ff1Sjsg 				     get_ras_block_str(&info->head));
10421bb76ff1Sjsg 			return -EINVAL;
1043c349dbc7Sjsg 		}
10445ca02815Sjsg 
10451bb76ff1Sjsg 		if (block_obj->hw_ops->query_ras_error_count)
10461bb76ff1Sjsg 			block_obj->hw_ops->query_ras_error_count(adev, &err_data);
10475ca02815Sjsg 
10481bb76ff1Sjsg 		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
10491bb76ff1Sjsg 		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
10501bb76ff1Sjsg 		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
10511bb76ff1Sjsg 				if (block_obj->hw_ops->query_ras_error_status)
10521bb76ff1Sjsg 					block_obj->hw_ops->query_ras_error_status(adev);
10531bb76ff1Sjsg 			}
1054c349dbc7Sjsg 	}
1055c349dbc7Sjsg 
1056c349dbc7Sjsg 	obj->err_data.ue_count += err_data.ue_count;
1057c349dbc7Sjsg 	obj->err_data.ce_count += err_data.ce_count;
1058c349dbc7Sjsg 
1059c349dbc7Sjsg 	info->ue_count = obj->err_data.ue_count;
1060c349dbc7Sjsg 	info->ce_count = obj->err_data.ce_count;
1061c349dbc7Sjsg 
1062c349dbc7Sjsg 	if (err_data.ce_count) {
1063f005ef32Sjsg 		if (!adev->aid_mask &&
1064f005ef32Sjsg 		    adev->smuio.funcs &&
10655ca02815Sjsg 		    adev->smuio.funcs->get_socket_id &&
10665ca02815Sjsg 		    adev->smuio.funcs->get_die_id) {
10675ca02815Sjsg 			dev_info(adev->dev, "socket: %d, die: %d "
10685ca02815Sjsg 					"%ld correctable hardware errors "
10695ca02815Sjsg 					"detected in %s block, no user "
10705ca02815Sjsg 					"action is needed.\n",
10715ca02815Sjsg 					adev->smuio.funcs->get_socket_id(adev),
10725ca02815Sjsg 					adev->smuio.funcs->get_die_id(adev),
10735ca02815Sjsg 					obj->err_data.ce_count,
10741bb76ff1Sjsg 					get_ras_block_str(&info->head));
10755ca02815Sjsg 		} else {
1076ad8b1aafSjsg 			dev_info(adev->dev, "%ld correctable hardware errors "
1077ad8b1aafSjsg 					"detected in %s block, no user "
1078ad8b1aafSjsg 					"action is needed.\n",
1079ad8b1aafSjsg 					obj->err_data.ce_count,
10801bb76ff1Sjsg 					get_ras_block_str(&info->head));
1081c349dbc7Sjsg 		}
10825ca02815Sjsg 	}
1083c349dbc7Sjsg 	if (err_data.ue_count) {
1084f005ef32Sjsg 		if (!adev->aid_mask &&
1085f005ef32Sjsg 		    adev->smuio.funcs &&
10865ca02815Sjsg 		    adev->smuio.funcs->get_socket_id &&
10875ca02815Sjsg 		    adev->smuio.funcs->get_die_id) {
10885ca02815Sjsg 			dev_info(adev->dev, "socket: %d, die: %d "
10895ca02815Sjsg 					"%ld uncorrectable hardware errors "
10905ca02815Sjsg 					"detected in %s block\n",
10915ca02815Sjsg 					adev->smuio.funcs->get_socket_id(adev),
10925ca02815Sjsg 					adev->smuio.funcs->get_die_id(adev),
10935ca02815Sjsg 					obj->err_data.ue_count,
10941bb76ff1Sjsg 					get_ras_block_str(&info->head));
10955ca02815Sjsg 		} else {
1096ad8b1aafSjsg 			dev_info(adev->dev, "%ld uncorrectable hardware errors "
1097ad8b1aafSjsg 					"detected in %s block\n",
1098ad8b1aafSjsg 					obj->err_data.ue_count,
10991bb76ff1Sjsg 					get_ras_block_str(&info->head));
1100c349dbc7Sjsg 		}
11015ca02815Sjsg 	}
11025ca02815Sjsg 
11035ca02815Sjsg 	return 0;
11045ca02815Sjsg }
11055ca02815Sjsg 
11065ca02815Sjsg int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
11075ca02815Sjsg 		enum amdgpu_ras_block block)
11085ca02815Sjsg {
11091bb76ff1Sjsg 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
11101bb76ff1Sjsg 
11115ca02815Sjsg 	if (!amdgpu_ras_is_supported(adev, block))
11125ca02815Sjsg 		return -EINVAL;
11135ca02815Sjsg 
11141bb76ff1Sjsg 	if (!block_obj || !block_obj->hw_ops)   {
11151bb76ff1Sjsg 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
11161bb76ff1Sjsg 			     ras_block_str(block));
11171bb76ff1Sjsg 		return -EINVAL;
11181bb76ff1Sjsg 	}
11195ca02815Sjsg 
11201bb76ff1Sjsg 	if (block_obj->hw_ops->reset_ras_error_count)
11211bb76ff1Sjsg 		block_obj->hw_ops->reset_ras_error_count(adev);
11225ca02815Sjsg 
11231bb76ff1Sjsg 	if ((block == AMDGPU_RAS_BLOCK__GFX) ||
11241bb76ff1Sjsg 	    (block == AMDGPU_RAS_BLOCK__MMHUB)) {
11251bb76ff1Sjsg 		if (block_obj->hw_ops->reset_ras_error_status)
11261bb76ff1Sjsg 			block_obj->hw_ops->reset_ras_error_status(adev);
11275ca02815Sjsg 	}
1128c349dbc7Sjsg 
1129c349dbc7Sjsg 	return 0;
1130c349dbc7Sjsg }
1131c349dbc7Sjsg 
1132c349dbc7Sjsg /* wrapper of psp_ras_trigger_error */
1133c349dbc7Sjsg int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1134c349dbc7Sjsg 		struct ras_inject_if *info)
1135c349dbc7Sjsg {
1136c349dbc7Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1137c349dbc7Sjsg 	struct ta_ras_trigger_error_input block_info = {
1138c349dbc7Sjsg 		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
1139c349dbc7Sjsg 		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1140c349dbc7Sjsg 		.sub_block_index = info->head.sub_block_index,
1141c349dbc7Sjsg 		.address = info->address,
1142c349dbc7Sjsg 		.value = info->value,
1143c349dbc7Sjsg 	};
11441bb76ff1Sjsg 	int ret = -EINVAL;
11451bb76ff1Sjsg 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
11461bb76ff1Sjsg 							info->head.block,
11471bb76ff1Sjsg 							info->head.sub_block_index);
1148c349dbc7Sjsg 
1149f005ef32Sjsg 	/* inject on guest isn't allowed, return success directly */
1150f005ef32Sjsg 	if (amdgpu_sriov_vf(adev))
1151f005ef32Sjsg 		return 0;
1152f005ef32Sjsg 
1153c349dbc7Sjsg 	if (!obj)
1154c349dbc7Sjsg 		return -EINVAL;
1155c349dbc7Sjsg 
11561bb76ff1Sjsg 	if (!block_obj || !block_obj->hw_ops)	{
11571bb76ff1Sjsg 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
11581bb76ff1Sjsg 			     get_ras_block_str(&info->head));
11591bb76ff1Sjsg 		return -EINVAL;
11601bb76ff1Sjsg 	}
11611bb76ff1Sjsg 
1162c349dbc7Sjsg 	/* Calculate XGMI relative offset */
1163f005ef32Sjsg 	if (adev->gmc.xgmi.num_physical_nodes > 1 &&
1164f005ef32Sjsg 	    info->head.block != AMDGPU_RAS_BLOCK__GFX) {
1165c349dbc7Sjsg 		block_info.address =
1166c349dbc7Sjsg 			amdgpu_xgmi_get_relative_phy_addr(adev,
1167c349dbc7Sjsg 							  block_info.address);
1168c349dbc7Sjsg 	}
1169c349dbc7Sjsg 
1170f005ef32Sjsg 	if (block_obj->hw_ops->ras_error_inject) {
1171f005ef32Sjsg 		if (info->head.block == AMDGPU_RAS_BLOCK__GFX)
1172f005ef32Sjsg 			ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
1173f005ef32Sjsg 		else /* Special ras_error_inject is defined (e.g: xgmi) */
1174f005ef32Sjsg 			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
1175f005ef32Sjsg 						info->instance_mask);
11761bb76ff1Sjsg 	} else {
1177f005ef32Sjsg 		/* default path */
1178f005ef32Sjsg 		ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
1179c349dbc7Sjsg 	}
1180c349dbc7Sjsg 
11815ca02815Sjsg 	if (ret)
11825ca02815Sjsg 		dev_err(adev->dev, "ras inject %s failed %d\n",
11831bb76ff1Sjsg 			get_ras_block_str(&info->head), ret);
1184c349dbc7Sjsg 
1185c349dbc7Sjsg 	return ret;
1186c349dbc7Sjsg }
1187c349dbc7Sjsg 
11885ca02815Sjsg /**
1189f005ef32Sjsg  * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1190f005ef32Sjsg  * @adev: pointer to AMD GPU device
1191f005ef32Sjsg  * @ce_count: pointer to an integer to be set to the count of correctible errors.
1192f005ef32Sjsg  * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
1193f005ef32Sjsg  * @query_info: pointer to ras_query_if
1194f005ef32Sjsg  *
1195f005ef32Sjsg  * Return 0 for query success or do nothing, otherwise return an error
1196f005ef32Sjsg  * on failures
1197f005ef32Sjsg  */
1198f005ef32Sjsg static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
1199f005ef32Sjsg 					       unsigned long *ce_count,
1200f005ef32Sjsg 					       unsigned long *ue_count,
1201f005ef32Sjsg 					       struct ras_query_if *query_info)
1202f005ef32Sjsg {
1203f005ef32Sjsg 	int ret;
1204f005ef32Sjsg 
1205f005ef32Sjsg 	if (!query_info)
1206f005ef32Sjsg 		/* do nothing if query_info is not specified */
1207f005ef32Sjsg 		return 0;
1208f005ef32Sjsg 
1209f005ef32Sjsg 	ret = amdgpu_ras_query_error_status(adev, query_info);
1210f005ef32Sjsg 	if (ret)
1211f005ef32Sjsg 		return ret;
1212f005ef32Sjsg 
1213f005ef32Sjsg 	*ce_count += query_info->ce_count;
1214f005ef32Sjsg 	*ue_count += query_info->ue_count;
1215f005ef32Sjsg 
1216f005ef32Sjsg 	/* some hardware/IP supports read to clear
1217f005ef32Sjsg 	 * no need to explictly reset the err status after the query call */
1218f005ef32Sjsg 	if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
1219f005ef32Sjsg 	    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
1220f005ef32Sjsg 		if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
1221f005ef32Sjsg 			dev_warn(adev->dev,
1222f005ef32Sjsg 				 "Failed to reset error counter and error status\n");
1223f005ef32Sjsg 	}
1224f005ef32Sjsg 
1225f005ef32Sjsg 	return 0;
1226f005ef32Sjsg }
1227f005ef32Sjsg 
1228f005ef32Sjsg /**
1229f005ef32Sjsg  * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
12301bb76ff1Sjsg  * @adev: pointer to AMD GPU device
12311bb76ff1Sjsg  * @ce_count: pointer to an integer to be set to the count of correctible errors.
12321bb76ff1Sjsg  * @ue_count: pointer to an integer to be set to the count of uncorrectible
12335ca02815Sjsg  * errors.
1234f005ef32Sjsg  * @query_info: pointer to ras_query_if if the query request is only for
1235f005ef32Sjsg  * specific ip block; if info is NULL, then the qurey request is for
1236f005ef32Sjsg  * all the ip blocks that support query ras error counters/status
12375ca02815Sjsg  *
12385ca02815Sjsg  * If set, @ce_count or @ue_count, count and return the corresponding
12395ca02815Sjsg  * error counts in those integer pointers. Return 0 if the device
12405ca02815Sjsg  * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
12415ca02815Sjsg  */
12425ca02815Sjsg int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
12435ca02815Sjsg 				 unsigned long *ce_count,
1244f005ef32Sjsg 				 unsigned long *ue_count,
1245f005ef32Sjsg 				 struct ras_query_if *query_info)
1246c349dbc7Sjsg {
1247c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1248c349dbc7Sjsg 	struct ras_manager *obj;
12495ca02815Sjsg 	unsigned long ce, ue;
1250f005ef32Sjsg 	int ret;
1251c349dbc7Sjsg 
12525ca02815Sjsg 	if (!adev->ras_enabled || !con)
12535ca02815Sjsg 		return -EOPNOTSUPP;
12545ca02815Sjsg 
12555ca02815Sjsg 	/* Don't count since no reporting.
12565ca02815Sjsg 	 */
12575ca02815Sjsg 	if (!ce_count && !ue_count)
1258c349dbc7Sjsg 		return 0;
1259c349dbc7Sjsg 
12605ca02815Sjsg 	ce = 0;
12615ca02815Sjsg 	ue = 0;
1262f005ef32Sjsg 	if (!query_info) {
1263f005ef32Sjsg 		/* query all the ip blocks that support ras query interface */
1264c349dbc7Sjsg 		list_for_each_entry(obj, &con->head, node) {
1265c349dbc7Sjsg 			struct ras_query_if info = {
1266c349dbc7Sjsg 				.head = obj->head,
1267c349dbc7Sjsg 			};
1268c349dbc7Sjsg 
1269f005ef32Sjsg 			ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
1270f005ef32Sjsg 		}
1271f005ef32Sjsg 	} else {
1272f005ef32Sjsg 		/* query specific ip block */
1273f005ef32Sjsg 		ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
12741bb76ff1Sjsg 	}
12751bb76ff1Sjsg 
1276f005ef32Sjsg 	if (ret)
1277f005ef32Sjsg 		return ret;
1278c349dbc7Sjsg 
12795ca02815Sjsg 	if (ce_count)
12805ca02815Sjsg 		*ce_count = ce;
12815ca02815Sjsg 
12825ca02815Sjsg 	if (ue_count)
12835ca02815Sjsg 		*ue_count = ue;
12845ca02815Sjsg 
12855ca02815Sjsg 	return 0;
1286c349dbc7Sjsg }
1287c349dbc7Sjsg /* query/inject/cure end */
1288c349dbc7Sjsg 
1289c349dbc7Sjsg #ifdef __linux__
1290c349dbc7Sjsg 
1291c349dbc7Sjsg /* sysfs begin */
1292c349dbc7Sjsg 
1293c349dbc7Sjsg static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1294c349dbc7Sjsg 		struct ras_badpage **bps, unsigned int *count);
1295c349dbc7Sjsg 
1296c349dbc7Sjsg static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1297c349dbc7Sjsg {
1298c349dbc7Sjsg 	switch (flags) {
1299c349dbc7Sjsg 	case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1300c349dbc7Sjsg 		return "R";
1301c349dbc7Sjsg 	case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1302c349dbc7Sjsg 		return "P";
1303c349dbc7Sjsg 	case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1304c349dbc7Sjsg 	default:
1305c349dbc7Sjsg 		return "F";
13065ca02815Sjsg 	}
1307c349dbc7Sjsg }
1308c349dbc7Sjsg 
1309c349dbc7Sjsg /**
1310c349dbc7Sjsg  * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1311c349dbc7Sjsg  *
1312c349dbc7Sjsg  * It allows user to read the bad pages of vram on the gpu through
1313c349dbc7Sjsg  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1314c349dbc7Sjsg  *
1315c349dbc7Sjsg  * It outputs multiple lines, and each line stands for one gpu page.
1316c349dbc7Sjsg  *
1317c349dbc7Sjsg  * The format of one line is below,
1318c349dbc7Sjsg  * gpu pfn : gpu page size : flags
1319c349dbc7Sjsg  *
1320c349dbc7Sjsg  * gpu pfn and gpu page size are printed in hex format.
1321c349dbc7Sjsg  * flags can be one of below character,
1322c349dbc7Sjsg  *
1323c349dbc7Sjsg  * R: reserved, this gpu page is reserved and not able to use.
1324c349dbc7Sjsg  *
1325c349dbc7Sjsg  * P: pending for reserve, this gpu page is marked as bad, will be reserved
1326c349dbc7Sjsg  * in next window of page_reserve.
1327c349dbc7Sjsg  *
1328c349dbc7Sjsg  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
1329c349dbc7Sjsg  *
1330c349dbc7Sjsg  * Examples:
1331c349dbc7Sjsg  *
1332c349dbc7Sjsg  * .. code-block:: bash
1333c349dbc7Sjsg  *
1334c349dbc7Sjsg  *	0x00000001 : 0x00001000 : R
1335c349dbc7Sjsg  *	0x00000002 : 0x00001000 : P
1336c349dbc7Sjsg  *
1337c349dbc7Sjsg  */
1338c349dbc7Sjsg 
1339c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1340c349dbc7Sjsg 		struct kobject *kobj, struct bin_attribute *attr,
1341c349dbc7Sjsg 		char *buf, loff_t ppos, size_t count)
1342c349dbc7Sjsg {
1343c349dbc7Sjsg 	struct amdgpu_ras *con =
1344c349dbc7Sjsg 		container_of(attr, struct amdgpu_ras, badpages_attr);
1345c349dbc7Sjsg 	struct amdgpu_device *adev = con->adev;
1346c349dbc7Sjsg 	const unsigned int element_size =
1347c349dbc7Sjsg 		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1348c349dbc7Sjsg 	unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1349c349dbc7Sjsg 	unsigned int end = div64_ul(ppos + count - 1, element_size);
1350c349dbc7Sjsg 	ssize_t s = 0;
1351c349dbc7Sjsg 	struct ras_badpage *bps = NULL;
1352c349dbc7Sjsg 	unsigned int bps_count = 0;
1353c349dbc7Sjsg 
1354c349dbc7Sjsg 	memset(buf, 0, count);
1355c349dbc7Sjsg 
1356c349dbc7Sjsg 	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1357c349dbc7Sjsg 		return 0;
1358c349dbc7Sjsg 
1359c349dbc7Sjsg 	for (; start < end && start < bps_count; start++)
1360c349dbc7Sjsg 		s += scnprintf(&buf[s], element_size + 1,
1361c349dbc7Sjsg 				"0x%08x : 0x%08x : %1s\n",
1362c349dbc7Sjsg 				bps[start].bp,
1363c349dbc7Sjsg 				bps[start].size,
1364c349dbc7Sjsg 				amdgpu_ras_badpage_flags_str(bps[start].flags));
1365c349dbc7Sjsg 
1366c349dbc7Sjsg 	kfree(bps);
1367c349dbc7Sjsg 
1368c349dbc7Sjsg 	return s;
1369c349dbc7Sjsg }
1370c349dbc7Sjsg 
1371c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1372c349dbc7Sjsg 		struct device_attribute *attr, char *buf)
1373c349dbc7Sjsg {
1374c349dbc7Sjsg 	struct amdgpu_ras *con =
1375c349dbc7Sjsg 		container_of(attr, struct amdgpu_ras, features_attr);
1376c349dbc7Sjsg 
1377f005ef32Sjsg 	return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);
1378c349dbc7Sjsg }
1379c349dbc7Sjsg 
1380ad8b1aafSjsg static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1381c349dbc7Sjsg {
1382c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1383c349dbc7Sjsg 
138475aab5aaSjsg 	if (adev->dev->kobj.sd)
1385ad8b1aafSjsg 		sysfs_remove_file_from_group(&adev->dev->kobj,
1386ad8b1aafSjsg 				&con->badpages_attr.attr,
1387ad8b1aafSjsg 				RAS_FS_NAME);
1388c349dbc7Sjsg }
1389c349dbc7Sjsg 
1390c349dbc7Sjsg static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1391c349dbc7Sjsg {
1392c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1393c349dbc7Sjsg 	struct attribute *attrs[] = {
1394c349dbc7Sjsg 		&con->features_attr.attr,
1395c349dbc7Sjsg 		NULL
1396c349dbc7Sjsg 	};
1397c349dbc7Sjsg 	struct attribute_group group = {
1398ad8b1aafSjsg 		.name = RAS_FS_NAME,
1399c349dbc7Sjsg 		.attrs = attrs,
1400c349dbc7Sjsg 	};
1401c349dbc7Sjsg 
140275aab5aaSjsg 	if (adev->dev->kobj.sd)
1403c349dbc7Sjsg 		sysfs_remove_group(&adev->dev->kobj, &group);
1404c349dbc7Sjsg 
1405c349dbc7Sjsg 	return 0;
1406c349dbc7Sjsg }
1407c349dbc7Sjsg 
1408c349dbc7Sjsg #endif /* __linux__ */
1409c349dbc7Sjsg 
1410c349dbc7Sjsg int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
14111bb76ff1Sjsg 		struct ras_common_if *head)
1412c349dbc7Sjsg {
14131bb76ff1Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1414c349dbc7Sjsg 
1415c349dbc7Sjsg 	if (!obj || obj->attr_inuse)
1416c349dbc7Sjsg 		return -EINVAL;
1417c349dbc7Sjsg 
1418c349dbc7Sjsg 	STUB();
1419c349dbc7Sjsg 	return -ENOSYS;
1420c349dbc7Sjsg #ifdef notyet
1421c349dbc7Sjsg 	get_obj(obj);
1422c349dbc7Sjsg 
14231bb76ff1Sjsg 	snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
14241bb76ff1Sjsg 		"%s_err_count", head->name);
1425c349dbc7Sjsg 
1426c349dbc7Sjsg 	obj->sysfs_attr = (struct device_attribute){
1427c349dbc7Sjsg 		.attr = {
1428c349dbc7Sjsg 			.name = obj->fs_data.sysfs_name,
1429c349dbc7Sjsg 			.mode = S_IRUGO,
1430c349dbc7Sjsg 		},
1431c349dbc7Sjsg 			.show = amdgpu_ras_sysfs_read,
1432c349dbc7Sjsg 	};
1433c349dbc7Sjsg 	sysfs_attr_init(&obj->sysfs_attr.attr);
1434c349dbc7Sjsg 
1435c349dbc7Sjsg 	if (sysfs_add_file_to_group(&adev->dev->kobj,
1436c349dbc7Sjsg 				&obj->sysfs_attr.attr,
1437ad8b1aafSjsg 				RAS_FS_NAME)) {
1438c349dbc7Sjsg 		put_obj(obj);
1439c349dbc7Sjsg 		return -EINVAL;
1440c349dbc7Sjsg 	}
1441c349dbc7Sjsg 
1442c349dbc7Sjsg 	obj->attr_inuse = 1;
1443c349dbc7Sjsg 
1444c349dbc7Sjsg 	return 0;
1445c349dbc7Sjsg #endif
1446c349dbc7Sjsg }
1447c349dbc7Sjsg 
1448c349dbc7Sjsg int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1449c349dbc7Sjsg 		struct ras_common_if *head)
1450c349dbc7Sjsg {
1451c349dbc7Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1452c349dbc7Sjsg 
1453c349dbc7Sjsg 	if (!obj || !obj->attr_inuse)
1454c349dbc7Sjsg 		return -EINVAL;
1455c349dbc7Sjsg 
145675aab5aaSjsg #ifdef __linux__
145775aab5aaSjsg 	if (adev->dev->kobj.sd)
1458c349dbc7Sjsg 		sysfs_remove_file_from_group(&adev->dev->kobj,
1459c349dbc7Sjsg 				&obj->sysfs_attr.attr,
1460ad8b1aafSjsg 				RAS_FS_NAME);
146175aab5aaSjsg #endif
1462c349dbc7Sjsg 	obj->attr_inuse = 0;
1463c349dbc7Sjsg 	put_obj(obj);
1464c349dbc7Sjsg 
1465c349dbc7Sjsg 	return 0;
1466c349dbc7Sjsg }
1467c349dbc7Sjsg 
1468c349dbc7Sjsg #ifdef __linux__
1469c349dbc7Sjsg 
1470c349dbc7Sjsg static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1471c349dbc7Sjsg {
1472c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1473c349dbc7Sjsg 	struct ras_manager *obj, *tmp;
1474c349dbc7Sjsg 
1475c349dbc7Sjsg 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
1476c349dbc7Sjsg 		amdgpu_ras_sysfs_remove(adev, &obj->head);
1477c349dbc7Sjsg 	}
1478c349dbc7Sjsg 
1479ad8b1aafSjsg 	if (amdgpu_bad_page_threshold != 0)
1480ad8b1aafSjsg 		amdgpu_ras_sysfs_remove_bad_page_node(adev);
1481ad8b1aafSjsg 
1482c349dbc7Sjsg 	amdgpu_ras_sysfs_remove_feature_node(adev);
1483c349dbc7Sjsg 
1484c349dbc7Sjsg 	return 0;
1485c349dbc7Sjsg }
1486c349dbc7Sjsg /* sysfs end */
1487c349dbc7Sjsg 
1488c349dbc7Sjsg /**
1489c349dbc7Sjsg  * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1490c349dbc7Sjsg  *
1491c349dbc7Sjsg  * Normally when there is an uncorrectable error, the driver will reset
1492c349dbc7Sjsg  * the GPU to recover.  However, in the event of an unrecoverable error,
1493c349dbc7Sjsg  * the driver provides an interface to reboot the system automatically
1494c349dbc7Sjsg  * in that event.
1495c349dbc7Sjsg  *
1496c349dbc7Sjsg  * The following file in debugfs provides that interface:
1497c349dbc7Sjsg  * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1498c349dbc7Sjsg  *
1499c349dbc7Sjsg  * Usage:
1500c349dbc7Sjsg  *
1501c349dbc7Sjsg  * .. code-block:: bash
1502c349dbc7Sjsg  *
1503c349dbc7Sjsg  *	echo true > .../ras/auto_reboot
1504c349dbc7Sjsg  *
1505c349dbc7Sjsg  */
1506c349dbc7Sjsg /* debugfs begin */
15075ca02815Sjsg static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1508c349dbc7Sjsg {
1509c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1510f005ef32Sjsg 	struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control;
1511ad8b1aafSjsg 	struct drm_minor  *minor = adev_to_drm(adev)->primary;
15125ca02815Sjsg 	struct dentry     *dir;
1513c349dbc7Sjsg 
15145ca02815Sjsg 	dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
15155ca02815Sjsg 	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
15165ca02815Sjsg 			    &amdgpu_ras_debugfs_ctrl_ops);
15175ca02815Sjsg 	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
15185ca02815Sjsg 			    &amdgpu_ras_debugfs_eeprom_ops);
15195ca02815Sjsg 	debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
15205ca02815Sjsg 			   &con->bad_page_cnt_threshold);
1521f005ef32Sjsg 	debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs);
15225ca02815Sjsg 	debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
15235ca02815Sjsg 	debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
15245ca02815Sjsg 	debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
15255ca02815Sjsg 			    &amdgpu_ras_debugfs_eeprom_size_ops);
15265ca02815Sjsg 	con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
15275ca02815Sjsg 						       S_IRUGO, dir, adev,
15285ca02815Sjsg 						       &amdgpu_ras_debugfs_eeprom_table_ops);
15295ca02815Sjsg 	amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1530c349dbc7Sjsg 
1531c349dbc7Sjsg 	/*
1532c349dbc7Sjsg 	 * After one uncorrectable error happens, usually GPU recovery will
1533c349dbc7Sjsg 	 * be scheduled. But due to the known problem in GPU recovery failing
1534c349dbc7Sjsg 	 * to bring GPU back, below interface provides one direct way to
1535c349dbc7Sjsg 	 * user to reboot system automatically in such case within
1536c349dbc7Sjsg 	 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1537c349dbc7Sjsg 	 * will never be called.
1538c349dbc7Sjsg 	 */
15395ca02815Sjsg 	debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
1540ad8b1aafSjsg 
1541ad8b1aafSjsg 	/*
1542ad8b1aafSjsg 	 * User could set this not to clean up hardware's error count register
1543ad8b1aafSjsg 	 * of RAS IPs during ras recovery.
1544ad8b1aafSjsg 	 */
15455ca02815Sjsg 	debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
15465ca02815Sjsg 			    &con->disable_ras_err_cnt_harvest);
15475ca02815Sjsg 	return dir;
1548c349dbc7Sjsg }
1549c349dbc7Sjsg 
1550ad8b1aafSjsg static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
15515ca02815Sjsg 				      struct ras_fs_if *head,
15525ca02815Sjsg 				      struct dentry *dir)
1553c349dbc7Sjsg {
1554c349dbc7Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1555c349dbc7Sjsg 
15565ca02815Sjsg 	if (!obj || !dir)
1557c349dbc7Sjsg 		return;
1558c349dbc7Sjsg 
1559c349dbc7Sjsg 	get_obj(obj);
1560c349dbc7Sjsg 
1561c349dbc7Sjsg 	memcpy(obj->fs_data.debugfs_name,
1562c349dbc7Sjsg 			head->debugfs_name,
1563c349dbc7Sjsg 			sizeof(obj->fs_data.debugfs_name));
1564c349dbc7Sjsg 
15655ca02815Sjsg 	debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
15665ca02815Sjsg 			    obj, &amdgpu_ras_debugfs_ops);
1567c349dbc7Sjsg }
1568c349dbc7Sjsg 
1569c349dbc7Sjsg void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1570c349dbc7Sjsg {
1571c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
15725ca02815Sjsg 	struct dentry *dir;
1573c349dbc7Sjsg 	struct ras_manager *obj;
1574c349dbc7Sjsg 	struct ras_fs_if fs_info;
1575c349dbc7Sjsg 
1576c349dbc7Sjsg 	/*
1577c349dbc7Sjsg 	 * it won't be called in resume path, no need to check
1578c349dbc7Sjsg 	 * suspend and gpu reset status
1579c349dbc7Sjsg 	 */
1580ad8b1aafSjsg 	if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1581c349dbc7Sjsg 		return;
1582c349dbc7Sjsg 
15835ca02815Sjsg 	dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1584c349dbc7Sjsg 
1585c349dbc7Sjsg 	list_for_each_entry(obj, &con->head, node) {
1586c349dbc7Sjsg 		if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1587c349dbc7Sjsg 			(obj->attr_inuse == 1)) {
1588f005ef32Sjsg 			snprintf(fs_info.debugfs_name, sizeof(fs_info.debugfs_name), "%s_err_inject",
15891bb76ff1Sjsg 					get_ras_block_str(&obj->head));
1590c349dbc7Sjsg 			fs_info.head = obj->head;
15915ca02815Sjsg 			amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1592c349dbc7Sjsg 		}
1593c349dbc7Sjsg 	}
1594c349dbc7Sjsg }
1595c349dbc7Sjsg 
1596c349dbc7Sjsg /* debugfs end */
1597c349dbc7Sjsg 
1598c349dbc7Sjsg /* ras fs */
1599ad8b1aafSjsg static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1600ad8b1aafSjsg 		amdgpu_ras_sysfs_badpages_read, NULL, 0);
1601ad8b1aafSjsg #endif /* __linux__ */
1602ad8b1aafSjsg static DEVICE_ATTR(features, S_IRUGO,
1603ad8b1aafSjsg 		amdgpu_ras_sysfs_features_read, NULL);
1604c349dbc7Sjsg static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1605c349dbc7Sjsg {
1606c349dbc7Sjsg #ifdef __linux__
1607ad8b1aafSjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1608ad8b1aafSjsg 	struct attribute_group group = {
1609ad8b1aafSjsg 		.name = RAS_FS_NAME,
1610ad8b1aafSjsg 	};
1611ad8b1aafSjsg 	struct attribute *attrs[] = {
1612ad8b1aafSjsg 		&con->features_attr.attr,
1613ad8b1aafSjsg 		NULL
1614ad8b1aafSjsg 	};
1615ad8b1aafSjsg 	struct bin_attribute *bin_attrs[] = {
1616ad8b1aafSjsg 		NULL,
1617ad8b1aafSjsg 		NULL,
1618ad8b1aafSjsg 	};
1619ad8b1aafSjsg 	int r;
1620ad8b1aafSjsg 
1621ad8b1aafSjsg 	/* add features entry */
1622ad8b1aafSjsg 	con->features_attr = dev_attr_features;
1623ad8b1aafSjsg 	group.attrs = attrs;
1624ad8b1aafSjsg 	sysfs_attr_init(attrs[0]);
1625ad8b1aafSjsg 
1626ad8b1aafSjsg 	if (amdgpu_bad_page_threshold != 0) {
1627ad8b1aafSjsg 		/* add bad_page_features entry */
1628ad8b1aafSjsg 		bin_attr_gpu_vram_bad_pages.private = NULL;
1629ad8b1aafSjsg 		con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1630ad8b1aafSjsg 		bin_attrs[0] = &con->badpages_attr;
1631ad8b1aafSjsg 		group.bin_attrs = bin_attrs;
1632ad8b1aafSjsg 		sysfs_bin_attr_init(bin_attrs[0]);
1633ad8b1aafSjsg 	}
1634ad8b1aafSjsg 
1635ad8b1aafSjsg 	r = sysfs_create_group(&adev->dev->kobj, &group);
1636ad8b1aafSjsg 	if (r)
1637ad8b1aafSjsg 		dev_err(adev->dev, "Failed to create RAS sysfs group!");
1638c349dbc7Sjsg #endif
1639c349dbc7Sjsg 
1640c349dbc7Sjsg 	return 0;
1641c349dbc7Sjsg }
1642c349dbc7Sjsg 
1643c349dbc7Sjsg static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1644c349dbc7Sjsg {
1645c349dbc7Sjsg #ifdef __linux__
16465ca02815Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
16475ca02815Sjsg 	struct ras_manager *con_obj, *ip_obj, *tmp;
16485ca02815Sjsg 
16495ca02815Sjsg 	if (IS_ENABLED(CONFIG_DEBUG_FS)) {
16505ca02815Sjsg 		list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
16515ca02815Sjsg 			ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
16525ca02815Sjsg 			if (ip_obj)
16535ca02815Sjsg 				put_obj(ip_obj);
16545ca02815Sjsg 		}
16555ca02815Sjsg 	}
16565ca02815Sjsg 
1657c349dbc7Sjsg 	amdgpu_ras_sysfs_remove_all(adev);
1658c349dbc7Sjsg #endif
1659c349dbc7Sjsg 	return 0;
1660c349dbc7Sjsg }
1661c349dbc7Sjsg /* ras fs end */
1662c349dbc7Sjsg 
1663c349dbc7Sjsg /* ih begin */
16641bb76ff1Sjsg 
16651bb76ff1Sjsg /* For the hardware that cannot enable bif ring for both ras_controller_irq
16661bb76ff1Sjsg  * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
16671bb76ff1Sjsg  * register to check whether the interrupt is triggered or not, and properly
16681bb76ff1Sjsg  * ack the interrupt if it is there
16691bb76ff1Sjsg  */
16701bb76ff1Sjsg void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
16711bb76ff1Sjsg {
16721bb76ff1Sjsg 	/* Fatal error events are handled on host side */
1673f005ef32Sjsg 	if (amdgpu_sriov_vf(adev))
16741bb76ff1Sjsg 		return;
16751bb76ff1Sjsg 
16761bb76ff1Sjsg 	if (adev->nbio.ras &&
16771bb76ff1Sjsg 	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
16781bb76ff1Sjsg 		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
16791bb76ff1Sjsg 
16801bb76ff1Sjsg 	if (adev->nbio.ras &&
16811bb76ff1Sjsg 	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
16821bb76ff1Sjsg 		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
16831bb76ff1Sjsg }
16841bb76ff1Sjsg 
16851bb76ff1Sjsg static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
16861bb76ff1Sjsg 				struct amdgpu_iv_entry *entry)
16871bb76ff1Sjsg {
16881bb76ff1Sjsg 	bool poison_stat = false;
16891bb76ff1Sjsg 	struct amdgpu_device *adev = obj->adev;
16901bb76ff1Sjsg 	struct amdgpu_ras_block_object *block_obj =
16911bb76ff1Sjsg 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
16921bb76ff1Sjsg 
1693f005ef32Sjsg 	if (!block_obj)
16941bb76ff1Sjsg 		return;
16951bb76ff1Sjsg 
16961bb76ff1Sjsg 	/* both query_poison_status and handle_poison_consumption are optional,
16971bb76ff1Sjsg 	 * but at least one of them should be implemented if we need poison
16981bb76ff1Sjsg 	 * consumption handler
16991bb76ff1Sjsg 	 */
1700f005ef32Sjsg 	if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
17011bb76ff1Sjsg 		poison_stat = block_obj->hw_ops->query_poison_status(adev);
17021bb76ff1Sjsg 		if (!poison_stat) {
17031bb76ff1Sjsg 			/* Not poison consumption interrupt, no need to handle it */
17041bb76ff1Sjsg 			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
17051bb76ff1Sjsg 					block_obj->ras_comm.name);
17061bb76ff1Sjsg 
17071bb76ff1Sjsg 			return;
17081bb76ff1Sjsg 		}
17091bb76ff1Sjsg 	}
17101bb76ff1Sjsg 
1711f005ef32Sjsg 	amdgpu_umc_poison_handler(adev, false);
17121bb76ff1Sjsg 
1713f005ef32Sjsg 	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
17141bb76ff1Sjsg 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
17151bb76ff1Sjsg 
17161bb76ff1Sjsg 	/* gpu reset is fallback for failed and default cases */
17171bb76ff1Sjsg 	if (poison_stat) {
17181bb76ff1Sjsg 		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
17191bb76ff1Sjsg 				block_obj->ras_comm.name);
17201bb76ff1Sjsg 		amdgpu_ras_reset_gpu(adev);
1721f005ef32Sjsg 	} else {
1722f005ef32Sjsg 		amdgpu_gfx_poison_consumption_handler(adev, entry);
17231bb76ff1Sjsg 	}
17241bb76ff1Sjsg }
17251bb76ff1Sjsg 
17261bb76ff1Sjsg static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
17271bb76ff1Sjsg 				struct amdgpu_iv_entry *entry)
17281bb76ff1Sjsg {
17291bb76ff1Sjsg 	dev_info(obj->adev->dev,
17301bb76ff1Sjsg 		"Poison is created, no user action is needed.\n");
17311bb76ff1Sjsg }
17321bb76ff1Sjsg 
17331bb76ff1Sjsg static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
17341bb76ff1Sjsg 				struct amdgpu_iv_entry *entry)
1735c349dbc7Sjsg {
1736c349dbc7Sjsg 	struct ras_ih_data *data = &obj->ih_data;
1737c349dbc7Sjsg 	struct ras_err_data err_data = {0, 0, 0, NULL};
17381bb76ff1Sjsg 	int ret;
1739c349dbc7Sjsg 
17401bb76ff1Sjsg 	if (!data->cb)
17411bb76ff1Sjsg 		return;
1742c349dbc7Sjsg 
1743c349dbc7Sjsg 	/* Let IP handle its data, maybe we need get the output
17441bb76ff1Sjsg 	 * from the callback to update the error type/count, etc
1745c349dbc7Sjsg 	 */
17461bb76ff1Sjsg 	ret = data->cb(obj->adev, &err_data, entry);
1747c349dbc7Sjsg 	/* ue will trigger an interrupt, and in that case
1748c349dbc7Sjsg 	 * we need do a reset to recovery the whole system.
1749c349dbc7Sjsg 	 * But leave IP do that recovery, here we just dispatch
1750c349dbc7Sjsg 	 * the error.
1751c349dbc7Sjsg 	 */
1752c349dbc7Sjsg 	if (ret == AMDGPU_RAS_SUCCESS) {
1753c349dbc7Sjsg 		/* these counts could be left as 0 if
1754c349dbc7Sjsg 		 * some blocks do not count error number
1755c349dbc7Sjsg 		 */
1756c349dbc7Sjsg 		obj->err_data.ue_count += err_data.ue_count;
1757c349dbc7Sjsg 		obj->err_data.ce_count += err_data.ce_count;
1758c349dbc7Sjsg 	}
1759c349dbc7Sjsg }
17601bb76ff1Sjsg 
17611bb76ff1Sjsg static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
17621bb76ff1Sjsg {
17631bb76ff1Sjsg 	struct ras_ih_data *data = &obj->ih_data;
17641bb76ff1Sjsg 	struct amdgpu_iv_entry entry;
17651bb76ff1Sjsg 
17661bb76ff1Sjsg 	while (data->rptr != data->wptr) {
17671bb76ff1Sjsg 		rmb();
17681bb76ff1Sjsg 		memcpy(&entry, &data->ring[data->rptr],
17691bb76ff1Sjsg 				data->element_size);
17701bb76ff1Sjsg 
17711bb76ff1Sjsg 		wmb();
17721bb76ff1Sjsg 		data->rptr = (data->aligned_element_size +
17731bb76ff1Sjsg 				data->rptr) % data->ring_size;
17741bb76ff1Sjsg 
17751bb76ff1Sjsg 		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
17761bb76ff1Sjsg 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
17771bb76ff1Sjsg 				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
17781bb76ff1Sjsg 			else
17791bb76ff1Sjsg 				amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
17801bb76ff1Sjsg 		} else {
17811bb76ff1Sjsg 			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
17821bb76ff1Sjsg 				amdgpu_ras_interrupt_umc_handler(obj, &entry);
17831bb76ff1Sjsg 			else
17841bb76ff1Sjsg 				dev_warn(obj->adev->dev,
17851bb76ff1Sjsg 					"No RAS interrupt handler for non-UMC block with poison disabled.\n");
17861bb76ff1Sjsg 		}
1787c349dbc7Sjsg 	}
1788c349dbc7Sjsg }
1789c349dbc7Sjsg 
1790c349dbc7Sjsg static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1791c349dbc7Sjsg {
1792c349dbc7Sjsg 	struct ras_ih_data *data =
1793c349dbc7Sjsg 		container_of(work, struct ras_ih_data, ih_work);
1794c349dbc7Sjsg 	struct ras_manager *obj =
1795c349dbc7Sjsg 		container_of(data, struct ras_manager, ih_data);
1796c349dbc7Sjsg 
1797c349dbc7Sjsg 	amdgpu_ras_interrupt_handler(obj);
1798c349dbc7Sjsg }
1799c349dbc7Sjsg 
1800c349dbc7Sjsg int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1801c349dbc7Sjsg 		struct ras_dispatch_if *info)
1802c349dbc7Sjsg {
1803*8602cf8bSjsg 	struct ras_manager *obj;
1804*8602cf8bSjsg 	struct ras_ih_data *data;
1805c349dbc7Sjsg 
1806*8602cf8bSjsg 	obj = amdgpu_ras_find_obj(adev, &info->head);
1807c349dbc7Sjsg 	if (!obj)
1808c349dbc7Sjsg 		return -EINVAL;
1809c349dbc7Sjsg 
1810*8602cf8bSjsg 	data = &obj->ih_data;
1811*8602cf8bSjsg 
1812c349dbc7Sjsg 	if (data->inuse == 0)
1813c349dbc7Sjsg 		return 0;
1814c349dbc7Sjsg 
1815c349dbc7Sjsg 	/* Might be overflow... */
1816c349dbc7Sjsg 	memcpy(&data->ring[data->wptr], info->entry,
1817c349dbc7Sjsg 			data->element_size);
1818c349dbc7Sjsg 
1819c349dbc7Sjsg 	wmb();
1820c349dbc7Sjsg 	data->wptr = (data->aligned_element_size +
1821c349dbc7Sjsg 			data->wptr) % data->ring_size;
1822c349dbc7Sjsg 
1823c349dbc7Sjsg 	schedule_work(&data->ih_work);
1824c349dbc7Sjsg 
1825c349dbc7Sjsg 	return 0;
1826c349dbc7Sjsg }
1827c349dbc7Sjsg 
1828c349dbc7Sjsg int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
18291bb76ff1Sjsg 		struct ras_common_if *head)
1830c349dbc7Sjsg {
18311bb76ff1Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1832c349dbc7Sjsg 	struct ras_ih_data *data;
1833c349dbc7Sjsg 
1834c349dbc7Sjsg 	if (!obj)
1835c349dbc7Sjsg 		return -EINVAL;
1836c349dbc7Sjsg 
1837c349dbc7Sjsg 	data = &obj->ih_data;
1838c349dbc7Sjsg 	if (data->inuse == 0)
1839c349dbc7Sjsg 		return 0;
1840c349dbc7Sjsg 
1841c349dbc7Sjsg 	cancel_work_sync(&data->ih_work);
1842c349dbc7Sjsg 
1843c349dbc7Sjsg 	kfree(data->ring);
1844c349dbc7Sjsg 	memset(data, 0, sizeof(*data));
1845c349dbc7Sjsg 	put_obj(obj);
1846c349dbc7Sjsg 
1847c349dbc7Sjsg 	return 0;
1848c349dbc7Sjsg }
1849c349dbc7Sjsg 
1850c349dbc7Sjsg int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
18511bb76ff1Sjsg 		struct ras_common_if *head)
1852c349dbc7Sjsg {
18531bb76ff1Sjsg 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1854c349dbc7Sjsg 	struct ras_ih_data *data;
18551bb76ff1Sjsg 	struct amdgpu_ras_block_object *ras_obj;
1856c349dbc7Sjsg 
1857c349dbc7Sjsg 	if (!obj) {
1858c349dbc7Sjsg 		/* in case we registe the IH before enable ras feature */
18591bb76ff1Sjsg 		obj = amdgpu_ras_create_obj(adev, head);
1860c349dbc7Sjsg 		if (!obj)
1861c349dbc7Sjsg 			return -EINVAL;
1862c349dbc7Sjsg 	} else
1863c349dbc7Sjsg 		get_obj(obj);
1864c349dbc7Sjsg 
18651bb76ff1Sjsg 	ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
18661bb76ff1Sjsg 
1867c349dbc7Sjsg 	data = &obj->ih_data;
1868c349dbc7Sjsg 	/* add the callback.etc */
1869c349dbc7Sjsg 	*data = (struct ras_ih_data) {
1870c349dbc7Sjsg 		.inuse = 0,
18711bb76ff1Sjsg 		.cb = ras_obj->ras_cb,
1872c349dbc7Sjsg 		.element_size = sizeof(struct amdgpu_iv_entry),
1873c349dbc7Sjsg 		.rptr = 0,
1874c349dbc7Sjsg 		.wptr = 0,
1875c349dbc7Sjsg 	};
1876c349dbc7Sjsg 
1877c349dbc7Sjsg 	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1878c349dbc7Sjsg 
1879f005ef32Sjsg 	data->aligned_element_size = ALIGN(data->element_size, 8);
1880c349dbc7Sjsg 	/* the ring can store 64 iv entries. */
1881c349dbc7Sjsg 	data->ring_size = 64 * data->aligned_element_size;
1882c349dbc7Sjsg 	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1883c349dbc7Sjsg 	if (!data->ring) {
1884c349dbc7Sjsg 		put_obj(obj);
1885c349dbc7Sjsg 		return -ENOMEM;
1886c349dbc7Sjsg 	}
1887c349dbc7Sjsg 
1888c349dbc7Sjsg 	/* IH is ready */
1889c349dbc7Sjsg 	data->inuse = 1;
1890c349dbc7Sjsg 
1891c349dbc7Sjsg 	return 0;
1892c349dbc7Sjsg }
1893c349dbc7Sjsg 
1894c349dbc7Sjsg static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1895c349dbc7Sjsg {
1896c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1897c349dbc7Sjsg 	struct ras_manager *obj, *tmp;
1898c349dbc7Sjsg 
1899c349dbc7Sjsg 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
19001bb76ff1Sjsg 		amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
1901c349dbc7Sjsg 	}
1902c349dbc7Sjsg 
1903c349dbc7Sjsg 	return 0;
1904c349dbc7Sjsg }
1905c349dbc7Sjsg /* ih end */
1906c349dbc7Sjsg 
1907c349dbc7Sjsg /* traversal all IPs except NBIO to query error counter */
1908c349dbc7Sjsg static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1909c349dbc7Sjsg {
1910c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1911c349dbc7Sjsg 	struct ras_manager *obj;
1912c349dbc7Sjsg 
19135ca02815Sjsg 	if (!adev->ras_enabled || !con)
1914c349dbc7Sjsg 		return;
1915c349dbc7Sjsg 
1916c349dbc7Sjsg 	list_for_each_entry(obj, &con->head, node) {
1917c349dbc7Sjsg 		struct ras_query_if info = {
1918c349dbc7Sjsg 			.head = obj->head,
1919c349dbc7Sjsg 		};
1920c349dbc7Sjsg 
1921c349dbc7Sjsg 		/*
1922c349dbc7Sjsg 		 * PCIE_BIF IP has one different isr by ras controller
1923c349dbc7Sjsg 		 * interrupt, the specific ras counter query will be
1924c349dbc7Sjsg 		 * done in that isr. So skip such block from common
1925c349dbc7Sjsg 		 * sync flood interrupt isr calling.
1926c349dbc7Sjsg 		 */
1927c349dbc7Sjsg 		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1928c349dbc7Sjsg 			continue;
1929c349dbc7Sjsg 
19301bb76ff1Sjsg 		/*
19311bb76ff1Sjsg 		 * this is a workaround for aldebaran, skip send msg to
19321bb76ff1Sjsg 		 * smu to get ecc_info table due to smu handle get ecc
19331bb76ff1Sjsg 		 * info table failed temporarily.
19341bb76ff1Sjsg 		 * should be removed until smu fix handle ecc_info table.
19351bb76ff1Sjsg 		 */
19361bb76ff1Sjsg 		if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
19371bb76ff1Sjsg 			(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
19381bb76ff1Sjsg 			continue;
19391bb76ff1Sjsg 
19405ca02815Sjsg 		amdgpu_ras_query_error_status(adev, &info);
19411bb76ff1Sjsg 
19421bb76ff1Sjsg 		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
19431bb76ff1Sjsg 		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
19441bb76ff1Sjsg 		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
19451bb76ff1Sjsg 			if (amdgpu_ras_reset_error_status(adev, info.head.block))
19461bb76ff1Sjsg 				dev_warn(adev->dev, "Failed to reset error counter and error status");
19471bb76ff1Sjsg 		}
1948c349dbc7Sjsg 	}
1949c349dbc7Sjsg }
1950c349dbc7Sjsg 
1951ad8b1aafSjsg /* Parse RdRspStatus and WrRspStatus */
19525ca02815Sjsg static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1953ad8b1aafSjsg 					  struct ras_query_if *info)
1954ad8b1aafSjsg {
19551bb76ff1Sjsg 	struct amdgpu_ras_block_object *block_obj;
1956ad8b1aafSjsg 	/*
1957ad8b1aafSjsg 	 * Only two block need to query read/write
1958ad8b1aafSjsg 	 * RspStatus at current state
1959ad8b1aafSjsg 	 */
19601bb76ff1Sjsg 	if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
19611bb76ff1Sjsg 		(info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
19621bb76ff1Sjsg 		return;
19631bb76ff1Sjsg 
19641bb76ff1Sjsg 	block_obj = amdgpu_ras_get_ras_block(adev,
19651bb76ff1Sjsg 					info->head.block,
19661bb76ff1Sjsg 					info->head.sub_block_index);
19671bb76ff1Sjsg 
19681bb76ff1Sjsg 	if (!block_obj || !block_obj->hw_ops) {
19691bb76ff1Sjsg 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
19701bb76ff1Sjsg 			     get_ras_block_str(&info->head));
19711bb76ff1Sjsg 		return;
1972ad8b1aafSjsg 	}
19731bb76ff1Sjsg 
19741bb76ff1Sjsg 	if (block_obj->hw_ops->query_ras_error_status)
19751bb76ff1Sjsg 		block_obj->hw_ops->query_ras_error_status(adev);
19761bb76ff1Sjsg 
1977ad8b1aafSjsg }
1978ad8b1aafSjsg 
1979ad8b1aafSjsg static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1980ad8b1aafSjsg {
1981ad8b1aafSjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1982ad8b1aafSjsg 	struct ras_manager *obj;
1983ad8b1aafSjsg 
19845ca02815Sjsg 	if (!adev->ras_enabled || !con)
1985ad8b1aafSjsg 		return;
1986ad8b1aafSjsg 
1987ad8b1aafSjsg 	list_for_each_entry(obj, &con->head, node) {
1988ad8b1aafSjsg 		struct ras_query_if info = {
1989ad8b1aafSjsg 			.head = obj->head,
1990ad8b1aafSjsg 		};
1991ad8b1aafSjsg 
1992ad8b1aafSjsg 		amdgpu_ras_error_status_query(adev, &info);
1993ad8b1aafSjsg 	}
1994ad8b1aafSjsg }
1995ad8b1aafSjsg 
1996c349dbc7Sjsg /* recovery begin */
1997c349dbc7Sjsg 
1998c349dbc7Sjsg /* return 0 on success.
1999c349dbc7Sjsg  * caller need free bps.
2000c349dbc7Sjsg  */
2001c349dbc7Sjsg static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
2002c349dbc7Sjsg 		struct ras_badpage **bps, unsigned int *count)
2003c349dbc7Sjsg {
2004c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2005c349dbc7Sjsg 	struct ras_err_handler_data *data;
2006c349dbc7Sjsg 	int i = 0;
20075ca02815Sjsg 	int ret = 0, status;
2008c349dbc7Sjsg 
2009c349dbc7Sjsg 	if (!con || !con->eh_data || !bps || !count)
2010c349dbc7Sjsg 		return -EINVAL;
2011c349dbc7Sjsg 
2012c349dbc7Sjsg 	mutex_lock(&con->recovery_lock);
2013c349dbc7Sjsg 	data = con->eh_data;
2014c349dbc7Sjsg 	if (!data || data->count == 0) {
2015c349dbc7Sjsg 		*bps = NULL;
2016c349dbc7Sjsg 		ret = -EINVAL;
2017c349dbc7Sjsg 		goto out;
2018c349dbc7Sjsg 	}
2019c349dbc7Sjsg 
2020c349dbc7Sjsg 	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
2021c349dbc7Sjsg 	if (!*bps) {
2022c349dbc7Sjsg 		ret = -ENOMEM;
2023c349dbc7Sjsg 		goto out;
2024c349dbc7Sjsg 	}
2025c349dbc7Sjsg 
2026c349dbc7Sjsg 	for (; i < data->count; i++) {
2027c349dbc7Sjsg 		(*bps)[i] = (struct ras_badpage){
2028c349dbc7Sjsg 			.bp = data->bps[i].retired_page,
2029c349dbc7Sjsg 			.size = AMDGPU_GPU_PAGE_SIZE,
2030c349dbc7Sjsg 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
2031c349dbc7Sjsg 		};
20321bb76ff1Sjsg 		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
20335ca02815Sjsg 				data->bps[i].retired_page);
20345ca02815Sjsg 		if (status == -EBUSY)
2035c349dbc7Sjsg 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
20365ca02815Sjsg 		else if (status == -ENOENT)
2037c349dbc7Sjsg 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
2038c349dbc7Sjsg 	}
2039c349dbc7Sjsg 
2040c349dbc7Sjsg 	*count = data->count;
2041c349dbc7Sjsg out:
2042c349dbc7Sjsg 	mutex_unlock(&con->recovery_lock);
2043c349dbc7Sjsg 	return ret;
2044c349dbc7Sjsg }
2045c349dbc7Sjsg 
2046c349dbc7Sjsg static void amdgpu_ras_do_recovery(struct work_struct *work)
2047c349dbc7Sjsg {
2048c349dbc7Sjsg 	struct amdgpu_ras *ras =
2049c349dbc7Sjsg 		container_of(work, struct amdgpu_ras, recovery_work);
2050c349dbc7Sjsg 	struct amdgpu_device *remote_adev = NULL;
2051c349dbc7Sjsg 	struct amdgpu_device *adev = ras->adev;
2052c349dbc7Sjsg 	struct list_head device_list, *device_list_handle =  NULL;
2053ad8b1aafSjsg 
2054ad8b1aafSjsg 	if (!ras->disable_ras_err_cnt_harvest) {
2055ad8b1aafSjsg 		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2056c349dbc7Sjsg 
2057c349dbc7Sjsg 		/* Build list of devices to query RAS related errors */
2058ad8b1aafSjsg 		if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
2059c349dbc7Sjsg 			device_list_handle = &hive->device_list;
2060ad8b1aafSjsg 		} else {
2061204a49e6Sjsg 			INIT_LIST_HEAD(&device_list);
2062c349dbc7Sjsg 			list_add_tail(&adev->gmc.xgmi.head, &device_list);
2063c349dbc7Sjsg 			device_list_handle = &device_list;
2064c349dbc7Sjsg 		}
2065c349dbc7Sjsg 
2066ad8b1aafSjsg 		list_for_each_entry(remote_adev,
2067ad8b1aafSjsg 				device_list_handle, gmc.xgmi.head) {
2068ad8b1aafSjsg 			amdgpu_ras_query_err_status(remote_adev);
2069c349dbc7Sjsg 			amdgpu_ras_log_on_err_counter(remote_adev);
2070c349dbc7Sjsg 		}
2071c349dbc7Sjsg 
2072ad8b1aafSjsg 		amdgpu_put_xgmi_hive(hive);
2073ad8b1aafSjsg 	}
2074ad8b1aafSjsg 
20751bb76ff1Sjsg 	if (amdgpu_device_should_recover_gpu(ras->adev)) {
20761bb76ff1Sjsg 		struct amdgpu_reset_context reset_context;
20771bb76ff1Sjsg 		memset(&reset_context, 0, sizeof(reset_context));
20781bb76ff1Sjsg 
20791bb76ff1Sjsg 		reset_context.method = AMD_RESET_METHOD_NONE;
20801bb76ff1Sjsg 		reset_context.reset_req_dev = adev;
2081f005ef32Sjsg 
2082f005ef32Sjsg 		/* Perform full reset in fatal error mode */
2083f005ef32Sjsg 		if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
2084f005ef32Sjsg 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2085f005ef32Sjsg 		else {
20861bb76ff1Sjsg 			clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
20871bb76ff1Sjsg 
2088f005ef32Sjsg 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
2089f005ef32Sjsg 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
2090f005ef32Sjsg 				reset_context.method = AMD_RESET_METHOD_MODE2;
2091f005ef32Sjsg 			}
2092f005ef32Sjsg 
2093f005ef32Sjsg 			/* Fatal error occurs in poison mode, mode1 reset is used to
2094f005ef32Sjsg 			 * recover gpu.
2095f005ef32Sjsg 			 */
2096f005ef32Sjsg 			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
2097f005ef32Sjsg 				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
2098f005ef32Sjsg 				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2099f005ef32Sjsg 
2100f005ef32Sjsg 				psp_fatal_error_recovery_quirk(&adev->psp);
2101f005ef32Sjsg 			}
2102f005ef32Sjsg 		}
2103f005ef32Sjsg 
21041bb76ff1Sjsg 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
21051bb76ff1Sjsg 	}
2106c349dbc7Sjsg 	atomic_set(&ras->in_recovery, 0);
2107c349dbc7Sjsg }
2108c349dbc7Sjsg 
2109c349dbc7Sjsg /* alloc/realloc bps array */
2110c349dbc7Sjsg static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
2111c349dbc7Sjsg 		struct ras_err_handler_data *data, int pages)
2112c349dbc7Sjsg {
2113c349dbc7Sjsg 	unsigned int old_space = data->count + data->space_left;
2114c349dbc7Sjsg 	unsigned int new_space = old_space + pages;
2115f005ef32Sjsg 	unsigned int align_space = ALIGN(new_space, 512);
2116c349dbc7Sjsg 	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
2117c349dbc7Sjsg 
21185ca02815Sjsg 	if (!bps) {
2119c349dbc7Sjsg 		return -ENOMEM;
2120c349dbc7Sjsg 	}
2121c349dbc7Sjsg 
2122c349dbc7Sjsg 	if (data->bps) {
2123c349dbc7Sjsg 		memcpy(bps, data->bps,
2124c349dbc7Sjsg 				data->count * sizeof(*data->bps));
2125c349dbc7Sjsg 		kfree(data->bps);
2126c349dbc7Sjsg 	}
2127c349dbc7Sjsg 
2128c349dbc7Sjsg 	data->bps = bps;
2129c349dbc7Sjsg 	data->space_left += align_space - old_space;
2130c349dbc7Sjsg 	return 0;
2131c349dbc7Sjsg }
2132c349dbc7Sjsg 
2133c349dbc7Sjsg /* it deal with vram only. */
2134c349dbc7Sjsg int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
2135c349dbc7Sjsg 		struct eeprom_table_record *bps, int pages)
2136c349dbc7Sjsg {
2137c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2138c349dbc7Sjsg 	struct ras_err_handler_data *data;
2139c349dbc7Sjsg 	int ret = 0;
21405ca02815Sjsg 	uint32_t i;
2141c349dbc7Sjsg 
2142c349dbc7Sjsg 	if (!con || !con->eh_data || !bps || pages <= 0)
2143c349dbc7Sjsg 		return 0;
2144c349dbc7Sjsg 
2145c349dbc7Sjsg 	mutex_lock(&con->recovery_lock);
2146c349dbc7Sjsg 	data = con->eh_data;
2147c349dbc7Sjsg 	if (!data)
2148c349dbc7Sjsg 		goto out;
2149c349dbc7Sjsg 
21505ca02815Sjsg 	for (i = 0; i < pages; i++) {
21515ca02815Sjsg 		if (amdgpu_ras_check_bad_page_unlock(con,
21525ca02815Sjsg 			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
21535ca02815Sjsg 			continue;
21545ca02815Sjsg 
21555ca02815Sjsg 		if (!data->space_left &&
21565ca02815Sjsg 			amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
2157c349dbc7Sjsg 			ret = -ENOMEM;
2158c349dbc7Sjsg 			goto out;
2159c349dbc7Sjsg 		}
2160c349dbc7Sjsg 
21611bb76ff1Sjsg 		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
21625ca02815Sjsg 			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
21635ca02815Sjsg 			AMDGPU_GPU_PAGE_SIZE);
2164c349dbc7Sjsg 
21655ca02815Sjsg 		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
21665ca02815Sjsg 		data->count++;
21675ca02815Sjsg 		data->space_left--;
21685ca02815Sjsg 	}
2169c349dbc7Sjsg out:
2170c349dbc7Sjsg 	mutex_unlock(&con->recovery_lock);
2171c349dbc7Sjsg 
2172c349dbc7Sjsg 	return ret;
2173c349dbc7Sjsg }
2174c349dbc7Sjsg 
2175c349dbc7Sjsg /*
2176c349dbc7Sjsg  * write error record array to eeprom, the function should be
2177c349dbc7Sjsg  * protected by recovery_lock
2178f005ef32Sjsg  * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
2179c349dbc7Sjsg  */
2180f005ef32Sjsg int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
2181f005ef32Sjsg 		unsigned long *new_cnt)
2182c349dbc7Sjsg {
2183c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2184c349dbc7Sjsg 	struct ras_err_handler_data *data;
2185c349dbc7Sjsg 	struct amdgpu_ras_eeprom_control *control;
2186c349dbc7Sjsg 	int save_count;
2187c349dbc7Sjsg 
2188f005ef32Sjsg 	if (!con || !con->eh_data) {
2189f005ef32Sjsg 		if (new_cnt)
2190f005ef32Sjsg 			*new_cnt = 0;
2191f005ef32Sjsg 
2192c349dbc7Sjsg 		return 0;
2193f005ef32Sjsg 	}
2194c349dbc7Sjsg 
21951bb76ff1Sjsg 	mutex_lock(&con->recovery_lock);
2196c349dbc7Sjsg 	control = &con->eeprom_control;
2197c349dbc7Sjsg 	data = con->eh_data;
21985ca02815Sjsg 	save_count = data->count - control->ras_num_recs;
21991bb76ff1Sjsg 	mutex_unlock(&con->recovery_lock);
2200f005ef32Sjsg 
2201f005ef32Sjsg 	if (new_cnt)
2202f005ef32Sjsg 		*new_cnt = save_count / adev->umc.retire_unit;
2203f005ef32Sjsg 
2204c349dbc7Sjsg 	/* only new entries are saved */
2205ad8b1aafSjsg 	if (save_count > 0) {
22065ca02815Sjsg 		if (amdgpu_ras_eeprom_append(control,
22075ca02815Sjsg 					     &data->bps[control->ras_num_recs],
2208c349dbc7Sjsg 					     save_count)) {
2209ad8b1aafSjsg 			dev_err(adev->dev, "Failed to save EEPROM table data!");
2210c349dbc7Sjsg 			return -EIO;
2211c349dbc7Sjsg 		}
2212c349dbc7Sjsg 
2213ad8b1aafSjsg 		dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
2214ad8b1aafSjsg 	}
2215ad8b1aafSjsg 
2216c349dbc7Sjsg 	return 0;
2217c349dbc7Sjsg }
2218c349dbc7Sjsg 
2219c349dbc7Sjsg /*
2220c349dbc7Sjsg  * read error record array in eeprom and reserve enough space for
2221c349dbc7Sjsg  * storing new bad pages
2222c349dbc7Sjsg  */
2223c349dbc7Sjsg static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
2224c349dbc7Sjsg {
2225c349dbc7Sjsg 	struct amdgpu_ras_eeprom_control *control =
22265ca02815Sjsg 		&adev->psp.ras_context.ras->eeprom_control;
22275ca02815Sjsg 	struct eeprom_table_record *bps;
22285ca02815Sjsg 	int ret;
2229c349dbc7Sjsg 
2230c349dbc7Sjsg 	/* no bad page record, skip eeprom access */
22315ca02815Sjsg 	if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
22325ca02815Sjsg 		return 0;
2233c349dbc7Sjsg 
22345ca02815Sjsg 	bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
2235c349dbc7Sjsg 	if (!bps)
2236c349dbc7Sjsg 		return -ENOMEM;
2237c349dbc7Sjsg 
22385ca02815Sjsg 	ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
22395ca02815Sjsg 	if (ret)
2240ad8b1aafSjsg 		dev_err(adev->dev, "Failed to load EEPROM table records!");
22415ca02815Sjsg 	else
22425ca02815Sjsg 		ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
2243c349dbc7Sjsg 
2244c349dbc7Sjsg 	kfree(bps);
2245c349dbc7Sjsg 	return ret;
2246c349dbc7Sjsg }
2247c349dbc7Sjsg 
22485ca02815Sjsg static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
22495ca02815Sjsg 				uint64_t addr)
22505ca02815Sjsg {
22515ca02815Sjsg 	struct ras_err_handler_data *data = con->eh_data;
22525ca02815Sjsg 	int i;
22535ca02815Sjsg 
22545ca02815Sjsg 	addr >>= AMDGPU_GPU_PAGE_SHIFT;
22555ca02815Sjsg 	for (i = 0; i < data->count; i++)
22565ca02815Sjsg 		if (addr == data->bps[i].retired_page)
22575ca02815Sjsg 			return true;
22585ca02815Sjsg 
22595ca02815Sjsg 	return false;
22605ca02815Sjsg }
22615ca02815Sjsg 
2262c349dbc7Sjsg /*
2263c349dbc7Sjsg  * check if an address belongs to bad page
2264c349dbc7Sjsg  *
2265c349dbc7Sjsg  * Note: this check is only for umc block
2266c349dbc7Sjsg  */
2267c349dbc7Sjsg static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
2268c349dbc7Sjsg 				uint64_t addr)
2269c349dbc7Sjsg {
2270c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2271c349dbc7Sjsg 	bool ret = false;
2272c349dbc7Sjsg 
2273c349dbc7Sjsg 	if (!con || !con->eh_data)
2274c349dbc7Sjsg 		return ret;
2275c349dbc7Sjsg 
2276c349dbc7Sjsg 	mutex_lock(&con->recovery_lock);
22775ca02815Sjsg 	ret = amdgpu_ras_check_bad_page_unlock(con, addr);
2278c349dbc7Sjsg 	mutex_unlock(&con->recovery_lock);
2279c349dbc7Sjsg 	return ret;
2280c349dbc7Sjsg }
2281c349dbc7Sjsg 
2282ad8b1aafSjsg static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
22835ca02815Sjsg 					  uint32_t max_count)
2284ad8b1aafSjsg {
2285ad8b1aafSjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2286ad8b1aafSjsg 
2287ad8b1aafSjsg 	/*
2288ad8b1aafSjsg 	 * Justification of value bad_page_cnt_threshold in ras structure
2289ad8b1aafSjsg 	 *
2290f005ef32Sjsg 	 * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2291f005ef32Sjsg 	 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2292f005ef32Sjsg 	 * scenarios accordingly.
2293ad8b1aafSjsg 	 *
2294ad8b1aafSjsg 	 * Bad page retirement enablement:
2295f005ef32Sjsg 	 *    - If amdgpu_bad_page_threshold = -2,
2296ad8b1aafSjsg 	 *      bad_page_cnt_threshold = typical value by formula.
2297ad8b1aafSjsg 	 *
2298ad8b1aafSjsg 	 *    - When the value from user is 0 < amdgpu_bad_page_threshold <
2299ad8b1aafSjsg 	 *      max record length in eeprom, use it directly.
2300ad8b1aafSjsg 	 *
2301ad8b1aafSjsg 	 * Bad page retirement disablement:
2302ad8b1aafSjsg 	 *    - If amdgpu_bad_page_threshold = 0, bad page retirement
2303ad8b1aafSjsg 	 *      functionality is disabled, and bad_page_cnt_threshold will
2304ad8b1aafSjsg 	 *      take no effect.
2305ad8b1aafSjsg 	 */
2306ad8b1aafSjsg 
23075ca02815Sjsg 	if (amdgpu_bad_page_threshold < 0) {
23085ca02815Sjsg 		u64 val = adev->gmc.mc_vram_size;
2309ad8b1aafSjsg 
23105ca02815Sjsg 		do_div(val, RAS_BAD_PAGE_COVER);
2311ad8b1aafSjsg 		con->bad_page_cnt_threshold = min(lower_32_bits(val),
23125ca02815Sjsg 						  max_count);
2313ad8b1aafSjsg 	} else {
23145ca02815Sjsg 		con->bad_page_cnt_threshold = min_t(int, max_count,
23155ca02815Sjsg 						    amdgpu_bad_page_threshold);
2316ad8b1aafSjsg 	}
2317ad8b1aafSjsg }
2318ad8b1aafSjsg 
2319c349dbc7Sjsg int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2320c349dbc7Sjsg {
2321c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2322c349dbc7Sjsg 	struct ras_err_handler_data **data;
23235ca02815Sjsg 	u32  max_eeprom_records_count = 0;
2324ad8b1aafSjsg 	bool exc_err_limit = false;
2325c349dbc7Sjsg 	int ret;
2326c349dbc7Sjsg 
23271bb76ff1Sjsg 	if (!con || amdgpu_sriov_vf(adev))
2328c349dbc7Sjsg 		return 0;
2329c349dbc7Sjsg 
23305ca02815Sjsg 	/* Allow access to RAS EEPROM via debugfs, when the ASIC
23315ca02815Sjsg 	 * supports RAS and debugfs is enabled, but when
23325ca02815Sjsg 	 * adev->ras_enabled is unset, i.e. when "ras_enable"
23335ca02815Sjsg 	 * module parameter is set to 0.
23345ca02815Sjsg 	 */
23355ca02815Sjsg 	con->adev = adev;
23365ca02815Sjsg 
23375ca02815Sjsg 	if (!adev->ras_enabled)
23385ca02815Sjsg 		return 0;
23395ca02815Sjsg 
23405ca02815Sjsg 	data = &con->eh_data;
2341c349dbc7Sjsg 	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
2342c349dbc7Sjsg 	if (!*data) {
2343c349dbc7Sjsg 		ret = -ENOMEM;
2344c349dbc7Sjsg 		goto out;
2345c349dbc7Sjsg 	}
2346c349dbc7Sjsg 
2347c349dbc7Sjsg 	rw_init(&con->recovery_lock, "rasrec");
2348c349dbc7Sjsg 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2349c349dbc7Sjsg 	atomic_set(&con->in_recovery, 0);
23501bb76ff1Sjsg 	con->eeprom_control.bad_channel_bitmap = 0;
2351c349dbc7Sjsg 
2352f005ef32Sjsg 	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
23535ca02815Sjsg 	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2354ad8b1aafSjsg 
23555ca02815Sjsg 	/* Todo: During test the SMU might fail to read the eeprom through I2C
23565ca02815Sjsg 	 * when the GPU is pending on XGMI reset during probe time
23575ca02815Sjsg 	 * (Mostly after second bus reset), skip it now
23585ca02815Sjsg 	 */
23595ca02815Sjsg 	if (adev->gmc.xgmi.pending_reset)
23605ca02815Sjsg 		return 0;
2361ad8b1aafSjsg 	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2362ad8b1aafSjsg 	/*
2363ad8b1aafSjsg 	 * This calling fails when exc_err_limit is true or
2364ad8b1aafSjsg 	 * ret != 0.
2365ad8b1aafSjsg 	 */
2366ad8b1aafSjsg 	if (exc_err_limit || ret)
2367c349dbc7Sjsg 		goto free;
2368c349dbc7Sjsg 
23695ca02815Sjsg 	if (con->eeprom_control.ras_num_recs) {
2370c349dbc7Sjsg 		ret = amdgpu_ras_load_bad_pages(adev);
2371c349dbc7Sjsg 		if (ret)
2372c349dbc7Sjsg 			goto free;
23735ca02815Sjsg 
23741bb76ff1Sjsg 		amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
23751bb76ff1Sjsg 
23761bb76ff1Sjsg 		if (con->update_channel_flag == true) {
23771bb76ff1Sjsg 			amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
23781bb76ff1Sjsg 			con->update_channel_flag = false;
23791bb76ff1Sjsg 		}
2380c349dbc7Sjsg 	}
2381c349dbc7Sjsg 
23821bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD
23831bb76ff1Sjsg 	if ((adev->asic_type == CHIP_ALDEBARAN) &&
23841bb76ff1Sjsg 	    (adev->gmc.xgmi.connected_to_cpu))
23851bb76ff1Sjsg 		amdgpu_register_bad_pages_mca_notifier(adev);
23861bb76ff1Sjsg #endif
2387c349dbc7Sjsg 	return 0;
2388c349dbc7Sjsg 
2389c349dbc7Sjsg free:
2390c349dbc7Sjsg 	kfree((*data)->bps);
2391c349dbc7Sjsg 	kfree(*data);
2392c349dbc7Sjsg 	con->eh_data = NULL;
2393c349dbc7Sjsg out:
23945ca02815Sjsg 	dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
2395ad8b1aafSjsg 
2396ad8b1aafSjsg 	/*
2397ad8b1aafSjsg 	 * Except error threshold exceeding case, other failure cases in this
2398ad8b1aafSjsg 	 * function would not fail amdgpu driver init.
2399ad8b1aafSjsg 	 */
2400ad8b1aafSjsg 	if (!exc_err_limit)
2401ad8b1aafSjsg 		ret = 0;
2402ad8b1aafSjsg 	else
2403ad8b1aafSjsg 		ret = -EINVAL;
2404c349dbc7Sjsg 
2405c349dbc7Sjsg 	return ret;
2406c349dbc7Sjsg }
2407c349dbc7Sjsg 
2408c349dbc7Sjsg static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2409c349dbc7Sjsg {
2410c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2411c349dbc7Sjsg 	struct ras_err_handler_data *data = con->eh_data;
2412c349dbc7Sjsg 
2413c349dbc7Sjsg 	/* recovery_init failed to init it, fini is useless */
2414c349dbc7Sjsg 	if (!data)
2415c349dbc7Sjsg 		return 0;
2416c349dbc7Sjsg 
2417c349dbc7Sjsg 	cancel_work_sync(&con->recovery_work);
2418c349dbc7Sjsg 
2419c349dbc7Sjsg 	mutex_lock(&con->recovery_lock);
2420c349dbc7Sjsg 	con->eh_data = NULL;
2421c349dbc7Sjsg 	kfree(data->bps);
2422c349dbc7Sjsg 	kfree(data);
2423c349dbc7Sjsg 	mutex_unlock(&con->recovery_lock);
2424c349dbc7Sjsg 
2425c349dbc7Sjsg 	return 0;
2426c349dbc7Sjsg }
2427c349dbc7Sjsg /* recovery end */
2428c349dbc7Sjsg 
24295ca02815Sjsg static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
2430ad8b1aafSjsg {
24311bb76ff1Sjsg 	if (amdgpu_sriov_vf(adev)) {
24321bb76ff1Sjsg 		switch (adev->ip_versions[MP0_HWIP][0]) {
24331bb76ff1Sjsg 		case IP_VERSION(13, 0, 2):
2434f005ef32Sjsg 		case IP_VERSION(13, 0, 6):
24351bb76ff1Sjsg 			return true;
24361bb76ff1Sjsg 		default:
24371bb76ff1Sjsg 			return false;
24381bb76ff1Sjsg 		}
24391bb76ff1Sjsg 	}
24401bb76ff1Sjsg 
24411bb76ff1Sjsg 	if (adev->asic_type == CHIP_IP_DISCOVERY) {
24421bb76ff1Sjsg 		switch (adev->ip_versions[MP0_HWIP][0]) {
24431bb76ff1Sjsg 		case IP_VERSION(13, 0, 0):
2444f005ef32Sjsg 		case IP_VERSION(13, 0, 6):
24451bb76ff1Sjsg 		case IP_VERSION(13, 0, 10):
24461bb76ff1Sjsg 			return true;
24471bb76ff1Sjsg 		default:
24481bb76ff1Sjsg 			return false;
24491bb76ff1Sjsg 		}
24501bb76ff1Sjsg 	}
24511bb76ff1Sjsg 
24525ca02815Sjsg 	return adev->asic_type == CHIP_VEGA10 ||
24535ca02815Sjsg 		adev->asic_type == CHIP_VEGA20 ||
24545ca02815Sjsg 		adev->asic_type == CHIP_ARCTURUS ||
24555ca02815Sjsg 		adev->asic_type == CHIP_ALDEBARAN ||
24565ca02815Sjsg 		adev->asic_type == CHIP_SIENNA_CICHLID;
24575ca02815Sjsg }
24585ca02815Sjsg 
24595ca02815Sjsg /*
24605ca02815Sjsg  * this is workaround for vega20 workstation sku,
24615ca02815Sjsg  * force enable gfx ras, ignore vbios gfx ras flag
24625ca02815Sjsg  * due to GC EDC can not write
24635ca02815Sjsg  */
24645ca02815Sjsg static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
24655ca02815Sjsg {
24665ca02815Sjsg 	struct atom_context *ctx = adev->mode_info.atom_context;
24675ca02815Sjsg 
24685ca02815Sjsg 	if (!ctx)
24695ca02815Sjsg 		return;
24705ca02815Sjsg 
2471f005ef32Sjsg 	if (strnstr(ctx->vbios_pn, "D16406",
2472f005ef32Sjsg 		    sizeof(ctx->vbios_pn)) ||
2473f005ef32Sjsg 		strnstr(ctx->vbios_pn, "D36002",
2474f005ef32Sjsg 			sizeof(ctx->vbios_pn)))
24755ca02815Sjsg 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2476ad8b1aafSjsg }
2477ad8b1aafSjsg 
2478c349dbc7Sjsg /*
2479c349dbc7Sjsg  * check hardware's ras ability which will be saved in hw_supported.
2480c349dbc7Sjsg  * if hardware does not support ras, we can skip some ras initializtion and
2481c349dbc7Sjsg  * forbid some ras operations from IP.
2482c349dbc7Sjsg  * if software itself, say boot parameter, limit the ras ability. We still
2483c349dbc7Sjsg  * need allow IP do some limited operations, like disable. In such case,
2484c349dbc7Sjsg  * we have to initialize ras as normal. but need check if operation is
2485c349dbc7Sjsg  * allowed or not in each function.
2486c349dbc7Sjsg  */
24875ca02815Sjsg static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2488c349dbc7Sjsg {
24895ca02815Sjsg 	adev->ras_hw_enabled = adev->ras_enabled = 0;
2490c349dbc7Sjsg 
2491f005ef32Sjsg 	if (!amdgpu_ras_asic_supported(adev))
2492c349dbc7Sjsg 		return;
2493c349dbc7Sjsg 
2494f005ef32Sjsg 	if (!adev->gmc.xgmi.connected_to_cpu &&	!adev->gmc.is_app_apu) {
2495c349dbc7Sjsg 		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
24965ca02815Sjsg 			dev_info(adev->dev, "MEM ECC is active.\n");
24975ca02815Sjsg 			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
2498c349dbc7Sjsg 						   1 << AMDGPU_RAS_BLOCK__DF);
24995ca02815Sjsg 		} else {
25005ca02815Sjsg 			dev_info(adev->dev, "MEM ECC is not presented.\n");
25015ca02815Sjsg 		}
2502c349dbc7Sjsg 
2503c349dbc7Sjsg 		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2504ad8b1aafSjsg 			dev_info(adev->dev, "SRAM ECC is active.\n");
2505f005ef32Sjsg 			if (!amdgpu_sriov_vf(adev))
25065ca02815Sjsg 				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2507c349dbc7Sjsg 							    1 << AMDGPU_RAS_BLOCK__DF);
2508f005ef32Sjsg 			else
2509f005ef32Sjsg 				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
2510f005ef32Sjsg 								1 << AMDGPU_RAS_BLOCK__SDMA |
2511f005ef32Sjsg 								1 << AMDGPU_RAS_BLOCK__GFX);
25121bb76ff1Sjsg 
2513f005ef32Sjsg 			/* VCN/JPEG RAS can be supported on both bare metal and
2514f005ef32Sjsg 			 * SRIOV environment
2515f005ef32Sjsg 			 */
2516f005ef32Sjsg 			if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
2517f005ef32Sjsg 			    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
25181bb76ff1Sjsg 				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
25191bb76ff1Sjsg 							1 << AMDGPU_RAS_BLOCK__JPEG);
25201bb76ff1Sjsg 			else
25211bb76ff1Sjsg 				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
25221bb76ff1Sjsg 							1 << AMDGPU_RAS_BLOCK__JPEG);
2523f005ef32Sjsg 
2524f005ef32Sjsg 			/*
2525f005ef32Sjsg 			 * XGMI RAS is not supported if xgmi num physical nodes
2526f005ef32Sjsg 			 * is zero
2527f005ef32Sjsg 			 */
2528f005ef32Sjsg 			if (!adev->gmc.xgmi.num_physical_nodes)
2529f005ef32Sjsg 				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL);
25305ca02815Sjsg 		} else {
2531ad8b1aafSjsg 			dev_info(adev->dev, "SRAM ECC is not presented.\n");
25325ca02815Sjsg 		}
25335ca02815Sjsg 	} else {
25345ca02815Sjsg 		/* driver only manages a few IP blocks RAS feature
25355ca02815Sjsg 		 * when GPU is connected cpu through XGMI */
25365ca02815Sjsg 		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
25375ca02815Sjsg 					   1 << AMDGPU_RAS_BLOCK__SDMA |
25385ca02815Sjsg 					   1 << AMDGPU_RAS_BLOCK__MMHUB);
25395ca02815Sjsg 	}
25405ca02815Sjsg 
25415ca02815Sjsg 	amdgpu_ras_get_quirks(adev);
2542c349dbc7Sjsg 
2543c349dbc7Sjsg 	/* hw_supported needs to be aligned with RAS block mask. */
25445ca02815Sjsg 	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2545c349dbc7Sjsg 
2546f005ef32Sjsg 
2547f005ef32Sjsg 	/*
2548f005ef32Sjsg 	 * Disable ras feature for aqua vanjaram
2549f005ef32Sjsg 	 * by default on apu platform.
2550f005ef32Sjsg 	 */
2551f005ef32Sjsg 	if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) &&
2552f005ef32Sjsg 	    adev->gmc.is_app_apu)
2553f005ef32Sjsg 		adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 :
2554f005ef32Sjsg 			adev->ras_hw_enabled & amdgpu_ras_mask;
2555f005ef32Sjsg 	else
25565ca02815Sjsg 		adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
25575ca02815Sjsg 			adev->ras_hw_enabled & amdgpu_ras_mask;
25585ca02815Sjsg }
25595ca02815Sjsg 
25605ca02815Sjsg static void amdgpu_ras_counte_dw(struct work_struct *work)
25615ca02815Sjsg {
25625ca02815Sjsg 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
25635ca02815Sjsg 					      ras_counte_delay_work.work);
25645ca02815Sjsg 	struct amdgpu_device *adev = con->adev;
25655ca02815Sjsg 	struct drm_device *dev = adev_to_drm(adev);
25665ca02815Sjsg 	unsigned long ce_count, ue_count;
25675ca02815Sjsg 	int res;
25685ca02815Sjsg 
25695ca02815Sjsg 	res = pm_runtime_get_sync(dev->dev);
25705ca02815Sjsg 	if (res < 0)
25715ca02815Sjsg 		goto Out;
25725ca02815Sjsg 
25735ca02815Sjsg 	/* Cache new values.
25745ca02815Sjsg 	 */
2575f005ef32Sjsg 	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
25765ca02815Sjsg 		atomic_set(&con->ras_ce_count, ce_count);
25775ca02815Sjsg 		atomic_set(&con->ras_ue_count, ue_count);
25785ca02815Sjsg 	}
25795ca02815Sjsg 
25805ca02815Sjsg 	pm_runtime_mark_last_busy(dev->dev);
25815ca02815Sjsg Out:
25825ca02815Sjsg 	pm_runtime_put_autosuspend(dev->dev);
2583c349dbc7Sjsg }
2584c349dbc7Sjsg 
2585f005ef32Sjsg static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
2586f005ef32Sjsg {
2587f005ef32Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2588f005ef32Sjsg 	bool df_poison, umc_poison;
2589f005ef32Sjsg 
2590f005ef32Sjsg 	/* poison setting is useless on SRIOV guest */
2591f005ef32Sjsg 	if (amdgpu_sriov_vf(adev) || !con)
2592f005ef32Sjsg 		return;
2593f005ef32Sjsg 
2594f005ef32Sjsg 	/* Init poison supported flag, the default value is false */
2595f005ef32Sjsg 	if (adev->gmc.xgmi.connected_to_cpu) {
2596f005ef32Sjsg 		/* enabled by default when GPU is connected to CPU */
2597f005ef32Sjsg 		con->poison_supported = true;
2598f005ef32Sjsg 	} else if (adev->df.funcs &&
2599f005ef32Sjsg 	    adev->df.funcs->query_ras_poison_mode &&
2600f005ef32Sjsg 	    adev->umc.ras &&
2601f005ef32Sjsg 	    adev->umc.ras->query_ras_poison_mode) {
2602f005ef32Sjsg 		df_poison =
2603f005ef32Sjsg 			adev->df.funcs->query_ras_poison_mode(adev);
2604f005ef32Sjsg 		umc_poison =
2605f005ef32Sjsg 			adev->umc.ras->query_ras_poison_mode(adev);
2606f005ef32Sjsg 
2607f005ef32Sjsg 		/* Only poison is set in both DF and UMC, we can support it */
2608f005ef32Sjsg 		if (df_poison && umc_poison)
2609f005ef32Sjsg 			con->poison_supported = true;
2610f005ef32Sjsg 		else if (df_poison != umc_poison)
2611f005ef32Sjsg 			dev_warn(adev->dev,
2612f005ef32Sjsg 				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2613f005ef32Sjsg 				df_poison, umc_poison);
2614f005ef32Sjsg 	}
2615f005ef32Sjsg }
2616f005ef32Sjsg 
2617c349dbc7Sjsg int amdgpu_ras_init(struct amdgpu_device *adev)
2618c349dbc7Sjsg {
2619c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2620c349dbc7Sjsg 	int r;
2621c349dbc7Sjsg 
2622c349dbc7Sjsg 	if (con)
2623c349dbc7Sjsg 		return 0;
2624c349dbc7Sjsg 
2625c349dbc7Sjsg 	con = kmalloc(sizeof(struct amdgpu_ras) +
26261bb76ff1Sjsg 			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
26271bb76ff1Sjsg 			sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2628c349dbc7Sjsg 			GFP_KERNEL|__GFP_ZERO);
2629c349dbc7Sjsg 	if (!con)
2630c349dbc7Sjsg 		return -ENOMEM;
2631c349dbc7Sjsg 
26325ca02815Sjsg 	con->adev = adev;
26335ca02815Sjsg 	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
26345ca02815Sjsg 	atomic_set(&con->ras_ce_count, 0);
26355ca02815Sjsg 	atomic_set(&con->ras_ue_count, 0);
26365ca02815Sjsg 
2637c349dbc7Sjsg 	con->objs = (struct ras_manager *)(con + 1);
2638c349dbc7Sjsg 
2639c349dbc7Sjsg 	amdgpu_ras_set_context(adev, con);
2640c349dbc7Sjsg 
26415ca02815Sjsg 	amdgpu_ras_check_supported(adev);
26425ca02815Sjsg 
26435ca02815Sjsg 	if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
26445ca02815Sjsg 		/* set gfx block ras context feature for VEGA20 Gaming
26455ca02815Sjsg 		 * send ras disable cmd to ras ta during ras late init.
26465ca02815Sjsg 		 */
26475ca02815Sjsg 		if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
26485ca02815Sjsg 			con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
26495ca02815Sjsg 
26505ca02815Sjsg 			return 0;
26515ca02815Sjsg 		}
26525ca02815Sjsg 
2653ad8b1aafSjsg 		r = 0;
2654ad8b1aafSjsg 		goto release_con;
2655c349dbc7Sjsg 	}
2656c349dbc7Sjsg 
26571bb76ff1Sjsg 	con->update_channel_flag = false;
2658c349dbc7Sjsg 	con->features = 0;
2659c349dbc7Sjsg 	INIT_LIST_HEAD(&con->head);
2660c349dbc7Sjsg 	/* Might need get this flag from vbios. */
2661c349dbc7Sjsg 	con->flags = RAS_DEFAULT_FLAGS;
2662c349dbc7Sjsg 
26635ca02815Sjsg 	/* initialize nbio ras function ahead of any other
26645ca02815Sjsg 	 * ras functions so hardware fatal error interrupt
26655ca02815Sjsg 	 * can be enabled as early as possible */
2666f005ef32Sjsg 	switch (adev->ip_versions[NBIO_HWIP][0]) {
2667f005ef32Sjsg 	case IP_VERSION(7, 4, 0):
2668f005ef32Sjsg 	case IP_VERSION(7, 4, 1):
2669f005ef32Sjsg 	case IP_VERSION(7, 4, 4):
2670f005ef32Sjsg 		if (!adev->gmc.xgmi.connected_to_cpu)
26711bb76ff1Sjsg 			adev->nbio.ras = &nbio_v7_4_ras;
2672f005ef32Sjsg 		break;
2673f005ef32Sjsg 	case IP_VERSION(4, 3, 0):
2674f005ef32Sjsg 		if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
2675f005ef32Sjsg 			/* unlike other generation of nbio ras,
2676f005ef32Sjsg 			 * nbio v4_3 only support fatal error interrupt
2677f005ef32Sjsg 			 * to inform software that DF is freezed due to
2678f005ef32Sjsg 			 * system fatal error event. driver should not
2679f005ef32Sjsg 			 * enable nbio ras in such case. Instead,
2680f005ef32Sjsg 			 * check DF RAS */
2681f005ef32Sjsg 			adev->nbio.ras = &nbio_v4_3_ras;
2682f005ef32Sjsg 		break;
2683f005ef32Sjsg 	case IP_VERSION(7, 9, 0):
2684f005ef32Sjsg 		if (!adev->gmc.is_app_apu)
2685f005ef32Sjsg 			adev->nbio.ras = &nbio_v7_9_ras;
26865ca02815Sjsg 		break;
26875ca02815Sjsg 	default:
26885ca02815Sjsg 		/* nbio ras is not available */
26895ca02815Sjsg 		break;
26905ca02815Sjsg 	}
26915ca02815Sjsg 
2692f005ef32Sjsg 	/* nbio ras block needs to be enabled ahead of other ras blocks
2693f005ef32Sjsg 	 * to handle fatal error */
2694f005ef32Sjsg 	r = amdgpu_nbio_ras_sw_init(adev);
2695f005ef32Sjsg 	if (r)
2696f005ef32Sjsg 		return r;
2697f005ef32Sjsg 
26981bb76ff1Sjsg 	if (adev->nbio.ras &&
26991bb76ff1Sjsg 	    adev->nbio.ras->init_ras_controller_interrupt) {
27001bb76ff1Sjsg 		r = adev->nbio.ras->init_ras_controller_interrupt(adev);
2701c349dbc7Sjsg 		if (r)
2702ad8b1aafSjsg 			goto release_con;
2703c349dbc7Sjsg 	}
2704c349dbc7Sjsg 
27051bb76ff1Sjsg 	if (adev->nbio.ras &&
27061bb76ff1Sjsg 	    adev->nbio.ras->init_ras_err_event_athub_interrupt) {
27071bb76ff1Sjsg 		r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
2708c349dbc7Sjsg 		if (r)
2709ad8b1aafSjsg 			goto release_con;
2710c349dbc7Sjsg 	}
2711c349dbc7Sjsg 
2712f005ef32Sjsg 	amdgpu_ras_query_poison_mode(adev);
27131bb76ff1Sjsg 
2714ad8b1aafSjsg 	if (amdgpu_ras_fs_init(adev)) {
2715ad8b1aafSjsg 		r = -EINVAL;
2716ad8b1aafSjsg 		goto release_con;
2717ad8b1aafSjsg 	}
2718c349dbc7Sjsg 
2719ad8b1aafSjsg 	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2720c349dbc7Sjsg 		 "hardware ability[%x] ras_mask[%x]\n",
27215ca02815Sjsg 		 adev->ras_hw_enabled, adev->ras_enabled);
27225ca02815Sjsg 
2723c349dbc7Sjsg 	return 0;
2724ad8b1aafSjsg release_con:
2725c349dbc7Sjsg 	amdgpu_ras_set_context(adev, NULL);
2726c349dbc7Sjsg 	kfree(con);
2727c349dbc7Sjsg 
2728ad8b1aafSjsg 	return r;
2729c349dbc7Sjsg }
2730c349dbc7Sjsg 
27315ca02815Sjsg int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
27325ca02815Sjsg {
2733f005ef32Sjsg 	if (adev->gmc.xgmi.connected_to_cpu ||
2734f005ef32Sjsg 	    adev->gmc.is_app_apu)
27355ca02815Sjsg 		return 1;
27365ca02815Sjsg 	return 0;
27375ca02815Sjsg }
27385ca02815Sjsg 
27395ca02815Sjsg static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
27405ca02815Sjsg 					struct ras_common_if *ras_block)
27415ca02815Sjsg {
27425ca02815Sjsg 	struct ras_query_if info = {
27435ca02815Sjsg 		.head = *ras_block,
27445ca02815Sjsg 	};
27455ca02815Sjsg 
27465ca02815Sjsg 	if (!amdgpu_persistent_edc_harvesting_supported(adev))
27475ca02815Sjsg 		return 0;
27485ca02815Sjsg 
27495ca02815Sjsg 	if (amdgpu_ras_query_error_status(adev, &info) != 0)
27505ca02815Sjsg 		DRM_WARN("RAS init harvest failure");
27515ca02815Sjsg 
27525ca02815Sjsg 	if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
27535ca02815Sjsg 		DRM_WARN("RAS init harvest reset failure");
27545ca02815Sjsg 
27555ca02815Sjsg 	return 0;
27565ca02815Sjsg }
27575ca02815Sjsg 
27581bb76ff1Sjsg bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2759c349dbc7Sjsg {
27605ca02815Sjsg        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
27611bb76ff1Sjsg 
27621bb76ff1Sjsg        if (!con)
27631bb76ff1Sjsg                return false;
27641bb76ff1Sjsg 
27651bb76ff1Sjsg        return con->poison_supported;
27661bb76ff1Sjsg }
27671bb76ff1Sjsg 
27681bb76ff1Sjsg /* helper function to handle common stuff in ip late init phase */
27691bb76ff1Sjsg int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
27701bb76ff1Sjsg 			 struct ras_common_if *ras_block)
27711bb76ff1Sjsg {
27721bb76ff1Sjsg 	struct amdgpu_ras_block_object *ras_obj = NULL;
27731bb76ff1Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2774f005ef32Sjsg 	struct ras_query_if *query_info;
27755ca02815Sjsg 	unsigned long ue_count, ce_count;
2776c349dbc7Sjsg 	int r;
2777c349dbc7Sjsg 
2778c349dbc7Sjsg 	/* disable RAS feature per IP block if it is not supported */
2779c349dbc7Sjsg 	if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2780c349dbc7Sjsg 		amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2781c349dbc7Sjsg 		return 0;
2782c349dbc7Sjsg 	}
2783c349dbc7Sjsg 
2784c349dbc7Sjsg 	r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2785c349dbc7Sjsg 	if (r) {
27861bb76ff1Sjsg 		if (adev->in_suspend || amdgpu_in_reset(adev)) {
2787c349dbc7Sjsg 			/* in resume phase, if fail to enable ras,
2788c349dbc7Sjsg 			 * clean up all ras fs nodes, and disable ras */
2789c349dbc7Sjsg 			goto cleanup;
2790c349dbc7Sjsg 		} else
2791c349dbc7Sjsg 			return r;
2792c349dbc7Sjsg 	}
2793c349dbc7Sjsg 
27945ca02815Sjsg 	/* check for errors on warm reset edc persisant supported ASIC */
27955ca02815Sjsg 	amdgpu_persistent_edc_harvesting(adev, ras_block);
27965ca02815Sjsg 
2797c349dbc7Sjsg 	/* in resume phase, no need to create ras fs node */
2798ad8b1aafSjsg 	if (adev->in_suspend || amdgpu_in_reset(adev))
2799c349dbc7Sjsg 		return 0;
2800c349dbc7Sjsg 
28011bb76ff1Sjsg 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
28021bb76ff1Sjsg 	if (ras_obj->ras_cb || (ras_obj->hw_ops &&
28031bb76ff1Sjsg 	    (ras_obj->hw_ops->query_poison_status ||
28041bb76ff1Sjsg 	    ras_obj->hw_ops->handle_poison_consumption))) {
28051bb76ff1Sjsg 		r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
2806c349dbc7Sjsg 		if (r)
28071bb76ff1Sjsg 			goto cleanup;
2808c349dbc7Sjsg 	}
2809c349dbc7Sjsg 
2810f005ef32Sjsg 	if (ras_obj->hw_ops &&
2811f005ef32Sjsg 	    (ras_obj->hw_ops->query_ras_error_count ||
2812f005ef32Sjsg 	     ras_obj->hw_ops->query_ras_error_status)) {
28131bb76ff1Sjsg 		r = amdgpu_ras_sysfs_create(adev, ras_block);
2814c349dbc7Sjsg 		if (r)
28151bb76ff1Sjsg 			goto interrupt;
2816c349dbc7Sjsg 
28175ca02815Sjsg 		/* Those are the cached values at init.
28185ca02815Sjsg 		 */
2819f005ef32Sjsg 		query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
2820f005ef32Sjsg 		if (!query_info)
2821f005ef32Sjsg 			return -ENOMEM;
2822f005ef32Sjsg 		memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
2823f005ef32Sjsg 
2824f005ef32Sjsg 		if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
28255ca02815Sjsg 			atomic_set(&con->ras_ce_count, ce_count);
28265ca02815Sjsg 			atomic_set(&con->ras_ue_count, ue_count);
28275ca02815Sjsg 		}
28285ca02815Sjsg 
2829f005ef32Sjsg 		kfree(query_info);
2830f005ef32Sjsg 	}
2831f005ef32Sjsg 
2832c349dbc7Sjsg 	return 0;
28331bb76ff1Sjsg 
2834c349dbc7Sjsg interrupt:
28351bb76ff1Sjsg 	if (ras_obj->ras_cb)
28361bb76ff1Sjsg 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
28371bb76ff1Sjsg cleanup:
2838c349dbc7Sjsg 	amdgpu_ras_feature_enable(adev, ras_block, 0);
2839c349dbc7Sjsg 	return r;
2840c349dbc7Sjsg }
2841c349dbc7Sjsg 
28421bb76ff1Sjsg static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
28431bb76ff1Sjsg 			 struct ras_common_if *ras_block)
2844c349dbc7Sjsg {
28451bb76ff1Sjsg 	return amdgpu_ras_block_late_init(adev, ras_block);
28461bb76ff1Sjsg }
28471bb76ff1Sjsg 
28481bb76ff1Sjsg /* helper function to remove ras fs node and interrupt handler */
28491bb76ff1Sjsg void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
28501bb76ff1Sjsg 			  struct ras_common_if *ras_block)
28511bb76ff1Sjsg {
28521bb76ff1Sjsg 	struct amdgpu_ras_block_object *ras_obj;
28531bb76ff1Sjsg 	if (!ras_block)
2854c349dbc7Sjsg 		return;
2855c349dbc7Sjsg 
2856c349dbc7Sjsg 	amdgpu_ras_sysfs_remove(adev, ras_block);
28571bb76ff1Sjsg 
28581bb76ff1Sjsg 	ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
28591bb76ff1Sjsg 	if (ras_obj->ras_cb)
28601bb76ff1Sjsg 		amdgpu_ras_interrupt_remove_handler(adev, ras_block);
28611bb76ff1Sjsg }
28621bb76ff1Sjsg 
28631bb76ff1Sjsg static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
28641bb76ff1Sjsg 			  struct ras_common_if *ras_block)
28651bb76ff1Sjsg {
28661bb76ff1Sjsg 	return amdgpu_ras_block_late_fini(adev, ras_block);
2867c349dbc7Sjsg }
2868c349dbc7Sjsg 
2869c349dbc7Sjsg /* do some init work after IP late init as dependence.
2870c349dbc7Sjsg  * and it runs in resume/gpu reset/booting up cases.
2871c349dbc7Sjsg  */
2872c349dbc7Sjsg void amdgpu_ras_resume(struct amdgpu_device *adev)
2873c349dbc7Sjsg {
2874c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2875c349dbc7Sjsg 	struct ras_manager *obj, *tmp;
2876c349dbc7Sjsg 
28775ca02815Sjsg 	if (!adev->ras_enabled || !con) {
28785ca02815Sjsg 		/* clean ras context for VEGA20 Gaming after send ras disable cmd */
28795ca02815Sjsg 		amdgpu_release_ras_context(adev);
28805ca02815Sjsg 
2881c349dbc7Sjsg 		return;
28825ca02815Sjsg 	}
2883c349dbc7Sjsg 
2884c349dbc7Sjsg 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2885c349dbc7Sjsg 		/* Set up all other IPs which are not implemented. There is a
2886c349dbc7Sjsg 		 * tricky thing that IP's actual ras error type should be
2887c349dbc7Sjsg 		 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
2888c349dbc7Sjsg 		 * ERROR_NONE make sense anyway.
2889c349dbc7Sjsg 		 */
2890c349dbc7Sjsg 		amdgpu_ras_enable_all_features(adev, 1);
2891c349dbc7Sjsg 
2892c349dbc7Sjsg 		/* We enable ras on all hw_supported block, but as boot
2893c349dbc7Sjsg 		 * parameter might disable some of them and one or more IP has
2894c349dbc7Sjsg 		 * not implemented yet. So we disable them on behalf.
2895c349dbc7Sjsg 		 */
2896c349dbc7Sjsg 		list_for_each_entry_safe(obj, tmp, &con->head, node) {
2897c349dbc7Sjsg 			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2898c349dbc7Sjsg 				amdgpu_ras_feature_enable(adev, &obj->head, 0);
2899c349dbc7Sjsg 				/* there should be no any reference. */
2900c349dbc7Sjsg 				WARN_ON(alive_obj(obj));
2901c349dbc7Sjsg 			}
2902c349dbc7Sjsg 		}
2903c349dbc7Sjsg 	}
2904c349dbc7Sjsg }
2905c349dbc7Sjsg 
2906c349dbc7Sjsg void amdgpu_ras_suspend(struct amdgpu_device *adev)
2907c349dbc7Sjsg {
2908c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2909c349dbc7Sjsg 
29105ca02815Sjsg 	if (!adev->ras_enabled || !con)
2911c349dbc7Sjsg 		return;
2912c349dbc7Sjsg 
2913c349dbc7Sjsg 	amdgpu_ras_disable_all_features(adev, 0);
2914c349dbc7Sjsg 	/* Make sure all ras objects are disabled. */
2915c349dbc7Sjsg 	if (con->features)
2916c349dbc7Sjsg 		amdgpu_ras_disable_all_features(adev, 1);
2917c349dbc7Sjsg }
2918c349dbc7Sjsg 
29191bb76ff1Sjsg int amdgpu_ras_late_init(struct amdgpu_device *adev)
29201bb76ff1Sjsg {
29211bb76ff1Sjsg 	struct amdgpu_ras_block_list *node, *tmp;
29221bb76ff1Sjsg 	struct amdgpu_ras_block_object *obj;
29231bb76ff1Sjsg 	int r;
29241bb76ff1Sjsg 
29251bb76ff1Sjsg 	/* Guest side doesn't need init ras feature */
29261bb76ff1Sjsg 	if (amdgpu_sriov_vf(adev))
29271bb76ff1Sjsg 		return 0;
29281bb76ff1Sjsg 
29291bb76ff1Sjsg 	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
29301bb76ff1Sjsg 		if (!node->ras_obj) {
29311bb76ff1Sjsg 			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
29321bb76ff1Sjsg 			continue;
29331bb76ff1Sjsg 		}
29341bb76ff1Sjsg 
29351bb76ff1Sjsg 		obj = node->ras_obj;
29361bb76ff1Sjsg 		if (obj->ras_late_init) {
29371bb76ff1Sjsg 			r = obj->ras_late_init(adev, &obj->ras_comm);
29381bb76ff1Sjsg 			if (r) {
29391bb76ff1Sjsg 				dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
29401bb76ff1Sjsg 					obj->ras_comm.name, r);
29411bb76ff1Sjsg 				return r;
29421bb76ff1Sjsg 			}
29431bb76ff1Sjsg 		} else
29441bb76ff1Sjsg 			amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
29451bb76ff1Sjsg 	}
29461bb76ff1Sjsg 
29471bb76ff1Sjsg 	return 0;
29481bb76ff1Sjsg }
29491bb76ff1Sjsg 
2950c349dbc7Sjsg /* do some fini work before IP fini as dependence */
2951c349dbc7Sjsg int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2952c349dbc7Sjsg {
2953c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2954c349dbc7Sjsg 
29555ca02815Sjsg 	if (!adev->ras_enabled || !con)
2956c349dbc7Sjsg 		return 0;
2957c349dbc7Sjsg 
29585ca02815Sjsg 
2959c349dbc7Sjsg 	/* Need disable ras on all IPs here before ip [hw/sw]fini */
29601bb76ff1Sjsg 	if (con->features)
2961c349dbc7Sjsg 		amdgpu_ras_disable_all_features(adev, 0);
2962c349dbc7Sjsg 	amdgpu_ras_recovery_fini(adev);
2963c349dbc7Sjsg 	return 0;
2964c349dbc7Sjsg }
2965c349dbc7Sjsg 
2966c349dbc7Sjsg int amdgpu_ras_fini(struct amdgpu_device *adev)
2967c349dbc7Sjsg {
29681bb76ff1Sjsg 	struct amdgpu_ras_block_list *ras_node, *tmp;
29691bb76ff1Sjsg 	struct amdgpu_ras_block_object *obj = NULL;
2970c349dbc7Sjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2971c349dbc7Sjsg 
29725ca02815Sjsg 	if (!adev->ras_enabled || !con)
2973c349dbc7Sjsg 		return 0;
2974c349dbc7Sjsg 
29751bb76ff1Sjsg 	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
29761bb76ff1Sjsg 		if (ras_node->ras_obj) {
29771bb76ff1Sjsg 			obj = ras_node->ras_obj;
29781bb76ff1Sjsg 			if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
29791bb76ff1Sjsg 			    obj->ras_fini)
29801bb76ff1Sjsg 				obj->ras_fini(adev, &obj->ras_comm);
29811bb76ff1Sjsg 			else
29821bb76ff1Sjsg 				amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
29831bb76ff1Sjsg 		}
29841bb76ff1Sjsg 
29851bb76ff1Sjsg 		/* Clear ras blocks from ras_list and free ras block list node */
29861bb76ff1Sjsg 		list_del(&ras_node->node);
29871bb76ff1Sjsg 		kfree(ras_node);
29881bb76ff1Sjsg 	}
29891bb76ff1Sjsg 
2990c349dbc7Sjsg 	amdgpu_ras_fs_fini(adev);
2991c349dbc7Sjsg 	amdgpu_ras_interrupt_remove_all(adev);
2992c349dbc7Sjsg 
2993c349dbc7Sjsg 	WARN(con->features, "Feature mask is not cleared");
2994c349dbc7Sjsg 
2995c349dbc7Sjsg 	if (con->features)
2996c349dbc7Sjsg 		amdgpu_ras_disable_all_features(adev, 1);
2997c349dbc7Sjsg 
29985ca02815Sjsg 	cancel_delayed_work_sync(&con->ras_counte_delay_work);
29995ca02815Sjsg 
3000c349dbc7Sjsg 	amdgpu_ras_set_context(adev, NULL);
3001c349dbc7Sjsg 	kfree(con);
3002c349dbc7Sjsg 
3003c349dbc7Sjsg 	return 0;
3004c349dbc7Sjsg }
3005c349dbc7Sjsg 
3006c349dbc7Sjsg void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
3007c349dbc7Sjsg {
3008c349dbc7Sjsg 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
3009f005ef32Sjsg 		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
3010f005ef32Sjsg 
3011ad8b1aafSjsg 		dev_info(adev->dev, "uncorrectable hardware error"
3012ad8b1aafSjsg 			"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
3013c349dbc7Sjsg 
3014f005ef32Sjsg 		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
3015c349dbc7Sjsg 		amdgpu_ras_reset_gpu(adev);
3016c349dbc7Sjsg 	}
3017c349dbc7Sjsg }
3018ad8b1aafSjsg 
3019ad8b1aafSjsg bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
3020ad8b1aafSjsg {
3021ad8b1aafSjsg 	if (adev->asic_type == CHIP_VEGA20 &&
3022ad8b1aafSjsg 	    adev->pm.fw_version <= 0x283400) {
3023ad8b1aafSjsg 		return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
3024ad8b1aafSjsg 				amdgpu_ras_intr_triggered();
3025ad8b1aafSjsg 	}
3026ad8b1aafSjsg 
3027ad8b1aafSjsg 	return false;
3028ad8b1aafSjsg }
3029ad8b1aafSjsg 
30305ca02815Sjsg void amdgpu_release_ras_context(struct amdgpu_device *adev)
3031ad8b1aafSjsg {
3032ad8b1aafSjsg 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3033ad8b1aafSjsg 
30345ca02815Sjsg 	if (!con)
30355ca02815Sjsg 		return;
3036ad8b1aafSjsg 
30375ca02815Sjsg 	if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
30385ca02815Sjsg 		con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
30395ca02815Sjsg 		amdgpu_ras_set_context(adev, NULL);
30405ca02815Sjsg 		kfree(con);
30415ca02815Sjsg 	}
3042ad8b1aafSjsg }
30431bb76ff1Sjsg 
30441bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD
30451bb76ff1Sjsg static struct amdgpu_device *find_adev(uint32_t node_id)
30461bb76ff1Sjsg {
30471bb76ff1Sjsg 	int i;
30481bb76ff1Sjsg 	struct amdgpu_device *adev = NULL;
30491bb76ff1Sjsg 
30501bb76ff1Sjsg 	for (i = 0; i < mce_adev_list.num_gpu; i++) {
30511bb76ff1Sjsg 		adev = mce_adev_list.devs[i];
30521bb76ff1Sjsg 
30531bb76ff1Sjsg 		if (adev && adev->gmc.xgmi.connected_to_cpu &&
30541bb76ff1Sjsg 		    adev->gmc.xgmi.physical_node_id == node_id)
30551bb76ff1Sjsg 			break;
30561bb76ff1Sjsg 		adev = NULL;
30571bb76ff1Sjsg 	}
30581bb76ff1Sjsg 
30591bb76ff1Sjsg 	return adev;
30601bb76ff1Sjsg }
30611bb76ff1Sjsg 
30621bb76ff1Sjsg #define GET_MCA_IPID_GPUID(m)	(((m) >> 44) & 0xF)
30631bb76ff1Sjsg #define GET_UMC_INST(m)		(((m) >> 21) & 0x7)
30641bb76ff1Sjsg #define GET_CHAN_INDEX(m)	((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
30651bb76ff1Sjsg #define GPU_ID_OFFSET		8
30661bb76ff1Sjsg 
30671bb76ff1Sjsg static int amdgpu_bad_page_notifier(struct notifier_block *nb,
30681bb76ff1Sjsg 				    unsigned long val, void *data)
30691bb76ff1Sjsg {
30701bb76ff1Sjsg 	struct mce *m = (struct mce *)data;
30711bb76ff1Sjsg 	struct amdgpu_device *adev = NULL;
30721bb76ff1Sjsg 	uint32_t gpu_id = 0;
30731bb76ff1Sjsg 	uint32_t umc_inst = 0, ch_inst = 0;
30741bb76ff1Sjsg 
30751bb76ff1Sjsg 	/*
30761bb76ff1Sjsg 	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
30771bb76ff1Sjsg 	 * and error occurred in DramECC (Extended error code = 0) then only
30781bb76ff1Sjsg 	 * process the error, else bail out.
30791bb76ff1Sjsg 	 */
30801bb76ff1Sjsg 	if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
30811bb76ff1Sjsg 		    (XEC(m->status, 0x3f) == 0x0)))
30821bb76ff1Sjsg 		return NOTIFY_DONE;
30831bb76ff1Sjsg 
30841bb76ff1Sjsg 	/*
30851bb76ff1Sjsg 	 * If it is correctable error, return.
30861bb76ff1Sjsg 	 */
30871bb76ff1Sjsg 	if (mce_is_correctable(m))
30881bb76ff1Sjsg 		return NOTIFY_OK;
30891bb76ff1Sjsg 
30901bb76ff1Sjsg 	/*
30911bb76ff1Sjsg 	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
30921bb76ff1Sjsg 	 */
30931bb76ff1Sjsg 	gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
30941bb76ff1Sjsg 
30951bb76ff1Sjsg 	adev = find_adev(gpu_id);
30961bb76ff1Sjsg 	if (!adev) {
30971bb76ff1Sjsg 		DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
30981bb76ff1Sjsg 								gpu_id);
30991bb76ff1Sjsg 		return NOTIFY_DONE;
31001bb76ff1Sjsg 	}
31011bb76ff1Sjsg 
31021bb76ff1Sjsg 	/*
31031bb76ff1Sjsg 	 * If it is uncorrectable error, then find out UMC instance and
31041bb76ff1Sjsg 	 * channel index.
31051bb76ff1Sjsg 	 */
31061bb76ff1Sjsg 	umc_inst = GET_UMC_INST(m->ipid);
31071bb76ff1Sjsg 	ch_inst = GET_CHAN_INDEX(m->ipid);
31081bb76ff1Sjsg 
31091bb76ff1Sjsg 	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
31101bb76ff1Sjsg 			     umc_inst, ch_inst);
31111bb76ff1Sjsg 
3112f005ef32Sjsg 	if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
31131bb76ff1Sjsg 		return NOTIFY_OK;
3114f005ef32Sjsg 	else
3115f005ef32Sjsg 		return NOTIFY_DONE;
31161bb76ff1Sjsg }
31171bb76ff1Sjsg 
31181bb76ff1Sjsg static struct notifier_block amdgpu_bad_page_nb = {
31191bb76ff1Sjsg 	.notifier_call  = amdgpu_bad_page_notifier,
31201bb76ff1Sjsg 	.priority       = MCE_PRIO_UC,
31211bb76ff1Sjsg };
31221bb76ff1Sjsg 
31231bb76ff1Sjsg static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
31241bb76ff1Sjsg {
31251bb76ff1Sjsg 	/*
31261bb76ff1Sjsg 	 * Add the adev to the mce_adev_list.
31271bb76ff1Sjsg 	 * During mode2 reset, amdgpu device is temporarily
31281bb76ff1Sjsg 	 * removed from the mgpu_info list which can cause
31291bb76ff1Sjsg 	 * page retirement to fail.
31301bb76ff1Sjsg 	 * Use this list instead of mgpu_info to find the amdgpu
31311bb76ff1Sjsg 	 * device on which the UMC error was reported.
31321bb76ff1Sjsg 	 */
31331bb76ff1Sjsg 	mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
31341bb76ff1Sjsg 
31351bb76ff1Sjsg 	/*
31361bb76ff1Sjsg 	 * Register the x86 notifier only once
31371bb76ff1Sjsg 	 * with MCE subsystem.
31381bb76ff1Sjsg 	 */
31391bb76ff1Sjsg 	if (notifier_registered == false) {
31401bb76ff1Sjsg 		mce_register_decode_chain(&amdgpu_bad_page_nb);
31411bb76ff1Sjsg 		notifier_registered = true;
31421bb76ff1Sjsg 	}
31431bb76ff1Sjsg }
31441bb76ff1Sjsg #endif
31451bb76ff1Sjsg 
31461bb76ff1Sjsg struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
31471bb76ff1Sjsg {
31481bb76ff1Sjsg 	if (!adev)
31491bb76ff1Sjsg 		return NULL;
31501bb76ff1Sjsg 
31511bb76ff1Sjsg 	return adev->psp.ras_context.ras;
31521bb76ff1Sjsg }
31531bb76ff1Sjsg 
31541bb76ff1Sjsg int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
31551bb76ff1Sjsg {
31561bb76ff1Sjsg 	if (!adev)
31571bb76ff1Sjsg 		return -EINVAL;
31581bb76ff1Sjsg 
31591bb76ff1Sjsg 	adev->psp.ras_context.ras = ras_con;
31601bb76ff1Sjsg 	return 0;
31611bb76ff1Sjsg }
31621bb76ff1Sjsg 
31631bb76ff1Sjsg /* check if ras is supported on block, say, sdma, gfx */
31641bb76ff1Sjsg int amdgpu_ras_is_supported(struct amdgpu_device *adev,
31651bb76ff1Sjsg 		unsigned int block)
31661bb76ff1Sjsg {
3167f005ef32Sjsg 	int ret = 0;
31681bb76ff1Sjsg 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31691bb76ff1Sjsg 
31701bb76ff1Sjsg 	if (block >= AMDGPU_RAS_BLOCK_COUNT)
31711bb76ff1Sjsg 		return 0;
3172f005ef32Sjsg 
3173f005ef32Sjsg 	ret = ras && (adev->ras_enabled & (1 << block));
3174f005ef32Sjsg 
3175f005ef32Sjsg 	/* For the special asic with mem ecc enabled but sram ecc
3176f005ef32Sjsg 	 * not enabled, even if the ras block is not supported on
3177f005ef32Sjsg 	 * .ras_enabled, if the asic supports poison mode and the
3178f005ef32Sjsg 	 * ras block has ras configuration, it can be considered
3179f005ef32Sjsg 	 * that the ras block supports ras function.
3180f005ef32Sjsg 	 */
3181f005ef32Sjsg 	if (!ret &&
3182f005ef32Sjsg 	    (block == AMDGPU_RAS_BLOCK__GFX ||
3183f005ef32Sjsg 	     block == AMDGPU_RAS_BLOCK__SDMA ||
3184f005ef32Sjsg 	     block == AMDGPU_RAS_BLOCK__VCN ||
3185f005ef32Sjsg 	     block == AMDGPU_RAS_BLOCK__JPEG) &&
3186f005ef32Sjsg 	    amdgpu_ras_is_poison_mode_supported(adev) &&
3187f005ef32Sjsg 	    amdgpu_ras_get_ras_block(adev, block, 0))
3188f005ef32Sjsg 		ret = 1;
3189f005ef32Sjsg 
3190f005ef32Sjsg 	return ret;
31911bb76ff1Sjsg }
31921bb76ff1Sjsg 
31931bb76ff1Sjsg int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
31941bb76ff1Sjsg {
31951bb76ff1Sjsg 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
31961bb76ff1Sjsg 
31971bb76ff1Sjsg 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
31981bb76ff1Sjsg 		amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
31991bb76ff1Sjsg 	return 0;
32001bb76ff1Sjsg }
32011bb76ff1Sjsg 
32021bb76ff1Sjsg 
32031bb76ff1Sjsg /* Register each ip ras block into amdgpu ras */
32041bb76ff1Sjsg int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
32051bb76ff1Sjsg 		struct amdgpu_ras_block_object *ras_block_obj)
32061bb76ff1Sjsg {
32071bb76ff1Sjsg 	struct amdgpu_ras_block_list *ras_node;
32081bb76ff1Sjsg 	if (!adev || !ras_block_obj)
32091bb76ff1Sjsg 		return -EINVAL;
32101bb76ff1Sjsg 
32111bb76ff1Sjsg 	ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
32121bb76ff1Sjsg 	if (!ras_node)
32131bb76ff1Sjsg 		return -ENOMEM;
32141bb76ff1Sjsg 
32151bb76ff1Sjsg 	INIT_LIST_HEAD(&ras_node->node);
32161bb76ff1Sjsg 	ras_node->ras_obj = ras_block_obj;
32171bb76ff1Sjsg 	list_add_tail(&ras_node->node, &adev->ras_list);
32181bb76ff1Sjsg 
32191bb76ff1Sjsg 	return 0;
32201bb76ff1Sjsg }
3221f005ef32Sjsg 
3222f005ef32Sjsg void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name)
3223f005ef32Sjsg {
3224f005ef32Sjsg 	if (!err_type_name)
3225f005ef32Sjsg 		return;
3226f005ef32Sjsg 
3227f005ef32Sjsg 	switch (err_type) {
3228f005ef32Sjsg 	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
3229f005ef32Sjsg 		snprintf(err_type_name, 16, "correctable");
3230f005ef32Sjsg 		break;
3231f005ef32Sjsg 	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
3232f005ef32Sjsg 		snprintf(err_type_name, 16, "uncorrectable");
3233f005ef32Sjsg 		break;
3234f005ef32Sjsg 	default:
3235f005ef32Sjsg 		snprintf(err_type_name, 16, "unknown");
3236f005ef32Sjsg 		break;
3237f005ef32Sjsg 	}
3238f005ef32Sjsg }
3239f005ef32Sjsg 
3240f005ef32Sjsg bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev,
3241f005ef32Sjsg 					 const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3242f005ef32Sjsg 					 uint32_t instance,
3243f005ef32Sjsg 					 uint32_t *memory_id)
3244f005ef32Sjsg {
3245f005ef32Sjsg 	uint32_t err_status_lo_data, err_status_lo_offset;
3246f005ef32Sjsg 
3247f005ef32Sjsg 	if (!reg_entry)
3248f005ef32Sjsg 		return false;
3249f005ef32Sjsg 
3250f005ef32Sjsg 	err_status_lo_offset =
3251f005ef32Sjsg 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3252f005ef32Sjsg 					    reg_entry->seg_lo, reg_entry->reg_lo);
3253f005ef32Sjsg 	err_status_lo_data = RREG32(err_status_lo_offset);
3254f005ef32Sjsg 
3255f005ef32Sjsg 	if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) &&
3256f005ef32Sjsg 	    !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG))
3257f005ef32Sjsg 		return false;
3258f005ef32Sjsg 
3259f005ef32Sjsg 	*memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID);
3260f005ef32Sjsg 
3261f005ef32Sjsg 	return true;
3262f005ef32Sjsg }
3263f005ef32Sjsg 
3264f005ef32Sjsg bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
3265f005ef32Sjsg 				       const struct amdgpu_ras_err_status_reg_entry *reg_entry,
3266f005ef32Sjsg 				       uint32_t instance,
3267f005ef32Sjsg 				       unsigned long *err_cnt)
3268f005ef32Sjsg {
3269f005ef32Sjsg 	uint32_t err_status_hi_data, err_status_hi_offset;
3270f005ef32Sjsg 
3271f005ef32Sjsg 	if (!reg_entry)
3272f005ef32Sjsg 		return false;
3273f005ef32Sjsg 
3274f005ef32Sjsg 	err_status_hi_offset =
3275f005ef32Sjsg 		AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance,
3276f005ef32Sjsg 					    reg_entry->seg_hi, reg_entry->reg_hi);
3277f005ef32Sjsg 	err_status_hi_data = RREG32(err_status_hi_offset);
3278f005ef32Sjsg 
3279f005ef32Sjsg 	if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
3280f005ef32Sjsg 	    !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
3281f005ef32Sjsg 		/* keep the check here in case we need to refer to the result later */
3282f005ef32Sjsg 		dev_dbg(adev->dev, "Invalid err_info field\n");
3283f005ef32Sjsg 
3284f005ef32Sjsg 	/* read err count */
3285f005ef32Sjsg 	*err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
3286f005ef32Sjsg 
3287f005ef32Sjsg 	return true;
3288f005ef32Sjsg }
3289f005ef32Sjsg 
3290f005ef32Sjsg void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
3291f005ef32Sjsg 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
3292f005ef32Sjsg 					   uint32_t reg_list_size,
3293f005ef32Sjsg 					   const struct amdgpu_ras_memory_id_entry *mem_list,
3294f005ef32Sjsg 					   uint32_t mem_list_size,
3295f005ef32Sjsg 					   uint32_t instance,
3296f005ef32Sjsg 					   uint32_t err_type,
3297f005ef32Sjsg 					   unsigned long *err_count)
3298f005ef32Sjsg {
3299f005ef32Sjsg 	uint32_t memory_id;
3300f005ef32Sjsg 	unsigned long err_cnt;
3301f005ef32Sjsg 	char err_type_name[16];
3302f005ef32Sjsg 	uint32_t i, j;
3303f005ef32Sjsg 
3304f005ef32Sjsg 	for (i = 0; i < reg_list_size; i++) {
3305f005ef32Sjsg 		/* query memory_id from err_status_lo */
3306f005ef32Sjsg 		if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
3307f005ef32Sjsg 							 instance, &memory_id))
3308f005ef32Sjsg 			continue;
3309f005ef32Sjsg 
3310f005ef32Sjsg 		/* query err_cnt from err_status_hi */
3311f005ef32Sjsg 		if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
3312f005ef32Sjsg 						       instance, &err_cnt) ||
3313f005ef32Sjsg 		    !err_cnt)
3314f005ef32Sjsg 			continue;
3315f005ef32Sjsg 
3316f005ef32Sjsg 		*err_count += err_cnt;
3317f005ef32Sjsg 
3318f005ef32Sjsg 		/* log the errors */
3319f005ef32Sjsg 		amdgpu_ras_get_error_type_name(err_type, err_type_name);
3320f005ef32Sjsg 		if (!mem_list) {
3321f005ef32Sjsg 			/* memory_list is not supported */
3322f005ef32Sjsg 			dev_info(adev->dev,
3323f005ef32Sjsg 				 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n",
3324f005ef32Sjsg 				 err_cnt, err_type_name,
3325f005ef32Sjsg 				 reg_list[i].block_name,
3326f005ef32Sjsg 				 instance, memory_id);
3327f005ef32Sjsg 		} else {
3328f005ef32Sjsg 			for (j = 0; j < mem_list_size; j++) {
3329f005ef32Sjsg 				if (memory_id == mem_list[j].memory_id) {
3330f005ef32Sjsg 					dev_info(adev->dev,
3331f005ef32Sjsg 						 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n",
3332f005ef32Sjsg 						 err_cnt, err_type_name,
3333f005ef32Sjsg 						 reg_list[i].block_name,
3334f005ef32Sjsg 						 instance, mem_list[j].name);
3335f005ef32Sjsg 					break;
3336f005ef32Sjsg 				}
3337f005ef32Sjsg 			}
3338f005ef32Sjsg 		}
3339f005ef32Sjsg 	}
3340f005ef32Sjsg }
3341f005ef32Sjsg 
3342f005ef32Sjsg void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
3343f005ef32Sjsg 					   const struct amdgpu_ras_err_status_reg_entry *reg_list,
3344f005ef32Sjsg 					   uint32_t reg_list_size,
3345f005ef32Sjsg 					   uint32_t instance)
3346f005ef32Sjsg {
3347f005ef32Sjsg 	uint32_t err_status_lo_offset, err_status_hi_offset;
3348f005ef32Sjsg 	uint32_t i;
3349f005ef32Sjsg 
3350f005ef32Sjsg 	for (i = 0; i < reg_list_size; i++) {
3351f005ef32Sjsg 		err_status_lo_offset =
3352f005ef32Sjsg 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3353f005ef32Sjsg 						    reg_list[i].seg_lo, reg_list[i].reg_lo);
3354f005ef32Sjsg 		err_status_hi_offset =
3355f005ef32Sjsg 			AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance,
3356f005ef32Sjsg 						    reg_list[i].seg_hi, reg_list[i].reg_hi);
3357f005ef32Sjsg 		WREG32(err_status_lo_offset, 0);
3358f005ef32Sjsg 		WREG32(err_status_hi_offset, 0);
3359f005ef32Sjsg 	}
3360f005ef32Sjsg }
3361