1c349dbc7Sjsg /* 2c349dbc7Sjsg * Copyright 2018 Advanced Micro Devices, Inc. 3c349dbc7Sjsg * 4c349dbc7Sjsg * Permission is hereby granted, free of charge, to any person obtaining a 5c349dbc7Sjsg * copy of this software and associated documentation files (the "Software"), 6c349dbc7Sjsg * to deal in the Software without restriction, including without limitation 7c349dbc7Sjsg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8c349dbc7Sjsg * and/or sell copies of the Software, and to permit persons to whom the 9c349dbc7Sjsg * Software is furnished to do so, subject to the following conditions: 10c349dbc7Sjsg * 11c349dbc7Sjsg * The above copyright notice and this permission notice shall be included in 12c349dbc7Sjsg * all copies or substantial portions of the Software. 13c349dbc7Sjsg * 14c349dbc7Sjsg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15c349dbc7Sjsg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16c349dbc7Sjsg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17c349dbc7Sjsg * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18c349dbc7Sjsg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19c349dbc7Sjsg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20c349dbc7Sjsg * OTHER DEALINGS IN THE SOFTWARE. 21c349dbc7Sjsg * 22c349dbc7Sjsg * 23c349dbc7Sjsg */ 24c349dbc7Sjsg #include <linux/debugfs.h> 25c349dbc7Sjsg #include <linux/list.h> 26c349dbc7Sjsg #include <linux/module.h> 27c349dbc7Sjsg #include <linux/uaccess.h> 28c349dbc7Sjsg #include <linux/reboot.h> 29c349dbc7Sjsg #include <linux/syscalls.h> 305ca02815Sjsg #include <linux/pm_runtime.h> 31c349dbc7Sjsg 32c349dbc7Sjsg #include "amdgpu.h" 33c349dbc7Sjsg #include "amdgpu_ras.h" 34c349dbc7Sjsg #include "amdgpu_atomfirmware.h" 35c349dbc7Sjsg #include "amdgpu_xgmi.h" 36c349dbc7Sjsg #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 37f005ef32Sjsg #include "nbio_v4_3.h" 38f005ef32Sjsg #include "nbio_v7_9.h" 395ca02815Sjsg #include "atom.h" 401bb76ff1Sjsg #include "amdgpu_reset.h" 41c349dbc7Sjsg 421bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD 431bb76ff1Sjsg #include <asm/mce.h> 441bb76ff1Sjsg 451bb76ff1Sjsg static bool notifier_registered; 461bb76ff1Sjsg #endif 47ad8b1aafSjsg static const char *RAS_FS_NAME = "ras"; 48ad8b1aafSjsg 49c349dbc7Sjsg const char *ras_error_string[] = { 50c349dbc7Sjsg "none", 51c349dbc7Sjsg "parity", 52c349dbc7Sjsg "single_correctable", 53c349dbc7Sjsg "multi_uncorrectable", 54c349dbc7Sjsg "poison", 55c349dbc7Sjsg }; 56c349dbc7Sjsg 57c349dbc7Sjsg const char *ras_block_string[] = { 58c349dbc7Sjsg "umc", 59c349dbc7Sjsg "sdma", 60c349dbc7Sjsg "gfx", 61c349dbc7Sjsg "mmhub", 62c349dbc7Sjsg "athub", 63c349dbc7Sjsg "pcie_bif", 64c349dbc7Sjsg "hdp", 65c349dbc7Sjsg "xgmi_wafl", 66c349dbc7Sjsg "df", 67c349dbc7Sjsg "smn", 68c349dbc7Sjsg "sem", 69c349dbc7Sjsg "mp0", 70c349dbc7Sjsg "mp1", 71c349dbc7Sjsg "fuse", 721bb76ff1Sjsg "mca", 731bb76ff1Sjsg "vcn", 741bb76ff1Sjsg "jpeg", 75c349dbc7Sjsg }; 76c349dbc7Sjsg 771bb76ff1Sjsg const char *ras_mca_block_string[] = { 781bb76ff1Sjsg "mca_mp0", 791bb76ff1Sjsg "mca_mp1", 801bb76ff1Sjsg "mca_mpio", 811bb76ff1Sjsg "mca_iohc", 821bb76ff1Sjsg }; 831bb76ff1Sjsg 841bb76ff1Sjsg struct amdgpu_ras_block_list { 851bb76ff1Sjsg /* ras block link */ 861bb76ff1Sjsg struct list_head node; 871bb76ff1Sjsg 881bb76ff1Sjsg struct amdgpu_ras_block_object *ras_obj; 891bb76ff1Sjsg }; 901bb76ff1Sjsg 911bb76ff1Sjsg const char *get_ras_block_str(struct ras_common_if *ras_block) 921bb76ff1Sjsg { 931bb76ff1Sjsg if (!ras_block) 941bb76ff1Sjsg return "NULL"; 951bb76ff1Sjsg 961bb76ff1Sjsg if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) 971bb76ff1Sjsg return "OUT OF RANGE"; 981bb76ff1Sjsg 991bb76ff1Sjsg if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) 1001bb76ff1Sjsg return ras_mca_block_string[ras_block->sub_block_index]; 1011bb76ff1Sjsg 1021bb76ff1Sjsg return ras_block_string[ras_block->block]; 1031bb76ff1Sjsg } 1041bb76ff1Sjsg 1051bb76ff1Sjsg #define ras_block_str(_BLOCK_) \ 1061bb76ff1Sjsg (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") 1071bb76ff1Sjsg 108c349dbc7Sjsg #define ras_err_str(i) (ras_error_string[ffs(i)]) 109c349dbc7Sjsg 110c349dbc7Sjsg #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 111c349dbc7Sjsg 112c349dbc7Sjsg /* inject address is 52 bits */ 113c349dbc7Sjsg #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) 114c349dbc7Sjsg 1155ca02815Sjsg /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 1165ca02815Sjsg #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 117ad8b1aafSjsg 118c349dbc7Sjsg enum amdgpu_ras_retire_page_reservation { 119c349dbc7Sjsg AMDGPU_RAS_RETIRE_PAGE_RESERVED, 120c349dbc7Sjsg AMDGPU_RAS_RETIRE_PAGE_PENDING, 121c349dbc7Sjsg AMDGPU_RAS_RETIRE_PAGE_FAULT, 122c349dbc7Sjsg }; 123c349dbc7Sjsg 124c349dbc7Sjsg atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); 125c349dbc7Sjsg 1265ca02815Sjsg static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 1275ca02815Sjsg uint64_t addr); 128c349dbc7Sjsg static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 129c349dbc7Sjsg uint64_t addr); 1301bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD 1311bb76ff1Sjsg static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); 1321bb76ff1Sjsg struct mce_notifier_adev_list { 1331bb76ff1Sjsg struct amdgpu_device *devs[MAX_GPU_INSTANCE]; 1341bb76ff1Sjsg int num_gpu; 1351bb76ff1Sjsg }; 1361bb76ff1Sjsg static struct mce_notifier_adev_list mce_adev_list; 1371bb76ff1Sjsg #endif 138c349dbc7Sjsg 139af8ed3f7Sjsg void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 140af8ed3f7Sjsg { 14157896dd2Sjsg if (adev && amdgpu_ras_get_context(adev)) 142af8ed3f7Sjsg amdgpu_ras_get_context(adev)->error_query_ready = ready; 143af8ed3f7Sjsg } 144af8ed3f7Sjsg 145ad8b1aafSjsg static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 146af8ed3f7Sjsg { 14757896dd2Sjsg if (adev && amdgpu_ras_get_context(adev)) 148af8ed3f7Sjsg return amdgpu_ras_get_context(adev)->error_query_ready; 149af8ed3f7Sjsg 150af8ed3f7Sjsg return false; 151af8ed3f7Sjsg } 152af8ed3f7Sjsg 1535ca02815Sjsg static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) 1545ca02815Sjsg { 1555ca02815Sjsg struct ras_err_data err_data = {0, 0, 0, NULL}; 1565ca02815Sjsg struct eeprom_table_record err_rec; 1575ca02815Sjsg 1585ca02815Sjsg if ((address >= adev->gmc.mc_vram_size) || 1595ca02815Sjsg (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { 1605ca02815Sjsg dev_warn(adev->dev, 1615ca02815Sjsg "RAS WARN: input address 0x%llx is invalid.\n", 1625ca02815Sjsg address); 1635ca02815Sjsg return -EINVAL; 1645ca02815Sjsg } 1655ca02815Sjsg 1665ca02815Sjsg if (amdgpu_ras_check_bad_page(adev, address)) { 1675ca02815Sjsg dev_warn(adev->dev, 1685ca02815Sjsg "RAS WARN: 0x%llx has already been marked as bad page!\n", 1695ca02815Sjsg address); 1705ca02815Sjsg return 0; 1715ca02815Sjsg } 1725ca02815Sjsg 1735ca02815Sjsg memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); 1745ca02815Sjsg err_data.err_addr = &err_rec; 1758ad1b843Sjsg amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); 1765ca02815Sjsg 1775ca02815Sjsg if (amdgpu_bad_page_threshold != 0) { 1785ca02815Sjsg amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 1795ca02815Sjsg err_data.err_addr_cnt); 180f005ef32Sjsg amdgpu_ras_save_bad_pages(adev, NULL); 1815ca02815Sjsg } 1825ca02815Sjsg 1835ca02815Sjsg dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); 1845ca02815Sjsg dev_warn(adev->dev, "Clear EEPROM:\n"); 1855ca02815Sjsg dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); 1865ca02815Sjsg 1875ca02815Sjsg return 0; 1885ca02815Sjsg } 1895ca02815Sjsg 190c349dbc7Sjsg #ifdef __linux__ 191c349dbc7Sjsg 192c349dbc7Sjsg static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 193c349dbc7Sjsg size_t size, loff_t *pos) 194c349dbc7Sjsg { 195c349dbc7Sjsg struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 196c349dbc7Sjsg struct ras_query_if info = { 197c349dbc7Sjsg .head = obj->head, 198c349dbc7Sjsg }; 199c349dbc7Sjsg ssize_t s; 200c349dbc7Sjsg char val[128]; 201c349dbc7Sjsg 2025ca02815Sjsg if (amdgpu_ras_query_error_status(obj->adev, &info)) 203c349dbc7Sjsg return -EINVAL; 204c349dbc7Sjsg 2051bb76ff1Sjsg /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ 2061bb76ff1Sjsg if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && 2071bb76ff1Sjsg obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { 2081bb76ff1Sjsg if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 2091bb76ff1Sjsg dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 2101bb76ff1Sjsg } 2111bb76ff1Sjsg 212c349dbc7Sjsg s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 213c349dbc7Sjsg "ue", info.ue_count, 214c349dbc7Sjsg "ce", info.ce_count); 215c349dbc7Sjsg if (*pos >= s) 216c349dbc7Sjsg return 0; 217c349dbc7Sjsg 218c349dbc7Sjsg s -= *pos; 219c349dbc7Sjsg s = min_t(u64, s, size); 220c349dbc7Sjsg 221c349dbc7Sjsg 222c349dbc7Sjsg if (copy_to_user(buf, &val[*pos], s)) 223c349dbc7Sjsg return -EINVAL; 224c349dbc7Sjsg 225c349dbc7Sjsg *pos += s; 226c349dbc7Sjsg 227c349dbc7Sjsg return s; 228c349dbc7Sjsg } 229c349dbc7Sjsg 230c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_ops = { 231c349dbc7Sjsg .owner = THIS_MODULE, 232c349dbc7Sjsg .read = amdgpu_ras_debugfs_read, 233c349dbc7Sjsg .write = NULL, 234c349dbc7Sjsg .llseek = default_llseek 235c349dbc7Sjsg }; 236c349dbc7Sjsg 237c349dbc7Sjsg static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 238c349dbc7Sjsg { 239c349dbc7Sjsg int i; 240c349dbc7Sjsg 241c349dbc7Sjsg for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 242c349dbc7Sjsg *block_id = i; 2431bb76ff1Sjsg if (strcmp(name, ras_block_string[i]) == 0) 244c349dbc7Sjsg return 0; 245c349dbc7Sjsg } 246c349dbc7Sjsg return -EINVAL; 247c349dbc7Sjsg } 248c349dbc7Sjsg 249c349dbc7Sjsg static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 250c349dbc7Sjsg const char __user *buf, size_t size, 251c349dbc7Sjsg loff_t *pos, struct ras_debug_if *data) 252c349dbc7Sjsg { 253c349dbc7Sjsg ssize_t s = min_t(u64, 64, size); 254c349dbc7Sjsg char str[65]; 255c349dbc7Sjsg char block_name[33]; 256c349dbc7Sjsg char err[9] = "ue"; 257c349dbc7Sjsg int op = -1; 258c349dbc7Sjsg int block_id; 259c349dbc7Sjsg uint32_t sub_block; 260c349dbc7Sjsg u64 address, value; 261f005ef32Sjsg /* default value is 0 if the mask is not set by user */ 262f005ef32Sjsg u32 instance_mask = 0; 263c349dbc7Sjsg 264c349dbc7Sjsg if (*pos) 265c349dbc7Sjsg return -EINVAL; 266c349dbc7Sjsg *pos = size; 267c349dbc7Sjsg 268c349dbc7Sjsg memset(str, 0, sizeof(str)); 269c349dbc7Sjsg memset(data, 0, sizeof(*data)); 270c349dbc7Sjsg 271c349dbc7Sjsg if (copy_from_user(str, buf, s)) 272c349dbc7Sjsg return -EINVAL; 273c349dbc7Sjsg 274c349dbc7Sjsg if (sscanf(str, "disable %32s", block_name) == 1) 275c349dbc7Sjsg op = 0; 276c349dbc7Sjsg else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 277c349dbc7Sjsg op = 1; 278c349dbc7Sjsg else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 279c349dbc7Sjsg op = 2; 2805ca02815Sjsg else if (strstr(str, "retire_page") != NULL) 2815ca02815Sjsg op = 3; 282c349dbc7Sjsg else if (str[0] && str[1] && str[2] && str[3]) 283c349dbc7Sjsg /* ascii string, but commands are not matched. */ 284c349dbc7Sjsg return -EINVAL; 285c349dbc7Sjsg 286c349dbc7Sjsg if (op != -1) { 2875ca02815Sjsg if (op == 3) { 2885ca02815Sjsg if (sscanf(str, "%*s 0x%llx", &address) != 1 && 2895ca02815Sjsg sscanf(str, "%*s %llu", &address) != 1) 2905ca02815Sjsg return -EINVAL; 2915ca02815Sjsg 2925ca02815Sjsg data->op = op; 2935ca02815Sjsg data->inject.address = address; 2945ca02815Sjsg 2955ca02815Sjsg return 0; 2965ca02815Sjsg } 2975ca02815Sjsg 298c349dbc7Sjsg if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 299c349dbc7Sjsg return -EINVAL; 300c349dbc7Sjsg 301c349dbc7Sjsg data->head.block = block_id; 302c349dbc7Sjsg /* only ue and ce errors are supported */ 303c349dbc7Sjsg if (!memcmp("ue", err, 2)) 304c349dbc7Sjsg data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 305c349dbc7Sjsg else if (!memcmp("ce", err, 2)) 306c349dbc7Sjsg data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 307c349dbc7Sjsg else 308c349dbc7Sjsg return -EINVAL; 309c349dbc7Sjsg 310c349dbc7Sjsg data->op = op; 311c349dbc7Sjsg 312c349dbc7Sjsg if (op == 2) { 313f005ef32Sjsg if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", 314f005ef32Sjsg &sub_block, &address, &value, &instance_mask) != 4 && 315f005ef32Sjsg sscanf(str, "%*s %*s %*s %u %llu %llu %u", 316f005ef32Sjsg &sub_block, &address, &value, &instance_mask) != 4 && 317f005ef32Sjsg sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", 3185ca02815Sjsg &sub_block, &address, &value) != 3 && 3195ca02815Sjsg sscanf(str, "%*s %*s %*s %u %llu %llu", 320c349dbc7Sjsg &sub_block, &address, &value) != 3) 321c349dbc7Sjsg return -EINVAL; 322c349dbc7Sjsg data->head.sub_block_index = sub_block; 323c349dbc7Sjsg data->inject.address = address; 324c349dbc7Sjsg data->inject.value = value; 325f005ef32Sjsg data->inject.instance_mask = instance_mask; 326c349dbc7Sjsg } 327c349dbc7Sjsg } else { 328c349dbc7Sjsg if (size < sizeof(*data)) 329c349dbc7Sjsg return -EINVAL; 330c349dbc7Sjsg 331c349dbc7Sjsg if (copy_from_user(data, buf, sizeof(*data))) 332c349dbc7Sjsg return -EINVAL; 333c349dbc7Sjsg } 334c349dbc7Sjsg 335c349dbc7Sjsg return 0; 336c349dbc7Sjsg } 337c349dbc7Sjsg 338f005ef32Sjsg static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, 339f005ef32Sjsg struct ras_debug_if *data) 340f005ef32Sjsg { 341f005ef32Sjsg int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; 342f005ef32Sjsg uint32_t mask, inst_mask = data->inject.instance_mask; 343f005ef32Sjsg 344f005ef32Sjsg /* no need to set instance mask if there is only one instance */ 345f005ef32Sjsg if (num_xcc <= 1 && inst_mask) { 346f005ef32Sjsg data->inject.instance_mask = 0; 347f005ef32Sjsg dev_dbg(adev->dev, 348f005ef32Sjsg "RAS inject mask(0x%x) isn't supported and force it to 0.\n", 349f005ef32Sjsg inst_mask); 350f005ef32Sjsg 351f005ef32Sjsg return; 352f005ef32Sjsg } 353f005ef32Sjsg 354f005ef32Sjsg switch (data->head.block) { 355f005ef32Sjsg case AMDGPU_RAS_BLOCK__GFX: 356f005ef32Sjsg mask = GENMASK(num_xcc - 1, 0); 357f005ef32Sjsg break; 358f005ef32Sjsg case AMDGPU_RAS_BLOCK__SDMA: 359f005ef32Sjsg mask = GENMASK(adev->sdma.num_instances - 1, 0); 360f005ef32Sjsg break; 361f005ef32Sjsg case AMDGPU_RAS_BLOCK__VCN: 362f005ef32Sjsg case AMDGPU_RAS_BLOCK__JPEG: 363f005ef32Sjsg mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); 364f005ef32Sjsg break; 365f005ef32Sjsg default: 366f005ef32Sjsg mask = inst_mask; 367f005ef32Sjsg break; 368f005ef32Sjsg } 369f005ef32Sjsg 370f005ef32Sjsg /* remove invalid bits in instance mask */ 371f005ef32Sjsg data->inject.instance_mask &= mask; 372f005ef32Sjsg if (inst_mask != data->inject.instance_mask) 373f005ef32Sjsg dev_dbg(adev->dev, 374f005ef32Sjsg "Adjust RAS inject mask 0x%x to 0x%x\n", 375f005ef32Sjsg inst_mask, data->inject.instance_mask); 376f005ef32Sjsg } 377f005ef32Sjsg 378c349dbc7Sjsg /** 379c349dbc7Sjsg * DOC: AMDGPU RAS debugfs control interface 380c349dbc7Sjsg * 3815ca02815Sjsg * The control interface accepts struct ras_debug_if which has two members. 382c349dbc7Sjsg * 383c349dbc7Sjsg * First member: ras_debug_if::head or ras_debug_if::inject. 384c349dbc7Sjsg * 385c349dbc7Sjsg * head is used to indicate which IP block will be under control. 386c349dbc7Sjsg * 387c349dbc7Sjsg * head has four members, they are block, type, sub_block_index, name. 388c349dbc7Sjsg * block: which IP will be under control. 389c349dbc7Sjsg * type: what kind of error will be enabled/disabled/injected. 390c349dbc7Sjsg * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 391c349dbc7Sjsg * name: the name of IP. 392c349dbc7Sjsg * 393f005ef32Sjsg * inject has three more members than head, they are address, value and mask. 394c349dbc7Sjsg * As their names indicate, inject operation will write the 395c349dbc7Sjsg * value to the address. 396c349dbc7Sjsg * 397c349dbc7Sjsg * The second member: struct ras_debug_if::op. 398c349dbc7Sjsg * It has three kinds of operations. 399c349dbc7Sjsg * 400c349dbc7Sjsg * - 0: disable RAS on the block. Take ::head as its data. 401c349dbc7Sjsg * - 1: enable RAS on the block. Take ::head as its data. 402c349dbc7Sjsg * - 2: inject errors on the block. Take ::inject as its data. 403c349dbc7Sjsg * 404c349dbc7Sjsg * How to use the interface? 405c349dbc7Sjsg * 4065ca02815Sjsg * In a program 407c349dbc7Sjsg * 4085ca02815Sjsg * Copy the struct ras_debug_if in your code and initialize it. 4095ca02815Sjsg * Write the struct to the control interface. 410c349dbc7Sjsg * 4115ca02815Sjsg * From shell 412c349dbc7Sjsg * 413c349dbc7Sjsg * .. code-block:: bash 414c349dbc7Sjsg * 4155ca02815Sjsg * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 4165ca02815Sjsg * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 417f005ef32Sjsg * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 418c349dbc7Sjsg * 4195ca02815Sjsg * Where N, is the card which you want to affect. 420c349dbc7Sjsg * 4215ca02815Sjsg * "disable" requires only the block. 4225ca02815Sjsg * "enable" requires the block and error type. 4235ca02815Sjsg * "inject" requires the block, error type, address, and value. 4245ca02815Sjsg * 4255ca02815Sjsg * The block is one of: umc, sdma, gfx, etc. 426c349dbc7Sjsg * see ras_block_string[] for details 427c349dbc7Sjsg * 4285ca02815Sjsg * The error type is one of: ue, ce, where, 4295ca02815Sjsg * ue is multi-uncorrectable 4305ca02815Sjsg * ce is single-correctable 4315ca02815Sjsg * 4325ca02815Sjsg * The sub-block is a the sub-block index, pass 0 if there is no sub-block. 4335ca02815Sjsg * The address and value are hexadecimal numbers, leading 0x is optional. 434f005ef32Sjsg * The mask means instance mask, is optional, default value is 0x1. 4355ca02815Sjsg * 4365ca02815Sjsg * For instance, 437c349dbc7Sjsg * 438c349dbc7Sjsg * .. code-block:: bash 439c349dbc7Sjsg * 440c349dbc7Sjsg * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 441f005ef32Sjsg * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl 442c349dbc7Sjsg * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 443c349dbc7Sjsg * 4445ca02815Sjsg * How to check the result of the operation? 445c349dbc7Sjsg * 4465ca02815Sjsg * To check disable/enable, see "ras" features at, 447c349dbc7Sjsg * /sys/class/drm/card[0/1/2...]/device/ras/features 448c349dbc7Sjsg * 4495ca02815Sjsg * To check inject, see the corresponding error count at, 4505ca02815Sjsg * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count 451c349dbc7Sjsg * 452c349dbc7Sjsg * .. note:: 453c349dbc7Sjsg * Operations are only allowed on blocks which are supported. 4545ca02815Sjsg * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask 455c349dbc7Sjsg * to see which blocks support RAS on a particular asic. 456c349dbc7Sjsg * 457c349dbc7Sjsg */ 4585ca02815Sjsg static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, 4595ca02815Sjsg const char __user *buf, 460c349dbc7Sjsg size_t size, loff_t *pos) 461c349dbc7Sjsg { 462c349dbc7Sjsg struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 463c349dbc7Sjsg struct ras_debug_if data; 464c349dbc7Sjsg int ret = 0; 465c349dbc7Sjsg 466af8ed3f7Sjsg if (!amdgpu_ras_get_error_query_ready(adev)) { 467ad8b1aafSjsg dev_warn(adev->dev, "RAS WARN: error injection " 468ad8b1aafSjsg "currently inaccessible\n"); 469c349dbc7Sjsg return size; 470c349dbc7Sjsg } 471c349dbc7Sjsg 472c349dbc7Sjsg ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 473c349dbc7Sjsg if (ret) 4745ca02815Sjsg return ret; 4755ca02815Sjsg 4765ca02815Sjsg if (data.op == 3) { 4775ca02815Sjsg ret = amdgpu_reserve_page_direct(adev, data.inject.address); 4785ca02815Sjsg if (!ret) 4795ca02815Sjsg return size; 4805ca02815Sjsg else 4815ca02815Sjsg return ret; 4825ca02815Sjsg } 483c349dbc7Sjsg 484c349dbc7Sjsg if (!amdgpu_ras_is_supported(adev, data.head.block)) 485c349dbc7Sjsg return -EINVAL; 486c349dbc7Sjsg 487c349dbc7Sjsg switch (data.op) { 488c349dbc7Sjsg case 0: 489c349dbc7Sjsg ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 490c349dbc7Sjsg break; 491c349dbc7Sjsg case 1: 492c349dbc7Sjsg ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 493c349dbc7Sjsg break; 494c349dbc7Sjsg case 2: 495f005ef32Sjsg if ((data.inject.address >= adev->gmc.mc_vram_size && 496f005ef32Sjsg adev->gmc.mc_vram_size) || 497c349dbc7Sjsg (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { 498ad8b1aafSjsg dev_warn(adev->dev, "RAS WARN: input address " 499ad8b1aafSjsg "0x%llx is invalid.", 500ad8b1aafSjsg data.inject.address); 501c349dbc7Sjsg ret = -EINVAL; 502c349dbc7Sjsg break; 503c349dbc7Sjsg } 504c349dbc7Sjsg 505c349dbc7Sjsg /* umc ce/ue error injection for a bad page is not allowed */ 506c349dbc7Sjsg if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && 507c349dbc7Sjsg amdgpu_ras_check_bad_page(adev, data.inject.address)) { 5085ca02815Sjsg dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " 5095ca02815Sjsg "already been marked as bad!\n", 510c349dbc7Sjsg data.inject.address); 511c349dbc7Sjsg break; 512c349dbc7Sjsg } 513c349dbc7Sjsg 514f005ef32Sjsg amdgpu_ras_instance_mask_check(adev, &data); 515f005ef32Sjsg 516c349dbc7Sjsg /* data.inject.address is offset instead of absolute gpu address */ 517c349dbc7Sjsg ret = amdgpu_ras_error_inject(adev, &data.inject); 518c349dbc7Sjsg break; 519c349dbc7Sjsg default: 520c349dbc7Sjsg ret = -EINVAL; 521c349dbc7Sjsg break; 522c349dbc7Sjsg } 523c349dbc7Sjsg 524c349dbc7Sjsg if (ret) 5251bb76ff1Sjsg return ret; 526c349dbc7Sjsg 527c349dbc7Sjsg return size; 528c349dbc7Sjsg } 529c349dbc7Sjsg 530c349dbc7Sjsg /** 531c349dbc7Sjsg * DOC: AMDGPU RAS debugfs EEPROM table reset interface 532c349dbc7Sjsg * 533c349dbc7Sjsg * Some boards contain an EEPROM which is used to persistently store a list of 534c349dbc7Sjsg * bad pages which experiences ECC errors in vram. This interface provides 535c349dbc7Sjsg * a way to reset the EEPROM, e.g., after testing error injection. 536c349dbc7Sjsg * 537c349dbc7Sjsg * Usage: 538c349dbc7Sjsg * 539c349dbc7Sjsg * .. code-block:: bash 540c349dbc7Sjsg * 541c349dbc7Sjsg * echo 1 > ../ras/ras_eeprom_reset 542c349dbc7Sjsg * 543c349dbc7Sjsg * will reset EEPROM table to 0 entries. 544c349dbc7Sjsg * 545c349dbc7Sjsg */ 5465ca02815Sjsg static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, 5475ca02815Sjsg const char __user *buf, 548c349dbc7Sjsg size_t size, loff_t *pos) 549c349dbc7Sjsg { 550ad8b1aafSjsg struct amdgpu_device *adev = 551ad8b1aafSjsg (struct amdgpu_device *)file_inode(f)->i_private; 552c349dbc7Sjsg int ret; 553c349dbc7Sjsg 554ad8b1aafSjsg ret = amdgpu_ras_eeprom_reset_table( 555ad8b1aafSjsg &(amdgpu_ras_get_context(adev)->eeprom_control)); 556c349dbc7Sjsg 5575ca02815Sjsg if (!ret) { 5585ca02815Sjsg /* Something was written to EEPROM. 5595ca02815Sjsg */ 560ad8b1aafSjsg amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; 561ad8b1aafSjsg return size; 562ad8b1aafSjsg } else { 5635ca02815Sjsg return ret; 564ad8b1aafSjsg } 565c349dbc7Sjsg } 566c349dbc7Sjsg 567c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 568c349dbc7Sjsg .owner = THIS_MODULE, 569c349dbc7Sjsg .read = NULL, 570c349dbc7Sjsg .write = amdgpu_ras_debugfs_ctrl_write, 571c349dbc7Sjsg .llseek = default_llseek 572c349dbc7Sjsg }; 573c349dbc7Sjsg 574c349dbc7Sjsg static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { 575c349dbc7Sjsg .owner = THIS_MODULE, 576c349dbc7Sjsg .read = NULL, 577c349dbc7Sjsg .write = amdgpu_ras_debugfs_eeprom_write, 578c349dbc7Sjsg .llseek = default_llseek 579c349dbc7Sjsg }; 580c349dbc7Sjsg 581c349dbc7Sjsg /** 582c349dbc7Sjsg * DOC: AMDGPU RAS sysfs Error Count Interface 583c349dbc7Sjsg * 584c349dbc7Sjsg * It allows the user to read the error count for each IP block on the gpu through 585c349dbc7Sjsg * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 586c349dbc7Sjsg * 587c349dbc7Sjsg * It outputs the multiple lines which report the uncorrected (ue) and corrected 588c349dbc7Sjsg * (ce) error counts. 589c349dbc7Sjsg * 590c349dbc7Sjsg * The format of one line is below, 591c349dbc7Sjsg * 592c349dbc7Sjsg * [ce|ue]: count 593c349dbc7Sjsg * 594c349dbc7Sjsg * Example: 595c349dbc7Sjsg * 596c349dbc7Sjsg * .. code-block:: bash 597c349dbc7Sjsg * 598c349dbc7Sjsg * ue: 0 599c349dbc7Sjsg * ce: 1 600c349dbc7Sjsg * 601c349dbc7Sjsg */ 602c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 603c349dbc7Sjsg struct device_attribute *attr, char *buf) 604c349dbc7Sjsg { 605c349dbc7Sjsg struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 606c349dbc7Sjsg struct ras_query_if info = { 607c349dbc7Sjsg .head = obj->head, 608c349dbc7Sjsg }; 609c349dbc7Sjsg 610af8ed3f7Sjsg if (!amdgpu_ras_get_error_query_ready(obj->adev)) 6115ca02815Sjsg return sysfs_emit(buf, "Query currently inaccessible\n"); 612c349dbc7Sjsg 6135ca02815Sjsg if (amdgpu_ras_query_error_status(obj->adev, &info)) 614c349dbc7Sjsg return -EINVAL; 615c349dbc7Sjsg 6161bb76ff1Sjsg if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && 6171bb76ff1Sjsg obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { 6185ca02815Sjsg if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 6191bb76ff1Sjsg dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 6205ca02815Sjsg } 6215ca02815Sjsg 6225ca02815Sjsg return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, 623c349dbc7Sjsg "ce", info.ce_count); 624c349dbc7Sjsg } 625c349dbc7Sjsg 626c349dbc7Sjsg #endif /* __linux__ */ 627c349dbc7Sjsg 628c349dbc7Sjsg /* obj begin */ 629c349dbc7Sjsg 630c349dbc7Sjsg #define get_obj(obj) do { (obj)->use++; } while (0) 631c349dbc7Sjsg #define alive_obj(obj) ((obj)->use) 632c349dbc7Sjsg 633c349dbc7Sjsg static inline void put_obj(struct ras_manager *obj) 634c349dbc7Sjsg { 6355ca02815Sjsg if (obj && (--obj->use == 0)) 636c349dbc7Sjsg list_del(&obj->node); 6375ca02815Sjsg if (obj && (obj->use < 0)) 6381bb76ff1Sjsg DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); 639c349dbc7Sjsg } 640c349dbc7Sjsg 641c349dbc7Sjsg /* make one obj and return it. */ 642c349dbc7Sjsg static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 643c349dbc7Sjsg struct ras_common_if *head) 644c349dbc7Sjsg { 645c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 646c349dbc7Sjsg struct ras_manager *obj; 647c349dbc7Sjsg 6485ca02815Sjsg if (!adev->ras_enabled || !con) 649c349dbc7Sjsg return NULL; 650c349dbc7Sjsg 651c349dbc7Sjsg if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 652c349dbc7Sjsg return NULL; 653c349dbc7Sjsg 6541bb76ff1Sjsg if (head->block == AMDGPU_RAS_BLOCK__MCA) { 6551bb76ff1Sjsg if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 6561bb76ff1Sjsg return NULL; 6571bb76ff1Sjsg 6581bb76ff1Sjsg obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 6591bb76ff1Sjsg } else 660c349dbc7Sjsg obj = &con->objs[head->block]; 6611bb76ff1Sjsg 662c349dbc7Sjsg /* already exist. return obj? */ 663c349dbc7Sjsg if (alive_obj(obj)) 664c349dbc7Sjsg return NULL; 665c349dbc7Sjsg 666c349dbc7Sjsg obj->head = *head; 667c349dbc7Sjsg obj->adev = adev; 668c349dbc7Sjsg list_add(&obj->node, &con->head); 669c349dbc7Sjsg get_obj(obj); 670c349dbc7Sjsg 671c349dbc7Sjsg return obj; 672c349dbc7Sjsg } 673c349dbc7Sjsg 674c349dbc7Sjsg /* return an obj equal to head, or the first when head is NULL */ 675c349dbc7Sjsg struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 676c349dbc7Sjsg struct ras_common_if *head) 677c349dbc7Sjsg { 678c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 679c349dbc7Sjsg struct ras_manager *obj; 680c349dbc7Sjsg int i; 681c349dbc7Sjsg 6825ca02815Sjsg if (!adev->ras_enabled || !con) 683c349dbc7Sjsg return NULL; 684c349dbc7Sjsg 685c349dbc7Sjsg if (head) { 686c349dbc7Sjsg if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 687c349dbc7Sjsg return NULL; 688c349dbc7Sjsg 6891bb76ff1Sjsg if (head->block == AMDGPU_RAS_BLOCK__MCA) { 6901bb76ff1Sjsg if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 6911bb76ff1Sjsg return NULL; 6921bb76ff1Sjsg 6931bb76ff1Sjsg obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 6941bb76ff1Sjsg } else 695c349dbc7Sjsg obj = &con->objs[head->block]; 696c349dbc7Sjsg 6971bb76ff1Sjsg if (alive_obj(obj)) 698c349dbc7Sjsg return obj; 699c349dbc7Sjsg } else { 7001bb76ff1Sjsg for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 701c349dbc7Sjsg obj = &con->objs[i]; 7021bb76ff1Sjsg if (alive_obj(obj)) 703c349dbc7Sjsg return obj; 704c349dbc7Sjsg } 705c349dbc7Sjsg } 706c349dbc7Sjsg 707c349dbc7Sjsg return NULL; 708c349dbc7Sjsg } 709c349dbc7Sjsg /* obj end */ 710c349dbc7Sjsg 711c349dbc7Sjsg /* feature ctl begin */ 712c349dbc7Sjsg static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 713c349dbc7Sjsg struct ras_common_if *head) 714c349dbc7Sjsg { 7155ca02815Sjsg return adev->ras_hw_enabled & BIT(head->block); 716c349dbc7Sjsg } 717c349dbc7Sjsg 718c349dbc7Sjsg static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 719c349dbc7Sjsg struct ras_common_if *head) 720c349dbc7Sjsg { 721c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 722c349dbc7Sjsg 723c349dbc7Sjsg return con->features & BIT(head->block); 724c349dbc7Sjsg } 725c349dbc7Sjsg 726c349dbc7Sjsg /* 727c349dbc7Sjsg * if obj is not created, then create one. 728c349dbc7Sjsg * set feature enable flag. 729c349dbc7Sjsg */ 730c349dbc7Sjsg static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 731c349dbc7Sjsg struct ras_common_if *head, int enable) 732c349dbc7Sjsg { 733c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 734c349dbc7Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 735c349dbc7Sjsg 736c349dbc7Sjsg /* If hardware does not support ras, then do not create obj. 737c349dbc7Sjsg * But if hardware support ras, we can create the obj. 738c349dbc7Sjsg * Ras framework checks con->hw_supported to see if it need do 739c349dbc7Sjsg * corresponding initialization. 740c349dbc7Sjsg * IP checks con->support to see if it need disable ras. 741c349dbc7Sjsg */ 742c349dbc7Sjsg if (!amdgpu_ras_is_feature_allowed(adev, head)) 743c349dbc7Sjsg return 0; 744c349dbc7Sjsg 745c349dbc7Sjsg if (enable) { 746c349dbc7Sjsg if (!obj) { 747c349dbc7Sjsg obj = amdgpu_ras_create_obj(adev, head); 748c349dbc7Sjsg if (!obj) 749c349dbc7Sjsg return -EINVAL; 750c349dbc7Sjsg } else { 751c349dbc7Sjsg /* In case we create obj somewhere else */ 752c349dbc7Sjsg get_obj(obj); 753c349dbc7Sjsg } 754c349dbc7Sjsg con->features |= BIT(head->block); 755c349dbc7Sjsg } else { 756c349dbc7Sjsg if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 757c349dbc7Sjsg con->features &= ~BIT(head->block); 758c349dbc7Sjsg put_obj(obj); 759c349dbc7Sjsg } 760c349dbc7Sjsg } 761c349dbc7Sjsg 762c349dbc7Sjsg return 0; 763c349dbc7Sjsg } 764c349dbc7Sjsg 765c349dbc7Sjsg /* wrapper of psp_ras_enable_features */ 766c349dbc7Sjsg int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 767c349dbc7Sjsg struct ras_common_if *head, bool enable) 768c349dbc7Sjsg { 769c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 770ad8b1aafSjsg union ta_ras_cmd_input *info; 771c349dbc7Sjsg int ret; 772c349dbc7Sjsg 773c349dbc7Sjsg if (!con) 774c349dbc7Sjsg return -EINVAL; 775c349dbc7Sjsg 776f005ef32Sjsg /* Do not enable ras feature if it is not allowed */ 777f005ef32Sjsg if (enable && 778f005ef32Sjsg head->block != AMDGPU_RAS_BLOCK__GFX && 779f005ef32Sjsg !amdgpu_ras_is_feature_allowed(adev, head)) 780f005ef32Sjsg return 0; 781f005ef32Sjsg 782f005ef32Sjsg /* Only enable gfx ras feature from host side */ 783f005ef32Sjsg if (head->block == AMDGPU_RAS_BLOCK__GFX && 784f005ef32Sjsg !amdgpu_sriov_vf(adev) && 785f005ef32Sjsg !amdgpu_ras_intr_triggered()) { 786ad8b1aafSjsg info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); 787ad8b1aafSjsg if (!info) 788ad8b1aafSjsg return -ENOMEM; 789ad8b1aafSjsg 790c349dbc7Sjsg if (!enable) { 791ad8b1aafSjsg info->disable_features = (struct ta_ras_disable_features_input) { 792c349dbc7Sjsg .block_id = amdgpu_ras_block_to_ta(head->block), 793c349dbc7Sjsg .error_type = amdgpu_ras_error_to_ta(head->type), 794c349dbc7Sjsg }; 795c349dbc7Sjsg } else { 796ad8b1aafSjsg info->enable_features = (struct ta_ras_enable_features_input) { 797c349dbc7Sjsg .block_id = amdgpu_ras_block_to_ta(head->block), 798c349dbc7Sjsg .error_type = amdgpu_ras_error_to_ta(head->type), 799c349dbc7Sjsg }; 800c349dbc7Sjsg } 801c349dbc7Sjsg 802ad8b1aafSjsg ret = psp_ras_enable_features(&adev->psp, info, enable); 803c349dbc7Sjsg if (ret) { 8041bb76ff1Sjsg dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", 805c349dbc7Sjsg enable ? "enable":"disable", 8061bb76ff1Sjsg get_ras_block_str(head), 8071bb76ff1Sjsg amdgpu_ras_is_poison_mode_supported(adev), ret); 808f005ef32Sjsg kfree(info); 809f005ef32Sjsg return ret; 810c349dbc7Sjsg } 811f005ef32Sjsg 812f005ef32Sjsg kfree(info); 813c349dbc7Sjsg } 814c349dbc7Sjsg 815c349dbc7Sjsg /* setup the obj */ 816c349dbc7Sjsg __amdgpu_ras_feature_enable(adev, head, enable); 817f005ef32Sjsg 818f005ef32Sjsg return 0; 819c349dbc7Sjsg } 820c349dbc7Sjsg 821c349dbc7Sjsg /* Only used in device probe stage and called only once. */ 822c349dbc7Sjsg int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 823c349dbc7Sjsg struct ras_common_if *head, bool enable) 824c349dbc7Sjsg { 825c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 826c349dbc7Sjsg int ret; 827c349dbc7Sjsg 828c349dbc7Sjsg if (!con) 829c349dbc7Sjsg return -EINVAL; 830c349dbc7Sjsg 831c349dbc7Sjsg if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 832c349dbc7Sjsg if (enable) { 833c349dbc7Sjsg /* There is no harm to issue a ras TA cmd regardless of 834c349dbc7Sjsg * the currecnt ras state. 835c349dbc7Sjsg * If current state == target state, it will do nothing 836c349dbc7Sjsg * But sometimes it requests driver to reset and repost 837c349dbc7Sjsg * with error code -EAGAIN. 838c349dbc7Sjsg */ 839c349dbc7Sjsg ret = amdgpu_ras_feature_enable(adev, head, 1); 840c349dbc7Sjsg /* With old ras TA, we might fail to enable ras. 841c349dbc7Sjsg * Log it and just setup the object. 842c349dbc7Sjsg * TODO need remove this WA in the future. 843c349dbc7Sjsg */ 844c349dbc7Sjsg if (ret == -EINVAL) { 845c349dbc7Sjsg ret = __amdgpu_ras_feature_enable(adev, head, 1); 846c349dbc7Sjsg if (!ret) 847ad8b1aafSjsg dev_info(adev->dev, 848ad8b1aafSjsg "RAS INFO: %s setup object\n", 8491bb76ff1Sjsg get_ras_block_str(head)); 850c349dbc7Sjsg } 851c349dbc7Sjsg } else { 852c349dbc7Sjsg /* setup the object then issue a ras TA disable cmd.*/ 853c349dbc7Sjsg ret = __amdgpu_ras_feature_enable(adev, head, 1); 854c349dbc7Sjsg if (ret) 855c349dbc7Sjsg return ret; 856c349dbc7Sjsg 8575ca02815Sjsg /* gfx block ras dsiable cmd must send to ras-ta */ 8585ca02815Sjsg if (head->block == AMDGPU_RAS_BLOCK__GFX) 8595ca02815Sjsg con->features |= BIT(head->block); 8605ca02815Sjsg 861c349dbc7Sjsg ret = amdgpu_ras_feature_enable(adev, head, 0); 8625ca02815Sjsg 8635ca02815Sjsg /* clean gfx block ras features flag */ 8645ca02815Sjsg if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) 8655ca02815Sjsg con->features &= ~BIT(head->block); 866c349dbc7Sjsg } 867c349dbc7Sjsg } else 868c349dbc7Sjsg ret = amdgpu_ras_feature_enable(adev, head, enable); 869c349dbc7Sjsg 870c349dbc7Sjsg return ret; 871c349dbc7Sjsg } 872c349dbc7Sjsg 873c349dbc7Sjsg static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 874c349dbc7Sjsg bool bypass) 875c349dbc7Sjsg { 876c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 877c349dbc7Sjsg struct ras_manager *obj, *tmp; 878c349dbc7Sjsg 879c349dbc7Sjsg list_for_each_entry_safe(obj, tmp, &con->head, node) { 880c349dbc7Sjsg /* bypass psp. 881c349dbc7Sjsg * aka just release the obj and corresponding flags 882c349dbc7Sjsg */ 883c349dbc7Sjsg if (bypass) { 884c349dbc7Sjsg if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 885c349dbc7Sjsg break; 886c349dbc7Sjsg } else { 887c349dbc7Sjsg if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 888c349dbc7Sjsg break; 889c349dbc7Sjsg } 890c349dbc7Sjsg } 891c349dbc7Sjsg 892c349dbc7Sjsg return con->features; 893c349dbc7Sjsg } 894c349dbc7Sjsg 895c349dbc7Sjsg static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 896c349dbc7Sjsg bool bypass) 897c349dbc7Sjsg { 898c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 899c349dbc7Sjsg int i; 9001bb76ff1Sjsg const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; 901c349dbc7Sjsg 9021bb76ff1Sjsg for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 903c349dbc7Sjsg struct ras_common_if head = { 904c349dbc7Sjsg .block = i, 905c349dbc7Sjsg .type = default_ras_type, 906c349dbc7Sjsg .sub_block_index = 0, 907c349dbc7Sjsg }; 9081bb76ff1Sjsg 9091bb76ff1Sjsg if (i == AMDGPU_RAS_BLOCK__MCA) 9101bb76ff1Sjsg continue; 9111bb76ff1Sjsg 9121bb76ff1Sjsg if (bypass) { 9131bb76ff1Sjsg /* 9141bb76ff1Sjsg * bypass psp. vbios enable ras for us. 9151bb76ff1Sjsg * so just create the obj 9161bb76ff1Sjsg */ 9171bb76ff1Sjsg if (__amdgpu_ras_feature_enable(adev, &head, 1)) 9181bb76ff1Sjsg break; 9191bb76ff1Sjsg } else { 9201bb76ff1Sjsg if (amdgpu_ras_feature_enable(adev, &head, 1)) 9211bb76ff1Sjsg break; 9221bb76ff1Sjsg } 9231bb76ff1Sjsg } 9241bb76ff1Sjsg 9251bb76ff1Sjsg for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 9261bb76ff1Sjsg struct ras_common_if head = { 9271bb76ff1Sjsg .block = AMDGPU_RAS_BLOCK__MCA, 9281bb76ff1Sjsg .type = default_ras_type, 9291bb76ff1Sjsg .sub_block_index = i, 9301bb76ff1Sjsg }; 9311bb76ff1Sjsg 932c349dbc7Sjsg if (bypass) { 933c349dbc7Sjsg /* 934c349dbc7Sjsg * bypass psp. vbios enable ras for us. 935c349dbc7Sjsg * so just create the obj 936c349dbc7Sjsg */ 937c349dbc7Sjsg if (__amdgpu_ras_feature_enable(adev, &head, 1)) 938c349dbc7Sjsg break; 939c349dbc7Sjsg } else { 940c349dbc7Sjsg if (amdgpu_ras_feature_enable(adev, &head, 1)) 941c349dbc7Sjsg break; 942c349dbc7Sjsg } 943c349dbc7Sjsg } 944c349dbc7Sjsg 945c349dbc7Sjsg return con->features; 946c349dbc7Sjsg } 947c349dbc7Sjsg /* feature ctl end */ 948c349dbc7Sjsg 9491bb76ff1Sjsg static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, 9501bb76ff1Sjsg enum amdgpu_ras_block block) 9511bb76ff1Sjsg { 9521bb76ff1Sjsg if (!block_obj) 9531bb76ff1Sjsg return -EINVAL; 9541bb76ff1Sjsg 9551bb76ff1Sjsg if (block_obj->ras_comm.block == block) 9561bb76ff1Sjsg return 0; 9571bb76ff1Sjsg 9581bb76ff1Sjsg return -EINVAL; 9591bb76ff1Sjsg } 9601bb76ff1Sjsg 9611bb76ff1Sjsg static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, 9621bb76ff1Sjsg enum amdgpu_ras_block block, uint32_t sub_block_index) 9631bb76ff1Sjsg { 9641bb76ff1Sjsg struct amdgpu_ras_block_list *node, *tmp; 9651bb76ff1Sjsg struct amdgpu_ras_block_object *obj; 9661bb76ff1Sjsg 9671bb76ff1Sjsg if (block >= AMDGPU_RAS_BLOCK__LAST) 9681bb76ff1Sjsg return NULL; 9691bb76ff1Sjsg 9701bb76ff1Sjsg list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 9711bb76ff1Sjsg if (!node->ras_obj) { 9721bb76ff1Sjsg dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 9731bb76ff1Sjsg continue; 9741bb76ff1Sjsg } 9751bb76ff1Sjsg 9761bb76ff1Sjsg obj = node->ras_obj; 9771bb76ff1Sjsg if (obj->ras_block_match) { 9781bb76ff1Sjsg if (obj->ras_block_match(obj, block, sub_block_index) == 0) 9791bb76ff1Sjsg return obj; 9801bb76ff1Sjsg } else { 9811bb76ff1Sjsg if (amdgpu_ras_block_match_default(obj, block) == 0) 9821bb76ff1Sjsg return obj; 9831bb76ff1Sjsg } 9841bb76ff1Sjsg } 9851bb76ff1Sjsg 9861bb76ff1Sjsg return NULL; 9871bb76ff1Sjsg } 9881bb76ff1Sjsg 9891bb76ff1Sjsg static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) 9901bb76ff1Sjsg { 9911bb76ff1Sjsg struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 9921bb76ff1Sjsg int ret = 0; 9931bb76ff1Sjsg 9941bb76ff1Sjsg /* 9951bb76ff1Sjsg * choosing right query method according to 9961bb76ff1Sjsg * whether smu support query error information 9971bb76ff1Sjsg */ 9981bb76ff1Sjsg ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); 9991bb76ff1Sjsg if (ret == -EOPNOTSUPP) { 10001bb76ff1Sjsg if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 10011bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 10021bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); 10031bb76ff1Sjsg 10041bb76ff1Sjsg /* umc query_ras_error_address is also responsible for clearing 10051bb76ff1Sjsg * error status 10061bb76ff1Sjsg */ 10071bb76ff1Sjsg if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 10081bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_address) 10091bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); 10101bb76ff1Sjsg } else if (!ret) { 10111bb76ff1Sjsg if (adev->umc.ras && 10121bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_count) 10131bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); 10141bb76ff1Sjsg 10151bb76ff1Sjsg if (adev->umc.ras && 10161bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_address) 10171bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); 10181bb76ff1Sjsg } 10191bb76ff1Sjsg } 10201bb76ff1Sjsg 1021c349dbc7Sjsg /* query/inject/cure begin */ 10225ca02815Sjsg int amdgpu_ras_query_error_status(struct amdgpu_device *adev, 1023c349dbc7Sjsg struct ras_query_if *info) 1024c349dbc7Sjsg { 10251bb76ff1Sjsg struct amdgpu_ras_block_object *block_obj = NULL; 1026c349dbc7Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1027c349dbc7Sjsg struct ras_err_data err_data = {0, 0, 0, NULL}; 1028c349dbc7Sjsg 1029c349dbc7Sjsg if (!obj) 1030c349dbc7Sjsg return -EINVAL; 1031c349dbc7Sjsg 1032b2bc41bbSjsg if (!info || info->head.block == AMDGPU_RAS_BLOCK_COUNT) 1033b2bc41bbSjsg return -EINVAL; 1034b2bc41bbSjsg 10351bb76ff1Sjsg if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { 10361bb76ff1Sjsg amdgpu_ras_get_ecc_info(adev, &err_data); 10371bb76ff1Sjsg } else { 10381bb76ff1Sjsg block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); 10391bb76ff1Sjsg if (!block_obj || !block_obj->hw_ops) { 10401bb76ff1Sjsg dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 10411bb76ff1Sjsg get_ras_block_str(&info->head)); 10421bb76ff1Sjsg return -EINVAL; 1043c349dbc7Sjsg } 10445ca02815Sjsg 10451bb76ff1Sjsg if (block_obj->hw_ops->query_ras_error_count) 10461bb76ff1Sjsg block_obj->hw_ops->query_ras_error_count(adev, &err_data); 10475ca02815Sjsg 10481bb76ff1Sjsg if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || 10491bb76ff1Sjsg (info->head.block == AMDGPU_RAS_BLOCK__GFX) || 10501bb76ff1Sjsg (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { 10511bb76ff1Sjsg if (block_obj->hw_ops->query_ras_error_status) 10521bb76ff1Sjsg block_obj->hw_ops->query_ras_error_status(adev); 10531bb76ff1Sjsg } 1054c349dbc7Sjsg } 1055c349dbc7Sjsg 1056c349dbc7Sjsg obj->err_data.ue_count += err_data.ue_count; 1057c349dbc7Sjsg obj->err_data.ce_count += err_data.ce_count; 1058c349dbc7Sjsg 1059c349dbc7Sjsg info->ue_count = obj->err_data.ue_count; 1060c349dbc7Sjsg info->ce_count = obj->err_data.ce_count; 1061c349dbc7Sjsg 1062c349dbc7Sjsg if (err_data.ce_count) { 1063f005ef32Sjsg if (!adev->aid_mask && 1064f005ef32Sjsg adev->smuio.funcs && 10655ca02815Sjsg adev->smuio.funcs->get_socket_id && 10665ca02815Sjsg adev->smuio.funcs->get_die_id) { 10675ca02815Sjsg dev_info(adev->dev, "socket: %d, die: %d " 10685ca02815Sjsg "%ld correctable hardware errors " 10695ca02815Sjsg "detected in %s block, no user " 10705ca02815Sjsg "action is needed.\n", 10715ca02815Sjsg adev->smuio.funcs->get_socket_id(adev), 10725ca02815Sjsg adev->smuio.funcs->get_die_id(adev), 10735ca02815Sjsg obj->err_data.ce_count, 10741bb76ff1Sjsg get_ras_block_str(&info->head)); 10755ca02815Sjsg } else { 1076ad8b1aafSjsg dev_info(adev->dev, "%ld correctable hardware errors " 1077ad8b1aafSjsg "detected in %s block, no user " 1078ad8b1aafSjsg "action is needed.\n", 1079ad8b1aafSjsg obj->err_data.ce_count, 10801bb76ff1Sjsg get_ras_block_str(&info->head)); 1081c349dbc7Sjsg } 10825ca02815Sjsg } 1083c349dbc7Sjsg if (err_data.ue_count) { 1084f005ef32Sjsg if (!adev->aid_mask && 1085f005ef32Sjsg adev->smuio.funcs && 10865ca02815Sjsg adev->smuio.funcs->get_socket_id && 10875ca02815Sjsg adev->smuio.funcs->get_die_id) { 10885ca02815Sjsg dev_info(adev->dev, "socket: %d, die: %d " 10895ca02815Sjsg "%ld uncorrectable hardware errors " 10905ca02815Sjsg "detected in %s block\n", 10915ca02815Sjsg adev->smuio.funcs->get_socket_id(adev), 10925ca02815Sjsg adev->smuio.funcs->get_die_id(adev), 10935ca02815Sjsg obj->err_data.ue_count, 10941bb76ff1Sjsg get_ras_block_str(&info->head)); 10955ca02815Sjsg } else { 1096ad8b1aafSjsg dev_info(adev->dev, "%ld uncorrectable hardware errors " 1097ad8b1aafSjsg "detected in %s block\n", 1098ad8b1aafSjsg obj->err_data.ue_count, 10991bb76ff1Sjsg get_ras_block_str(&info->head)); 1100c349dbc7Sjsg } 11015ca02815Sjsg } 11025ca02815Sjsg 11035ca02815Sjsg return 0; 11045ca02815Sjsg } 11055ca02815Sjsg 11065ca02815Sjsg int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, 11075ca02815Sjsg enum amdgpu_ras_block block) 11085ca02815Sjsg { 11091bb76ff1Sjsg struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 11101bb76ff1Sjsg 11115ca02815Sjsg if (!amdgpu_ras_is_supported(adev, block)) 11125ca02815Sjsg return -EINVAL; 11135ca02815Sjsg 11141bb76ff1Sjsg if (!block_obj || !block_obj->hw_ops) { 11151bb76ff1Sjsg dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 11161bb76ff1Sjsg ras_block_str(block)); 11171bb76ff1Sjsg return -EINVAL; 11181bb76ff1Sjsg } 11195ca02815Sjsg 11201bb76ff1Sjsg if (block_obj->hw_ops->reset_ras_error_count) 11211bb76ff1Sjsg block_obj->hw_ops->reset_ras_error_count(adev); 11225ca02815Sjsg 11231bb76ff1Sjsg if ((block == AMDGPU_RAS_BLOCK__GFX) || 11241bb76ff1Sjsg (block == AMDGPU_RAS_BLOCK__MMHUB)) { 11251bb76ff1Sjsg if (block_obj->hw_ops->reset_ras_error_status) 11261bb76ff1Sjsg block_obj->hw_ops->reset_ras_error_status(adev); 11275ca02815Sjsg } 1128c349dbc7Sjsg 1129c349dbc7Sjsg return 0; 1130c349dbc7Sjsg } 1131c349dbc7Sjsg 1132c349dbc7Sjsg /* wrapper of psp_ras_trigger_error */ 1133c349dbc7Sjsg int amdgpu_ras_error_inject(struct amdgpu_device *adev, 1134c349dbc7Sjsg struct ras_inject_if *info) 1135c349dbc7Sjsg { 1136c349dbc7Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1137c349dbc7Sjsg struct ta_ras_trigger_error_input block_info = { 1138c349dbc7Sjsg .block_id = amdgpu_ras_block_to_ta(info->head.block), 1139c349dbc7Sjsg .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 1140c349dbc7Sjsg .sub_block_index = info->head.sub_block_index, 1141c349dbc7Sjsg .address = info->address, 1142c349dbc7Sjsg .value = info->value, 1143c349dbc7Sjsg }; 11441bb76ff1Sjsg int ret = -EINVAL; 11451bb76ff1Sjsg struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, 11461bb76ff1Sjsg info->head.block, 11471bb76ff1Sjsg info->head.sub_block_index); 1148c349dbc7Sjsg 1149f005ef32Sjsg /* inject on guest isn't allowed, return success directly */ 1150f005ef32Sjsg if (amdgpu_sriov_vf(adev)) 1151f005ef32Sjsg return 0; 1152f005ef32Sjsg 1153c349dbc7Sjsg if (!obj) 1154c349dbc7Sjsg return -EINVAL; 1155c349dbc7Sjsg 11561bb76ff1Sjsg if (!block_obj || !block_obj->hw_ops) { 11571bb76ff1Sjsg dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 11581bb76ff1Sjsg get_ras_block_str(&info->head)); 11591bb76ff1Sjsg return -EINVAL; 11601bb76ff1Sjsg } 11611bb76ff1Sjsg 1162c349dbc7Sjsg /* Calculate XGMI relative offset */ 1163f005ef32Sjsg if (adev->gmc.xgmi.num_physical_nodes > 1 && 1164f005ef32Sjsg info->head.block != AMDGPU_RAS_BLOCK__GFX) { 1165c349dbc7Sjsg block_info.address = 1166c349dbc7Sjsg amdgpu_xgmi_get_relative_phy_addr(adev, 1167c349dbc7Sjsg block_info.address); 1168c349dbc7Sjsg } 1169c349dbc7Sjsg 1170f005ef32Sjsg if (block_obj->hw_ops->ras_error_inject) { 1171f005ef32Sjsg if (info->head.block == AMDGPU_RAS_BLOCK__GFX) 1172f005ef32Sjsg ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); 1173f005ef32Sjsg else /* Special ras_error_inject is defined (e.g: xgmi) */ 1174f005ef32Sjsg ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, 1175f005ef32Sjsg info->instance_mask); 11761bb76ff1Sjsg } else { 1177f005ef32Sjsg /* default path */ 1178f005ef32Sjsg ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); 1179c349dbc7Sjsg } 1180c349dbc7Sjsg 11815ca02815Sjsg if (ret) 11825ca02815Sjsg dev_err(adev->dev, "ras inject %s failed %d\n", 11831bb76ff1Sjsg get_ras_block_str(&info->head), ret); 1184c349dbc7Sjsg 1185c349dbc7Sjsg return ret; 1186c349dbc7Sjsg } 1187c349dbc7Sjsg 11885ca02815Sjsg /** 1189f005ef32Sjsg * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP 1190f005ef32Sjsg * @adev: pointer to AMD GPU device 1191f005ef32Sjsg * @ce_count: pointer to an integer to be set to the count of correctible errors. 1192f005ef32Sjsg * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. 1193f005ef32Sjsg * @query_info: pointer to ras_query_if 1194f005ef32Sjsg * 1195f005ef32Sjsg * Return 0 for query success or do nothing, otherwise return an error 1196f005ef32Sjsg * on failures 1197f005ef32Sjsg */ 1198f005ef32Sjsg static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, 1199f005ef32Sjsg unsigned long *ce_count, 1200f005ef32Sjsg unsigned long *ue_count, 1201f005ef32Sjsg struct ras_query_if *query_info) 1202f005ef32Sjsg { 1203f005ef32Sjsg int ret; 1204f005ef32Sjsg 1205f005ef32Sjsg if (!query_info) 1206f005ef32Sjsg /* do nothing if query_info is not specified */ 1207f005ef32Sjsg return 0; 1208f005ef32Sjsg 1209f005ef32Sjsg ret = amdgpu_ras_query_error_status(adev, query_info); 1210f005ef32Sjsg if (ret) 1211f005ef32Sjsg return ret; 1212f005ef32Sjsg 1213f005ef32Sjsg *ce_count += query_info->ce_count; 1214f005ef32Sjsg *ue_count += query_info->ue_count; 1215f005ef32Sjsg 1216f005ef32Sjsg /* some hardware/IP supports read to clear 1217f005ef32Sjsg * no need to explictly reset the err status after the query call */ 1218f005ef32Sjsg if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && 1219f005ef32Sjsg adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { 1220f005ef32Sjsg if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) 1221f005ef32Sjsg dev_warn(adev->dev, 1222f005ef32Sjsg "Failed to reset error counter and error status\n"); 1223f005ef32Sjsg } 1224f005ef32Sjsg 1225f005ef32Sjsg return 0; 1226f005ef32Sjsg } 1227f005ef32Sjsg 1228f005ef32Sjsg /** 1229f005ef32Sjsg * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP 12301bb76ff1Sjsg * @adev: pointer to AMD GPU device 12311bb76ff1Sjsg * @ce_count: pointer to an integer to be set to the count of correctible errors. 12321bb76ff1Sjsg * @ue_count: pointer to an integer to be set to the count of uncorrectible 12335ca02815Sjsg * errors. 1234f005ef32Sjsg * @query_info: pointer to ras_query_if if the query request is only for 1235f005ef32Sjsg * specific ip block; if info is NULL, then the qurey request is for 1236f005ef32Sjsg * all the ip blocks that support query ras error counters/status 12375ca02815Sjsg * 12385ca02815Sjsg * If set, @ce_count or @ue_count, count and return the corresponding 12395ca02815Sjsg * error counts in those integer pointers. Return 0 if the device 12405ca02815Sjsg * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS. 12415ca02815Sjsg */ 12425ca02815Sjsg int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 12435ca02815Sjsg unsigned long *ce_count, 1244f005ef32Sjsg unsigned long *ue_count, 1245f005ef32Sjsg struct ras_query_if *query_info) 1246c349dbc7Sjsg { 1247c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1248c349dbc7Sjsg struct ras_manager *obj; 12495ca02815Sjsg unsigned long ce, ue; 1250f005ef32Sjsg int ret; 1251c349dbc7Sjsg 12525ca02815Sjsg if (!adev->ras_enabled || !con) 12535ca02815Sjsg return -EOPNOTSUPP; 12545ca02815Sjsg 12555ca02815Sjsg /* Don't count since no reporting. 12565ca02815Sjsg */ 12575ca02815Sjsg if (!ce_count && !ue_count) 1258c349dbc7Sjsg return 0; 1259c349dbc7Sjsg 12605ca02815Sjsg ce = 0; 12615ca02815Sjsg ue = 0; 1262f005ef32Sjsg if (!query_info) { 1263f005ef32Sjsg /* query all the ip blocks that support ras query interface */ 1264c349dbc7Sjsg list_for_each_entry(obj, &con->head, node) { 1265c349dbc7Sjsg struct ras_query_if info = { 1266c349dbc7Sjsg .head = obj->head, 1267c349dbc7Sjsg }; 1268c349dbc7Sjsg 1269f005ef32Sjsg ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); 1270f005ef32Sjsg } 1271f005ef32Sjsg } else { 1272f005ef32Sjsg /* query specific ip block */ 1273f005ef32Sjsg ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); 12741bb76ff1Sjsg } 12751bb76ff1Sjsg 1276f005ef32Sjsg if (ret) 1277f005ef32Sjsg return ret; 1278c349dbc7Sjsg 12795ca02815Sjsg if (ce_count) 12805ca02815Sjsg *ce_count = ce; 12815ca02815Sjsg 12825ca02815Sjsg if (ue_count) 12835ca02815Sjsg *ue_count = ue; 12845ca02815Sjsg 12855ca02815Sjsg return 0; 1286c349dbc7Sjsg } 1287c349dbc7Sjsg /* query/inject/cure end */ 1288c349dbc7Sjsg 1289c349dbc7Sjsg #ifdef __linux__ 1290c349dbc7Sjsg 1291c349dbc7Sjsg /* sysfs begin */ 1292c349dbc7Sjsg 1293c349dbc7Sjsg static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1294c349dbc7Sjsg struct ras_badpage **bps, unsigned int *count); 1295c349dbc7Sjsg 1296c349dbc7Sjsg static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 1297c349dbc7Sjsg { 1298c349dbc7Sjsg switch (flags) { 1299c349dbc7Sjsg case AMDGPU_RAS_RETIRE_PAGE_RESERVED: 1300c349dbc7Sjsg return "R"; 1301c349dbc7Sjsg case AMDGPU_RAS_RETIRE_PAGE_PENDING: 1302c349dbc7Sjsg return "P"; 1303c349dbc7Sjsg case AMDGPU_RAS_RETIRE_PAGE_FAULT: 1304c349dbc7Sjsg default: 1305c349dbc7Sjsg return "F"; 13065ca02815Sjsg } 1307c349dbc7Sjsg } 1308c349dbc7Sjsg 1309c349dbc7Sjsg /** 1310c349dbc7Sjsg * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface 1311c349dbc7Sjsg * 1312c349dbc7Sjsg * It allows user to read the bad pages of vram on the gpu through 1313c349dbc7Sjsg * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 1314c349dbc7Sjsg * 1315c349dbc7Sjsg * It outputs multiple lines, and each line stands for one gpu page. 1316c349dbc7Sjsg * 1317c349dbc7Sjsg * The format of one line is below, 1318c349dbc7Sjsg * gpu pfn : gpu page size : flags 1319c349dbc7Sjsg * 1320c349dbc7Sjsg * gpu pfn and gpu page size are printed in hex format. 1321c349dbc7Sjsg * flags can be one of below character, 1322c349dbc7Sjsg * 1323c349dbc7Sjsg * R: reserved, this gpu page is reserved and not able to use. 1324c349dbc7Sjsg * 1325c349dbc7Sjsg * P: pending for reserve, this gpu page is marked as bad, will be reserved 1326c349dbc7Sjsg * in next window of page_reserve. 1327c349dbc7Sjsg * 1328c349dbc7Sjsg * F: unable to reserve. this gpu page can't be reserved due to some reasons. 1329c349dbc7Sjsg * 1330c349dbc7Sjsg * Examples: 1331c349dbc7Sjsg * 1332c349dbc7Sjsg * .. code-block:: bash 1333c349dbc7Sjsg * 1334c349dbc7Sjsg * 0x00000001 : 0x00001000 : R 1335c349dbc7Sjsg * 0x00000002 : 0x00001000 : P 1336c349dbc7Sjsg * 1337c349dbc7Sjsg */ 1338c349dbc7Sjsg 1339c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 1340c349dbc7Sjsg struct kobject *kobj, struct bin_attribute *attr, 1341c349dbc7Sjsg char *buf, loff_t ppos, size_t count) 1342c349dbc7Sjsg { 1343c349dbc7Sjsg struct amdgpu_ras *con = 1344c349dbc7Sjsg container_of(attr, struct amdgpu_ras, badpages_attr); 1345c349dbc7Sjsg struct amdgpu_device *adev = con->adev; 1346c349dbc7Sjsg const unsigned int element_size = 1347c349dbc7Sjsg sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 1348c349dbc7Sjsg unsigned int start = div64_ul(ppos + element_size - 1, element_size); 1349c349dbc7Sjsg unsigned int end = div64_ul(ppos + count - 1, element_size); 1350c349dbc7Sjsg ssize_t s = 0; 1351c349dbc7Sjsg struct ras_badpage *bps = NULL; 1352c349dbc7Sjsg unsigned int bps_count = 0; 1353c349dbc7Sjsg 1354c349dbc7Sjsg memset(buf, 0, count); 1355c349dbc7Sjsg 1356c349dbc7Sjsg if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) 1357c349dbc7Sjsg return 0; 1358c349dbc7Sjsg 1359c349dbc7Sjsg for (; start < end && start < bps_count; start++) 1360c349dbc7Sjsg s += scnprintf(&buf[s], element_size + 1, 1361c349dbc7Sjsg "0x%08x : 0x%08x : %1s\n", 1362c349dbc7Sjsg bps[start].bp, 1363c349dbc7Sjsg bps[start].size, 1364c349dbc7Sjsg amdgpu_ras_badpage_flags_str(bps[start].flags)); 1365c349dbc7Sjsg 1366c349dbc7Sjsg kfree(bps); 1367c349dbc7Sjsg 1368c349dbc7Sjsg return s; 1369c349dbc7Sjsg } 1370c349dbc7Sjsg 1371c349dbc7Sjsg static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 1372c349dbc7Sjsg struct device_attribute *attr, char *buf) 1373c349dbc7Sjsg { 1374c349dbc7Sjsg struct amdgpu_ras *con = 1375c349dbc7Sjsg container_of(attr, struct amdgpu_ras, features_attr); 1376c349dbc7Sjsg 1377f005ef32Sjsg return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); 1378c349dbc7Sjsg } 1379c349dbc7Sjsg 1380ad8b1aafSjsg static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) 1381c349dbc7Sjsg { 1382c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1383c349dbc7Sjsg 138475aab5aaSjsg if (adev->dev->kobj.sd) 1385ad8b1aafSjsg sysfs_remove_file_from_group(&adev->dev->kobj, 1386ad8b1aafSjsg &con->badpages_attr.attr, 1387ad8b1aafSjsg RAS_FS_NAME); 1388c349dbc7Sjsg } 1389c349dbc7Sjsg 1390c349dbc7Sjsg static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 1391c349dbc7Sjsg { 1392c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1393c349dbc7Sjsg struct attribute *attrs[] = { 1394c349dbc7Sjsg &con->features_attr.attr, 1395c349dbc7Sjsg NULL 1396c349dbc7Sjsg }; 1397c349dbc7Sjsg struct attribute_group group = { 1398ad8b1aafSjsg .name = RAS_FS_NAME, 1399c349dbc7Sjsg .attrs = attrs, 1400c349dbc7Sjsg }; 1401c349dbc7Sjsg 140275aab5aaSjsg if (adev->dev->kobj.sd) 1403c349dbc7Sjsg sysfs_remove_group(&adev->dev->kobj, &group); 1404c349dbc7Sjsg 1405c349dbc7Sjsg return 0; 1406c349dbc7Sjsg } 1407c349dbc7Sjsg 1408c349dbc7Sjsg #endif /* __linux__ */ 1409c349dbc7Sjsg 1410c349dbc7Sjsg int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 14111bb76ff1Sjsg struct ras_common_if *head) 1412c349dbc7Sjsg { 14131bb76ff1Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1414c349dbc7Sjsg 1415c349dbc7Sjsg if (!obj || obj->attr_inuse) 1416c349dbc7Sjsg return -EINVAL; 1417c349dbc7Sjsg 1418c349dbc7Sjsg STUB(); 1419c349dbc7Sjsg return -ENOSYS; 1420c349dbc7Sjsg #ifdef notyet 1421c349dbc7Sjsg get_obj(obj); 1422c349dbc7Sjsg 14231bb76ff1Sjsg snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), 14241bb76ff1Sjsg "%s_err_count", head->name); 1425c349dbc7Sjsg 1426c349dbc7Sjsg obj->sysfs_attr = (struct device_attribute){ 1427c349dbc7Sjsg .attr = { 1428c349dbc7Sjsg .name = obj->fs_data.sysfs_name, 1429c349dbc7Sjsg .mode = S_IRUGO, 1430c349dbc7Sjsg }, 1431c349dbc7Sjsg .show = amdgpu_ras_sysfs_read, 1432c349dbc7Sjsg }; 1433c349dbc7Sjsg sysfs_attr_init(&obj->sysfs_attr.attr); 1434c349dbc7Sjsg 1435c349dbc7Sjsg if (sysfs_add_file_to_group(&adev->dev->kobj, 1436c349dbc7Sjsg &obj->sysfs_attr.attr, 1437ad8b1aafSjsg RAS_FS_NAME)) { 1438c349dbc7Sjsg put_obj(obj); 1439c349dbc7Sjsg return -EINVAL; 1440c349dbc7Sjsg } 1441c349dbc7Sjsg 1442c349dbc7Sjsg obj->attr_inuse = 1; 1443c349dbc7Sjsg 1444c349dbc7Sjsg return 0; 1445c349dbc7Sjsg #endif 1446c349dbc7Sjsg } 1447c349dbc7Sjsg 1448c349dbc7Sjsg int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 1449c349dbc7Sjsg struct ras_common_if *head) 1450c349dbc7Sjsg { 1451c349dbc7Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1452c349dbc7Sjsg 1453c349dbc7Sjsg if (!obj || !obj->attr_inuse) 1454c349dbc7Sjsg return -EINVAL; 1455c349dbc7Sjsg 145675aab5aaSjsg #ifdef __linux__ 145775aab5aaSjsg if (adev->dev->kobj.sd) 1458c349dbc7Sjsg sysfs_remove_file_from_group(&adev->dev->kobj, 1459c349dbc7Sjsg &obj->sysfs_attr.attr, 1460ad8b1aafSjsg RAS_FS_NAME); 146175aab5aaSjsg #endif 1462c349dbc7Sjsg obj->attr_inuse = 0; 1463c349dbc7Sjsg put_obj(obj); 1464c349dbc7Sjsg 1465c349dbc7Sjsg return 0; 1466c349dbc7Sjsg } 1467c349dbc7Sjsg 1468c349dbc7Sjsg #ifdef __linux__ 1469c349dbc7Sjsg 1470c349dbc7Sjsg static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 1471c349dbc7Sjsg { 1472c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1473c349dbc7Sjsg struct ras_manager *obj, *tmp; 1474c349dbc7Sjsg 1475c349dbc7Sjsg list_for_each_entry_safe(obj, tmp, &con->head, node) { 1476c349dbc7Sjsg amdgpu_ras_sysfs_remove(adev, &obj->head); 1477c349dbc7Sjsg } 1478c349dbc7Sjsg 1479ad8b1aafSjsg if (amdgpu_bad_page_threshold != 0) 1480ad8b1aafSjsg amdgpu_ras_sysfs_remove_bad_page_node(adev); 1481ad8b1aafSjsg 1482c349dbc7Sjsg amdgpu_ras_sysfs_remove_feature_node(adev); 1483c349dbc7Sjsg 1484c349dbc7Sjsg return 0; 1485c349dbc7Sjsg } 1486c349dbc7Sjsg /* sysfs end */ 1487c349dbc7Sjsg 1488c349dbc7Sjsg /** 1489c349dbc7Sjsg * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors 1490c349dbc7Sjsg * 1491c349dbc7Sjsg * Normally when there is an uncorrectable error, the driver will reset 1492c349dbc7Sjsg * the GPU to recover. However, in the event of an unrecoverable error, 1493c349dbc7Sjsg * the driver provides an interface to reboot the system automatically 1494c349dbc7Sjsg * in that event. 1495c349dbc7Sjsg * 1496c349dbc7Sjsg * The following file in debugfs provides that interface: 1497c349dbc7Sjsg * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot 1498c349dbc7Sjsg * 1499c349dbc7Sjsg * Usage: 1500c349dbc7Sjsg * 1501c349dbc7Sjsg * .. code-block:: bash 1502c349dbc7Sjsg * 1503c349dbc7Sjsg * echo true > .../ras/auto_reboot 1504c349dbc7Sjsg * 1505c349dbc7Sjsg */ 1506c349dbc7Sjsg /* debugfs begin */ 15075ca02815Sjsg static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 1508c349dbc7Sjsg { 1509c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1510f005ef32Sjsg struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; 1511ad8b1aafSjsg struct drm_minor *minor = adev_to_drm(adev)->primary; 15125ca02815Sjsg struct dentry *dir; 1513c349dbc7Sjsg 15145ca02815Sjsg dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); 15155ca02815Sjsg debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, 15165ca02815Sjsg &amdgpu_ras_debugfs_ctrl_ops); 15175ca02815Sjsg debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, 15185ca02815Sjsg &amdgpu_ras_debugfs_eeprom_ops); 15195ca02815Sjsg debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, 15205ca02815Sjsg &con->bad_page_cnt_threshold); 1521f005ef32Sjsg debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); 15225ca02815Sjsg debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); 15235ca02815Sjsg debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); 15245ca02815Sjsg debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, 15255ca02815Sjsg &amdgpu_ras_debugfs_eeprom_size_ops); 15265ca02815Sjsg con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", 15275ca02815Sjsg S_IRUGO, dir, adev, 15285ca02815Sjsg &amdgpu_ras_debugfs_eeprom_table_ops); 15295ca02815Sjsg amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); 1530c349dbc7Sjsg 1531c349dbc7Sjsg /* 1532c349dbc7Sjsg * After one uncorrectable error happens, usually GPU recovery will 1533c349dbc7Sjsg * be scheduled. But due to the known problem in GPU recovery failing 1534c349dbc7Sjsg * to bring GPU back, below interface provides one direct way to 1535c349dbc7Sjsg * user to reboot system automatically in such case within 1536c349dbc7Sjsg * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine 1537c349dbc7Sjsg * will never be called. 1538c349dbc7Sjsg */ 15395ca02815Sjsg debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); 1540ad8b1aafSjsg 1541ad8b1aafSjsg /* 1542ad8b1aafSjsg * User could set this not to clean up hardware's error count register 1543ad8b1aafSjsg * of RAS IPs during ras recovery. 1544ad8b1aafSjsg */ 15455ca02815Sjsg debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, 15465ca02815Sjsg &con->disable_ras_err_cnt_harvest); 15475ca02815Sjsg return dir; 1548c349dbc7Sjsg } 1549c349dbc7Sjsg 1550ad8b1aafSjsg static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 15515ca02815Sjsg struct ras_fs_if *head, 15525ca02815Sjsg struct dentry *dir) 1553c349dbc7Sjsg { 1554c349dbc7Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 1555c349dbc7Sjsg 15565ca02815Sjsg if (!obj || !dir) 1557c349dbc7Sjsg return; 1558c349dbc7Sjsg 1559c349dbc7Sjsg get_obj(obj); 1560c349dbc7Sjsg 1561c349dbc7Sjsg memcpy(obj->fs_data.debugfs_name, 1562c349dbc7Sjsg head->debugfs_name, 1563c349dbc7Sjsg sizeof(obj->fs_data.debugfs_name)); 1564c349dbc7Sjsg 15655ca02815Sjsg debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, 15665ca02815Sjsg obj, &amdgpu_ras_debugfs_ops); 1567c349dbc7Sjsg } 1568c349dbc7Sjsg 1569c349dbc7Sjsg void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) 1570c349dbc7Sjsg { 1571c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 15725ca02815Sjsg struct dentry *dir; 1573c349dbc7Sjsg struct ras_manager *obj; 1574c349dbc7Sjsg struct ras_fs_if fs_info; 1575c349dbc7Sjsg 1576c349dbc7Sjsg /* 1577c349dbc7Sjsg * it won't be called in resume path, no need to check 1578c349dbc7Sjsg * suspend and gpu reset status 1579c349dbc7Sjsg */ 1580ad8b1aafSjsg if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) 1581c349dbc7Sjsg return; 1582c349dbc7Sjsg 15835ca02815Sjsg dir = amdgpu_ras_debugfs_create_ctrl_node(adev); 1584c349dbc7Sjsg 1585c349dbc7Sjsg list_for_each_entry(obj, &con->head, node) { 1586c349dbc7Sjsg if (amdgpu_ras_is_supported(adev, obj->head.block) && 1587c349dbc7Sjsg (obj->attr_inuse == 1)) { 1588f005ef32Sjsg snprintf(fs_info.debugfs_name, sizeof(fs_info.debugfs_name), "%s_err_inject", 15891bb76ff1Sjsg get_ras_block_str(&obj->head)); 1590c349dbc7Sjsg fs_info.head = obj->head; 15915ca02815Sjsg amdgpu_ras_debugfs_create(adev, &fs_info, dir); 1592c349dbc7Sjsg } 1593c349dbc7Sjsg } 1594c349dbc7Sjsg } 1595c349dbc7Sjsg 1596c349dbc7Sjsg /* debugfs end */ 1597c349dbc7Sjsg 1598c349dbc7Sjsg /* ras fs */ 1599ad8b1aafSjsg static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, 1600ad8b1aafSjsg amdgpu_ras_sysfs_badpages_read, NULL, 0); 1601ad8b1aafSjsg #endif /* __linux__ */ 1602ad8b1aafSjsg static DEVICE_ATTR(features, S_IRUGO, 1603ad8b1aafSjsg amdgpu_ras_sysfs_features_read, NULL); 1604c349dbc7Sjsg static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 1605c349dbc7Sjsg { 1606c349dbc7Sjsg #ifdef __linux__ 1607ad8b1aafSjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1608ad8b1aafSjsg struct attribute_group group = { 1609ad8b1aafSjsg .name = RAS_FS_NAME, 1610ad8b1aafSjsg }; 1611ad8b1aafSjsg struct attribute *attrs[] = { 1612ad8b1aafSjsg &con->features_attr.attr, 1613ad8b1aafSjsg NULL 1614ad8b1aafSjsg }; 1615ad8b1aafSjsg struct bin_attribute *bin_attrs[] = { 1616ad8b1aafSjsg NULL, 1617ad8b1aafSjsg NULL, 1618ad8b1aafSjsg }; 1619ad8b1aafSjsg int r; 1620ad8b1aafSjsg 1621ad8b1aafSjsg /* add features entry */ 1622ad8b1aafSjsg con->features_attr = dev_attr_features; 1623ad8b1aafSjsg group.attrs = attrs; 1624ad8b1aafSjsg sysfs_attr_init(attrs[0]); 1625ad8b1aafSjsg 1626ad8b1aafSjsg if (amdgpu_bad_page_threshold != 0) { 1627ad8b1aafSjsg /* add bad_page_features entry */ 1628ad8b1aafSjsg bin_attr_gpu_vram_bad_pages.private = NULL; 1629ad8b1aafSjsg con->badpages_attr = bin_attr_gpu_vram_bad_pages; 1630ad8b1aafSjsg bin_attrs[0] = &con->badpages_attr; 1631ad8b1aafSjsg group.bin_attrs = bin_attrs; 1632ad8b1aafSjsg sysfs_bin_attr_init(bin_attrs[0]); 1633ad8b1aafSjsg } 1634ad8b1aafSjsg 1635ad8b1aafSjsg r = sysfs_create_group(&adev->dev->kobj, &group); 1636ad8b1aafSjsg if (r) 1637ad8b1aafSjsg dev_err(adev->dev, "Failed to create RAS sysfs group!"); 1638c349dbc7Sjsg #endif 1639c349dbc7Sjsg 1640c349dbc7Sjsg return 0; 1641c349dbc7Sjsg } 1642c349dbc7Sjsg 1643c349dbc7Sjsg static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 1644c349dbc7Sjsg { 1645c349dbc7Sjsg #ifdef __linux__ 16465ca02815Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 16475ca02815Sjsg struct ras_manager *con_obj, *ip_obj, *tmp; 16485ca02815Sjsg 16495ca02815Sjsg if (IS_ENABLED(CONFIG_DEBUG_FS)) { 16505ca02815Sjsg list_for_each_entry_safe(con_obj, tmp, &con->head, node) { 16515ca02815Sjsg ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); 16525ca02815Sjsg if (ip_obj) 16535ca02815Sjsg put_obj(ip_obj); 16545ca02815Sjsg } 16555ca02815Sjsg } 16565ca02815Sjsg 1657c349dbc7Sjsg amdgpu_ras_sysfs_remove_all(adev); 1658c349dbc7Sjsg #endif 1659c349dbc7Sjsg return 0; 1660c349dbc7Sjsg } 1661c349dbc7Sjsg /* ras fs end */ 1662c349dbc7Sjsg 1663c349dbc7Sjsg /* ih begin */ 16641bb76ff1Sjsg 16651bb76ff1Sjsg /* For the hardware that cannot enable bif ring for both ras_controller_irq 16661bb76ff1Sjsg * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status 16671bb76ff1Sjsg * register to check whether the interrupt is triggered or not, and properly 16681bb76ff1Sjsg * ack the interrupt if it is there 16691bb76ff1Sjsg */ 16701bb76ff1Sjsg void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) 16711bb76ff1Sjsg { 16721bb76ff1Sjsg /* Fatal error events are handled on host side */ 1673f005ef32Sjsg if (amdgpu_sriov_vf(adev)) 16741bb76ff1Sjsg return; 16751bb76ff1Sjsg 16761bb76ff1Sjsg if (adev->nbio.ras && 16771bb76ff1Sjsg adev->nbio.ras->handle_ras_controller_intr_no_bifring) 16781bb76ff1Sjsg adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); 16791bb76ff1Sjsg 16801bb76ff1Sjsg if (adev->nbio.ras && 16811bb76ff1Sjsg adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) 16821bb76ff1Sjsg adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); 16831bb76ff1Sjsg } 16841bb76ff1Sjsg 16851bb76ff1Sjsg static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, 16861bb76ff1Sjsg struct amdgpu_iv_entry *entry) 16871bb76ff1Sjsg { 16881bb76ff1Sjsg bool poison_stat = false; 16891bb76ff1Sjsg struct amdgpu_device *adev = obj->adev; 16901bb76ff1Sjsg struct amdgpu_ras_block_object *block_obj = 16911bb76ff1Sjsg amdgpu_ras_get_ras_block(adev, obj->head.block, 0); 16921bb76ff1Sjsg 1693f005ef32Sjsg if (!block_obj) 16941bb76ff1Sjsg return; 16951bb76ff1Sjsg 16961bb76ff1Sjsg /* both query_poison_status and handle_poison_consumption are optional, 16971bb76ff1Sjsg * but at least one of them should be implemented if we need poison 16981bb76ff1Sjsg * consumption handler 16991bb76ff1Sjsg */ 1700f005ef32Sjsg if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { 17011bb76ff1Sjsg poison_stat = block_obj->hw_ops->query_poison_status(adev); 17021bb76ff1Sjsg if (!poison_stat) { 17031bb76ff1Sjsg /* Not poison consumption interrupt, no need to handle it */ 17041bb76ff1Sjsg dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", 17051bb76ff1Sjsg block_obj->ras_comm.name); 17061bb76ff1Sjsg 17071bb76ff1Sjsg return; 17081bb76ff1Sjsg } 17091bb76ff1Sjsg } 17101bb76ff1Sjsg 1711f005ef32Sjsg amdgpu_umc_poison_handler(adev, false); 17121bb76ff1Sjsg 1713f005ef32Sjsg if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) 17141bb76ff1Sjsg poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); 17151bb76ff1Sjsg 17161bb76ff1Sjsg /* gpu reset is fallback for failed and default cases */ 17171bb76ff1Sjsg if (poison_stat) { 17181bb76ff1Sjsg dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", 17191bb76ff1Sjsg block_obj->ras_comm.name); 17201bb76ff1Sjsg amdgpu_ras_reset_gpu(adev); 1721f005ef32Sjsg } else { 1722f005ef32Sjsg amdgpu_gfx_poison_consumption_handler(adev, entry); 17231bb76ff1Sjsg } 17241bb76ff1Sjsg } 17251bb76ff1Sjsg 17261bb76ff1Sjsg static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, 17271bb76ff1Sjsg struct amdgpu_iv_entry *entry) 17281bb76ff1Sjsg { 17291bb76ff1Sjsg dev_info(obj->adev->dev, 17301bb76ff1Sjsg "Poison is created, no user action is needed.\n"); 17311bb76ff1Sjsg } 17321bb76ff1Sjsg 17331bb76ff1Sjsg static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, 17341bb76ff1Sjsg struct amdgpu_iv_entry *entry) 1735c349dbc7Sjsg { 1736c349dbc7Sjsg struct ras_ih_data *data = &obj->ih_data; 1737c349dbc7Sjsg struct ras_err_data err_data = {0, 0, 0, NULL}; 17381bb76ff1Sjsg int ret; 1739c349dbc7Sjsg 17401bb76ff1Sjsg if (!data->cb) 17411bb76ff1Sjsg return; 1742c349dbc7Sjsg 1743c349dbc7Sjsg /* Let IP handle its data, maybe we need get the output 17441bb76ff1Sjsg * from the callback to update the error type/count, etc 1745c349dbc7Sjsg */ 17461bb76ff1Sjsg ret = data->cb(obj->adev, &err_data, entry); 1747c349dbc7Sjsg /* ue will trigger an interrupt, and in that case 1748c349dbc7Sjsg * we need do a reset to recovery the whole system. 1749c349dbc7Sjsg * But leave IP do that recovery, here we just dispatch 1750c349dbc7Sjsg * the error. 1751c349dbc7Sjsg */ 1752c349dbc7Sjsg if (ret == AMDGPU_RAS_SUCCESS) { 1753c349dbc7Sjsg /* these counts could be left as 0 if 1754c349dbc7Sjsg * some blocks do not count error number 1755c349dbc7Sjsg */ 1756c349dbc7Sjsg obj->err_data.ue_count += err_data.ue_count; 1757c349dbc7Sjsg obj->err_data.ce_count += err_data.ce_count; 1758c349dbc7Sjsg } 1759c349dbc7Sjsg } 17601bb76ff1Sjsg 17611bb76ff1Sjsg static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 17621bb76ff1Sjsg { 17631bb76ff1Sjsg struct ras_ih_data *data = &obj->ih_data; 17641bb76ff1Sjsg struct amdgpu_iv_entry entry; 17651bb76ff1Sjsg 17661bb76ff1Sjsg while (data->rptr != data->wptr) { 17671bb76ff1Sjsg rmb(); 17681bb76ff1Sjsg memcpy(&entry, &data->ring[data->rptr], 17691bb76ff1Sjsg data->element_size); 17701bb76ff1Sjsg 17711bb76ff1Sjsg wmb(); 17721bb76ff1Sjsg data->rptr = (data->aligned_element_size + 17731bb76ff1Sjsg data->rptr) % data->ring_size; 17741bb76ff1Sjsg 17751bb76ff1Sjsg if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { 17761bb76ff1Sjsg if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 17771bb76ff1Sjsg amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); 17781bb76ff1Sjsg else 17791bb76ff1Sjsg amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); 17801bb76ff1Sjsg } else { 17811bb76ff1Sjsg if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 17821bb76ff1Sjsg amdgpu_ras_interrupt_umc_handler(obj, &entry); 17831bb76ff1Sjsg else 17841bb76ff1Sjsg dev_warn(obj->adev->dev, 17851bb76ff1Sjsg "No RAS interrupt handler for non-UMC block with poison disabled.\n"); 17861bb76ff1Sjsg } 1787c349dbc7Sjsg } 1788c349dbc7Sjsg } 1789c349dbc7Sjsg 1790c349dbc7Sjsg static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 1791c349dbc7Sjsg { 1792c349dbc7Sjsg struct ras_ih_data *data = 1793c349dbc7Sjsg container_of(work, struct ras_ih_data, ih_work); 1794c349dbc7Sjsg struct ras_manager *obj = 1795c349dbc7Sjsg container_of(data, struct ras_manager, ih_data); 1796c349dbc7Sjsg 1797c349dbc7Sjsg amdgpu_ras_interrupt_handler(obj); 1798c349dbc7Sjsg } 1799c349dbc7Sjsg 1800c349dbc7Sjsg int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 1801c349dbc7Sjsg struct ras_dispatch_if *info) 1802c349dbc7Sjsg { 1803*8602cf8bSjsg struct ras_manager *obj; 1804*8602cf8bSjsg struct ras_ih_data *data; 1805c349dbc7Sjsg 1806*8602cf8bSjsg obj = amdgpu_ras_find_obj(adev, &info->head); 1807c349dbc7Sjsg if (!obj) 1808c349dbc7Sjsg return -EINVAL; 1809c349dbc7Sjsg 1810*8602cf8bSjsg data = &obj->ih_data; 1811*8602cf8bSjsg 1812c349dbc7Sjsg if (data->inuse == 0) 1813c349dbc7Sjsg return 0; 1814c349dbc7Sjsg 1815c349dbc7Sjsg /* Might be overflow... */ 1816c349dbc7Sjsg memcpy(&data->ring[data->wptr], info->entry, 1817c349dbc7Sjsg data->element_size); 1818c349dbc7Sjsg 1819c349dbc7Sjsg wmb(); 1820c349dbc7Sjsg data->wptr = (data->aligned_element_size + 1821c349dbc7Sjsg data->wptr) % data->ring_size; 1822c349dbc7Sjsg 1823c349dbc7Sjsg schedule_work(&data->ih_work); 1824c349dbc7Sjsg 1825c349dbc7Sjsg return 0; 1826c349dbc7Sjsg } 1827c349dbc7Sjsg 1828c349dbc7Sjsg int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 18291bb76ff1Sjsg struct ras_common_if *head) 1830c349dbc7Sjsg { 18311bb76ff1Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1832c349dbc7Sjsg struct ras_ih_data *data; 1833c349dbc7Sjsg 1834c349dbc7Sjsg if (!obj) 1835c349dbc7Sjsg return -EINVAL; 1836c349dbc7Sjsg 1837c349dbc7Sjsg data = &obj->ih_data; 1838c349dbc7Sjsg if (data->inuse == 0) 1839c349dbc7Sjsg return 0; 1840c349dbc7Sjsg 1841c349dbc7Sjsg cancel_work_sync(&data->ih_work); 1842c349dbc7Sjsg 1843c349dbc7Sjsg kfree(data->ring); 1844c349dbc7Sjsg memset(data, 0, sizeof(*data)); 1845c349dbc7Sjsg put_obj(obj); 1846c349dbc7Sjsg 1847c349dbc7Sjsg return 0; 1848c349dbc7Sjsg } 1849c349dbc7Sjsg 1850c349dbc7Sjsg int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 18511bb76ff1Sjsg struct ras_common_if *head) 1852c349dbc7Sjsg { 18531bb76ff1Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 1854c349dbc7Sjsg struct ras_ih_data *data; 18551bb76ff1Sjsg struct amdgpu_ras_block_object *ras_obj; 1856c349dbc7Sjsg 1857c349dbc7Sjsg if (!obj) { 1858c349dbc7Sjsg /* in case we registe the IH before enable ras feature */ 18591bb76ff1Sjsg obj = amdgpu_ras_create_obj(adev, head); 1860c349dbc7Sjsg if (!obj) 1861c349dbc7Sjsg return -EINVAL; 1862c349dbc7Sjsg } else 1863c349dbc7Sjsg get_obj(obj); 1864c349dbc7Sjsg 18651bb76ff1Sjsg ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); 18661bb76ff1Sjsg 1867c349dbc7Sjsg data = &obj->ih_data; 1868c349dbc7Sjsg /* add the callback.etc */ 1869c349dbc7Sjsg *data = (struct ras_ih_data) { 1870c349dbc7Sjsg .inuse = 0, 18711bb76ff1Sjsg .cb = ras_obj->ras_cb, 1872c349dbc7Sjsg .element_size = sizeof(struct amdgpu_iv_entry), 1873c349dbc7Sjsg .rptr = 0, 1874c349dbc7Sjsg .wptr = 0, 1875c349dbc7Sjsg }; 1876c349dbc7Sjsg 1877c349dbc7Sjsg INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 1878c349dbc7Sjsg 1879f005ef32Sjsg data->aligned_element_size = ALIGN(data->element_size, 8); 1880c349dbc7Sjsg /* the ring can store 64 iv entries. */ 1881c349dbc7Sjsg data->ring_size = 64 * data->aligned_element_size; 1882c349dbc7Sjsg data->ring = kmalloc(data->ring_size, GFP_KERNEL); 1883c349dbc7Sjsg if (!data->ring) { 1884c349dbc7Sjsg put_obj(obj); 1885c349dbc7Sjsg return -ENOMEM; 1886c349dbc7Sjsg } 1887c349dbc7Sjsg 1888c349dbc7Sjsg /* IH is ready */ 1889c349dbc7Sjsg data->inuse = 1; 1890c349dbc7Sjsg 1891c349dbc7Sjsg return 0; 1892c349dbc7Sjsg } 1893c349dbc7Sjsg 1894c349dbc7Sjsg static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 1895c349dbc7Sjsg { 1896c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1897c349dbc7Sjsg struct ras_manager *obj, *tmp; 1898c349dbc7Sjsg 1899c349dbc7Sjsg list_for_each_entry_safe(obj, tmp, &con->head, node) { 19001bb76ff1Sjsg amdgpu_ras_interrupt_remove_handler(adev, &obj->head); 1901c349dbc7Sjsg } 1902c349dbc7Sjsg 1903c349dbc7Sjsg return 0; 1904c349dbc7Sjsg } 1905c349dbc7Sjsg /* ih end */ 1906c349dbc7Sjsg 1907c349dbc7Sjsg /* traversal all IPs except NBIO to query error counter */ 1908c349dbc7Sjsg static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) 1909c349dbc7Sjsg { 1910c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1911c349dbc7Sjsg struct ras_manager *obj; 1912c349dbc7Sjsg 19135ca02815Sjsg if (!adev->ras_enabled || !con) 1914c349dbc7Sjsg return; 1915c349dbc7Sjsg 1916c349dbc7Sjsg list_for_each_entry(obj, &con->head, node) { 1917c349dbc7Sjsg struct ras_query_if info = { 1918c349dbc7Sjsg .head = obj->head, 1919c349dbc7Sjsg }; 1920c349dbc7Sjsg 1921c349dbc7Sjsg /* 1922c349dbc7Sjsg * PCIE_BIF IP has one different isr by ras controller 1923c349dbc7Sjsg * interrupt, the specific ras counter query will be 1924c349dbc7Sjsg * done in that isr. So skip such block from common 1925c349dbc7Sjsg * sync flood interrupt isr calling. 1926c349dbc7Sjsg */ 1927c349dbc7Sjsg if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) 1928c349dbc7Sjsg continue; 1929c349dbc7Sjsg 19301bb76ff1Sjsg /* 19311bb76ff1Sjsg * this is a workaround for aldebaran, skip send msg to 19321bb76ff1Sjsg * smu to get ecc_info table due to smu handle get ecc 19331bb76ff1Sjsg * info table failed temporarily. 19341bb76ff1Sjsg * should be removed until smu fix handle ecc_info table. 19351bb76ff1Sjsg */ 19361bb76ff1Sjsg if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && 19371bb76ff1Sjsg (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) 19381bb76ff1Sjsg continue; 19391bb76ff1Sjsg 19405ca02815Sjsg amdgpu_ras_query_error_status(adev, &info); 19411bb76ff1Sjsg 19421bb76ff1Sjsg if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && 19431bb76ff1Sjsg adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && 19441bb76ff1Sjsg adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { 19451bb76ff1Sjsg if (amdgpu_ras_reset_error_status(adev, info.head.block)) 19461bb76ff1Sjsg dev_warn(adev->dev, "Failed to reset error counter and error status"); 19471bb76ff1Sjsg } 1948c349dbc7Sjsg } 1949c349dbc7Sjsg } 1950c349dbc7Sjsg 1951ad8b1aafSjsg /* Parse RdRspStatus and WrRspStatus */ 19525ca02815Sjsg static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, 1953ad8b1aafSjsg struct ras_query_if *info) 1954ad8b1aafSjsg { 19551bb76ff1Sjsg struct amdgpu_ras_block_object *block_obj; 1956ad8b1aafSjsg /* 1957ad8b1aafSjsg * Only two block need to query read/write 1958ad8b1aafSjsg * RspStatus at current state 1959ad8b1aafSjsg */ 19601bb76ff1Sjsg if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && 19611bb76ff1Sjsg (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) 19621bb76ff1Sjsg return; 19631bb76ff1Sjsg 19641bb76ff1Sjsg block_obj = amdgpu_ras_get_ras_block(adev, 19651bb76ff1Sjsg info->head.block, 19661bb76ff1Sjsg info->head.sub_block_index); 19671bb76ff1Sjsg 19681bb76ff1Sjsg if (!block_obj || !block_obj->hw_ops) { 19691bb76ff1Sjsg dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 19701bb76ff1Sjsg get_ras_block_str(&info->head)); 19711bb76ff1Sjsg return; 1972ad8b1aafSjsg } 19731bb76ff1Sjsg 19741bb76ff1Sjsg if (block_obj->hw_ops->query_ras_error_status) 19751bb76ff1Sjsg block_obj->hw_ops->query_ras_error_status(adev); 19761bb76ff1Sjsg 1977ad8b1aafSjsg } 1978ad8b1aafSjsg 1979ad8b1aafSjsg static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) 1980ad8b1aafSjsg { 1981ad8b1aafSjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1982ad8b1aafSjsg struct ras_manager *obj; 1983ad8b1aafSjsg 19845ca02815Sjsg if (!adev->ras_enabled || !con) 1985ad8b1aafSjsg return; 1986ad8b1aafSjsg 1987ad8b1aafSjsg list_for_each_entry(obj, &con->head, node) { 1988ad8b1aafSjsg struct ras_query_if info = { 1989ad8b1aafSjsg .head = obj->head, 1990ad8b1aafSjsg }; 1991ad8b1aafSjsg 1992ad8b1aafSjsg amdgpu_ras_error_status_query(adev, &info); 1993ad8b1aafSjsg } 1994ad8b1aafSjsg } 1995ad8b1aafSjsg 1996c349dbc7Sjsg /* recovery begin */ 1997c349dbc7Sjsg 1998c349dbc7Sjsg /* return 0 on success. 1999c349dbc7Sjsg * caller need free bps. 2000c349dbc7Sjsg */ 2001c349dbc7Sjsg static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 2002c349dbc7Sjsg struct ras_badpage **bps, unsigned int *count) 2003c349dbc7Sjsg { 2004c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2005c349dbc7Sjsg struct ras_err_handler_data *data; 2006c349dbc7Sjsg int i = 0; 20075ca02815Sjsg int ret = 0, status; 2008c349dbc7Sjsg 2009c349dbc7Sjsg if (!con || !con->eh_data || !bps || !count) 2010c349dbc7Sjsg return -EINVAL; 2011c349dbc7Sjsg 2012c349dbc7Sjsg mutex_lock(&con->recovery_lock); 2013c349dbc7Sjsg data = con->eh_data; 2014c349dbc7Sjsg if (!data || data->count == 0) { 2015c349dbc7Sjsg *bps = NULL; 2016c349dbc7Sjsg ret = -EINVAL; 2017c349dbc7Sjsg goto out; 2018c349dbc7Sjsg } 2019c349dbc7Sjsg 2020c349dbc7Sjsg *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); 2021c349dbc7Sjsg if (!*bps) { 2022c349dbc7Sjsg ret = -ENOMEM; 2023c349dbc7Sjsg goto out; 2024c349dbc7Sjsg } 2025c349dbc7Sjsg 2026c349dbc7Sjsg for (; i < data->count; i++) { 2027c349dbc7Sjsg (*bps)[i] = (struct ras_badpage){ 2028c349dbc7Sjsg .bp = data->bps[i].retired_page, 2029c349dbc7Sjsg .size = AMDGPU_GPU_PAGE_SIZE, 2030c349dbc7Sjsg .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, 2031c349dbc7Sjsg }; 20321bb76ff1Sjsg status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, 20335ca02815Sjsg data->bps[i].retired_page); 20345ca02815Sjsg if (status == -EBUSY) 2035c349dbc7Sjsg (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; 20365ca02815Sjsg else if (status == -ENOENT) 2037c349dbc7Sjsg (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; 2038c349dbc7Sjsg } 2039c349dbc7Sjsg 2040c349dbc7Sjsg *count = data->count; 2041c349dbc7Sjsg out: 2042c349dbc7Sjsg mutex_unlock(&con->recovery_lock); 2043c349dbc7Sjsg return ret; 2044c349dbc7Sjsg } 2045c349dbc7Sjsg 2046c349dbc7Sjsg static void amdgpu_ras_do_recovery(struct work_struct *work) 2047c349dbc7Sjsg { 2048c349dbc7Sjsg struct amdgpu_ras *ras = 2049c349dbc7Sjsg container_of(work, struct amdgpu_ras, recovery_work); 2050c349dbc7Sjsg struct amdgpu_device *remote_adev = NULL; 2051c349dbc7Sjsg struct amdgpu_device *adev = ras->adev; 2052c349dbc7Sjsg struct list_head device_list, *device_list_handle = NULL; 2053ad8b1aafSjsg 2054ad8b1aafSjsg if (!ras->disable_ras_err_cnt_harvest) { 2055ad8b1aafSjsg struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2056c349dbc7Sjsg 2057c349dbc7Sjsg /* Build list of devices to query RAS related errors */ 2058ad8b1aafSjsg if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 2059c349dbc7Sjsg device_list_handle = &hive->device_list; 2060ad8b1aafSjsg } else { 2061204a49e6Sjsg INIT_LIST_HEAD(&device_list); 2062c349dbc7Sjsg list_add_tail(&adev->gmc.xgmi.head, &device_list); 2063c349dbc7Sjsg device_list_handle = &device_list; 2064c349dbc7Sjsg } 2065c349dbc7Sjsg 2066ad8b1aafSjsg list_for_each_entry(remote_adev, 2067ad8b1aafSjsg device_list_handle, gmc.xgmi.head) { 2068ad8b1aafSjsg amdgpu_ras_query_err_status(remote_adev); 2069c349dbc7Sjsg amdgpu_ras_log_on_err_counter(remote_adev); 2070c349dbc7Sjsg } 2071c349dbc7Sjsg 2072ad8b1aafSjsg amdgpu_put_xgmi_hive(hive); 2073ad8b1aafSjsg } 2074ad8b1aafSjsg 20751bb76ff1Sjsg if (amdgpu_device_should_recover_gpu(ras->adev)) { 20761bb76ff1Sjsg struct amdgpu_reset_context reset_context; 20771bb76ff1Sjsg memset(&reset_context, 0, sizeof(reset_context)); 20781bb76ff1Sjsg 20791bb76ff1Sjsg reset_context.method = AMD_RESET_METHOD_NONE; 20801bb76ff1Sjsg reset_context.reset_req_dev = adev; 2081f005ef32Sjsg 2082f005ef32Sjsg /* Perform full reset in fatal error mode */ 2083f005ef32Sjsg if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) 2084f005ef32Sjsg set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2085f005ef32Sjsg else { 20861bb76ff1Sjsg clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 20871bb76ff1Sjsg 2088f005ef32Sjsg if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { 2089f005ef32Sjsg ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; 2090f005ef32Sjsg reset_context.method = AMD_RESET_METHOD_MODE2; 2091f005ef32Sjsg } 2092f005ef32Sjsg 2093f005ef32Sjsg /* Fatal error occurs in poison mode, mode1 reset is used to 2094f005ef32Sjsg * recover gpu. 2095f005ef32Sjsg */ 2096f005ef32Sjsg if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { 2097f005ef32Sjsg ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; 2098f005ef32Sjsg set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2099f005ef32Sjsg 2100f005ef32Sjsg psp_fatal_error_recovery_quirk(&adev->psp); 2101f005ef32Sjsg } 2102f005ef32Sjsg } 2103f005ef32Sjsg 21041bb76ff1Sjsg amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); 21051bb76ff1Sjsg } 2106c349dbc7Sjsg atomic_set(&ras->in_recovery, 0); 2107c349dbc7Sjsg } 2108c349dbc7Sjsg 2109c349dbc7Sjsg /* alloc/realloc bps array */ 2110c349dbc7Sjsg static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 2111c349dbc7Sjsg struct ras_err_handler_data *data, int pages) 2112c349dbc7Sjsg { 2113c349dbc7Sjsg unsigned int old_space = data->count + data->space_left; 2114c349dbc7Sjsg unsigned int new_space = old_space + pages; 2115f005ef32Sjsg unsigned int align_space = ALIGN(new_space, 512); 2116c349dbc7Sjsg void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 2117c349dbc7Sjsg 21185ca02815Sjsg if (!bps) { 2119c349dbc7Sjsg return -ENOMEM; 2120c349dbc7Sjsg } 2121c349dbc7Sjsg 2122c349dbc7Sjsg if (data->bps) { 2123c349dbc7Sjsg memcpy(bps, data->bps, 2124c349dbc7Sjsg data->count * sizeof(*data->bps)); 2125c349dbc7Sjsg kfree(data->bps); 2126c349dbc7Sjsg } 2127c349dbc7Sjsg 2128c349dbc7Sjsg data->bps = bps; 2129c349dbc7Sjsg data->space_left += align_space - old_space; 2130c349dbc7Sjsg return 0; 2131c349dbc7Sjsg } 2132c349dbc7Sjsg 2133c349dbc7Sjsg /* it deal with vram only. */ 2134c349dbc7Sjsg int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 2135c349dbc7Sjsg struct eeprom_table_record *bps, int pages) 2136c349dbc7Sjsg { 2137c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2138c349dbc7Sjsg struct ras_err_handler_data *data; 2139c349dbc7Sjsg int ret = 0; 21405ca02815Sjsg uint32_t i; 2141c349dbc7Sjsg 2142c349dbc7Sjsg if (!con || !con->eh_data || !bps || pages <= 0) 2143c349dbc7Sjsg return 0; 2144c349dbc7Sjsg 2145c349dbc7Sjsg mutex_lock(&con->recovery_lock); 2146c349dbc7Sjsg data = con->eh_data; 2147c349dbc7Sjsg if (!data) 2148c349dbc7Sjsg goto out; 2149c349dbc7Sjsg 21505ca02815Sjsg for (i = 0; i < pages; i++) { 21515ca02815Sjsg if (amdgpu_ras_check_bad_page_unlock(con, 21525ca02815Sjsg bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 21535ca02815Sjsg continue; 21545ca02815Sjsg 21555ca02815Sjsg if (!data->space_left && 21565ca02815Sjsg amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 2157c349dbc7Sjsg ret = -ENOMEM; 2158c349dbc7Sjsg goto out; 2159c349dbc7Sjsg } 2160c349dbc7Sjsg 21611bb76ff1Sjsg amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, 21625ca02815Sjsg bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, 21635ca02815Sjsg AMDGPU_GPU_PAGE_SIZE); 2164c349dbc7Sjsg 21655ca02815Sjsg memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); 21665ca02815Sjsg data->count++; 21675ca02815Sjsg data->space_left--; 21685ca02815Sjsg } 2169c349dbc7Sjsg out: 2170c349dbc7Sjsg mutex_unlock(&con->recovery_lock); 2171c349dbc7Sjsg 2172c349dbc7Sjsg return ret; 2173c349dbc7Sjsg } 2174c349dbc7Sjsg 2175c349dbc7Sjsg /* 2176c349dbc7Sjsg * write error record array to eeprom, the function should be 2177c349dbc7Sjsg * protected by recovery_lock 2178f005ef32Sjsg * new_cnt: new added UE count, excluding reserved bad pages, can be NULL 2179c349dbc7Sjsg */ 2180f005ef32Sjsg int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, 2181f005ef32Sjsg unsigned long *new_cnt) 2182c349dbc7Sjsg { 2183c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2184c349dbc7Sjsg struct ras_err_handler_data *data; 2185c349dbc7Sjsg struct amdgpu_ras_eeprom_control *control; 2186c349dbc7Sjsg int save_count; 2187c349dbc7Sjsg 2188f005ef32Sjsg if (!con || !con->eh_data) { 2189f005ef32Sjsg if (new_cnt) 2190f005ef32Sjsg *new_cnt = 0; 2191f005ef32Sjsg 2192c349dbc7Sjsg return 0; 2193f005ef32Sjsg } 2194c349dbc7Sjsg 21951bb76ff1Sjsg mutex_lock(&con->recovery_lock); 2196c349dbc7Sjsg control = &con->eeprom_control; 2197c349dbc7Sjsg data = con->eh_data; 21985ca02815Sjsg save_count = data->count - control->ras_num_recs; 21991bb76ff1Sjsg mutex_unlock(&con->recovery_lock); 2200f005ef32Sjsg 2201f005ef32Sjsg if (new_cnt) 2202f005ef32Sjsg *new_cnt = save_count / adev->umc.retire_unit; 2203f005ef32Sjsg 2204c349dbc7Sjsg /* only new entries are saved */ 2205ad8b1aafSjsg if (save_count > 0) { 22065ca02815Sjsg if (amdgpu_ras_eeprom_append(control, 22075ca02815Sjsg &data->bps[control->ras_num_recs], 2208c349dbc7Sjsg save_count)) { 2209ad8b1aafSjsg dev_err(adev->dev, "Failed to save EEPROM table data!"); 2210c349dbc7Sjsg return -EIO; 2211c349dbc7Sjsg } 2212c349dbc7Sjsg 2213ad8b1aafSjsg dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); 2214ad8b1aafSjsg } 2215ad8b1aafSjsg 2216c349dbc7Sjsg return 0; 2217c349dbc7Sjsg } 2218c349dbc7Sjsg 2219c349dbc7Sjsg /* 2220c349dbc7Sjsg * read error record array in eeprom and reserve enough space for 2221c349dbc7Sjsg * storing new bad pages 2222c349dbc7Sjsg */ 2223c349dbc7Sjsg static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 2224c349dbc7Sjsg { 2225c349dbc7Sjsg struct amdgpu_ras_eeprom_control *control = 22265ca02815Sjsg &adev->psp.ras_context.ras->eeprom_control; 22275ca02815Sjsg struct eeprom_table_record *bps; 22285ca02815Sjsg int ret; 2229c349dbc7Sjsg 2230c349dbc7Sjsg /* no bad page record, skip eeprom access */ 22315ca02815Sjsg if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) 22325ca02815Sjsg return 0; 2233c349dbc7Sjsg 22345ca02815Sjsg bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); 2235c349dbc7Sjsg if (!bps) 2236c349dbc7Sjsg return -ENOMEM; 2237c349dbc7Sjsg 22385ca02815Sjsg ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); 22395ca02815Sjsg if (ret) 2240ad8b1aafSjsg dev_err(adev->dev, "Failed to load EEPROM table records!"); 22415ca02815Sjsg else 22425ca02815Sjsg ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); 2243c349dbc7Sjsg 2244c349dbc7Sjsg kfree(bps); 2245c349dbc7Sjsg return ret; 2246c349dbc7Sjsg } 2247c349dbc7Sjsg 22485ca02815Sjsg static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 22495ca02815Sjsg uint64_t addr) 22505ca02815Sjsg { 22515ca02815Sjsg struct ras_err_handler_data *data = con->eh_data; 22525ca02815Sjsg int i; 22535ca02815Sjsg 22545ca02815Sjsg addr >>= AMDGPU_GPU_PAGE_SHIFT; 22555ca02815Sjsg for (i = 0; i < data->count; i++) 22565ca02815Sjsg if (addr == data->bps[i].retired_page) 22575ca02815Sjsg return true; 22585ca02815Sjsg 22595ca02815Sjsg return false; 22605ca02815Sjsg } 22615ca02815Sjsg 2262c349dbc7Sjsg /* 2263c349dbc7Sjsg * check if an address belongs to bad page 2264c349dbc7Sjsg * 2265c349dbc7Sjsg * Note: this check is only for umc block 2266c349dbc7Sjsg */ 2267c349dbc7Sjsg static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 2268c349dbc7Sjsg uint64_t addr) 2269c349dbc7Sjsg { 2270c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2271c349dbc7Sjsg bool ret = false; 2272c349dbc7Sjsg 2273c349dbc7Sjsg if (!con || !con->eh_data) 2274c349dbc7Sjsg return ret; 2275c349dbc7Sjsg 2276c349dbc7Sjsg mutex_lock(&con->recovery_lock); 22775ca02815Sjsg ret = amdgpu_ras_check_bad_page_unlock(con, addr); 2278c349dbc7Sjsg mutex_unlock(&con->recovery_lock); 2279c349dbc7Sjsg return ret; 2280c349dbc7Sjsg } 2281c349dbc7Sjsg 2282ad8b1aafSjsg static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, 22835ca02815Sjsg uint32_t max_count) 2284ad8b1aafSjsg { 2285ad8b1aafSjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2286ad8b1aafSjsg 2287ad8b1aafSjsg /* 2288ad8b1aafSjsg * Justification of value bad_page_cnt_threshold in ras structure 2289ad8b1aafSjsg * 2290f005ef32Sjsg * Generally, 0 <= amdgpu_bad_page_threshold <= max record length 2291f005ef32Sjsg * in eeprom or amdgpu_bad_page_threshold == -2, introduce two 2292f005ef32Sjsg * scenarios accordingly. 2293ad8b1aafSjsg * 2294ad8b1aafSjsg * Bad page retirement enablement: 2295f005ef32Sjsg * - If amdgpu_bad_page_threshold = -2, 2296ad8b1aafSjsg * bad_page_cnt_threshold = typical value by formula. 2297ad8b1aafSjsg * 2298ad8b1aafSjsg * - When the value from user is 0 < amdgpu_bad_page_threshold < 2299ad8b1aafSjsg * max record length in eeprom, use it directly. 2300ad8b1aafSjsg * 2301ad8b1aafSjsg * Bad page retirement disablement: 2302ad8b1aafSjsg * - If amdgpu_bad_page_threshold = 0, bad page retirement 2303ad8b1aafSjsg * functionality is disabled, and bad_page_cnt_threshold will 2304ad8b1aafSjsg * take no effect. 2305ad8b1aafSjsg */ 2306ad8b1aafSjsg 23075ca02815Sjsg if (amdgpu_bad_page_threshold < 0) { 23085ca02815Sjsg u64 val = adev->gmc.mc_vram_size; 2309ad8b1aafSjsg 23105ca02815Sjsg do_div(val, RAS_BAD_PAGE_COVER); 2311ad8b1aafSjsg con->bad_page_cnt_threshold = min(lower_32_bits(val), 23125ca02815Sjsg max_count); 2313ad8b1aafSjsg } else { 23145ca02815Sjsg con->bad_page_cnt_threshold = min_t(int, max_count, 23155ca02815Sjsg amdgpu_bad_page_threshold); 2316ad8b1aafSjsg } 2317ad8b1aafSjsg } 2318ad8b1aafSjsg 2319c349dbc7Sjsg int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 2320c349dbc7Sjsg { 2321c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2322c349dbc7Sjsg struct ras_err_handler_data **data; 23235ca02815Sjsg u32 max_eeprom_records_count = 0; 2324ad8b1aafSjsg bool exc_err_limit = false; 2325c349dbc7Sjsg int ret; 2326c349dbc7Sjsg 23271bb76ff1Sjsg if (!con || amdgpu_sriov_vf(adev)) 2328c349dbc7Sjsg return 0; 2329c349dbc7Sjsg 23305ca02815Sjsg /* Allow access to RAS EEPROM via debugfs, when the ASIC 23315ca02815Sjsg * supports RAS and debugfs is enabled, but when 23325ca02815Sjsg * adev->ras_enabled is unset, i.e. when "ras_enable" 23335ca02815Sjsg * module parameter is set to 0. 23345ca02815Sjsg */ 23355ca02815Sjsg con->adev = adev; 23365ca02815Sjsg 23375ca02815Sjsg if (!adev->ras_enabled) 23385ca02815Sjsg return 0; 23395ca02815Sjsg 23405ca02815Sjsg data = &con->eh_data; 2341c349dbc7Sjsg *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); 2342c349dbc7Sjsg if (!*data) { 2343c349dbc7Sjsg ret = -ENOMEM; 2344c349dbc7Sjsg goto out; 2345c349dbc7Sjsg } 2346c349dbc7Sjsg 2347c349dbc7Sjsg rw_init(&con->recovery_lock, "rasrec"); 2348c349dbc7Sjsg INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 2349c349dbc7Sjsg atomic_set(&con->in_recovery, 0); 23501bb76ff1Sjsg con->eeprom_control.bad_channel_bitmap = 0; 2351c349dbc7Sjsg 2352f005ef32Sjsg max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); 23535ca02815Sjsg amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); 2354ad8b1aafSjsg 23555ca02815Sjsg /* Todo: During test the SMU might fail to read the eeprom through I2C 23565ca02815Sjsg * when the GPU is pending on XGMI reset during probe time 23575ca02815Sjsg * (Mostly after second bus reset), skip it now 23585ca02815Sjsg */ 23595ca02815Sjsg if (adev->gmc.xgmi.pending_reset) 23605ca02815Sjsg return 0; 2361ad8b1aafSjsg ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); 2362ad8b1aafSjsg /* 2363ad8b1aafSjsg * This calling fails when exc_err_limit is true or 2364ad8b1aafSjsg * ret != 0. 2365ad8b1aafSjsg */ 2366ad8b1aafSjsg if (exc_err_limit || ret) 2367c349dbc7Sjsg goto free; 2368c349dbc7Sjsg 23695ca02815Sjsg if (con->eeprom_control.ras_num_recs) { 2370c349dbc7Sjsg ret = amdgpu_ras_load_bad_pages(adev); 2371c349dbc7Sjsg if (ret) 2372c349dbc7Sjsg goto free; 23735ca02815Sjsg 23741bb76ff1Sjsg amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 23751bb76ff1Sjsg 23761bb76ff1Sjsg if (con->update_channel_flag == true) { 23771bb76ff1Sjsg amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); 23781bb76ff1Sjsg con->update_channel_flag = false; 23791bb76ff1Sjsg } 2380c349dbc7Sjsg } 2381c349dbc7Sjsg 23821bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD 23831bb76ff1Sjsg if ((adev->asic_type == CHIP_ALDEBARAN) && 23841bb76ff1Sjsg (adev->gmc.xgmi.connected_to_cpu)) 23851bb76ff1Sjsg amdgpu_register_bad_pages_mca_notifier(adev); 23861bb76ff1Sjsg #endif 2387c349dbc7Sjsg return 0; 2388c349dbc7Sjsg 2389c349dbc7Sjsg free: 2390c349dbc7Sjsg kfree((*data)->bps); 2391c349dbc7Sjsg kfree(*data); 2392c349dbc7Sjsg con->eh_data = NULL; 2393c349dbc7Sjsg out: 23945ca02815Sjsg dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); 2395ad8b1aafSjsg 2396ad8b1aafSjsg /* 2397ad8b1aafSjsg * Except error threshold exceeding case, other failure cases in this 2398ad8b1aafSjsg * function would not fail amdgpu driver init. 2399ad8b1aafSjsg */ 2400ad8b1aafSjsg if (!exc_err_limit) 2401ad8b1aafSjsg ret = 0; 2402ad8b1aafSjsg else 2403ad8b1aafSjsg ret = -EINVAL; 2404c349dbc7Sjsg 2405c349dbc7Sjsg return ret; 2406c349dbc7Sjsg } 2407c349dbc7Sjsg 2408c349dbc7Sjsg static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 2409c349dbc7Sjsg { 2410c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2411c349dbc7Sjsg struct ras_err_handler_data *data = con->eh_data; 2412c349dbc7Sjsg 2413c349dbc7Sjsg /* recovery_init failed to init it, fini is useless */ 2414c349dbc7Sjsg if (!data) 2415c349dbc7Sjsg return 0; 2416c349dbc7Sjsg 2417c349dbc7Sjsg cancel_work_sync(&con->recovery_work); 2418c349dbc7Sjsg 2419c349dbc7Sjsg mutex_lock(&con->recovery_lock); 2420c349dbc7Sjsg con->eh_data = NULL; 2421c349dbc7Sjsg kfree(data->bps); 2422c349dbc7Sjsg kfree(data); 2423c349dbc7Sjsg mutex_unlock(&con->recovery_lock); 2424c349dbc7Sjsg 2425c349dbc7Sjsg return 0; 2426c349dbc7Sjsg } 2427c349dbc7Sjsg /* recovery end */ 2428c349dbc7Sjsg 24295ca02815Sjsg static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) 2430ad8b1aafSjsg { 24311bb76ff1Sjsg if (amdgpu_sriov_vf(adev)) { 24321bb76ff1Sjsg switch (adev->ip_versions[MP0_HWIP][0]) { 24331bb76ff1Sjsg case IP_VERSION(13, 0, 2): 2434f005ef32Sjsg case IP_VERSION(13, 0, 6): 24351bb76ff1Sjsg return true; 24361bb76ff1Sjsg default: 24371bb76ff1Sjsg return false; 24381bb76ff1Sjsg } 24391bb76ff1Sjsg } 24401bb76ff1Sjsg 24411bb76ff1Sjsg if (adev->asic_type == CHIP_IP_DISCOVERY) { 24421bb76ff1Sjsg switch (adev->ip_versions[MP0_HWIP][0]) { 24431bb76ff1Sjsg case IP_VERSION(13, 0, 0): 2444f005ef32Sjsg case IP_VERSION(13, 0, 6): 24451bb76ff1Sjsg case IP_VERSION(13, 0, 10): 24461bb76ff1Sjsg return true; 24471bb76ff1Sjsg default: 24481bb76ff1Sjsg return false; 24491bb76ff1Sjsg } 24501bb76ff1Sjsg } 24511bb76ff1Sjsg 24525ca02815Sjsg return adev->asic_type == CHIP_VEGA10 || 24535ca02815Sjsg adev->asic_type == CHIP_VEGA20 || 24545ca02815Sjsg adev->asic_type == CHIP_ARCTURUS || 24555ca02815Sjsg adev->asic_type == CHIP_ALDEBARAN || 24565ca02815Sjsg adev->asic_type == CHIP_SIENNA_CICHLID; 24575ca02815Sjsg } 24585ca02815Sjsg 24595ca02815Sjsg /* 24605ca02815Sjsg * this is workaround for vega20 workstation sku, 24615ca02815Sjsg * force enable gfx ras, ignore vbios gfx ras flag 24625ca02815Sjsg * due to GC EDC can not write 24635ca02815Sjsg */ 24645ca02815Sjsg static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) 24655ca02815Sjsg { 24665ca02815Sjsg struct atom_context *ctx = adev->mode_info.atom_context; 24675ca02815Sjsg 24685ca02815Sjsg if (!ctx) 24695ca02815Sjsg return; 24705ca02815Sjsg 2471f005ef32Sjsg if (strnstr(ctx->vbios_pn, "D16406", 2472f005ef32Sjsg sizeof(ctx->vbios_pn)) || 2473f005ef32Sjsg strnstr(ctx->vbios_pn, "D36002", 2474f005ef32Sjsg sizeof(ctx->vbios_pn))) 24755ca02815Sjsg adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); 2476ad8b1aafSjsg } 2477ad8b1aafSjsg 2478c349dbc7Sjsg /* 2479c349dbc7Sjsg * check hardware's ras ability which will be saved in hw_supported. 2480c349dbc7Sjsg * if hardware does not support ras, we can skip some ras initializtion and 2481c349dbc7Sjsg * forbid some ras operations from IP. 2482c349dbc7Sjsg * if software itself, say boot parameter, limit the ras ability. We still 2483c349dbc7Sjsg * need allow IP do some limited operations, like disable. In such case, 2484c349dbc7Sjsg * we have to initialize ras as normal. but need check if operation is 2485c349dbc7Sjsg * allowed or not in each function. 2486c349dbc7Sjsg */ 24875ca02815Sjsg static void amdgpu_ras_check_supported(struct amdgpu_device *adev) 2488c349dbc7Sjsg { 24895ca02815Sjsg adev->ras_hw_enabled = adev->ras_enabled = 0; 2490c349dbc7Sjsg 2491f005ef32Sjsg if (!amdgpu_ras_asic_supported(adev)) 2492c349dbc7Sjsg return; 2493c349dbc7Sjsg 2494f005ef32Sjsg if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 2495c349dbc7Sjsg if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 24965ca02815Sjsg dev_info(adev->dev, "MEM ECC is active.\n"); 24975ca02815Sjsg adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 2498c349dbc7Sjsg 1 << AMDGPU_RAS_BLOCK__DF); 24995ca02815Sjsg } else { 25005ca02815Sjsg dev_info(adev->dev, "MEM ECC is not presented.\n"); 25015ca02815Sjsg } 2502c349dbc7Sjsg 2503c349dbc7Sjsg if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 2504ad8b1aafSjsg dev_info(adev->dev, "SRAM ECC is active.\n"); 2505f005ef32Sjsg if (!amdgpu_sriov_vf(adev)) 25065ca02815Sjsg adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 2507c349dbc7Sjsg 1 << AMDGPU_RAS_BLOCK__DF); 2508f005ef32Sjsg else 2509f005ef32Sjsg adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | 2510f005ef32Sjsg 1 << AMDGPU_RAS_BLOCK__SDMA | 2511f005ef32Sjsg 1 << AMDGPU_RAS_BLOCK__GFX); 25121bb76ff1Sjsg 2513f005ef32Sjsg /* VCN/JPEG RAS can be supported on both bare metal and 2514f005ef32Sjsg * SRIOV environment 2515f005ef32Sjsg */ 2516f005ef32Sjsg if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || 2517f005ef32Sjsg adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) 25181bb76ff1Sjsg adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 25191bb76ff1Sjsg 1 << AMDGPU_RAS_BLOCK__JPEG); 25201bb76ff1Sjsg else 25211bb76ff1Sjsg adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | 25221bb76ff1Sjsg 1 << AMDGPU_RAS_BLOCK__JPEG); 2523f005ef32Sjsg 2524f005ef32Sjsg /* 2525f005ef32Sjsg * XGMI RAS is not supported if xgmi num physical nodes 2526f005ef32Sjsg * is zero 2527f005ef32Sjsg */ 2528f005ef32Sjsg if (!adev->gmc.xgmi.num_physical_nodes) 2529f005ef32Sjsg adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); 25305ca02815Sjsg } else { 2531ad8b1aafSjsg dev_info(adev->dev, "SRAM ECC is not presented.\n"); 25325ca02815Sjsg } 25335ca02815Sjsg } else { 25345ca02815Sjsg /* driver only manages a few IP blocks RAS feature 25355ca02815Sjsg * when GPU is connected cpu through XGMI */ 25365ca02815Sjsg adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 25375ca02815Sjsg 1 << AMDGPU_RAS_BLOCK__SDMA | 25385ca02815Sjsg 1 << AMDGPU_RAS_BLOCK__MMHUB); 25395ca02815Sjsg } 25405ca02815Sjsg 25415ca02815Sjsg amdgpu_ras_get_quirks(adev); 2542c349dbc7Sjsg 2543c349dbc7Sjsg /* hw_supported needs to be aligned with RAS block mask. */ 25445ca02815Sjsg adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; 2545c349dbc7Sjsg 2546f005ef32Sjsg 2547f005ef32Sjsg /* 2548f005ef32Sjsg * Disable ras feature for aqua vanjaram 2549f005ef32Sjsg * by default on apu platform. 2550f005ef32Sjsg */ 2551f005ef32Sjsg if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) && 2552f005ef32Sjsg adev->gmc.is_app_apu) 2553f005ef32Sjsg adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : 2554f005ef32Sjsg adev->ras_hw_enabled & amdgpu_ras_mask; 2555f005ef32Sjsg else 25565ca02815Sjsg adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : 25575ca02815Sjsg adev->ras_hw_enabled & amdgpu_ras_mask; 25585ca02815Sjsg } 25595ca02815Sjsg 25605ca02815Sjsg static void amdgpu_ras_counte_dw(struct work_struct *work) 25615ca02815Sjsg { 25625ca02815Sjsg struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 25635ca02815Sjsg ras_counte_delay_work.work); 25645ca02815Sjsg struct amdgpu_device *adev = con->adev; 25655ca02815Sjsg struct drm_device *dev = adev_to_drm(adev); 25665ca02815Sjsg unsigned long ce_count, ue_count; 25675ca02815Sjsg int res; 25685ca02815Sjsg 25695ca02815Sjsg res = pm_runtime_get_sync(dev->dev); 25705ca02815Sjsg if (res < 0) 25715ca02815Sjsg goto Out; 25725ca02815Sjsg 25735ca02815Sjsg /* Cache new values. 25745ca02815Sjsg */ 2575f005ef32Sjsg if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { 25765ca02815Sjsg atomic_set(&con->ras_ce_count, ce_count); 25775ca02815Sjsg atomic_set(&con->ras_ue_count, ue_count); 25785ca02815Sjsg } 25795ca02815Sjsg 25805ca02815Sjsg pm_runtime_mark_last_busy(dev->dev); 25815ca02815Sjsg Out: 25825ca02815Sjsg pm_runtime_put_autosuspend(dev->dev); 2583c349dbc7Sjsg } 2584c349dbc7Sjsg 2585f005ef32Sjsg static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) 2586f005ef32Sjsg { 2587f005ef32Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2588f005ef32Sjsg bool df_poison, umc_poison; 2589f005ef32Sjsg 2590f005ef32Sjsg /* poison setting is useless on SRIOV guest */ 2591f005ef32Sjsg if (amdgpu_sriov_vf(adev) || !con) 2592f005ef32Sjsg return; 2593f005ef32Sjsg 2594f005ef32Sjsg /* Init poison supported flag, the default value is false */ 2595f005ef32Sjsg if (adev->gmc.xgmi.connected_to_cpu) { 2596f005ef32Sjsg /* enabled by default when GPU is connected to CPU */ 2597f005ef32Sjsg con->poison_supported = true; 2598f005ef32Sjsg } else if (adev->df.funcs && 2599f005ef32Sjsg adev->df.funcs->query_ras_poison_mode && 2600f005ef32Sjsg adev->umc.ras && 2601f005ef32Sjsg adev->umc.ras->query_ras_poison_mode) { 2602f005ef32Sjsg df_poison = 2603f005ef32Sjsg adev->df.funcs->query_ras_poison_mode(adev); 2604f005ef32Sjsg umc_poison = 2605f005ef32Sjsg adev->umc.ras->query_ras_poison_mode(adev); 2606f005ef32Sjsg 2607f005ef32Sjsg /* Only poison is set in both DF and UMC, we can support it */ 2608f005ef32Sjsg if (df_poison && umc_poison) 2609f005ef32Sjsg con->poison_supported = true; 2610f005ef32Sjsg else if (df_poison != umc_poison) 2611f005ef32Sjsg dev_warn(adev->dev, 2612f005ef32Sjsg "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", 2613f005ef32Sjsg df_poison, umc_poison); 2614f005ef32Sjsg } 2615f005ef32Sjsg } 2616f005ef32Sjsg 2617c349dbc7Sjsg int amdgpu_ras_init(struct amdgpu_device *adev) 2618c349dbc7Sjsg { 2619c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2620c349dbc7Sjsg int r; 2621c349dbc7Sjsg 2622c349dbc7Sjsg if (con) 2623c349dbc7Sjsg return 0; 2624c349dbc7Sjsg 2625c349dbc7Sjsg con = kmalloc(sizeof(struct amdgpu_ras) + 26261bb76ff1Sjsg sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + 26271bb76ff1Sjsg sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, 2628c349dbc7Sjsg GFP_KERNEL|__GFP_ZERO); 2629c349dbc7Sjsg if (!con) 2630c349dbc7Sjsg return -ENOMEM; 2631c349dbc7Sjsg 26325ca02815Sjsg con->adev = adev; 26335ca02815Sjsg INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); 26345ca02815Sjsg atomic_set(&con->ras_ce_count, 0); 26355ca02815Sjsg atomic_set(&con->ras_ue_count, 0); 26365ca02815Sjsg 2637c349dbc7Sjsg con->objs = (struct ras_manager *)(con + 1); 2638c349dbc7Sjsg 2639c349dbc7Sjsg amdgpu_ras_set_context(adev, con); 2640c349dbc7Sjsg 26415ca02815Sjsg amdgpu_ras_check_supported(adev); 26425ca02815Sjsg 26435ca02815Sjsg if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { 26445ca02815Sjsg /* set gfx block ras context feature for VEGA20 Gaming 26455ca02815Sjsg * send ras disable cmd to ras ta during ras late init. 26465ca02815Sjsg */ 26475ca02815Sjsg if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { 26485ca02815Sjsg con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); 26495ca02815Sjsg 26505ca02815Sjsg return 0; 26515ca02815Sjsg } 26525ca02815Sjsg 2653ad8b1aafSjsg r = 0; 2654ad8b1aafSjsg goto release_con; 2655c349dbc7Sjsg } 2656c349dbc7Sjsg 26571bb76ff1Sjsg con->update_channel_flag = false; 2658c349dbc7Sjsg con->features = 0; 2659c349dbc7Sjsg INIT_LIST_HEAD(&con->head); 2660c349dbc7Sjsg /* Might need get this flag from vbios. */ 2661c349dbc7Sjsg con->flags = RAS_DEFAULT_FLAGS; 2662c349dbc7Sjsg 26635ca02815Sjsg /* initialize nbio ras function ahead of any other 26645ca02815Sjsg * ras functions so hardware fatal error interrupt 26655ca02815Sjsg * can be enabled as early as possible */ 2666f005ef32Sjsg switch (adev->ip_versions[NBIO_HWIP][0]) { 2667f005ef32Sjsg case IP_VERSION(7, 4, 0): 2668f005ef32Sjsg case IP_VERSION(7, 4, 1): 2669f005ef32Sjsg case IP_VERSION(7, 4, 4): 2670f005ef32Sjsg if (!adev->gmc.xgmi.connected_to_cpu) 26711bb76ff1Sjsg adev->nbio.ras = &nbio_v7_4_ras; 2672f005ef32Sjsg break; 2673f005ef32Sjsg case IP_VERSION(4, 3, 0): 2674f005ef32Sjsg if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 2675f005ef32Sjsg /* unlike other generation of nbio ras, 2676f005ef32Sjsg * nbio v4_3 only support fatal error interrupt 2677f005ef32Sjsg * to inform software that DF is freezed due to 2678f005ef32Sjsg * system fatal error event. driver should not 2679f005ef32Sjsg * enable nbio ras in such case. Instead, 2680f005ef32Sjsg * check DF RAS */ 2681f005ef32Sjsg adev->nbio.ras = &nbio_v4_3_ras; 2682f005ef32Sjsg break; 2683f005ef32Sjsg case IP_VERSION(7, 9, 0): 2684f005ef32Sjsg if (!adev->gmc.is_app_apu) 2685f005ef32Sjsg adev->nbio.ras = &nbio_v7_9_ras; 26865ca02815Sjsg break; 26875ca02815Sjsg default: 26885ca02815Sjsg /* nbio ras is not available */ 26895ca02815Sjsg break; 26905ca02815Sjsg } 26915ca02815Sjsg 2692f005ef32Sjsg /* nbio ras block needs to be enabled ahead of other ras blocks 2693f005ef32Sjsg * to handle fatal error */ 2694f005ef32Sjsg r = amdgpu_nbio_ras_sw_init(adev); 2695f005ef32Sjsg if (r) 2696f005ef32Sjsg return r; 2697f005ef32Sjsg 26981bb76ff1Sjsg if (adev->nbio.ras && 26991bb76ff1Sjsg adev->nbio.ras->init_ras_controller_interrupt) { 27001bb76ff1Sjsg r = adev->nbio.ras->init_ras_controller_interrupt(adev); 2701c349dbc7Sjsg if (r) 2702ad8b1aafSjsg goto release_con; 2703c349dbc7Sjsg } 2704c349dbc7Sjsg 27051bb76ff1Sjsg if (adev->nbio.ras && 27061bb76ff1Sjsg adev->nbio.ras->init_ras_err_event_athub_interrupt) { 27071bb76ff1Sjsg r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); 2708c349dbc7Sjsg if (r) 2709ad8b1aafSjsg goto release_con; 2710c349dbc7Sjsg } 2711c349dbc7Sjsg 2712f005ef32Sjsg amdgpu_ras_query_poison_mode(adev); 27131bb76ff1Sjsg 2714ad8b1aafSjsg if (amdgpu_ras_fs_init(adev)) { 2715ad8b1aafSjsg r = -EINVAL; 2716ad8b1aafSjsg goto release_con; 2717ad8b1aafSjsg } 2718c349dbc7Sjsg 2719ad8b1aafSjsg dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 2720c349dbc7Sjsg "hardware ability[%x] ras_mask[%x]\n", 27215ca02815Sjsg adev->ras_hw_enabled, adev->ras_enabled); 27225ca02815Sjsg 2723c349dbc7Sjsg return 0; 2724ad8b1aafSjsg release_con: 2725c349dbc7Sjsg amdgpu_ras_set_context(adev, NULL); 2726c349dbc7Sjsg kfree(con); 2727c349dbc7Sjsg 2728ad8b1aafSjsg return r; 2729c349dbc7Sjsg } 2730c349dbc7Sjsg 27315ca02815Sjsg int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) 27325ca02815Sjsg { 2733f005ef32Sjsg if (adev->gmc.xgmi.connected_to_cpu || 2734f005ef32Sjsg adev->gmc.is_app_apu) 27355ca02815Sjsg return 1; 27365ca02815Sjsg return 0; 27375ca02815Sjsg } 27385ca02815Sjsg 27395ca02815Sjsg static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, 27405ca02815Sjsg struct ras_common_if *ras_block) 27415ca02815Sjsg { 27425ca02815Sjsg struct ras_query_if info = { 27435ca02815Sjsg .head = *ras_block, 27445ca02815Sjsg }; 27455ca02815Sjsg 27465ca02815Sjsg if (!amdgpu_persistent_edc_harvesting_supported(adev)) 27475ca02815Sjsg return 0; 27485ca02815Sjsg 27495ca02815Sjsg if (amdgpu_ras_query_error_status(adev, &info) != 0) 27505ca02815Sjsg DRM_WARN("RAS init harvest failure"); 27515ca02815Sjsg 27525ca02815Sjsg if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) 27535ca02815Sjsg DRM_WARN("RAS init harvest reset failure"); 27545ca02815Sjsg 27555ca02815Sjsg return 0; 27565ca02815Sjsg } 27575ca02815Sjsg 27581bb76ff1Sjsg bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) 2759c349dbc7Sjsg { 27605ca02815Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 27611bb76ff1Sjsg 27621bb76ff1Sjsg if (!con) 27631bb76ff1Sjsg return false; 27641bb76ff1Sjsg 27651bb76ff1Sjsg return con->poison_supported; 27661bb76ff1Sjsg } 27671bb76ff1Sjsg 27681bb76ff1Sjsg /* helper function to handle common stuff in ip late init phase */ 27691bb76ff1Sjsg int amdgpu_ras_block_late_init(struct amdgpu_device *adev, 27701bb76ff1Sjsg struct ras_common_if *ras_block) 27711bb76ff1Sjsg { 27721bb76ff1Sjsg struct amdgpu_ras_block_object *ras_obj = NULL; 27731bb76ff1Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2774f005ef32Sjsg struct ras_query_if *query_info; 27755ca02815Sjsg unsigned long ue_count, ce_count; 2776c349dbc7Sjsg int r; 2777c349dbc7Sjsg 2778c349dbc7Sjsg /* disable RAS feature per IP block if it is not supported */ 2779c349dbc7Sjsg if (!amdgpu_ras_is_supported(adev, ras_block->block)) { 2780c349dbc7Sjsg amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); 2781c349dbc7Sjsg return 0; 2782c349dbc7Sjsg } 2783c349dbc7Sjsg 2784c349dbc7Sjsg r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); 2785c349dbc7Sjsg if (r) { 27861bb76ff1Sjsg if (adev->in_suspend || amdgpu_in_reset(adev)) { 2787c349dbc7Sjsg /* in resume phase, if fail to enable ras, 2788c349dbc7Sjsg * clean up all ras fs nodes, and disable ras */ 2789c349dbc7Sjsg goto cleanup; 2790c349dbc7Sjsg } else 2791c349dbc7Sjsg return r; 2792c349dbc7Sjsg } 2793c349dbc7Sjsg 27945ca02815Sjsg /* check for errors on warm reset edc persisant supported ASIC */ 27955ca02815Sjsg amdgpu_persistent_edc_harvesting(adev, ras_block); 27965ca02815Sjsg 2797c349dbc7Sjsg /* in resume phase, no need to create ras fs node */ 2798ad8b1aafSjsg if (adev->in_suspend || amdgpu_in_reset(adev)) 2799c349dbc7Sjsg return 0; 2800c349dbc7Sjsg 28011bb76ff1Sjsg ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 28021bb76ff1Sjsg if (ras_obj->ras_cb || (ras_obj->hw_ops && 28031bb76ff1Sjsg (ras_obj->hw_ops->query_poison_status || 28041bb76ff1Sjsg ras_obj->hw_ops->handle_poison_consumption))) { 28051bb76ff1Sjsg r = amdgpu_ras_interrupt_add_handler(adev, ras_block); 2806c349dbc7Sjsg if (r) 28071bb76ff1Sjsg goto cleanup; 2808c349dbc7Sjsg } 2809c349dbc7Sjsg 2810f005ef32Sjsg if (ras_obj->hw_ops && 2811f005ef32Sjsg (ras_obj->hw_ops->query_ras_error_count || 2812f005ef32Sjsg ras_obj->hw_ops->query_ras_error_status)) { 28131bb76ff1Sjsg r = amdgpu_ras_sysfs_create(adev, ras_block); 2814c349dbc7Sjsg if (r) 28151bb76ff1Sjsg goto interrupt; 2816c349dbc7Sjsg 28175ca02815Sjsg /* Those are the cached values at init. 28185ca02815Sjsg */ 2819f005ef32Sjsg query_info = kzalloc(sizeof(*query_info), GFP_KERNEL); 2820f005ef32Sjsg if (!query_info) 2821f005ef32Sjsg return -ENOMEM; 2822f005ef32Sjsg memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); 2823f005ef32Sjsg 2824f005ef32Sjsg if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { 28255ca02815Sjsg atomic_set(&con->ras_ce_count, ce_count); 28265ca02815Sjsg atomic_set(&con->ras_ue_count, ue_count); 28275ca02815Sjsg } 28285ca02815Sjsg 2829f005ef32Sjsg kfree(query_info); 2830f005ef32Sjsg } 2831f005ef32Sjsg 2832c349dbc7Sjsg return 0; 28331bb76ff1Sjsg 2834c349dbc7Sjsg interrupt: 28351bb76ff1Sjsg if (ras_obj->ras_cb) 28361bb76ff1Sjsg amdgpu_ras_interrupt_remove_handler(adev, ras_block); 28371bb76ff1Sjsg cleanup: 2838c349dbc7Sjsg amdgpu_ras_feature_enable(adev, ras_block, 0); 2839c349dbc7Sjsg return r; 2840c349dbc7Sjsg } 2841c349dbc7Sjsg 28421bb76ff1Sjsg static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, 28431bb76ff1Sjsg struct ras_common_if *ras_block) 2844c349dbc7Sjsg { 28451bb76ff1Sjsg return amdgpu_ras_block_late_init(adev, ras_block); 28461bb76ff1Sjsg } 28471bb76ff1Sjsg 28481bb76ff1Sjsg /* helper function to remove ras fs node and interrupt handler */ 28491bb76ff1Sjsg void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, 28501bb76ff1Sjsg struct ras_common_if *ras_block) 28511bb76ff1Sjsg { 28521bb76ff1Sjsg struct amdgpu_ras_block_object *ras_obj; 28531bb76ff1Sjsg if (!ras_block) 2854c349dbc7Sjsg return; 2855c349dbc7Sjsg 2856c349dbc7Sjsg amdgpu_ras_sysfs_remove(adev, ras_block); 28571bb76ff1Sjsg 28581bb76ff1Sjsg ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 28591bb76ff1Sjsg if (ras_obj->ras_cb) 28601bb76ff1Sjsg amdgpu_ras_interrupt_remove_handler(adev, ras_block); 28611bb76ff1Sjsg } 28621bb76ff1Sjsg 28631bb76ff1Sjsg static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, 28641bb76ff1Sjsg struct ras_common_if *ras_block) 28651bb76ff1Sjsg { 28661bb76ff1Sjsg return amdgpu_ras_block_late_fini(adev, ras_block); 2867c349dbc7Sjsg } 2868c349dbc7Sjsg 2869c349dbc7Sjsg /* do some init work after IP late init as dependence. 2870c349dbc7Sjsg * and it runs in resume/gpu reset/booting up cases. 2871c349dbc7Sjsg */ 2872c349dbc7Sjsg void amdgpu_ras_resume(struct amdgpu_device *adev) 2873c349dbc7Sjsg { 2874c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2875c349dbc7Sjsg struct ras_manager *obj, *tmp; 2876c349dbc7Sjsg 28775ca02815Sjsg if (!adev->ras_enabled || !con) { 28785ca02815Sjsg /* clean ras context for VEGA20 Gaming after send ras disable cmd */ 28795ca02815Sjsg amdgpu_release_ras_context(adev); 28805ca02815Sjsg 2881c349dbc7Sjsg return; 28825ca02815Sjsg } 2883c349dbc7Sjsg 2884c349dbc7Sjsg if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 2885c349dbc7Sjsg /* Set up all other IPs which are not implemented. There is a 2886c349dbc7Sjsg * tricky thing that IP's actual ras error type should be 2887c349dbc7Sjsg * MULTI_UNCORRECTABLE, but as driver does not handle it, so 2888c349dbc7Sjsg * ERROR_NONE make sense anyway. 2889c349dbc7Sjsg */ 2890c349dbc7Sjsg amdgpu_ras_enable_all_features(adev, 1); 2891c349dbc7Sjsg 2892c349dbc7Sjsg /* We enable ras on all hw_supported block, but as boot 2893c349dbc7Sjsg * parameter might disable some of them and one or more IP has 2894c349dbc7Sjsg * not implemented yet. So we disable them on behalf. 2895c349dbc7Sjsg */ 2896c349dbc7Sjsg list_for_each_entry_safe(obj, tmp, &con->head, node) { 2897c349dbc7Sjsg if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 2898c349dbc7Sjsg amdgpu_ras_feature_enable(adev, &obj->head, 0); 2899c349dbc7Sjsg /* there should be no any reference. */ 2900c349dbc7Sjsg WARN_ON(alive_obj(obj)); 2901c349dbc7Sjsg } 2902c349dbc7Sjsg } 2903c349dbc7Sjsg } 2904c349dbc7Sjsg } 2905c349dbc7Sjsg 2906c349dbc7Sjsg void amdgpu_ras_suspend(struct amdgpu_device *adev) 2907c349dbc7Sjsg { 2908c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2909c349dbc7Sjsg 29105ca02815Sjsg if (!adev->ras_enabled || !con) 2911c349dbc7Sjsg return; 2912c349dbc7Sjsg 2913c349dbc7Sjsg amdgpu_ras_disable_all_features(adev, 0); 2914c349dbc7Sjsg /* Make sure all ras objects are disabled. */ 2915c349dbc7Sjsg if (con->features) 2916c349dbc7Sjsg amdgpu_ras_disable_all_features(adev, 1); 2917c349dbc7Sjsg } 2918c349dbc7Sjsg 29191bb76ff1Sjsg int amdgpu_ras_late_init(struct amdgpu_device *adev) 29201bb76ff1Sjsg { 29211bb76ff1Sjsg struct amdgpu_ras_block_list *node, *tmp; 29221bb76ff1Sjsg struct amdgpu_ras_block_object *obj; 29231bb76ff1Sjsg int r; 29241bb76ff1Sjsg 29251bb76ff1Sjsg /* Guest side doesn't need init ras feature */ 29261bb76ff1Sjsg if (amdgpu_sriov_vf(adev)) 29271bb76ff1Sjsg return 0; 29281bb76ff1Sjsg 29291bb76ff1Sjsg list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 29301bb76ff1Sjsg if (!node->ras_obj) { 29311bb76ff1Sjsg dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 29321bb76ff1Sjsg continue; 29331bb76ff1Sjsg } 29341bb76ff1Sjsg 29351bb76ff1Sjsg obj = node->ras_obj; 29361bb76ff1Sjsg if (obj->ras_late_init) { 29371bb76ff1Sjsg r = obj->ras_late_init(adev, &obj->ras_comm); 29381bb76ff1Sjsg if (r) { 29391bb76ff1Sjsg dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", 29401bb76ff1Sjsg obj->ras_comm.name, r); 29411bb76ff1Sjsg return r; 29421bb76ff1Sjsg } 29431bb76ff1Sjsg } else 29441bb76ff1Sjsg amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); 29451bb76ff1Sjsg } 29461bb76ff1Sjsg 29471bb76ff1Sjsg return 0; 29481bb76ff1Sjsg } 29491bb76ff1Sjsg 2950c349dbc7Sjsg /* do some fini work before IP fini as dependence */ 2951c349dbc7Sjsg int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 2952c349dbc7Sjsg { 2953c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2954c349dbc7Sjsg 29555ca02815Sjsg if (!adev->ras_enabled || !con) 2956c349dbc7Sjsg return 0; 2957c349dbc7Sjsg 29585ca02815Sjsg 2959c349dbc7Sjsg /* Need disable ras on all IPs here before ip [hw/sw]fini */ 29601bb76ff1Sjsg if (con->features) 2961c349dbc7Sjsg amdgpu_ras_disable_all_features(adev, 0); 2962c349dbc7Sjsg amdgpu_ras_recovery_fini(adev); 2963c349dbc7Sjsg return 0; 2964c349dbc7Sjsg } 2965c349dbc7Sjsg 2966c349dbc7Sjsg int amdgpu_ras_fini(struct amdgpu_device *adev) 2967c349dbc7Sjsg { 29681bb76ff1Sjsg struct amdgpu_ras_block_list *ras_node, *tmp; 29691bb76ff1Sjsg struct amdgpu_ras_block_object *obj = NULL; 2970c349dbc7Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2971c349dbc7Sjsg 29725ca02815Sjsg if (!adev->ras_enabled || !con) 2973c349dbc7Sjsg return 0; 2974c349dbc7Sjsg 29751bb76ff1Sjsg list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { 29761bb76ff1Sjsg if (ras_node->ras_obj) { 29771bb76ff1Sjsg obj = ras_node->ras_obj; 29781bb76ff1Sjsg if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && 29791bb76ff1Sjsg obj->ras_fini) 29801bb76ff1Sjsg obj->ras_fini(adev, &obj->ras_comm); 29811bb76ff1Sjsg else 29821bb76ff1Sjsg amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); 29831bb76ff1Sjsg } 29841bb76ff1Sjsg 29851bb76ff1Sjsg /* Clear ras blocks from ras_list and free ras block list node */ 29861bb76ff1Sjsg list_del(&ras_node->node); 29871bb76ff1Sjsg kfree(ras_node); 29881bb76ff1Sjsg } 29891bb76ff1Sjsg 2990c349dbc7Sjsg amdgpu_ras_fs_fini(adev); 2991c349dbc7Sjsg amdgpu_ras_interrupt_remove_all(adev); 2992c349dbc7Sjsg 2993c349dbc7Sjsg WARN(con->features, "Feature mask is not cleared"); 2994c349dbc7Sjsg 2995c349dbc7Sjsg if (con->features) 2996c349dbc7Sjsg amdgpu_ras_disable_all_features(adev, 1); 2997c349dbc7Sjsg 29985ca02815Sjsg cancel_delayed_work_sync(&con->ras_counte_delay_work); 29995ca02815Sjsg 3000c349dbc7Sjsg amdgpu_ras_set_context(adev, NULL); 3001c349dbc7Sjsg kfree(con); 3002c349dbc7Sjsg 3003c349dbc7Sjsg return 0; 3004c349dbc7Sjsg } 3005c349dbc7Sjsg 3006c349dbc7Sjsg void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) 3007c349dbc7Sjsg { 3008c349dbc7Sjsg if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 3009f005ef32Sjsg struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 3010f005ef32Sjsg 3011ad8b1aafSjsg dev_info(adev->dev, "uncorrectable hardware error" 3012ad8b1aafSjsg "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 3013c349dbc7Sjsg 3014f005ef32Sjsg ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 3015c349dbc7Sjsg amdgpu_ras_reset_gpu(adev); 3016c349dbc7Sjsg } 3017c349dbc7Sjsg } 3018ad8b1aafSjsg 3019ad8b1aafSjsg bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) 3020ad8b1aafSjsg { 3021ad8b1aafSjsg if (adev->asic_type == CHIP_VEGA20 && 3022ad8b1aafSjsg adev->pm.fw_version <= 0x283400) { 3023ad8b1aafSjsg return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && 3024ad8b1aafSjsg amdgpu_ras_intr_triggered(); 3025ad8b1aafSjsg } 3026ad8b1aafSjsg 3027ad8b1aafSjsg return false; 3028ad8b1aafSjsg } 3029ad8b1aafSjsg 30305ca02815Sjsg void amdgpu_release_ras_context(struct amdgpu_device *adev) 3031ad8b1aafSjsg { 3032ad8b1aafSjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3033ad8b1aafSjsg 30345ca02815Sjsg if (!con) 30355ca02815Sjsg return; 3036ad8b1aafSjsg 30375ca02815Sjsg if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { 30385ca02815Sjsg con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); 30395ca02815Sjsg amdgpu_ras_set_context(adev, NULL); 30405ca02815Sjsg kfree(con); 30415ca02815Sjsg } 3042ad8b1aafSjsg } 30431bb76ff1Sjsg 30441bb76ff1Sjsg #ifdef CONFIG_X86_MCE_AMD 30451bb76ff1Sjsg static struct amdgpu_device *find_adev(uint32_t node_id) 30461bb76ff1Sjsg { 30471bb76ff1Sjsg int i; 30481bb76ff1Sjsg struct amdgpu_device *adev = NULL; 30491bb76ff1Sjsg 30501bb76ff1Sjsg for (i = 0; i < mce_adev_list.num_gpu; i++) { 30511bb76ff1Sjsg adev = mce_adev_list.devs[i]; 30521bb76ff1Sjsg 30531bb76ff1Sjsg if (adev && adev->gmc.xgmi.connected_to_cpu && 30541bb76ff1Sjsg adev->gmc.xgmi.physical_node_id == node_id) 30551bb76ff1Sjsg break; 30561bb76ff1Sjsg adev = NULL; 30571bb76ff1Sjsg } 30581bb76ff1Sjsg 30591bb76ff1Sjsg return adev; 30601bb76ff1Sjsg } 30611bb76ff1Sjsg 30621bb76ff1Sjsg #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) 30631bb76ff1Sjsg #define GET_UMC_INST(m) (((m) >> 21) & 0x7) 30641bb76ff1Sjsg #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) 30651bb76ff1Sjsg #define GPU_ID_OFFSET 8 30661bb76ff1Sjsg 30671bb76ff1Sjsg static int amdgpu_bad_page_notifier(struct notifier_block *nb, 30681bb76ff1Sjsg unsigned long val, void *data) 30691bb76ff1Sjsg { 30701bb76ff1Sjsg struct mce *m = (struct mce *)data; 30711bb76ff1Sjsg struct amdgpu_device *adev = NULL; 30721bb76ff1Sjsg uint32_t gpu_id = 0; 30731bb76ff1Sjsg uint32_t umc_inst = 0, ch_inst = 0; 30741bb76ff1Sjsg 30751bb76ff1Sjsg /* 30761bb76ff1Sjsg * If the error was generated in UMC_V2, which belongs to GPU UMCs, 30771bb76ff1Sjsg * and error occurred in DramECC (Extended error code = 0) then only 30781bb76ff1Sjsg * process the error, else bail out. 30791bb76ff1Sjsg */ 30801bb76ff1Sjsg if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && 30811bb76ff1Sjsg (XEC(m->status, 0x3f) == 0x0))) 30821bb76ff1Sjsg return NOTIFY_DONE; 30831bb76ff1Sjsg 30841bb76ff1Sjsg /* 30851bb76ff1Sjsg * If it is correctable error, return. 30861bb76ff1Sjsg */ 30871bb76ff1Sjsg if (mce_is_correctable(m)) 30881bb76ff1Sjsg return NOTIFY_OK; 30891bb76ff1Sjsg 30901bb76ff1Sjsg /* 30911bb76ff1Sjsg * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. 30921bb76ff1Sjsg */ 30931bb76ff1Sjsg gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; 30941bb76ff1Sjsg 30951bb76ff1Sjsg adev = find_adev(gpu_id); 30961bb76ff1Sjsg if (!adev) { 30971bb76ff1Sjsg DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, 30981bb76ff1Sjsg gpu_id); 30991bb76ff1Sjsg return NOTIFY_DONE; 31001bb76ff1Sjsg } 31011bb76ff1Sjsg 31021bb76ff1Sjsg /* 31031bb76ff1Sjsg * If it is uncorrectable error, then find out UMC instance and 31041bb76ff1Sjsg * channel index. 31051bb76ff1Sjsg */ 31061bb76ff1Sjsg umc_inst = GET_UMC_INST(m->ipid); 31071bb76ff1Sjsg ch_inst = GET_CHAN_INDEX(m->ipid); 31081bb76ff1Sjsg 31091bb76ff1Sjsg dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", 31101bb76ff1Sjsg umc_inst, ch_inst); 31111bb76ff1Sjsg 3112f005ef32Sjsg if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) 31131bb76ff1Sjsg return NOTIFY_OK; 3114f005ef32Sjsg else 3115f005ef32Sjsg return NOTIFY_DONE; 31161bb76ff1Sjsg } 31171bb76ff1Sjsg 31181bb76ff1Sjsg static struct notifier_block amdgpu_bad_page_nb = { 31191bb76ff1Sjsg .notifier_call = amdgpu_bad_page_notifier, 31201bb76ff1Sjsg .priority = MCE_PRIO_UC, 31211bb76ff1Sjsg }; 31221bb76ff1Sjsg 31231bb76ff1Sjsg static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) 31241bb76ff1Sjsg { 31251bb76ff1Sjsg /* 31261bb76ff1Sjsg * Add the adev to the mce_adev_list. 31271bb76ff1Sjsg * During mode2 reset, amdgpu device is temporarily 31281bb76ff1Sjsg * removed from the mgpu_info list which can cause 31291bb76ff1Sjsg * page retirement to fail. 31301bb76ff1Sjsg * Use this list instead of mgpu_info to find the amdgpu 31311bb76ff1Sjsg * device on which the UMC error was reported. 31321bb76ff1Sjsg */ 31331bb76ff1Sjsg mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; 31341bb76ff1Sjsg 31351bb76ff1Sjsg /* 31361bb76ff1Sjsg * Register the x86 notifier only once 31371bb76ff1Sjsg * with MCE subsystem. 31381bb76ff1Sjsg */ 31391bb76ff1Sjsg if (notifier_registered == false) { 31401bb76ff1Sjsg mce_register_decode_chain(&amdgpu_bad_page_nb); 31411bb76ff1Sjsg notifier_registered = true; 31421bb76ff1Sjsg } 31431bb76ff1Sjsg } 31441bb76ff1Sjsg #endif 31451bb76ff1Sjsg 31461bb76ff1Sjsg struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) 31471bb76ff1Sjsg { 31481bb76ff1Sjsg if (!adev) 31491bb76ff1Sjsg return NULL; 31501bb76ff1Sjsg 31511bb76ff1Sjsg return adev->psp.ras_context.ras; 31521bb76ff1Sjsg } 31531bb76ff1Sjsg 31541bb76ff1Sjsg int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) 31551bb76ff1Sjsg { 31561bb76ff1Sjsg if (!adev) 31571bb76ff1Sjsg return -EINVAL; 31581bb76ff1Sjsg 31591bb76ff1Sjsg adev->psp.ras_context.ras = ras_con; 31601bb76ff1Sjsg return 0; 31611bb76ff1Sjsg } 31621bb76ff1Sjsg 31631bb76ff1Sjsg /* check if ras is supported on block, say, sdma, gfx */ 31641bb76ff1Sjsg int amdgpu_ras_is_supported(struct amdgpu_device *adev, 31651bb76ff1Sjsg unsigned int block) 31661bb76ff1Sjsg { 3167f005ef32Sjsg int ret = 0; 31681bb76ff1Sjsg struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 31691bb76ff1Sjsg 31701bb76ff1Sjsg if (block >= AMDGPU_RAS_BLOCK_COUNT) 31711bb76ff1Sjsg return 0; 3172f005ef32Sjsg 3173f005ef32Sjsg ret = ras && (adev->ras_enabled & (1 << block)); 3174f005ef32Sjsg 3175f005ef32Sjsg /* For the special asic with mem ecc enabled but sram ecc 3176f005ef32Sjsg * not enabled, even if the ras block is not supported on 3177f005ef32Sjsg * .ras_enabled, if the asic supports poison mode and the 3178f005ef32Sjsg * ras block has ras configuration, it can be considered 3179f005ef32Sjsg * that the ras block supports ras function. 3180f005ef32Sjsg */ 3181f005ef32Sjsg if (!ret && 3182f005ef32Sjsg (block == AMDGPU_RAS_BLOCK__GFX || 3183f005ef32Sjsg block == AMDGPU_RAS_BLOCK__SDMA || 3184f005ef32Sjsg block == AMDGPU_RAS_BLOCK__VCN || 3185f005ef32Sjsg block == AMDGPU_RAS_BLOCK__JPEG) && 3186f005ef32Sjsg amdgpu_ras_is_poison_mode_supported(adev) && 3187f005ef32Sjsg amdgpu_ras_get_ras_block(adev, block, 0)) 3188f005ef32Sjsg ret = 1; 3189f005ef32Sjsg 3190f005ef32Sjsg return ret; 31911bb76ff1Sjsg } 31921bb76ff1Sjsg 31931bb76ff1Sjsg int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) 31941bb76ff1Sjsg { 31951bb76ff1Sjsg struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 31961bb76ff1Sjsg 31971bb76ff1Sjsg if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) 31981bb76ff1Sjsg amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 31991bb76ff1Sjsg return 0; 32001bb76ff1Sjsg } 32011bb76ff1Sjsg 32021bb76ff1Sjsg 32031bb76ff1Sjsg /* Register each ip ras block into amdgpu ras */ 32041bb76ff1Sjsg int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, 32051bb76ff1Sjsg struct amdgpu_ras_block_object *ras_block_obj) 32061bb76ff1Sjsg { 32071bb76ff1Sjsg struct amdgpu_ras_block_list *ras_node; 32081bb76ff1Sjsg if (!adev || !ras_block_obj) 32091bb76ff1Sjsg return -EINVAL; 32101bb76ff1Sjsg 32111bb76ff1Sjsg ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); 32121bb76ff1Sjsg if (!ras_node) 32131bb76ff1Sjsg return -ENOMEM; 32141bb76ff1Sjsg 32151bb76ff1Sjsg INIT_LIST_HEAD(&ras_node->node); 32161bb76ff1Sjsg ras_node->ras_obj = ras_block_obj; 32171bb76ff1Sjsg list_add_tail(&ras_node->node, &adev->ras_list); 32181bb76ff1Sjsg 32191bb76ff1Sjsg return 0; 32201bb76ff1Sjsg } 3221f005ef32Sjsg 3222f005ef32Sjsg void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) 3223f005ef32Sjsg { 3224f005ef32Sjsg if (!err_type_name) 3225f005ef32Sjsg return; 3226f005ef32Sjsg 3227f005ef32Sjsg switch (err_type) { 3228f005ef32Sjsg case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 3229f005ef32Sjsg snprintf(err_type_name, 16, "correctable"); 3230f005ef32Sjsg break; 3231f005ef32Sjsg case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 3232f005ef32Sjsg snprintf(err_type_name, 16, "uncorrectable"); 3233f005ef32Sjsg break; 3234f005ef32Sjsg default: 3235f005ef32Sjsg snprintf(err_type_name, 16, "unknown"); 3236f005ef32Sjsg break; 3237f005ef32Sjsg } 3238f005ef32Sjsg } 3239f005ef32Sjsg 3240f005ef32Sjsg bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, 3241f005ef32Sjsg const struct amdgpu_ras_err_status_reg_entry *reg_entry, 3242f005ef32Sjsg uint32_t instance, 3243f005ef32Sjsg uint32_t *memory_id) 3244f005ef32Sjsg { 3245f005ef32Sjsg uint32_t err_status_lo_data, err_status_lo_offset; 3246f005ef32Sjsg 3247f005ef32Sjsg if (!reg_entry) 3248f005ef32Sjsg return false; 3249f005ef32Sjsg 3250f005ef32Sjsg err_status_lo_offset = 3251f005ef32Sjsg AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 3252f005ef32Sjsg reg_entry->seg_lo, reg_entry->reg_lo); 3253f005ef32Sjsg err_status_lo_data = RREG32(err_status_lo_offset); 3254f005ef32Sjsg 3255f005ef32Sjsg if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && 3256f005ef32Sjsg !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) 3257f005ef32Sjsg return false; 3258f005ef32Sjsg 3259f005ef32Sjsg *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); 3260f005ef32Sjsg 3261f005ef32Sjsg return true; 3262f005ef32Sjsg } 3263f005ef32Sjsg 3264f005ef32Sjsg bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, 3265f005ef32Sjsg const struct amdgpu_ras_err_status_reg_entry *reg_entry, 3266f005ef32Sjsg uint32_t instance, 3267f005ef32Sjsg unsigned long *err_cnt) 3268f005ef32Sjsg { 3269f005ef32Sjsg uint32_t err_status_hi_data, err_status_hi_offset; 3270f005ef32Sjsg 3271f005ef32Sjsg if (!reg_entry) 3272f005ef32Sjsg return false; 3273f005ef32Sjsg 3274f005ef32Sjsg err_status_hi_offset = 3275f005ef32Sjsg AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 3276f005ef32Sjsg reg_entry->seg_hi, reg_entry->reg_hi); 3277f005ef32Sjsg err_status_hi_data = RREG32(err_status_hi_offset); 3278f005ef32Sjsg 3279f005ef32Sjsg if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && 3280f005ef32Sjsg !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) 3281f005ef32Sjsg /* keep the check here in case we need to refer to the result later */ 3282f005ef32Sjsg dev_dbg(adev->dev, "Invalid err_info field\n"); 3283f005ef32Sjsg 3284f005ef32Sjsg /* read err count */ 3285f005ef32Sjsg *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); 3286f005ef32Sjsg 3287f005ef32Sjsg return true; 3288f005ef32Sjsg } 3289f005ef32Sjsg 3290f005ef32Sjsg void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, 3291f005ef32Sjsg const struct amdgpu_ras_err_status_reg_entry *reg_list, 3292f005ef32Sjsg uint32_t reg_list_size, 3293f005ef32Sjsg const struct amdgpu_ras_memory_id_entry *mem_list, 3294f005ef32Sjsg uint32_t mem_list_size, 3295f005ef32Sjsg uint32_t instance, 3296f005ef32Sjsg uint32_t err_type, 3297f005ef32Sjsg unsigned long *err_count) 3298f005ef32Sjsg { 3299f005ef32Sjsg uint32_t memory_id; 3300f005ef32Sjsg unsigned long err_cnt; 3301f005ef32Sjsg char err_type_name[16]; 3302f005ef32Sjsg uint32_t i, j; 3303f005ef32Sjsg 3304f005ef32Sjsg for (i = 0; i < reg_list_size; i++) { 3305f005ef32Sjsg /* query memory_id from err_status_lo */ 3306f005ef32Sjsg if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], 3307f005ef32Sjsg instance, &memory_id)) 3308f005ef32Sjsg continue; 3309f005ef32Sjsg 3310f005ef32Sjsg /* query err_cnt from err_status_hi */ 3311f005ef32Sjsg if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], 3312f005ef32Sjsg instance, &err_cnt) || 3313f005ef32Sjsg !err_cnt) 3314f005ef32Sjsg continue; 3315f005ef32Sjsg 3316f005ef32Sjsg *err_count += err_cnt; 3317f005ef32Sjsg 3318f005ef32Sjsg /* log the errors */ 3319f005ef32Sjsg amdgpu_ras_get_error_type_name(err_type, err_type_name); 3320f005ef32Sjsg if (!mem_list) { 3321f005ef32Sjsg /* memory_list is not supported */ 3322f005ef32Sjsg dev_info(adev->dev, 3323f005ef32Sjsg "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", 3324f005ef32Sjsg err_cnt, err_type_name, 3325f005ef32Sjsg reg_list[i].block_name, 3326f005ef32Sjsg instance, memory_id); 3327f005ef32Sjsg } else { 3328f005ef32Sjsg for (j = 0; j < mem_list_size; j++) { 3329f005ef32Sjsg if (memory_id == mem_list[j].memory_id) { 3330f005ef32Sjsg dev_info(adev->dev, 3331f005ef32Sjsg "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", 3332f005ef32Sjsg err_cnt, err_type_name, 3333f005ef32Sjsg reg_list[i].block_name, 3334f005ef32Sjsg instance, mem_list[j].name); 3335f005ef32Sjsg break; 3336f005ef32Sjsg } 3337f005ef32Sjsg } 3338f005ef32Sjsg } 3339f005ef32Sjsg } 3340f005ef32Sjsg } 3341f005ef32Sjsg 3342f005ef32Sjsg void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, 3343f005ef32Sjsg const struct amdgpu_ras_err_status_reg_entry *reg_list, 3344f005ef32Sjsg uint32_t reg_list_size, 3345f005ef32Sjsg uint32_t instance) 3346f005ef32Sjsg { 3347f005ef32Sjsg uint32_t err_status_lo_offset, err_status_hi_offset; 3348f005ef32Sjsg uint32_t i; 3349f005ef32Sjsg 3350f005ef32Sjsg for (i = 0; i < reg_list_size; i++) { 3351f005ef32Sjsg err_status_lo_offset = 3352f005ef32Sjsg AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 3353f005ef32Sjsg reg_list[i].seg_lo, reg_list[i].reg_lo); 3354f005ef32Sjsg err_status_hi_offset = 3355f005ef32Sjsg AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 3356f005ef32Sjsg reg_list[i].seg_hi, reg_list[i].reg_hi); 3357f005ef32Sjsg WREG32(err_status_lo_offset, 0); 3358f005ef32Sjsg WREG32(err_status_hi_offset, 0); 3359f005ef32Sjsg } 3360f005ef32Sjsg } 3361