1*41ec0267Sriastradh /* $NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $ */
24e390cabSriastradh
34e390cabSriastradh /*
44e390cabSriastradh * Copyright 2019 Advanced Micro Devices, Inc.
54e390cabSriastradh *
64e390cabSriastradh * Permission is hereby granted, free of charge, to any person obtaining a
74e390cabSriastradh * copy of this software and associated documentation files (the "Software"),
84e390cabSriastradh * to deal in the Software without restriction, including without limitation
94e390cabSriastradh * the rights to use, copy, modify, merge, publish, distribute, sublicense,
104e390cabSriastradh * and/or sell copies of the Software, and to permit persons to whom the
114e390cabSriastradh * Software is furnished to do so, subject to the following conditions:
124e390cabSriastradh *
134e390cabSriastradh * The above copyright notice and this permission notice shall be included in
144e390cabSriastradh * all copies or substantial portions of the Software.
154e390cabSriastradh *
164e390cabSriastradh * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
174e390cabSriastradh * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
184e390cabSriastradh * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
194e390cabSriastradh * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
204e390cabSriastradh * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
214e390cabSriastradh * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
224e390cabSriastradh * OTHER DEALINGS IN THE SOFTWARE.
234e390cabSriastradh *
244e390cabSriastradh */
254e390cabSriastradh
264e390cabSriastradh #include <sys/cdefs.h>
27*41ec0267Sriastradh __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
284e390cabSriastradh
294e390cabSriastradh #include "amdgpu_ras.h"
304e390cabSriastradh
amdgpu_umc_ras_late_init(struct amdgpu_device * adev)314e390cabSriastradh int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
324e390cabSriastradh {
334e390cabSriastradh int r;
344e390cabSriastradh struct ras_fs_if fs_info = {
354e390cabSriastradh .sysfs_name = "umc_err_count",
364e390cabSriastradh .debugfs_name = "umc_err_inject",
374e390cabSriastradh };
384e390cabSriastradh struct ras_ih_if ih_info = {
394e390cabSriastradh .cb = amdgpu_umc_process_ras_data_cb,
404e390cabSriastradh };
414e390cabSriastradh
424e390cabSriastradh if (!adev->umc.ras_if) {
434e390cabSriastradh adev->umc.ras_if =
444e390cabSriastradh kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
454e390cabSriastradh if (!adev->umc.ras_if)
464e390cabSriastradh return -ENOMEM;
474e390cabSriastradh adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
484e390cabSriastradh adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
494e390cabSriastradh adev->umc.ras_if->sub_block_index = 0;
504e390cabSriastradh strcpy(adev->umc.ras_if->name, "umc");
514e390cabSriastradh }
524e390cabSriastradh ih_info.head = fs_info.head = *adev->umc.ras_if;
534e390cabSriastradh
544e390cabSriastradh r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
554e390cabSriastradh &fs_info, &ih_info);
564e390cabSriastradh if (r)
574e390cabSriastradh goto free;
584e390cabSriastradh
594e390cabSriastradh if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
604e390cabSriastradh r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
614e390cabSriastradh if (r)
624e390cabSriastradh goto late_fini;
634e390cabSriastradh } else {
644e390cabSriastradh r = 0;
654e390cabSriastradh goto free;
664e390cabSriastradh }
674e390cabSriastradh
684e390cabSriastradh /* ras init of specific umc version */
694e390cabSriastradh if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
704e390cabSriastradh adev->umc.funcs->err_cnt_init(adev);
714e390cabSriastradh
724e390cabSriastradh return 0;
734e390cabSriastradh
744e390cabSriastradh late_fini:
754e390cabSriastradh amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
764e390cabSriastradh free:
774e390cabSriastradh kfree(adev->umc.ras_if);
784e390cabSriastradh adev->umc.ras_if = NULL;
794e390cabSriastradh return r;
804e390cabSriastradh }
814e390cabSriastradh
amdgpu_umc_ras_fini(struct amdgpu_device * adev)824e390cabSriastradh void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
834e390cabSriastradh {
844e390cabSriastradh if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
854e390cabSriastradh adev->umc.ras_if) {
864e390cabSriastradh struct ras_common_if *ras_if = adev->umc.ras_if;
874e390cabSriastradh struct ras_ih_if ih_info = {
884e390cabSriastradh .head = *ras_if,
894e390cabSriastradh .cb = amdgpu_umc_process_ras_data_cb,
904e390cabSriastradh };
914e390cabSriastradh
924e390cabSriastradh amdgpu_ras_late_fini(adev, ras_if, &ih_info);
934e390cabSriastradh kfree(ras_if);
944e390cabSriastradh }
954e390cabSriastradh }
964e390cabSriastradh
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)974e390cabSriastradh int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
984e390cabSriastradh void *ras_error_status,
994e390cabSriastradh struct amdgpu_iv_entry *entry)
1004e390cabSriastradh {
1014e390cabSriastradh struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1024e390cabSriastradh
1034e390cabSriastradh kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
1044e390cabSriastradh if (adev->umc.funcs &&
1054e390cabSriastradh adev->umc.funcs->query_ras_error_count)
1064e390cabSriastradh adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
1074e390cabSriastradh
1084e390cabSriastradh if (adev->umc.funcs &&
1094e390cabSriastradh adev->umc.funcs->query_ras_error_address &&
1104e390cabSriastradh adev->umc.max_ras_err_cnt_per_query) {
1114e390cabSriastradh err_data->err_addr =
1124e390cabSriastradh kcalloc(adev->umc.max_ras_err_cnt_per_query,
1134e390cabSriastradh sizeof(struct eeprom_table_record), GFP_KERNEL);
1144e390cabSriastradh
1154e390cabSriastradh /* still call query_ras_error_address to clear error status
1164e390cabSriastradh * even NOMEM error is encountered
1174e390cabSriastradh */
1184e390cabSriastradh if(!err_data->err_addr)
1194e390cabSriastradh DRM_WARN("Failed to alloc memory for umc error address record!\n");
1204e390cabSriastradh
1214e390cabSriastradh /* umc query_ras_error_address is also responsible for clearing
1224e390cabSriastradh * error status
1234e390cabSriastradh */
1244e390cabSriastradh adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
1254e390cabSriastradh }
1264e390cabSriastradh
1274e390cabSriastradh /* only uncorrectable error needs gpu reset */
1284e390cabSriastradh if (err_data->ue_count) {
1294e390cabSriastradh if (err_data->err_addr_cnt &&
1304e390cabSriastradh amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
1314e390cabSriastradh err_data->err_addr_cnt))
1324e390cabSriastradh DRM_WARN("Failed to add ras bad page!\n");
1334e390cabSriastradh
1344e390cabSriastradh amdgpu_ras_reset_gpu(adev);
1354e390cabSriastradh }
1364e390cabSriastradh
1374e390cabSriastradh kfree(err_data->err_addr);
1384e390cabSriastradh return AMDGPU_RAS_SUCCESS;
1394e390cabSriastradh }
1404e390cabSriastradh
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)1414e390cabSriastradh int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
1424e390cabSriastradh struct amdgpu_irq_src *source,
1434e390cabSriastradh struct amdgpu_iv_entry *entry)
1444e390cabSriastradh {
1454e390cabSriastradh struct ras_common_if *ras_if = adev->umc.ras_if;
1464e390cabSriastradh struct ras_dispatch_if ih_data = {
1474e390cabSriastradh .entry = entry,
1484e390cabSriastradh };
1494e390cabSriastradh
1504e390cabSriastradh if (!ras_if)
1514e390cabSriastradh return 0;
1524e390cabSriastradh
1534e390cabSriastradh ih_data.head = *ras_if;
1544e390cabSriastradh
1554e390cabSriastradh amdgpu_ras_interrupt_dispatch(adev, &ih_data);
1564e390cabSriastradh return 0;
1574e390cabSriastradh }
158