xref: /netbsd-src/sys/external/bsd/drm2/dist/drm/amd/amdgpu/amdgpu_umc.c (revision 41ec02673d281bbb3d38e6c78504ce6e30c228c1)
1*41ec0267Sriastradh /*	$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
24e390cabSriastradh 
34e390cabSriastradh /*
44e390cabSriastradh  * Copyright 2019 Advanced Micro Devices, Inc.
54e390cabSriastradh  *
64e390cabSriastradh  * Permission is hereby granted, free of charge, to any person obtaining a
74e390cabSriastradh  * copy of this software and associated documentation files (the "Software"),
84e390cabSriastradh  * to deal in the Software without restriction, including without limitation
94e390cabSriastradh  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
104e390cabSriastradh  * and/or sell copies of the Software, and to permit persons to whom the
114e390cabSriastradh  * Software is furnished to do so, subject to the following conditions:
124e390cabSriastradh  *
134e390cabSriastradh  * The above copyright notice and this permission notice shall be included in
144e390cabSriastradh  * all copies or substantial portions of the Software.
154e390cabSriastradh  *
164e390cabSriastradh  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
174e390cabSriastradh  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
184e390cabSriastradh  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
194e390cabSriastradh  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
204e390cabSriastradh  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
214e390cabSriastradh  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
224e390cabSriastradh  * OTHER DEALINGS IN THE SOFTWARE.
234e390cabSriastradh  *
244e390cabSriastradh  */
254e390cabSriastradh 
264e390cabSriastradh #include <sys/cdefs.h>
27*41ec0267Sriastradh __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
284e390cabSriastradh 
294e390cabSriastradh #include "amdgpu_ras.h"
304e390cabSriastradh 
amdgpu_umc_ras_late_init(struct amdgpu_device * adev)314e390cabSriastradh int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
324e390cabSriastradh {
334e390cabSriastradh 	int r;
344e390cabSriastradh 	struct ras_fs_if fs_info = {
354e390cabSriastradh 		.sysfs_name = "umc_err_count",
364e390cabSriastradh 		.debugfs_name = "umc_err_inject",
374e390cabSriastradh 	};
384e390cabSriastradh 	struct ras_ih_if ih_info = {
394e390cabSriastradh 		.cb = amdgpu_umc_process_ras_data_cb,
404e390cabSriastradh 	};
414e390cabSriastradh 
424e390cabSriastradh 	if (!adev->umc.ras_if) {
434e390cabSriastradh 		adev->umc.ras_if =
444e390cabSriastradh 			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
454e390cabSriastradh 		if (!adev->umc.ras_if)
464e390cabSriastradh 			return -ENOMEM;
474e390cabSriastradh 		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
484e390cabSriastradh 		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
494e390cabSriastradh 		adev->umc.ras_if->sub_block_index = 0;
504e390cabSriastradh 		strcpy(adev->umc.ras_if->name, "umc");
514e390cabSriastradh 	}
524e390cabSriastradh 	ih_info.head = fs_info.head = *adev->umc.ras_if;
534e390cabSriastradh 
544e390cabSriastradh 	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
554e390cabSriastradh 				 &fs_info, &ih_info);
564e390cabSriastradh 	if (r)
574e390cabSriastradh 		goto free;
584e390cabSriastradh 
594e390cabSriastradh 	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
604e390cabSriastradh 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
614e390cabSriastradh 		if (r)
624e390cabSriastradh 			goto late_fini;
634e390cabSriastradh 	} else {
644e390cabSriastradh 		r = 0;
654e390cabSriastradh 		goto free;
664e390cabSriastradh 	}
674e390cabSriastradh 
684e390cabSriastradh 	/* ras init of specific umc version */
694e390cabSriastradh 	if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
704e390cabSriastradh 		adev->umc.funcs->err_cnt_init(adev);
714e390cabSriastradh 
724e390cabSriastradh 	return 0;
734e390cabSriastradh 
744e390cabSriastradh late_fini:
754e390cabSriastradh 	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
764e390cabSriastradh free:
774e390cabSriastradh 	kfree(adev->umc.ras_if);
784e390cabSriastradh 	adev->umc.ras_if = NULL;
794e390cabSriastradh 	return r;
804e390cabSriastradh }
814e390cabSriastradh 
amdgpu_umc_ras_fini(struct amdgpu_device * adev)824e390cabSriastradh void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
834e390cabSriastradh {
844e390cabSriastradh 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
854e390cabSriastradh 			adev->umc.ras_if) {
864e390cabSriastradh 		struct ras_common_if *ras_if = adev->umc.ras_if;
874e390cabSriastradh 		struct ras_ih_if ih_info = {
884e390cabSriastradh 			.head = *ras_if,
894e390cabSriastradh 			.cb = amdgpu_umc_process_ras_data_cb,
904e390cabSriastradh 		};
914e390cabSriastradh 
924e390cabSriastradh 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
934e390cabSriastradh 		kfree(ras_if);
944e390cabSriastradh 	}
954e390cabSriastradh }
964e390cabSriastradh 
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)974e390cabSriastradh int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
984e390cabSriastradh 		void *ras_error_status,
994e390cabSriastradh 		struct amdgpu_iv_entry *entry)
1004e390cabSriastradh {
1014e390cabSriastradh 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1024e390cabSriastradh 
1034e390cabSriastradh 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
1044e390cabSriastradh 	if (adev->umc.funcs &&
1054e390cabSriastradh 	    adev->umc.funcs->query_ras_error_count)
1064e390cabSriastradh 	    adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
1074e390cabSriastradh 
1084e390cabSriastradh 	if (adev->umc.funcs &&
1094e390cabSriastradh 	    adev->umc.funcs->query_ras_error_address &&
1104e390cabSriastradh 	    adev->umc.max_ras_err_cnt_per_query) {
1114e390cabSriastradh 		err_data->err_addr =
1124e390cabSriastradh 			kcalloc(adev->umc.max_ras_err_cnt_per_query,
1134e390cabSriastradh 				sizeof(struct eeprom_table_record), GFP_KERNEL);
1144e390cabSriastradh 
1154e390cabSriastradh 		/* still call query_ras_error_address to clear error status
1164e390cabSriastradh 		 * even NOMEM error is encountered
1174e390cabSriastradh 		 */
1184e390cabSriastradh 		if(!err_data->err_addr)
1194e390cabSriastradh 			DRM_WARN("Failed to alloc memory for umc error address record!\n");
1204e390cabSriastradh 
1214e390cabSriastradh 		/* umc query_ras_error_address is also responsible for clearing
1224e390cabSriastradh 		 * error status
1234e390cabSriastradh 		 */
1244e390cabSriastradh 		adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
1254e390cabSriastradh 	}
1264e390cabSriastradh 
1274e390cabSriastradh 	/* only uncorrectable error needs gpu reset */
1284e390cabSriastradh 	if (err_data->ue_count) {
1294e390cabSriastradh 		if (err_data->err_addr_cnt &&
1304e390cabSriastradh 		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
1314e390cabSriastradh 						err_data->err_addr_cnt))
1324e390cabSriastradh 			DRM_WARN("Failed to add ras bad page!\n");
1334e390cabSriastradh 
1344e390cabSriastradh 		amdgpu_ras_reset_gpu(adev);
1354e390cabSriastradh 	}
1364e390cabSriastradh 
1374e390cabSriastradh 	kfree(err_data->err_addr);
1384e390cabSriastradh 	return AMDGPU_RAS_SUCCESS;
1394e390cabSriastradh }
1404e390cabSriastradh 
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)1414e390cabSriastradh int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
1424e390cabSriastradh 		struct amdgpu_irq_src *source,
1434e390cabSriastradh 		struct amdgpu_iv_entry *entry)
1444e390cabSriastradh {
1454e390cabSriastradh 	struct ras_common_if *ras_if = adev->umc.ras_if;
1464e390cabSriastradh 	struct ras_dispatch_if ih_data = {
1474e390cabSriastradh 		.entry = entry,
1484e390cabSriastradh 	};
1494e390cabSriastradh 
1504e390cabSriastradh 	if (!ras_if)
1514e390cabSriastradh 		return 0;
1524e390cabSriastradh 
1534e390cabSriastradh 	ih_data.head = *ras_if;
1544e390cabSriastradh 
1554e390cabSriastradh 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
1564e390cabSriastradh 	return 0;
1574e390cabSriastradh }
158