xref: /netbsd-src/sys/external/bsd/drm2/dist/drm/amd/amdgpu/amdgpu_umc.c (revision 41ec02673d281bbb3d38e6c78504ce6e30c228c1)
1 /*	$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
2 
3 /*
4  * Copyright 2019 Advanced Micro Devices, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  */
25 
26 #include <sys/cdefs.h>
27 __KERNEL_RCSID(0, "$NetBSD: amdgpu_umc.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
28 
29 #include "amdgpu_ras.h"
30 
amdgpu_umc_ras_late_init(struct amdgpu_device * adev)31 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
32 {
33 	int r;
34 	struct ras_fs_if fs_info = {
35 		.sysfs_name = "umc_err_count",
36 		.debugfs_name = "umc_err_inject",
37 	};
38 	struct ras_ih_if ih_info = {
39 		.cb = amdgpu_umc_process_ras_data_cb,
40 	};
41 
42 	if (!adev->umc.ras_if) {
43 		adev->umc.ras_if =
44 			kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
45 		if (!adev->umc.ras_if)
46 			return -ENOMEM;
47 		adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
48 		adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
49 		adev->umc.ras_if->sub_block_index = 0;
50 		strcpy(adev->umc.ras_if->name, "umc");
51 	}
52 	ih_info.head = fs_info.head = *adev->umc.ras_if;
53 
54 	r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
55 				 &fs_info, &ih_info);
56 	if (r)
57 		goto free;
58 
59 	if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
60 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
61 		if (r)
62 			goto late_fini;
63 	} else {
64 		r = 0;
65 		goto free;
66 	}
67 
68 	/* ras init of specific umc version */
69 	if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
70 		adev->umc.funcs->err_cnt_init(adev);
71 
72 	return 0;
73 
74 late_fini:
75 	amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
76 free:
77 	kfree(adev->umc.ras_if);
78 	adev->umc.ras_if = NULL;
79 	return r;
80 }
81 
amdgpu_umc_ras_fini(struct amdgpu_device * adev)82 void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
83 {
84 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
85 			adev->umc.ras_if) {
86 		struct ras_common_if *ras_if = adev->umc.ras_if;
87 		struct ras_ih_if ih_info = {
88 			.head = *ras_if,
89 			.cb = amdgpu_umc_process_ras_data_cb,
90 		};
91 
92 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
93 		kfree(ras_if);
94 	}
95 }
96 
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)97 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
98 		void *ras_error_status,
99 		struct amdgpu_iv_entry *entry)
100 {
101 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
102 
103 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
104 	if (adev->umc.funcs &&
105 	    adev->umc.funcs->query_ras_error_count)
106 	    adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
107 
108 	if (adev->umc.funcs &&
109 	    adev->umc.funcs->query_ras_error_address &&
110 	    adev->umc.max_ras_err_cnt_per_query) {
111 		err_data->err_addr =
112 			kcalloc(adev->umc.max_ras_err_cnt_per_query,
113 				sizeof(struct eeprom_table_record), GFP_KERNEL);
114 
115 		/* still call query_ras_error_address to clear error status
116 		 * even NOMEM error is encountered
117 		 */
118 		if(!err_data->err_addr)
119 			DRM_WARN("Failed to alloc memory for umc error address record!\n");
120 
121 		/* umc query_ras_error_address is also responsible for clearing
122 		 * error status
123 		 */
124 		adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
125 	}
126 
127 	/* only uncorrectable error needs gpu reset */
128 	if (err_data->ue_count) {
129 		if (err_data->err_addr_cnt &&
130 		    amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
131 						err_data->err_addr_cnt))
132 			DRM_WARN("Failed to add ras bad page!\n");
133 
134 		amdgpu_ras_reset_gpu(adev);
135 	}
136 
137 	kfree(err_data->err_addr);
138 	return AMDGPU_RAS_SUCCESS;
139 }
140 
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)141 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
142 		struct amdgpu_irq_src *source,
143 		struct amdgpu_iv_entry *entry)
144 {
145 	struct ras_common_if *ras_if = adev->umc.ras_if;
146 	struct ras_dispatch_if ih_data = {
147 		.entry = entry,
148 	};
149 
150 	if (!ras_if)
151 		return 0;
152 
153 	ih_data.head = *ras_if;
154 
155 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
156 	return 0;
157 }
158