1c349dbc7Sjsg /*
2c349dbc7Sjsg * Copyright 2019 Advanced Micro Devices, Inc.
3c349dbc7Sjsg *
4c349dbc7Sjsg * Permission is hereby granted, free of charge, to any person obtaining a
5c349dbc7Sjsg * copy of this software and associated documentation files (the "Software"),
6c349dbc7Sjsg * to deal in the Software without restriction, including without limitation
7c349dbc7Sjsg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c349dbc7Sjsg * and/or sell copies of the Software, and to permit persons to whom the
9c349dbc7Sjsg * Software is furnished to do so, subject to the following conditions:
10c349dbc7Sjsg *
11c349dbc7Sjsg * The above copyright notice and this permission notice shall be included in
12c349dbc7Sjsg * all copies or substantial portions of the Software.
13c349dbc7Sjsg *
14c349dbc7Sjsg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15c349dbc7Sjsg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16c349dbc7Sjsg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17c349dbc7Sjsg * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18c349dbc7Sjsg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19c349dbc7Sjsg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20c349dbc7Sjsg * OTHER DEALINGS IN THE SOFTWARE.
21c349dbc7Sjsg *
22c349dbc7Sjsg */
23c349dbc7Sjsg
241bb76ff1Sjsg #include "amdgpu.h"
25*f005ef32Sjsg #include "umc_v6_7.h"
26*f005ef32Sjsg
amdgpu_umc_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)27*f005ef32Sjsg static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
28*f005ef32Sjsg struct ras_err_data *err_data, uint64_t err_addr,
29*f005ef32Sjsg uint32_t ch_inst, uint32_t umc_inst)
30*f005ef32Sjsg {
31*f005ef32Sjsg switch (adev->ip_versions[UMC_HWIP][0]) {
32*f005ef32Sjsg case IP_VERSION(6, 7, 0):
33*f005ef32Sjsg umc_v6_7_convert_error_address(adev,
34*f005ef32Sjsg err_data, err_addr, ch_inst, umc_inst);
35*f005ef32Sjsg break;
36*f005ef32Sjsg default:
37*f005ef32Sjsg dev_warn(adev->dev,
38*f005ef32Sjsg "UMC address to Physical address translation is not supported\n");
39*f005ef32Sjsg return AMDGPU_RAS_FAIL;
40*f005ef32Sjsg }
41*f005ef32Sjsg
42*f005ef32Sjsg return AMDGPU_RAS_SUCCESS;
43*f005ef32Sjsg }
44*f005ef32Sjsg
amdgpu_umc_page_retirement_mca(struct amdgpu_device * adev,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)45*f005ef32Sjsg int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
46*f005ef32Sjsg uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
47*f005ef32Sjsg {
48*f005ef32Sjsg struct ras_err_data err_data = {0, 0, 0, NULL};
49*f005ef32Sjsg int ret = AMDGPU_RAS_FAIL;
50*f005ef32Sjsg
51*f005ef32Sjsg err_data.err_addr =
52*f005ef32Sjsg kcalloc(adev->umc.max_ras_err_cnt_per_query,
53*f005ef32Sjsg sizeof(struct eeprom_table_record), GFP_KERNEL);
54*f005ef32Sjsg if (!err_data.err_addr) {
55*f005ef32Sjsg dev_warn(adev->dev,
56*f005ef32Sjsg "Failed to alloc memory for umc error record in MCA notifier!\n");
57*f005ef32Sjsg return AMDGPU_RAS_FAIL;
58*f005ef32Sjsg }
59*f005ef32Sjsg
60*f005ef32Sjsg /*
61*f005ef32Sjsg * Translate UMC channel address to Physical address
62*f005ef32Sjsg */
63*f005ef32Sjsg ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
64*f005ef32Sjsg ch_inst, umc_inst);
65*f005ef32Sjsg if (ret)
66*f005ef32Sjsg goto out;
67*f005ef32Sjsg
68*f005ef32Sjsg if (amdgpu_bad_page_threshold != 0) {
69*f005ef32Sjsg amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
70*f005ef32Sjsg err_data.err_addr_cnt);
71*f005ef32Sjsg amdgpu_ras_save_bad_pages(adev, NULL);
72*f005ef32Sjsg }
73*f005ef32Sjsg
74*f005ef32Sjsg out:
75*f005ef32Sjsg kfree(err_data.err_addr);
76*f005ef32Sjsg return ret;
77*f005ef32Sjsg }
78c349dbc7Sjsg
amdgpu_umc_do_page_retirement(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry,bool reset)791bb76ff1Sjsg static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
80c349dbc7Sjsg void *ras_error_status,
811bb76ff1Sjsg struct amdgpu_iv_entry *entry,
821bb76ff1Sjsg bool reset)
83c349dbc7Sjsg {
84c349dbc7Sjsg struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
855ca02815Sjsg struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
861bb76ff1Sjsg int ret = 0;
87c349dbc7Sjsg
88c349dbc7Sjsg kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
891bb76ff1Sjsg ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
901bb76ff1Sjsg if (ret == -EOPNOTSUPP) {
911bb76ff1Sjsg if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
921bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
931bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
94c349dbc7Sjsg
951bb76ff1Sjsg if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
961bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
97c349dbc7Sjsg adev->umc.max_ras_err_cnt_per_query) {
98c349dbc7Sjsg err_data->err_addr =
99c349dbc7Sjsg kcalloc(adev->umc.max_ras_err_cnt_per_query,
100c349dbc7Sjsg sizeof(struct eeprom_table_record), GFP_KERNEL);
101c349dbc7Sjsg
102c349dbc7Sjsg /* still call query_ras_error_address to clear error status
103c349dbc7Sjsg * even NOMEM error is encountered
104c349dbc7Sjsg */
105c349dbc7Sjsg if(!err_data->err_addr)
106ad8b1aafSjsg dev_warn(adev->dev, "Failed to alloc memory for "
107ad8b1aafSjsg "umc error address record!\n");
108c349dbc7Sjsg
109c349dbc7Sjsg /* umc query_ras_error_address is also responsible for clearing
110c349dbc7Sjsg * error status
111c349dbc7Sjsg */
1121bb76ff1Sjsg adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
1131bb76ff1Sjsg }
1141bb76ff1Sjsg } else if (!ret) {
1151bb76ff1Sjsg if (adev->umc.ras &&
1161bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_count)
1171bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
1181bb76ff1Sjsg
1191bb76ff1Sjsg if (adev->umc.ras &&
1201bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_address &&
1211bb76ff1Sjsg adev->umc.max_ras_err_cnt_per_query) {
1221bb76ff1Sjsg err_data->err_addr =
1231bb76ff1Sjsg kcalloc(adev->umc.max_ras_err_cnt_per_query,
1241bb76ff1Sjsg sizeof(struct eeprom_table_record), GFP_KERNEL);
1251bb76ff1Sjsg
1261bb76ff1Sjsg /* still call query_ras_error_address to clear error status
1271bb76ff1Sjsg * even NOMEM error is encountered
1281bb76ff1Sjsg */
1291bb76ff1Sjsg if(!err_data->err_addr)
1301bb76ff1Sjsg dev_warn(adev->dev, "Failed to alloc memory for "
1311bb76ff1Sjsg "umc error address record!\n");
1321bb76ff1Sjsg
1331bb76ff1Sjsg /* umc query_ras_error_address is also responsible for clearing
1341bb76ff1Sjsg * error status
1351bb76ff1Sjsg */
1361bb76ff1Sjsg adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
1371bb76ff1Sjsg }
138c349dbc7Sjsg }
139c349dbc7Sjsg
140c349dbc7Sjsg /* only uncorrectable error needs gpu reset */
141c349dbc7Sjsg if (err_data->ue_count) {
142ad8b1aafSjsg dev_info(adev->dev, "%ld uncorrectable hardware errors "
143ad8b1aafSjsg "detected in UMC block\n",
144ad8b1aafSjsg err_data->ue_count);
145ad8b1aafSjsg
146ad8b1aafSjsg if ((amdgpu_bad_page_threshold != 0) &&
1475ca02815Sjsg err_data->err_addr_cnt) {
148c349dbc7Sjsg amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
1495ca02815Sjsg err_data->err_addr_cnt);
150*f005ef32Sjsg amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
1515ca02815Sjsg
1521bb76ff1Sjsg amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
1531bb76ff1Sjsg
1541bb76ff1Sjsg if (con->update_channel_flag == true) {
1551bb76ff1Sjsg amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
1561bb76ff1Sjsg con->update_channel_flag = false;
1571bb76ff1Sjsg }
1585ca02815Sjsg }
159c349dbc7Sjsg
1601bb76ff1Sjsg if (reset)
161c349dbc7Sjsg amdgpu_ras_reset_gpu(adev);
162c349dbc7Sjsg }
163c349dbc7Sjsg
164c349dbc7Sjsg kfree(err_data->err_addr);
165c349dbc7Sjsg return AMDGPU_RAS_SUCCESS;
166c349dbc7Sjsg }
167c349dbc7Sjsg
amdgpu_umc_poison_handler(struct amdgpu_device * adev,bool reset)168*f005ef32Sjsg int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
1691bb76ff1Sjsg {
170*f005ef32Sjsg int ret = AMDGPU_RAS_SUCCESS;
171*f005ef32Sjsg
172*f005ef32Sjsg if (adev->gmc.xgmi.connected_to_cpu ||
173*f005ef32Sjsg adev->gmc.is_app_apu) {
174*f005ef32Sjsg if (reset) {
175*f005ef32Sjsg /* MCA poison handler is only responsible for GPU reset,
176*f005ef32Sjsg * let MCA notifier do page retirement.
177*f005ef32Sjsg */
178*f005ef32Sjsg kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
179*f005ef32Sjsg amdgpu_ras_reset_gpu(adev);
180*f005ef32Sjsg }
181*f005ef32Sjsg return ret;
182*f005ef32Sjsg }
183*f005ef32Sjsg
184*f005ef32Sjsg if (!amdgpu_sriov_vf(adev)) {
185*f005ef32Sjsg struct ras_err_data err_data = {0, 0, 0, NULL};
1861bb76ff1Sjsg struct ras_common_if head = {
1871bb76ff1Sjsg .block = AMDGPU_RAS_BLOCK__UMC,
1881bb76ff1Sjsg };
1891bb76ff1Sjsg struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
1901bb76ff1Sjsg
191*f005ef32Sjsg ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
1921bb76ff1Sjsg
1931bb76ff1Sjsg if (ret == AMDGPU_RAS_SUCCESS && obj) {
194*f005ef32Sjsg obj->err_data.ue_count += err_data.ue_count;
195*f005ef32Sjsg obj->err_data.ce_count += err_data.ce_count;
196*f005ef32Sjsg }
197*f005ef32Sjsg } else {
198*f005ef32Sjsg if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
199*f005ef32Sjsg adev->virt.ops->ras_poison_handler(adev);
200*f005ef32Sjsg else
201*f005ef32Sjsg dev_warn(adev->dev,
202*f005ef32Sjsg "No ras_poison_handler interface in SRIOV!\n");
2031bb76ff1Sjsg }
2041bb76ff1Sjsg
2051bb76ff1Sjsg return ret;
2061bb76ff1Sjsg }
2071bb76ff1Sjsg
amdgpu_umc_process_ras_data_cb(struct amdgpu_device * adev,void * ras_error_status,struct amdgpu_iv_entry * entry)2081bb76ff1Sjsg int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
2091bb76ff1Sjsg void *ras_error_status,
2101bb76ff1Sjsg struct amdgpu_iv_entry *entry)
2111bb76ff1Sjsg {
2121bb76ff1Sjsg return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
2131bb76ff1Sjsg }
2141bb76ff1Sjsg
amdgpu_umc_ras_sw_init(struct amdgpu_device * adev)215*f005ef32Sjsg int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
216*f005ef32Sjsg {
217*f005ef32Sjsg int err;
218*f005ef32Sjsg struct amdgpu_umc_ras *ras;
219*f005ef32Sjsg
220*f005ef32Sjsg if (!adev->umc.ras)
221*f005ef32Sjsg return 0;
222*f005ef32Sjsg
223*f005ef32Sjsg ras = adev->umc.ras;
224*f005ef32Sjsg
225*f005ef32Sjsg err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
226*f005ef32Sjsg if (err) {
227*f005ef32Sjsg dev_err(adev->dev, "Failed to register umc ras block!\n");
228*f005ef32Sjsg return err;
229*f005ef32Sjsg }
230*f005ef32Sjsg
231*f005ef32Sjsg strlcpy(adev->umc.ras->ras_block.ras_comm.name, "umc",
232*f005ef32Sjsg sizeof(adev->umc.ras->ras_block.ras_comm.name));
233*f005ef32Sjsg ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
234*f005ef32Sjsg ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
235*f005ef32Sjsg adev->umc.ras_if = &ras->ras_block.ras_comm;
236*f005ef32Sjsg
237*f005ef32Sjsg if (!ras->ras_block.ras_late_init)
238*f005ef32Sjsg ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
239*f005ef32Sjsg
240*f005ef32Sjsg if (!ras->ras_block.ras_cb)
241*f005ef32Sjsg ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
242*f005ef32Sjsg
243*f005ef32Sjsg return 0;
244*f005ef32Sjsg }
245*f005ef32Sjsg
amdgpu_umc_ras_late_init(struct amdgpu_device * adev,struct ras_common_if * ras_block)2461bb76ff1Sjsg int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
2471bb76ff1Sjsg {
2481bb76ff1Sjsg int r;
2491bb76ff1Sjsg
2501bb76ff1Sjsg r = amdgpu_ras_block_late_init(adev, ras_block);
2511bb76ff1Sjsg if (r)
2521bb76ff1Sjsg return r;
2531bb76ff1Sjsg
2541bb76ff1Sjsg if (amdgpu_ras_is_supported(adev, ras_block->block)) {
2551bb76ff1Sjsg r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
2561bb76ff1Sjsg if (r)
2571bb76ff1Sjsg goto late_fini;
2581bb76ff1Sjsg }
2591bb76ff1Sjsg
2601bb76ff1Sjsg /* ras init of specific umc version */
2611bb76ff1Sjsg if (adev->umc.ras &&
2621bb76ff1Sjsg adev->umc.ras->err_cnt_init)
2631bb76ff1Sjsg adev->umc.ras->err_cnt_init(adev);
2641bb76ff1Sjsg
2651bb76ff1Sjsg return 0;
2661bb76ff1Sjsg
2671bb76ff1Sjsg late_fini:
2681bb76ff1Sjsg amdgpu_ras_block_late_fini(adev, ras_block);
2691bb76ff1Sjsg return r;
2701bb76ff1Sjsg }
2711bb76ff1Sjsg
amdgpu_umc_process_ecc_irq(struct amdgpu_device * adev,struct amdgpu_irq_src * source,struct amdgpu_iv_entry * entry)272c349dbc7Sjsg int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
273c349dbc7Sjsg struct amdgpu_irq_src *source,
274c349dbc7Sjsg struct amdgpu_iv_entry *entry)
275c349dbc7Sjsg {
276c349dbc7Sjsg struct ras_common_if *ras_if = adev->umc.ras_if;
277c349dbc7Sjsg struct ras_dispatch_if ih_data = {
278c349dbc7Sjsg .entry = entry,
279c349dbc7Sjsg };
280c349dbc7Sjsg
281c349dbc7Sjsg if (!ras_if)
282c349dbc7Sjsg return 0;
283c349dbc7Sjsg
284c349dbc7Sjsg ih_data.head = *ras_if;
285c349dbc7Sjsg
286c349dbc7Sjsg amdgpu_ras_interrupt_dispatch(adev, &ih_data);
287c349dbc7Sjsg return 0;
288c349dbc7Sjsg }
2891bb76ff1Sjsg
amdgpu_umc_fill_error_record(struct ras_err_data * err_data,uint64_t err_addr,uint64_t retired_page,uint32_t channel_index,uint32_t umc_inst)2901bb76ff1Sjsg void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
2911bb76ff1Sjsg uint64_t err_addr,
2921bb76ff1Sjsg uint64_t retired_page,
2931bb76ff1Sjsg uint32_t channel_index,
2941bb76ff1Sjsg uint32_t umc_inst)
2951bb76ff1Sjsg {
2961bb76ff1Sjsg struct eeprom_table_record *err_rec =
2971bb76ff1Sjsg &err_data->err_addr[err_data->err_addr_cnt];
2981bb76ff1Sjsg
2991bb76ff1Sjsg err_rec->address = err_addr;
3001bb76ff1Sjsg /* page frame address is saved */
3011bb76ff1Sjsg err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
3021bb76ff1Sjsg err_rec->ts = (uint64_t)ktime_get_real_seconds();
3031bb76ff1Sjsg err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
3041bb76ff1Sjsg err_rec->cu = 0;
3051bb76ff1Sjsg err_rec->mem_channel = channel_index;
3061bb76ff1Sjsg err_rec->mcumc_id = umc_inst;
3071bb76ff1Sjsg
3081bb76ff1Sjsg err_data->err_addr_cnt++;
3091bb76ff1Sjsg }
310*f005ef32Sjsg
amdgpu_umc_loop_channels(struct amdgpu_device * adev,umc_func func,void * data)311*f005ef32Sjsg int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
312*f005ef32Sjsg umc_func func, void *data)
313*f005ef32Sjsg {
314*f005ef32Sjsg uint32_t node_inst = 0;
315*f005ef32Sjsg uint32_t umc_inst = 0;
316*f005ef32Sjsg uint32_t ch_inst = 0;
317*f005ef32Sjsg int ret = 0;
318*f005ef32Sjsg
319*f005ef32Sjsg if (adev->umc.node_inst_num) {
320*f005ef32Sjsg LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
321*f005ef32Sjsg ret = func(adev, node_inst, umc_inst, ch_inst, data);
322*f005ef32Sjsg if (ret) {
323*f005ef32Sjsg dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n",
324*f005ef32Sjsg node_inst, umc_inst, ch_inst, ret);
325*f005ef32Sjsg return ret;
326*f005ef32Sjsg }
327*f005ef32Sjsg }
328*f005ef32Sjsg } else {
329*f005ef32Sjsg LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
330*f005ef32Sjsg ret = func(adev, 0, umc_inst, ch_inst, data);
331*f005ef32Sjsg if (ret) {
332*f005ef32Sjsg dev_err(adev->dev, "Umc %d ch %d func returns %d\n",
333*f005ef32Sjsg umc_inst, ch_inst, ret);
334*f005ef32Sjsg return ret;
335*f005ef32Sjsg }
336*f005ef32Sjsg }
337*f005ef32Sjsg }
338*f005ef32Sjsg
339*f005ef32Sjsg return 0;
340*f005ef32Sjsg }
341