1*e4a580baSriastradh /* $NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $ */
24e390cabSriastradh
34e390cabSriastradh /*
44e390cabSriastradh * Copyright 2018 Advanced Micro Devices, Inc.
54e390cabSriastradh *
64e390cabSriastradh * Permission is hereby granted, free of charge, to any person obtaining a
74e390cabSriastradh * copy of this software and associated documentation files (the "Software"),
84e390cabSriastradh * to deal in the Software without restriction, including without limitation
94e390cabSriastradh * the rights to use, copy, modify, merge, publish, distribute, sublicense,
104e390cabSriastradh * and/or sell copies of the Software, and to permit persons to whom the
114e390cabSriastradh * Software is furnished to do so, subject to the following conditions:
124e390cabSriastradh *
134e390cabSriastradh * The above copyright notice and this permission notice shall be included in
144e390cabSriastradh * all copies or substantial portions of the Software.
154e390cabSriastradh *
164e390cabSriastradh * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
174e390cabSriastradh * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
184e390cabSriastradh * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
194e390cabSriastradh * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
204e390cabSriastradh * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
214e390cabSriastradh * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
224e390cabSriastradh * OTHER DEALINGS IN THE SOFTWARE.
234e390cabSriastradh *
244e390cabSriastradh *
254e390cabSriastradh */
264e390cabSriastradh #include <sys/cdefs.h>
27*e4a580baSriastradh __KERNEL_RCSID(0, "$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $");
284e390cabSriastradh
294e390cabSriastradh #include <linux/list.h>
304e390cabSriastradh #include "amdgpu.h"
314e390cabSriastradh #include "amdgpu_xgmi.h"
324e390cabSriastradh #include "amdgpu_smu.h"
334e390cabSriastradh #include "amdgpu_ras.h"
344e390cabSriastradh #include "df/df_3_6_offset.h"
354e390cabSriastradh
364e390cabSriastradh static DEFINE_MUTEX(xgmi_mutex);
374e390cabSriastradh
384e390cabSriastradh #define AMDGPU_MAX_XGMI_HIVE 8
394e390cabSriastradh #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
404e390cabSriastradh
414e390cabSriastradh static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
424e390cabSriastradh static unsigned hive_count = 0;
434e390cabSriastradh
amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info * hive)444e390cabSriastradh void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
454e390cabSriastradh {
464e390cabSriastradh return &hive->device_list;
474e390cabSriastradh }
484e390cabSriastradh
494e390cabSriastradh /**
504e390cabSriastradh * DOC: AMDGPU XGMI Support
514e390cabSriastradh *
524e390cabSriastradh * XGMI is a high speed interconnect that joins multiple GPU cards
534e390cabSriastradh * into a homogeneous memory space that is organized by a collective
544e390cabSriastradh * hive ID and individual node IDs, both of which are 64-bit numbers.
554e390cabSriastradh *
564e390cabSriastradh * The file xgmi_device_id contains the unique per GPU device ID and
574e390cabSriastradh * is stored in the /sys/class/drm/card${cardno}/device/ directory.
584e390cabSriastradh *
594e390cabSriastradh * Inside the device directory a sub-directory 'xgmi_hive_info' is
604e390cabSriastradh * created which contains the hive ID and the list of nodes.
614e390cabSriastradh *
624e390cabSriastradh * The hive ID is stored in:
634e390cabSriastradh * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
644e390cabSriastradh *
654e390cabSriastradh * The node information is stored in numbered directories:
664e390cabSriastradh * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
674e390cabSriastradh *
684e390cabSriastradh * Each device has their own xgmi_hive_info direction with a mirror
694e390cabSriastradh * set of node sub-directories.
704e390cabSriastradh *
714e390cabSriastradh * The XGMI memory space is built by contiguously adding the power of
724e390cabSriastradh * two padded VRAM space from each node to each other.
734e390cabSriastradh *
744e390cabSriastradh */
754e390cabSriastradh
764e390cabSriastradh
amdgpu_xgmi_show_hive_id(struct device * dev,struct device_attribute * attr,char * buf)774e390cabSriastradh static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
784e390cabSriastradh struct device_attribute *attr, char *buf)
794e390cabSriastradh {
804e390cabSriastradh struct amdgpu_hive_info *hive =
814e390cabSriastradh container_of(attr, struct amdgpu_hive_info, dev_attr);
824e390cabSriastradh
834e390cabSriastradh return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
844e390cabSriastradh }
854e390cabSriastradh
amdgpu_xgmi_sysfs_create(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)864e390cabSriastradh static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
874e390cabSriastradh struct amdgpu_hive_info *hive)
884e390cabSriastradh {
89*e4a580baSriastradh #ifdef CONFIG_SYSFS
904e390cabSriastradh int ret = 0;
914e390cabSriastradh
924e390cabSriastradh if (WARN_ON(hive->kobj))
934e390cabSriastradh return -EINVAL;
944e390cabSriastradh
954e390cabSriastradh hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
964e390cabSriastradh if (!hive->kobj) {
974e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
984e390cabSriastradh return -EINVAL;
994e390cabSriastradh }
1004e390cabSriastradh
1014e390cabSriastradh hive->dev_attr = (struct device_attribute) {
1024e390cabSriastradh .attr = {
1034e390cabSriastradh .name = "xgmi_hive_id",
1044e390cabSriastradh .mode = S_IRUGO,
1054e390cabSriastradh
1064e390cabSriastradh },
1074e390cabSriastradh .show = amdgpu_xgmi_show_hive_id,
1084e390cabSriastradh };
1094e390cabSriastradh
1104e390cabSriastradh ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
1114e390cabSriastradh if (ret) {
1124e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
1134e390cabSriastradh kobject_del(hive->kobj);
1144e390cabSriastradh kobject_put(hive->kobj);
1154e390cabSriastradh hive->kobj = NULL;
1164e390cabSriastradh }
1174e390cabSriastradh
1184e390cabSriastradh return ret;
119*e4a580baSriastradh #endif
1204e390cabSriastradh }
1214e390cabSriastradh
amdgpu_xgmi_sysfs_destroy(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)1224e390cabSriastradh static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
1234e390cabSriastradh struct amdgpu_hive_info *hive)
1244e390cabSriastradh {
125*e4a580baSriastradh #ifdef CONFIG_SYSFS
1264e390cabSriastradh sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
1274e390cabSriastradh kobject_del(hive->kobj);
1284e390cabSriastradh kobject_put(hive->kobj);
1294e390cabSriastradh hive->kobj = NULL;
130*e4a580baSriastradh #endif
1314e390cabSriastradh }
1324e390cabSriastradh
amdgpu_xgmi_show_device_id(struct device * dev,struct device_attribute * attr,char * buf)1334e390cabSriastradh static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
1344e390cabSriastradh struct device_attribute *attr,
1354e390cabSriastradh char *buf)
1364e390cabSriastradh {
1374e390cabSriastradh struct drm_device *ddev = dev_get_drvdata(dev);
1384e390cabSriastradh struct amdgpu_device *adev = ddev->dev_private;
1394e390cabSriastradh
1404e390cabSriastradh return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
1414e390cabSriastradh
1424e390cabSriastradh }
1434e390cabSriastradh
1444e390cabSriastradh #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
amdgpu_xgmi_show_error(struct device * dev,struct device_attribute * attr,char * buf)1454e390cabSriastradh static ssize_t amdgpu_xgmi_show_error(struct device *dev,
1464e390cabSriastradh struct device_attribute *attr,
1474e390cabSriastradh char *buf)
1484e390cabSriastradh {
1494e390cabSriastradh struct drm_device *ddev = dev_get_drvdata(dev);
1504e390cabSriastradh struct amdgpu_device *adev = ddev->dev_private;
1514e390cabSriastradh uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
1524e390cabSriastradh uint64_t fica_out;
1534e390cabSriastradh unsigned int error_count = 0;
1544e390cabSriastradh
1554e390cabSriastradh ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
1564e390cabSriastradh ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
1574e390cabSriastradh
1584e390cabSriastradh fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
1594e390cabSriastradh if (fica_out != 0x1f)
1604e390cabSriastradh pr_err("xGMI error counters not enabled!\n");
1614e390cabSriastradh
1624e390cabSriastradh fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
1634e390cabSriastradh
1644e390cabSriastradh if ((fica_out & 0xffff) == 2)
1654e390cabSriastradh error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
1664e390cabSriastradh
1674e390cabSriastradh adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
1684e390cabSriastradh
1694e390cabSriastradh return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
1704e390cabSriastradh }
1714e390cabSriastradh
1724e390cabSriastradh
1734e390cabSriastradh static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
1744e390cabSriastradh static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
1754e390cabSriastradh
amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)1764e390cabSriastradh static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
1774e390cabSriastradh struct amdgpu_hive_info *hive)
1784e390cabSriastradh {
179*e4a580baSriastradh #ifdef CONFIG_SYSFS
1804e390cabSriastradh int ret = 0;
1814e390cabSriastradh char node[10] = { 0 };
1824e390cabSriastradh
1834e390cabSriastradh /* Create xgmi device id file */
1844e390cabSriastradh ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
1854e390cabSriastradh if (ret) {
1864e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
1874e390cabSriastradh return ret;
1884e390cabSriastradh }
1894e390cabSriastradh
1904e390cabSriastradh /* Create xgmi error file */
1914e390cabSriastradh ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
1924e390cabSriastradh if (ret)
1934e390cabSriastradh pr_err("failed to create xgmi_error\n");
1944e390cabSriastradh
1954e390cabSriastradh
1964e390cabSriastradh /* Create sysfs link to hive info folder on the first device */
1974e390cabSriastradh if (adev != hive->adev) {
1984e390cabSriastradh ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
1994e390cabSriastradh "xgmi_hive_info");
2004e390cabSriastradh if (ret) {
2014e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to create link to hive info");
2024e390cabSriastradh goto remove_file;
2034e390cabSriastradh }
2044e390cabSriastradh }
2054e390cabSriastradh
2064e390cabSriastradh sprintf(node, "node%d", hive->number_devices);
2074e390cabSriastradh /* Create sysfs link form the hive folder to yourself */
2084e390cabSriastradh ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
2094e390cabSriastradh if (ret) {
2104e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to create link from hive info");
2114e390cabSriastradh goto remove_link;
2124e390cabSriastradh }
2134e390cabSriastradh
2144e390cabSriastradh goto success;
2154e390cabSriastradh
2164e390cabSriastradh
2174e390cabSriastradh remove_link:
2184e390cabSriastradh sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2194e390cabSriastradh
2204e390cabSriastradh remove_file:
2214e390cabSriastradh device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2224e390cabSriastradh
2234e390cabSriastradh success:
2244e390cabSriastradh return ret;
225*e4a580baSriastradh #endif
2264e390cabSriastradh }
2274e390cabSriastradh
amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)2284e390cabSriastradh static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
2294e390cabSriastradh struct amdgpu_hive_info *hive)
2304e390cabSriastradh {
231*e4a580baSriastradh #ifdef CONFIG_SYSFS
2324e390cabSriastradh device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2334e390cabSriastradh sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2344e390cabSriastradh sysfs_remove_link(hive->kobj, adev->ddev->unique);
235*e4a580baSriastradh #endif
2364e390cabSriastradh }
2374e390cabSriastradh
2384e390cabSriastradh
2394e390cabSriastradh
amdgpu_get_xgmi_hive(struct amdgpu_device * adev,int lock)2404e390cabSriastradh struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
2414e390cabSriastradh {
2424e390cabSriastradh int i;
2434e390cabSriastradh struct amdgpu_hive_info *tmp;
2444e390cabSriastradh
2454e390cabSriastradh if (!adev->gmc.xgmi.hive_id)
2464e390cabSriastradh return NULL;
2474e390cabSriastradh
2484e390cabSriastradh mutex_lock(&xgmi_mutex);
2494e390cabSriastradh
2504e390cabSriastradh for (i = 0 ; i < hive_count; ++i) {
2514e390cabSriastradh tmp = &xgmi_hives[i];
2524e390cabSriastradh if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
2534e390cabSriastradh if (lock)
2544e390cabSriastradh mutex_lock(&tmp->hive_lock);
2554e390cabSriastradh mutex_unlock(&xgmi_mutex);
2564e390cabSriastradh return tmp;
2574e390cabSriastradh }
2584e390cabSriastradh }
2594e390cabSriastradh if (i >= AMDGPU_MAX_XGMI_HIVE) {
2604e390cabSriastradh mutex_unlock(&xgmi_mutex);
2614e390cabSriastradh return NULL;
2624e390cabSriastradh }
2634e390cabSriastradh
2644e390cabSriastradh /* initialize new hive if not exist */
2654e390cabSriastradh tmp = &xgmi_hives[hive_count++];
2664e390cabSriastradh
2674e390cabSriastradh if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
2684e390cabSriastradh mutex_unlock(&xgmi_mutex);
2694e390cabSriastradh return NULL;
2704e390cabSriastradh }
2714e390cabSriastradh
2724e390cabSriastradh tmp->adev = adev;
2734e390cabSriastradh tmp->hive_id = adev->gmc.xgmi.hive_id;
2744e390cabSriastradh INIT_LIST_HEAD(&tmp->device_list);
2754e390cabSriastradh mutex_init(&tmp->hive_lock);
2764e390cabSriastradh mutex_init(&tmp->reset_lock);
2774e390cabSriastradh task_barrier_init(&tmp->tb);
2784e390cabSriastradh
2794e390cabSriastradh if (lock)
2804e390cabSriastradh mutex_lock(&tmp->hive_lock);
2814e390cabSriastradh tmp->pstate = -1;
2824e390cabSriastradh mutex_unlock(&xgmi_mutex);
2834e390cabSriastradh
2844e390cabSriastradh return tmp;
2854e390cabSriastradh }
2864e390cabSriastradh
amdgpu_xgmi_set_pstate(struct amdgpu_device * adev,int pstate)2874e390cabSriastradh int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
2884e390cabSriastradh {
2894e390cabSriastradh int ret = 0;
2904e390cabSriastradh struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2914e390cabSriastradh struct amdgpu_device *tmp_adev;
2924e390cabSriastradh bool update_hive_pstate = true;
2934e390cabSriastradh bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
2944e390cabSriastradh
2954e390cabSriastradh if (!hive)
2964e390cabSriastradh return 0;
2974e390cabSriastradh
2984e390cabSriastradh mutex_lock(&hive->hive_lock);
2994e390cabSriastradh
3004e390cabSriastradh if (hive->pstate == pstate) {
3014e390cabSriastradh adev->pstate = is_high_pstate ? pstate : adev->pstate;
3024e390cabSriastradh goto out;
3034e390cabSriastradh }
3044e390cabSriastradh
3054e390cabSriastradh dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
3064e390cabSriastradh
3074e390cabSriastradh ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
3084e390cabSriastradh if (ret) {
3094e390cabSriastradh dev_err(adev->dev,
3104e390cabSriastradh "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
3114e390cabSriastradh adev->gmc.xgmi.node_id,
3124e390cabSriastradh adev->gmc.xgmi.hive_id, ret);
3134e390cabSriastradh goto out;
3144e390cabSriastradh }
3154e390cabSriastradh
3164e390cabSriastradh /* Update device pstate */
3174e390cabSriastradh adev->pstate = pstate;
3184e390cabSriastradh
3194e390cabSriastradh /*
3204e390cabSriastradh * Update the hive pstate only all devices of the hive
3214e390cabSriastradh * are in the same pstate
3224e390cabSriastradh */
3234e390cabSriastradh list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
3244e390cabSriastradh if (tmp_adev->pstate != adev->pstate) {
3254e390cabSriastradh update_hive_pstate = false;
3264e390cabSriastradh break;
3274e390cabSriastradh }
3284e390cabSriastradh }
3294e390cabSriastradh if (update_hive_pstate || is_high_pstate)
3304e390cabSriastradh hive->pstate = pstate;
3314e390cabSriastradh
3324e390cabSriastradh out:
3334e390cabSriastradh mutex_unlock(&hive->hive_lock);
3344e390cabSriastradh
3354e390cabSriastradh return ret;
3364e390cabSriastradh }
3374e390cabSriastradh
amdgpu_xgmi_update_topology(struct amdgpu_hive_info * hive,struct amdgpu_device * adev)3384e390cabSriastradh int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
3394e390cabSriastradh {
3404e390cabSriastradh int ret = -EINVAL;
3414e390cabSriastradh
3424e390cabSriastradh /* Each psp need to set the latest topology */
3434e390cabSriastradh ret = psp_xgmi_set_topology_info(&adev->psp,
3444e390cabSriastradh hive->number_devices,
3454e390cabSriastradh &adev->psp.xgmi_context.top_info);
3464e390cabSriastradh if (ret)
3474e390cabSriastradh dev_err(adev->dev,
3484e390cabSriastradh "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
3494e390cabSriastradh adev->gmc.xgmi.node_id,
3504e390cabSriastradh adev->gmc.xgmi.hive_id, ret);
3514e390cabSriastradh
3524e390cabSriastradh return ret;
3534e390cabSriastradh }
3544e390cabSriastradh
3554e390cabSriastradh
amdgpu_xgmi_get_hops_count(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)3564e390cabSriastradh int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
3574e390cabSriastradh struct amdgpu_device *peer_adev)
3584e390cabSriastradh {
3594e390cabSriastradh struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
3604e390cabSriastradh int i;
3614e390cabSriastradh
3624e390cabSriastradh for (i = 0 ; i < top->num_nodes; ++i)
3634e390cabSriastradh if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
3644e390cabSriastradh return top->nodes[i].num_hops;
3654e390cabSriastradh return -EINVAL;
3664e390cabSriastradh }
3674e390cabSriastradh
amdgpu_xgmi_add_device(struct amdgpu_device * adev)3684e390cabSriastradh int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
3694e390cabSriastradh {
3704e390cabSriastradh struct psp_xgmi_topology_info *top_info;
3714e390cabSriastradh struct amdgpu_hive_info *hive;
3724e390cabSriastradh struct amdgpu_xgmi *entry;
3734e390cabSriastradh struct amdgpu_device *tmp_adev = NULL;
3744e390cabSriastradh
3754e390cabSriastradh int count = 0, ret = 0;
3764e390cabSriastradh
3774e390cabSriastradh if (!adev->gmc.xgmi.supported)
3784e390cabSriastradh return 0;
3794e390cabSriastradh
3804e390cabSriastradh if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
3814e390cabSriastradh ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
3824e390cabSriastradh if (ret) {
3834e390cabSriastradh dev_err(adev->dev,
3844e390cabSriastradh "XGMI: Failed to get hive id\n");
3854e390cabSriastradh return ret;
3864e390cabSriastradh }
3874e390cabSriastradh
3884e390cabSriastradh ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
3894e390cabSriastradh if (ret) {
3904e390cabSriastradh dev_err(adev->dev,
3914e390cabSriastradh "XGMI: Failed to get node id\n");
3924e390cabSriastradh return ret;
3934e390cabSriastradh }
3944e390cabSriastradh } else {
3954e390cabSriastradh adev->gmc.xgmi.hive_id = 16;
3964e390cabSriastradh adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
3974e390cabSriastradh }
3984e390cabSriastradh
3994e390cabSriastradh hive = amdgpu_get_xgmi_hive(adev, 1);
4004e390cabSriastradh if (!hive) {
4014e390cabSriastradh ret = -EINVAL;
4024e390cabSriastradh dev_err(adev->dev,
4034e390cabSriastradh "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
4044e390cabSriastradh adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
4054e390cabSriastradh goto exit;
4064e390cabSriastradh }
4074e390cabSriastradh
4084e390cabSriastradh /* Set default device pstate */
4094e390cabSriastradh adev->pstate = -1;
4104e390cabSriastradh
4114e390cabSriastradh top_info = &adev->psp.xgmi_context.top_info;
4124e390cabSriastradh
4134e390cabSriastradh list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
4144e390cabSriastradh list_for_each_entry(entry, &hive->device_list, head)
4154e390cabSriastradh top_info->nodes[count++].node_id = entry->node_id;
4164e390cabSriastradh top_info->num_nodes = count;
4174e390cabSriastradh hive->number_devices = count;
4184e390cabSriastradh
4194e390cabSriastradh task_barrier_add_task(&hive->tb);
4204e390cabSriastradh
4214e390cabSriastradh if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
4224e390cabSriastradh list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4234e390cabSriastradh /* update node list for other device in the hive */
4244e390cabSriastradh if (tmp_adev != adev) {
4254e390cabSriastradh top_info = &tmp_adev->psp.xgmi_context.top_info;
4264e390cabSriastradh top_info->nodes[count - 1].node_id =
4274e390cabSriastradh adev->gmc.xgmi.node_id;
4284e390cabSriastradh top_info->num_nodes = count;
4294e390cabSriastradh }
4304e390cabSriastradh ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
4314e390cabSriastradh if (ret)
4324e390cabSriastradh goto exit;
4334e390cabSriastradh }
4344e390cabSriastradh
4354e390cabSriastradh /* get latest topology info for each device from psp */
4364e390cabSriastradh list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4374e390cabSriastradh ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
4384e390cabSriastradh &tmp_adev->psp.xgmi_context.top_info);
4394e390cabSriastradh if (ret) {
4404e390cabSriastradh dev_err(tmp_adev->dev,
4414e390cabSriastradh "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
4424e390cabSriastradh tmp_adev->gmc.xgmi.node_id,
4434e390cabSriastradh tmp_adev->gmc.xgmi.hive_id, ret);
4444e390cabSriastradh /* To do : continue with some node failed or disable the whole hive */
4454e390cabSriastradh goto exit;
4464e390cabSriastradh }
4474e390cabSriastradh }
4484e390cabSriastradh }
4494e390cabSriastradh
4504e390cabSriastradh if (!ret)
4514e390cabSriastradh ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
4524e390cabSriastradh
4534e390cabSriastradh
4544e390cabSriastradh mutex_unlock(&hive->hive_lock);
4554e390cabSriastradh exit:
4564e390cabSriastradh if (!ret)
4574e390cabSriastradh dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
4584e390cabSriastradh adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
4594e390cabSriastradh else
4604e390cabSriastradh dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
4614e390cabSriastradh adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
4624e390cabSriastradh ret);
4634e390cabSriastradh
4644e390cabSriastradh return ret;
4654e390cabSriastradh }
4664e390cabSriastradh
amdgpu_xgmi_remove_device(struct amdgpu_device * adev)4674e390cabSriastradh void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
4684e390cabSriastradh {
4694e390cabSriastradh struct amdgpu_hive_info *hive;
4704e390cabSriastradh
4714e390cabSriastradh if (!adev->gmc.xgmi.supported)
4724e390cabSriastradh return;
4734e390cabSriastradh
4744e390cabSriastradh hive = amdgpu_get_xgmi_hive(adev, 1);
4754e390cabSriastradh if (!hive)
4764e390cabSriastradh return;
4774e390cabSriastradh
4784e390cabSriastradh if (!(hive->number_devices--)) {
4794e390cabSriastradh amdgpu_xgmi_sysfs_destroy(adev, hive);
480*e4a580baSriastradh task_barrier_destroy(&tmp->tb);
4814e390cabSriastradh mutex_destroy(&hive->hive_lock);
4824e390cabSriastradh mutex_destroy(&hive->reset_lock);
4834e390cabSriastradh } else {
4844e390cabSriastradh task_barrier_rem_task(&hive->tb);
4854e390cabSriastradh amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
4864e390cabSriastradh mutex_unlock(&hive->hive_lock);
4874e390cabSriastradh }
4884e390cabSriastradh }
4894e390cabSriastradh
amdgpu_xgmi_ras_late_init(struct amdgpu_device * adev)4904e390cabSriastradh int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
4914e390cabSriastradh {
4924e390cabSriastradh int r;
4934e390cabSriastradh struct ras_ih_if ih_info = {
4944e390cabSriastradh .cb = NULL,
4954e390cabSriastradh };
4964e390cabSriastradh struct ras_fs_if fs_info = {
4974e390cabSriastradh .sysfs_name = "xgmi_wafl_err_count",
4984e390cabSriastradh .debugfs_name = "xgmi_wafl_err_inject",
4994e390cabSriastradh };
5004e390cabSriastradh
5014e390cabSriastradh if (!adev->gmc.xgmi.supported ||
5024e390cabSriastradh adev->gmc.xgmi.num_physical_nodes == 0)
5034e390cabSriastradh return 0;
5044e390cabSriastradh
5054e390cabSriastradh if (!adev->gmc.xgmi.ras_if) {
5064e390cabSriastradh adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
5074e390cabSriastradh if (!adev->gmc.xgmi.ras_if)
5084e390cabSriastradh return -ENOMEM;
5094e390cabSriastradh adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
5104e390cabSriastradh adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
5114e390cabSriastradh adev->gmc.xgmi.ras_if->sub_block_index = 0;
5124e390cabSriastradh strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
5134e390cabSriastradh }
5144e390cabSriastradh ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
5154e390cabSriastradh r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
5164e390cabSriastradh &fs_info, &ih_info);
5174e390cabSriastradh if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
5184e390cabSriastradh kfree(adev->gmc.xgmi.ras_if);
5194e390cabSriastradh adev->gmc.xgmi.ras_if = NULL;
5204e390cabSriastradh }
5214e390cabSriastradh
5224e390cabSriastradh return r;
5234e390cabSriastradh }
5244e390cabSriastradh
amdgpu_xgmi_ras_fini(struct amdgpu_device * adev)5254e390cabSriastradh void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
5264e390cabSriastradh {
5274e390cabSriastradh if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
5284e390cabSriastradh adev->gmc.xgmi.ras_if) {
5294e390cabSriastradh struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
5304e390cabSriastradh struct ras_ih_if ih_info = {
5314e390cabSriastradh .cb = NULL,
5324e390cabSriastradh };
5334e390cabSriastradh
5344e390cabSriastradh amdgpu_ras_late_fini(adev, ras_if, &ih_info);
5354e390cabSriastradh kfree(ras_if);
5364e390cabSriastradh }
5374e390cabSriastradh }
538