xref: /netbsd-src/sys/external/bsd/drm2/dist/drm/amd/amdgpu/amdgpu_xgmi.c (revision e4a580baf2598beeaae98d953ac7635b8700b80c)
1*e4a580baSriastradh /*	$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $	*/
24e390cabSriastradh 
34e390cabSriastradh /*
44e390cabSriastradh  * Copyright 2018 Advanced Micro Devices, Inc.
54e390cabSriastradh  *
64e390cabSriastradh  * Permission is hereby granted, free of charge, to any person obtaining a
74e390cabSriastradh  * copy of this software and associated documentation files (the "Software"),
84e390cabSriastradh  * to deal in the Software without restriction, including without limitation
94e390cabSriastradh  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
104e390cabSriastradh  * and/or sell copies of the Software, and to permit persons to whom the
114e390cabSriastradh  * Software is furnished to do so, subject to the following conditions:
124e390cabSriastradh  *
134e390cabSriastradh  * The above copyright notice and this permission notice shall be included in
144e390cabSriastradh  * all copies or substantial portions of the Software.
154e390cabSriastradh  *
164e390cabSriastradh  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
174e390cabSriastradh  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
184e390cabSriastradh  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
194e390cabSriastradh  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
204e390cabSriastradh  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
214e390cabSriastradh  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
224e390cabSriastradh  * OTHER DEALINGS IN THE SOFTWARE.
234e390cabSriastradh  *
244e390cabSriastradh  *
254e390cabSriastradh  */
264e390cabSriastradh #include <sys/cdefs.h>
27*e4a580baSriastradh __KERNEL_RCSID(0, "$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $");
284e390cabSriastradh 
294e390cabSriastradh #include <linux/list.h>
304e390cabSriastradh #include "amdgpu.h"
314e390cabSriastradh #include "amdgpu_xgmi.h"
324e390cabSriastradh #include "amdgpu_smu.h"
334e390cabSriastradh #include "amdgpu_ras.h"
344e390cabSriastradh #include "df/df_3_6_offset.h"
354e390cabSriastradh 
364e390cabSriastradh static DEFINE_MUTEX(xgmi_mutex);
374e390cabSriastradh 
384e390cabSriastradh #define AMDGPU_MAX_XGMI_HIVE			8
394e390cabSriastradh #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
404e390cabSriastradh 
414e390cabSriastradh static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
424e390cabSriastradh static unsigned hive_count = 0;
434e390cabSriastradh 
amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info * hive)444e390cabSriastradh void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
454e390cabSriastradh {
464e390cabSriastradh 	return &hive->device_list;
474e390cabSriastradh }
484e390cabSriastradh 
494e390cabSriastradh /**
504e390cabSriastradh  * DOC: AMDGPU XGMI Support
514e390cabSriastradh  *
524e390cabSriastradh  * XGMI is a high speed interconnect that joins multiple GPU cards
534e390cabSriastradh  * into a homogeneous memory space that is organized by a collective
544e390cabSriastradh  * hive ID and individual node IDs, both of which are 64-bit numbers.
554e390cabSriastradh  *
564e390cabSriastradh  * The file xgmi_device_id contains the unique per GPU device ID and
574e390cabSriastradh  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
584e390cabSriastradh  *
594e390cabSriastradh  * Inside the device directory a sub-directory 'xgmi_hive_info' is
604e390cabSriastradh  * created which contains the hive ID and the list of nodes.
614e390cabSriastradh  *
624e390cabSriastradh  * The hive ID is stored in:
634e390cabSriastradh  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
644e390cabSriastradh  *
654e390cabSriastradh  * The node information is stored in numbered directories:
664e390cabSriastradh  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
674e390cabSriastradh  *
684e390cabSriastradh  * Each device has their own xgmi_hive_info direction with a mirror
694e390cabSriastradh  * set of node sub-directories.
704e390cabSriastradh  *
714e390cabSriastradh  * The XGMI memory space is built by contiguously adding the power of
724e390cabSriastradh  * two padded VRAM space from each node to each other.
734e390cabSriastradh  *
744e390cabSriastradh  */
754e390cabSriastradh 
764e390cabSriastradh 
amdgpu_xgmi_show_hive_id(struct device * dev,struct device_attribute * attr,char * buf)774e390cabSriastradh static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
784e390cabSriastradh 		struct device_attribute *attr, char *buf)
794e390cabSriastradh {
804e390cabSriastradh 	struct amdgpu_hive_info *hive =
814e390cabSriastradh 			container_of(attr, struct amdgpu_hive_info, dev_attr);
824e390cabSriastradh 
834e390cabSriastradh 	return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
844e390cabSriastradh }
854e390cabSriastradh 
amdgpu_xgmi_sysfs_create(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)864e390cabSriastradh static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
874e390cabSriastradh 				    struct amdgpu_hive_info *hive)
884e390cabSriastradh {
89*e4a580baSriastradh #ifdef CONFIG_SYSFS
904e390cabSriastradh 	int ret = 0;
914e390cabSriastradh 
924e390cabSriastradh 	if (WARN_ON(hive->kobj))
934e390cabSriastradh 		return -EINVAL;
944e390cabSriastradh 
954e390cabSriastradh 	hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
964e390cabSriastradh 	if (!hive->kobj) {
974e390cabSriastradh 		dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
984e390cabSriastradh 		return -EINVAL;
994e390cabSriastradh 	}
1004e390cabSriastradh 
1014e390cabSriastradh 	hive->dev_attr = (struct device_attribute) {
1024e390cabSriastradh 		.attr = {
1034e390cabSriastradh 			.name = "xgmi_hive_id",
1044e390cabSriastradh 			.mode = S_IRUGO,
1054e390cabSriastradh 
1064e390cabSriastradh 		},
1074e390cabSriastradh 		.show = amdgpu_xgmi_show_hive_id,
1084e390cabSriastradh 	};
1094e390cabSriastradh 
1104e390cabSriastradh 	ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
1114e390cabSriastradh 	if (ret) {
1124e390cabSriastradh 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
1134e390cabSriastradh 		kobject_del(hive->kobj);
1144e390cabSriastradh 		kobject_put(hive->kobj);
1154e390cabSriastradh 		hive->kobj = NULL;
1164e390cabSriastradh 	}
1174e390cabSriastradh 
1184e390cabSriastradh 	return ret;
119*e4a580baSriastradh #endif
1204e390cabSriastradh }
1214e390cabSriastradh 
amdgpu_xgmi_sysfs_destroy(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)1224e390cabSriastradh static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
1234e390cabSriastradh 				    struct amdgpu_hive_info *hive)
1244e390cabSriastradh {
125*e4a580baSriastradh #ifdef CONFIG_SYSFS
1264e390cabSriastradh 	sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
1274e390cabSriastradh 	kobject_del(hive->kobj);
1284e390cabSriastradh 	kobject_put(hive->kobj);
1294e390cabSriastradh 	hive->kobj = NULL;
130*e4a580baSriastradh #endif
1314e390cabSriastradh }
1324e390cabSriastradh 
amdgpu_xgmi_show_device_id(struct device * dev,struct device_attribute * attr,char * buf)1334e390cabSriastradh static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
1344e390cabSriastradh 				     struct device_attribute *attr,
1354e390cabSriastradh 				     char *buf)
1364e390cabSriastradh {
1374e390cabSriastradh 	struct drm_device *ddev = dev_get_drvdata(dev);
1384e390cabSriastradh 	struct amdgpu_device *adev = ddev->dev_private;
1394e390cabSriastradh 
1404e390cabSriastradh 	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
1414e390cabSriastradh 
1424e390cabSriastradh }
1434e390cabSriastradh 
1444e390cabSriastradh #define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
amdgpu_xgmi_show_error(struct device * dev,struct device_attribute * attr,char * buf)1454e390cabSriastradh static ssize_t amdgpu_xgmi_show_error(struct device *dev,
1464e390cabSriastradh 				      struct device_attribute *attr,
1474e390cabSriastradh 				      char *buf)
1484e390cabSriastradh {
1494e390cabSriastradh 	struct drm_device *ddev = dev_get_drvdata(dev);
1504e390cabSriastradh 	struct amdgpu_device *adev = ddev->dev_private;
1514e390cabSriastradh 	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
1524e390cabSriastradh 	uint64_t fica_out;
1534e390cabSriastradh 	unsigned int error_count = 0;
1544e390cabSriastradh 
1554e390cabSriastradh 	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
1564e390cabSriastradh 	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
1574e390cabSriastradh 
1584e390cabSriastradh 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
1594e390cabSriastradh 	if (fica_out != 0x1f)
1604e390cabSriastradh 		pr_err("xGMI error counters not enabled!\n");
1614e390cabSriastradh 
1624e390cabSriastradh 	fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
1634e390cabSriastradh 
1644e390cabSriastradh 	if ((fica_out & 0xffff) == 2)
1654e390cabSriastradh 		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
1664e390cabSriastradh 
1674e390cabSriastradh 	adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
1684e390cabSriastradh 
1694e390cabSriastradh 	return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
1704e390cabSriastradh }
1714e390cabSriastradh 
1724e390cabSriastradh 
1734e390cabSriastradh static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
1744e390cabSriastradh static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
1754e390cabSriastradh 
amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)1764e390cabSriastradh static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
1774e390cabSriastradh 					 struct amdgpu_hive_info *hive)
1784e390cabSriastradh {
179*e4a580baSriastradh #ifdef CONFIG_SYSFS
1804e390cabSriastradh 	int ret = 0;
1814e390cabSriastradh 	char node[10] = { 0 };
1824e390cabSriastradh 
1834e390cabSriastradh 	/* Create xgmi device id file */
1844e390cabSriastradh 	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
1854e390cabSriastradh 	if (ret) {
1864e390cabSriastradh 		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
1874e390cabSriastradh 		return ret;
1884e390cabSriastradh 	}
1894e390cabSriastradh 
1904e390cabSriastradh 	/* Create xgmi error file */
1914e390cabSriastradh 	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
1924e390cabSriastradh 	if (ret)
1934e390cabSriastradh 		pr_err("failed to create xgmi_error\n");
1944e390cabSriastradh 
1954e390cabSriastradh 
1964e390cabSriastradh 	/* Create sysfs link to hive info folder on the first device */
1974e390cabSriastradh 	if (adev != hive->adev) {
1984e390cabSriastradh 		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
1994e390cabSriastradh 					"xgmi_hive_info");
2004e390cabSriastradh 		if (ret) {
2014e390cabSriastradh 			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
2024e390cabSriastradh 			goto remove_file;
2034e390cabSriastradh 		}
2044e390cabSriastradh 	}
2054e390cabSriastradh 
2064e390cabSriastradh 	sprintf(node, "node%d", hive->number_devices);
2074e390cabSriastradh 	/* Create sysfs link form the hive folder to yourself */
2084e390cabSriastradh 	ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
2094e390cabSriastradh 	if (ret) {
2104e390cabSriastradh 		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
2114e390cabSriastradh 		goto remove_link;
2124e390cabSriastradh 	}
2134e390cabSriastradh 
2144e390cabSriastradh 	goto success;
2154e390cabSriastradh 
2164e390cabSriastradh 
2174e390cabSriastradh remove_link:
2184e390cabSriastradh 	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2194e390cabSriastradh 
2204e390cabSriastradh remove_file:
2214e390cabSriastradh 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2224e390cabSriastradh 
2234e390cabSriastradh success:
2244e390cabSriastradh 	return ret;
225*e4a580baSriastradh #endif
2264e390cabSriastradh }
2274e390cabSriastradh 
amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)2284e390cabSriastradh static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
2294e390cabSriastradh 					  struct amdgpu_hive_info *hive)
2304e390cabSriastradh {
231*e4a580baSriastradh #ifdef CONFIG_SYSFS
2324e390cabSriastradh 	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
2334e390cabSriastradh 	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
2344e390cabSriastradh 	sysfs_remove_link(hive->kobj, adev->ddev->unique);
235*e4a580baSriastradh #endif
2364e390cabSriastradh }
2374e390cabSriastradh 
2384e390cabSriastradh 
2394e390cabSriastradh 
amdgpu_get_xgmi_hive(struct amdgpu_device * adev,int lock)2404e390cabSriastradh struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
2414e390cabSriastradh {
2424e390cabSriastradh 	int i;
2434e390cabSriastradh 	struct amdgpu_hive_info *tmp;
2444e390cabSriastradh 
2454e390cabSriastradh 	if (!adev->gmc.xgmi.hive_id)
2464e390cabSriastradh 		return NULL;
2474e390cabSriastradh 
2484e390cabSriastradh 	mutex_lock(&xgmi_mutex);
2494e390cabSriastradh 
2504e390cabSriastradh 	for (i = 0 ; i < hive_count; ++i) {
2514e390cabSriastradh 		tmp = &xgmi_hives[i];
2524e390cabSriastradh 		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
2534e390cabSriastradh 			if (lock)
2544e390cabSriastradh 				mutex_lock(&tmp->hive_lock);
2554e390cabSriastradh 			mutex_unlock(&xgmi_mutex);
2564e390cabSriastradh 			return tmp;
2574e390cabSriastradh 		}
2584e390cabSriastradh 	}
2594e390cabSriastradh 	if (i >= AMDGPU_MAX_XGMI_HIVE) {
2604e390cabSriastradh 		mutex_unlock(&xgmi_mutex);
2614e390cabSriastradh 		return NULL;
2624e390cabSriastradh 	}
2634e390cabSriastradh 
2644e390cabSriastradh 	/* initialize new hive if not exist */
2654e390cabSriastradh 	tmp = &xgmi_hives[hive_count++];
2664e390cabSriastradh 
2674e390cabSriastradh 	if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
2684e390cabSriastradh 		mutex_unlock(&xgmi_mutex);
2694e390cabSriastradh 		return NULL;
2704e390cabSriastradh 	}
2714e390cabSriastradh 
2724e390cabSriastradh 	tmp->adev = adev;
2734e390cabSriastradh 	tmp->hive_id = adev->gmc.xgmi.hive_id;
2744e390cabSriastradh 	INIT_LIST_HEAD(&tmp->device_list);
2754e390cabSriastradh 	mutex_init(&tmp->hive_lock);
2764e390cabSriastradh 	mutex_init(&tmp->reset_lock);
2774e390cabSriastradh 	task_barrier_init(&tmp->tb);
2784e390cabSriastradh 
2794e390cabSriastradh 	if (lock)
2804e390cabSriastradh 		mutex_lock(&tmp->hive_lock);
2814e390cabSriastradh 	tmp->pstate = -1;
2824e390cabSriastradh 	mutex_unlock(&xgmi_mutex);
2834e390cabSriastradh 
2844e390cabSriastradh 	return tmp;
2854e390cabSriastradh }
2864e390cabSriastradh 
amdgpu_xgmi_set_pstate(struct amdgpu_device * adev,int pstate)2874e390cabSriastradh int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
2884e390cabSriastradh {
2894e390cabSriastradh 	int ret = 0;
2904e390cabSriastradh 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2914e390cabSriastradh 	struct amdgpu_device *tmp_adev;
2924e390cabSriastradh 	bool update_hive_pstate = true;
2934e390cabSriastradh 	bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
2944e390cabSriastradh 
2954e390cabSriastradh 	if (!hive)
2964e390cabSriastradh 		return 0;
2974e390cabSriastradh 
2984e390cabSriastradh 	mutex_lock(&hive->hive_lock);
2994e390cabSriastradh 
3004e390cabSriastradh 	if (hive->pstate == pstate) {
3014e390cabSriastradh 		adev->pstate = is_high_pstate ? pstate : adev->pstate;
3024e390cabSriastradh 		goto out;
3034e390cabSriastradh 	}
3044e390cabSriastradh 
3054e390cabSriastradh 	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
3064e390cabSriastradh 
3074e390cabSriastradh 	ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
3084e390cabSriastradh 	if (ret) {
3094e390cabSriastradh 		dev_err(adev->dev,
3104e390cabSriastradh 			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
3114e390cabSriastradh 			adev->gmc.xgmi.node_id,
3124e390cabSriastradh 			adev->gmc.xgmi.hive_id, ret);
3134e390cabSriastradh 		goto out;
3144e390cabSriastradh 	}
3154e390cabSriastradh 
3164e390cabSriastradh 	/* Update device pstate */
3174e390cabSriastradh 	adev->pstate = pstate;
3184e390cabSriastradh 
3194e390cabSriastradh 	/*
3204e390cabSriastradh 	 * Update the hive pstate only all devices of the hive
3214e390cabSriastradh 	 * are in the same pstate
3224e390cabSriastradh 	 */
3234e390cabSriastradh 	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
3244e390cabSriastradh 		if (tmp_adev->pstate != adev->pstate) {
3254e390cabSriastradh 			update_hive_pstate = false;
3264e390cabSriastradh 			break;
3274e390cabSriastradh 		}
3284e390cabSriastradh 	}
3294e390cabSriastradh 	if (update_hive_pstate || is_high_pstate)
3304e390cabSriastradh 		hive->pstate = pstate;
3314e390cabSriastradh 
3324e390cabSriastradh out:
3334e390cabSriastradh 	mutex_unlock(&hive->hive_lock);
3344e390cabSriastradh 
3354e390cabSriastradh 	return ret;
3364e390cabSriastradh }
3374e390cabSriastradh 
amdgpu_xgmi_update_topology(struct amdgpu_hive_info * hive,struct amdgpu_device * adev)3384e390cabSriastradh int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
3394e390cabSriastradh {
3404e390cabSriastradh 	int ret = -EINVAL;
3414e390cabSriastradh 
3424e390cabSriastradh 	/* Each psp need to set the latest topology */
3434e390cabSriastradh 	ret = psp_xgmi_set_topology_info(&adev->psp,
3444e390cabSriastradh 					 hive->number_devices,
3454e390cabSriastradh 					 &adev->psp.xgmi_context.top_info);
3464e390cabSriastradh 	if (ret)
3474e390cabSriastradh 		dev_err(adev->dev,
3484e390cabSriastradh 			"XGMI: Set topology failure on device %llx, hive %llx, ret %d",
3494e390cabSriastradh 			adev->gmc.xgmi.node_id,
3504e390cabSriastradh 			adev->gmc.xgmi.hive_id, ret);
3514e390cabSriastradh 
3524e390cabSriastradh 	return ret;
3534e390cabSriastradh }
3544e390cabSriastradh 
3554e390cabSriastradh 
amdgpu_xgmi_get_hops_count(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)3564e390cabSriastradh int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
3574e390cabSriastradh 		struct amdgpu_device *peer_adev)
3584e390cabSriastradh {
3594e390cabSriastradh 	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
3604e390cabSriastradh 	int i;
3614e390cabSriastradh 
3624e390cabSriastradh 	for (i = 0 ; i < top->num_nodes; ++i)
3634e390cabSriastradh 		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
3644e390cabSriastradh 			return top->nodes[i].num_hops;
3654e390cabSriastradh 	return	-EINVAL;
3664e390cabSriastradh }
3674e390cabSriastradh 
amdgpu_xgmi_add_device(struct amdgpu_device * adev)3684e390cabSriastradh int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
3694e390cabSriastradh {
3704e390cabSriastradh 	struct psp_xgmi_topology_info *top_info;
3714e390cabSriastradh 	struct amdgpu_hive_info *hive;
3724e390cabSriastradh 	struct amdgpu_xgmi	*entry;
3734e390cabSriastradh 	struct amdgpu_device *tmp_adev = NULL;
3744e390cabSriastradh 
3754e390cabSriastradh 	int count = 0, ret = 0;
3764e390cabSriastradh 
3774e390cabSriastradh 	if (!adev->gmc.xgmi.supported)
3784e390cabSriastradh 		return 0;
3794e390cabSriastradh 
3804e390cabSriastradh 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
3814e390cabSriastradh 		ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
3824e390cabSriastradh 		if (ret) {
3834e390cabSriastradh 			dev_err(adev->dev,
3844e390cabSriastradh 				"XGMI: Failed to get hive id\n");
3854e390cabSriastradh 			return ret;
3864e390cabSriastradh 		}
3874e390cabSriastradh 
3884e390cabSriastradh 		ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
3894e390cabSriastradh 		if (ret) {
3904e390cabSriastradh 			dev_err(adev->dev,
3914e390cabSriastradh 				"XGMI: Failed to get node id\n");
3924e390cabSriastradh 			return ret;
3934e390cabSriastradh 		}
3944e390cabSriastradh 	} else {
3954e390cabSriastradh 		adev->gmc.xgmi.hive_id = 16;
3964e390cabSriastradh 		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
3974e390cabSriastradh 	}
3984e390cabSriastradh 
3994e390cabSriastradh 	hive = amdgpu_get_xgmi_hive(adev, 1);
4004e390cabSriastradh 	if (!hive) {
4014e390cabSriastradh 		ret = -EINVAL;
4024e390cabSriastradh 		dev_err(adev->dev,
4034e390cabSriastradh 			"XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
4044e390cabSriastradh 			adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
4054e390cabSriastradh 		goto exit;
4064e390cabSriastradh 	}
4074e390cabSriastradh 
4084e390cabSriastradh 	/* Set default device pstate */
4094e390cabSriastradh 	adev->pstate = -1;
4104e390cabSriastradh 
4114e390cabSriastradh 	top_info = &adev->psp.xgmi_context.top_info;
4124e390cabSriastradh 
4134e390cabSriastradh 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
4144e390cabSriastradh 	list_for_each_entry(entry, &hive->device_list, head)
4154e390cabSriastradh 		top_info->nodes[count++].node_id = entry->node_id;
4164e390cabSriastradh 	top_info->num_nodes = count;
4174e390cabSriastradh 	hive->number_devices = count;
4184e390cabSriastradh 
4194e390cabSriastradh 	task_barrier_add_task(&hive->tb);
4204e390cabSriastradh 
4214e390cabSriastradh 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
4224e390cabSriastradh 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4234e390cabSriastradh 			/* update node list for other device in the hive */
4244e390cabSriastradh 			if (tmp_adev != adev) {
4254e390cabSriastradh 				top_info = &tmp_adev->psp.xgmi_context.top_info;
4264e390cabSriastradh 				top_info->nodes[count - 1].node_id =
4274e390cabSriastradh 					adev->gmc.xgmi.node_id;
4284e390cabSriastradh 				top_info->num_nodes = count;
4294e390cabSriastradh 			}
4304e390cabSriastradh 			ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
4314e390cabSriastradh 			if (ret)
4324e390cabSriastradh 				goto exit;
4334e390cabSriastradh 		}
4344e390cabSriastradh 
4354e390cabSriastradh 		/* get latest topology info for each device from psp */
4364e390cabSriastradh 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4374e390cabSriastradh 			ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
4384e390cabSriastradh 					&tmp_adev->psp.xgmi_context.top_info);
4394e390cabSriastradh 			if (ret) {
4404e390cabSriastradh 				dev_err(tmp_adev->dev,
4414e390cabSriastradh 					"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
4424e390cabSriastradh 					tmp_adev->gmc.xgmi.node_id,
4434e390cabSriastradh 					tmp_adev->gmc.xgmi.hive_id, ret);
4444e390cabSriastradh 				/* To do : continue with some node failed or disable the whole hive */
4454e390cabSriastradh 				goto exit;
4464e390cabSriastradh 			}
4474e390cabSriastradh 		}
4484e390cabSriastradh 	}
4494e390cabSriastradh 
4504e390cabSriastradh 	if (!ret)
4514e390cabSriastradh 		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
4524e390cabSriastradh 
4534e390cabSriastradh 
4544e390cabSriastradh 	mutex_unlock(&hive->hive_lock);
4554e390cabSriastradh exit:
4564e390cabSriastradh 	if (!ret)
4574e390cabSriastradh 		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
4584e390cabSriastradh 			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
4594e390cabSriastradh 	else
4604e390cabSriastradh 		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
4614e390cabSriastradh 			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
4624e390cabSriastradh 			ret);
4634e390cabSriastradh 
4644e390cabSriastradh 	return ret;
4654e390cabSriastradh }
4664e390cabSriastradh 
amdgpu_xgmi_remove_device(struct amdgpu_device * adev)4674e390cabSriastradh void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
4684e390cabSriastradh {
4694e390cabSriastradh 	struct amdgpu_hive_info *hive;
4704e390cabSriastradh 
4714e390cabSriastradh 	if (!adev->gmc.xgmi.supported)
4724e390cabSriastradh 		return;
4734e390cabSriastradh 
4744e390cabSriastradh 	hive = amdgpu_get_xgmi_hive(adev, 1);
4754e390cabSriastradh 	if (!hive)
4764e390cabSriastradh 		return;
4774e390cabSriastradh 
4784e390cabSriastradh 	if (!(hive->number_devices--)) {
4794e390cabSriastradh 		amdgpu_xgmi_sysfs_destroy(adev, hive);
480*e4a580baSriastradh 		task_barrier_destroy(&tmp->tb);
4814e390cabSriastradh 		mutex_destroy(&hive->hive_lock);
4824e390cabSriastradh 		mutex_destroy(&hive->reset_lock);
4834e390cabSriastradh 	} else {
4844e390cabSriastradh 		task_barrier_rem_task(&hive->tb);
4854e390cabSriastradh 		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
4864e390cabSriastradh 		mutex_unlock(&hive->hive_lock);
4874e390cabSriastradh 	}
4884e390cabSriastradh }
4894e390cabSriastradh 
amdgpu_xgmi_ras_late_init(struct amdgpu_device * adev)4904e390cabSriastradh int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
4914e390cabSriastradh {
4924e390cabSriastradh 	int r;
4934e390cabSriastradh 	struct ras_ih_if ih_info = {
4944e390cabSriastradh 		.cb = NULL,
4954e390cabSriastradh 	};
4964e390cabSriastradh 	struct ras_fs_if fs_info = {
4974e390cabSriastradh 		.sysfs_name = "xgmi_wafl_err_count",
4984e390cabSriastradh 		.debugfs_name = "xgmi_wafl_err_inject",
4994e390cabSriastradh 	};
5004e390cabSriastradh 
5014e390cabSriastradh 	if (!adev->gmc.xgmi.supported ||
5024e390cabSriastradh 	    adev->gmc.xgmi.num_physical_nodes == 0)
5034e390cabSriastradh 		return 0;
5044e390cabSriastradh 
5054e390cabSriastradh 	if (!adev->gmc.xgmi.ras_if) {
5064e390cabSriastradh 		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
5074e390cabSriastradh 		if (!adev->gmc.xgmi.ras_if)
5084e390cabSriastradh 			return -ENOMEM;
5094e390cabSriastradh 		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
5104e390cabSriastradh 		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
5114e390cabSriastradh 		adev->gmc.xgmi.ras_if->sub_block_index = 0;
5124e390cabSriastradh 		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
5134e390cabSriastradh 	}
5144e390cabSriastradh 	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
5154e390cabSriastradh 	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
5164e390cabSriastradh 				 &fs_info, &ih_info);
5174e390cabSriastradh 	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
5184e390cabSriastradh 		kfree(adev->gmc.xgmi.ras_if);
5194e390cabSriastradh 		adev->gmc.xgmi.ras_if = NULL;
5204e390cabSriastradh 	}
5214e390cabSriastradh 
5224e390cabSriastradh 	return r;
5234e390cabSriastradh }
5244e390cabSriastradh 
amdgpu_xgmi_ras_fini(struct amdgpu_device * adev)5254e390cabSriastradh void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
5264e390cabSriastradh {
5274e390cabSriastradh 	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
5284e390cabSriastradh 			adev->gmc.xgmi.ras_if) {
5294e390cabSriastradh 		struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
5304e390cabSriastradh 		struct ras_ih_if ih_info = {
5314e390cabSriastradh 			.cb = NULL,
5324e390cabSriastradh 		};
5334e390cabSriastradh 
5344e390cabSriastradh 		amdgpu_ras_late_fini(adev, ras_if, &ih_info);
5354e390cabSriastradh 		kfree(ras_if);
5364e390cabSriastradh 	}
5374e390cabSriastradh }
538