1 /* $NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $ */
2
3 /*
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 *
25 */
26 #include <sys/cdefs.h>
27 __KERNEL_RCSID(0, "$NetBSD: amdgpu_xgmi.c,v 1.3 2021/12/19 10:59:01 riastradh Exp $");
28
29 #include <linux/list.h>
30 #include "amdgpu.h"
31 #include "amdgpu_xgmi.h"
32 #include "amdgpu_smu.h"
33 #include "amdgpu_ras.h"
34 #include "df/df_3_6_offset.h"
35
36 static DEFINE_MUTEX(xgmi_mutex);
37
38 #define AMDGPU_MAX_XGMI_HIVE 8
39 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
40
41 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
42 static unsigned hive_count = 0;
43
amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info * hive)44 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
45 {
46 return &hive->device_list;
47 }
48
49 /**
50 * DOC: AMDGPU XGMI Support
51 *
52 * XGMI is a high speed interconnect that joins multiple GPU cards
53 * into a homogeneous memory space that is organized by a collective
54 * hive ID and individual node IDs, both of which are 64-bit numbers.
55 *
56 * The file xgmi_device_id contains the unique per GPU device ID and
57 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
58 *
59 * Inside the device directory a sub-directory 'xgmi_hive_info' is
60 * created which contains the hive ID and the list of nodes.
61 *
62 * The hive ID is stored in:
63 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
64 *
65 * The node information is stored in numbered directories:
66 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
67 *
68 * Each device has their own xgmi_hive_info direction with a mirror
69 * set of node sub-directories.
70 *
71 * The XGMI memory space is built by contiguously adding the power of
72 * two padded VRAM space from each node to each other.
73 *
74 */
75
76
amdgpu_xgmi_show_hive_id(struct device * dev,struct device_attribute * attr,char * buf)77 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
78 struct device_attribute *attr, char *buf)
79 {
80 struct amdgpu_hive_info *hive =
81 container_of(attr, struct amdgpu_hive_info, dev_attr);
82
83 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
84 }
85
amdgpu_xgmi_sysfs_create(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)86 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
87 struct amdgpu_hive_info *hive)
88 {
89 #ifdef CONFIG_SYSFS
90 int ret = 0;
91
92 if (WARN_ON(hive->kobj))
93 return -EINVAL;
94
95 hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
96 if (!hive->kobj) {
97 dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
98 return -EINVAL;
99 }
100
101 hive->dev_attr = (struct device_attribute) {
102 .attr = {
103 .name = "xgmi_hive_id",
104 .mode = S_IRUGO,
105
106 },
107 .show = amdgpu_xgmi_show_hive_id,
108 };
109
110 ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
111 if (ret) {
112 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
113 kobject_del(hive->kobj);
114 kobject_put(hive->kobj);
115 hive->kobj = NULL;
116 }
117
118 return ret;
119 #endif
120 }
121
amdgpu_xgmi_sysfs_destroy(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)122 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
123 struct amdgpu_hive_info *hive)
124 {
125 #ifdef CONFIG_SYSFS
126 sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
127 kobject_del(hive->kobj);
128 kobject_put(hive->kobj);
129 hive->kobj = NULL;
130 #endif
131 }
132
amdgpu_xgmi_show_device_id(struct device * dev,struct device_attribute * attr,char * buf)133 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
134 struct device_attribute *attr,
135 char *buf)
136 {
137 struct drm_device *ddev = dev_get_drvdata(dev);
138 struct amdgpu_device *adev = ddev->dev_private;
139
140 return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
141
142 }
143
144 #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
amdgpu_xgmi_show_error(struct device * dev,struct device_attribute * attr,char * buf)145 static ssize_t amdgpu_xgmi_show_error(struct device *dev,
146 struct device_attribute *attr,
147 char *buf)
148 {
149 struct drm_device *ddev = dev_get_drvdata(dev);
150 struct amdgpu_device *adev = ddev->dev_private;
151 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
152 uint64_t fica_out;
153 unsigned int error_count = 0;
154
155 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
156 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
157
158 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
159 if (fica_out != 0x1f)
160 pr_err("xGMI error counters not enabled!\n");
161
162 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
163
164 if ((fica_out & 0xffff) == 2)
165 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
166
167 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
168
169 return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
170 }
171
172
173 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
174 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
175
amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)176 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
177 struct amdgpu_hive_info *hive)
178 {
179 #ifdef CONFIG_SYSFS
180 int ret = 0;
181 char node[10] = { 0 };
182
183 /* Create xgmi device id file */
184 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
185 if (ret) {
186 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
187 return ret;
188 }
189
190 /* Create xgmi error file */
191 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
192 if (ret)
193 pr_err("failed to create xgmi_error\n");
194
195
196 /* Create sysfs link to hive info folder on the first device */
197 if (adev != hive->adev) {
198 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
199 "xgmi_hive_info");
200 if (ret) {
201 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
202 goto remove_file;
203 }
204 }
205
206 sprintf(node, "node%d", hive->number_devices);
207 /* Create sysfs link form the hive folder to yourself */
208 ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
209 if (ret) {
210 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
211 goto remove_link;
212 }
213
214 goto success;
215
216
217 remove_link:
218 sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
219
220 remove_file:
221 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
222
223 success:
224 return ret;
225 #endif
226 }
227
amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)228 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
229 struct amdgpu_hive_info *hive)
230 {
231 #ifdef CONFIG_SYSFS
232 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
233 sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
234 sysfs_remove_link(hive->kobj, adev->ddev->unique);
235 #endif
236 }
237
238
239
amdgpu_get_xgmi_hive(struct amdgpu_device * adev,int lock)240 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
241 {
242 int i;
243 struct amdgpu_hive_info *tmp;
244
245 if (!adev->gmc.xgmi.hive_id)
246 return NULL;
247
248 mutex_lock(&xgmi_mutex);
249
250 for (i = 0 ; i < hive_count; ++i) {
251 tmp = &xgmi_hives[i];
252 if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
253 if (lock)
254 mutex_lock(&tmp->hive_lock);
255 mutex_unlock(&xgmi_mutex);
256 return tmp;
257 }
258 }
259 if (i >= AMDGPU_MAX_XGMI_HIVE) {
260 mutex_unlock(&xgmi_mutex);
261 return NULL;
262 }
263
264 /* initialize new hive if not exist */
265 tmp = &xgmi_hives[hive_count++];
266
267 if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
268 mutex_unlock(&xgmi_mutex);
269 return NULL;
270 }
271
272 tmp->adev = adev;
273 tmp->hive_id = adev->gmc.xgmi.hive_id;
274 INIT_LIST_HEAD(&tmp->device_list);
275 mutex_init(&tmp->hive_lock);
276 mutex_init(&tmp->reset_lock);
277 task_barrier_init(&tmp->tb);
278
279 if (lock)
280 mutex_lock(&tmp->hive_lock);
281 tmp->pstate = -1;
282 mutex_unlock(&xgmi_mutex);
283
284 return tmp;
285 }
286
amdgpu_xgmi_set_pstate(struct amdgpu_device * adev,int pstate)287 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
288 {
289 int ret = 0;
290 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
291 struct amdgpu_device *tmp_adev;
292 bool update_hive_pstate = true;
293 bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
294
295 if (!hive)
296 return 0;
297
298 mutex_lock(&hive->hive_lock);
299
300 if (hive->pstate == pstate) {
301 adev->pstate = is_high_pstate ? pstate : adev->pstate;
302 goto out;
303 }
304
305 dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
306
307 ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
308 if (ret) {
309 dev_err(adev->dev,
310 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
311 adev->gmc.xgmi.node_id,
312 adev->gmc.xgmi.hive_id, ret);
313 goto out;
314 }
315
316 /* Update device pstate */
317 adev->pstate = pstate;
318
319 /*
320 * Update the hive pstate only all devices of the hive
321 * are in the same pstate
322 */
323 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
324 if (tmp_adev->pstate != adev->pstate) {
325 update_hive_pstate = false;
326 break;
327 }
328 }
329 if (update_hive_pstate || is_high_pstate)
330 hive->pstate = pstate;
331
332 out:
333 mutex_unlock(&hive->hive_lock);
334
335 return ret;
336 }
337
amdgpu_xgmi_update_topology(struct amdgpu_hive_info * hive,struct amdgpu_device * adev)338 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
339 {
340 int ret = -EINVAL;
341
342 /* Each psp need to set the latest topology */
343 ret = psp_xgmi_set_topology_info(&adev->psp,
344 hive->number_devices,
345 &adev->psp.xgmi_context.top_info);
346 if (ret)
347 dev_err(adev->dev,
348 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
349 adev->gmc.xgmi.node_id,
350 adev->gmc.xgmi.hive_id, ret);
351
352 return ret;
353 }
354
355
amdgpu_xgmi_get_hops_count(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)356 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
357 struct amdgpu_device *peer_adev)
358 {
359 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
360 int i;
361
362 for (i = 0 ; i < top->num_nodes; ++i)
363 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
364 return top->nodes[i].num_hops;
365 return -EINVAL;
366 }
367
amdgpu_xgmi_add_device(struct amdgpu_device * adev)368 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
369 {
370 struct psp_xgmi_topology_info *top_info;
371 struct amdgpu_hive_info *hive;
372 struct amdgpu_xgmi *entry;
373 struct amdgpu_device *tmp_adev = NULL;
374
375 int count = 0, ret = 0;
376
377 if (!adev->gmc.xgmi.supported)
378 return 0;
379
380 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
381 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
382 if (ret) {
383 dev_err(adev->dev,
384 "XGMI: Failed to get hive id\n");
385 return ret;
386 }
387
388 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
389 if (ret) {
390 dev_err(adev->dev,
391 "XGMI: Failed to get node id\n");
392 return ret;
393 }
394 } else {
395 adev->gmc.xgmi.hive_id = 16;
396 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
397 }
398
399 hive = amdgpu_get_xgmi_hive(adev, 1);
400 if (!hive) {
401 ret = -EINVAL;
402 dev_err(adev->dev,
403 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
404 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
405 goto exit;
406 }
407
408 /* Set default device pstate */
409 adev->pstate = -1;
410
411 top_info = &adev->psp.xgmi_context.top_info;
412
413 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
414 list_for_each_entry(entry, &hive->device_list, head)
415 top_info->nodes[count++].node_id = entry->node_id;
416 top_info->num_nodes = count;
417 hive->number_devices = count;
418
419 task_barrier_add_task(&hive->tb);
420
421 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
422 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
423 /* update node list for other device in the hive */
424 if (tmp_adev != adev) {
425 top_info = &tmp_adev->psp.xgmi_context.top_info;
426 top_info->nodes[count - 1].node_id =
427 adev->gmc.xgmi.node_id;
428 top_info->num_nodes = count;
429 }
430 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
431 if (ret)
432 goto exit;
433 }
434
435 /* get latest topology info for each device from psp */
436 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
437 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
438 &tmp_adev->psp.xgmi_context.top_info);
439 if (ret) {
440 dev_err(tmp_adev->dev,
441 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
442 tmp_adev->gmc.xgmi.node_id,
443 tmp_adev->gmc.xgmi.hive_id, ret);
444 /* To do : continue with some node failed or disable the whole hive */
445 goto exit;
446 }
447 }
448 }
449
450 if (!ret)
451 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
452
453
454 mutex_unlock(&hive->hive_lock);
455 exit:
456 if (!ret)
457 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
458 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
459 else
460 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
461 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
462 ret);
463
464 return ret;
465 }
466
amdgpu_xgmi_remove_device(struct amdgpu_device * adev)467 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
468 {
469 struct amdgpu_hive_info *hive;
470
471 if (!adev->gmc.xgmi.supported)
472 return;
473
474 hive = amdgpu_get_xgmi_hive(adev, 1);
475 if (!hive)
476 return;
477
478 if (!(hive->number_devices--)) {
479 amdgpu_xgmi_sysfs_destroy(adev, hive);
480 task_barrier_destroy(&tmp->tb);
481 mutex_destroy(&hive->hive_lock);
482 mutex_destroy(&hive->reset_lock);
483 } else {
484 task_barrier_rem_task(&hive->tb);
485 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
486 mutex_unlock(&hive->hive_lock);
487 }
488 }
489
amdgpu_xgmi_ras_late_init(struct amdgpu_device * adev)490 int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
491 {
492 int r;
493 struct ras_ih_if ih_info = {
494 .cb = NULL,
495 };
496 struct ras_fs_if fs_info = {
497 .sysfs_name = "xgmi_wafl_err_count",
498 .debugfs_name = "xgmi_wafl_err_inject",
499 };
500
501 if (!adev->gmc.xgmi.supported ||
502 adev->gmc.xgmi.num_physical_nodes == 0)
503 return 0;
504
505 if (!adev->gmc.xgmi.ras_if) {
506 adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
507 if (!adev->gmc.xgmi.ras_if)
508 return -ENOMEM;
509 adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
510 adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
511 adev->gmc.xgmi.ras_if->sub_block_index = 0;
512 strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
513 }
514 ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
515 r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
516 &fs_info, &ih_info);
517 if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
518 kfree(adev->gmc.xgmi.ras_if);
519 adev->gmc.xgmi.ras_if = NULL;
520 }
521
522 return r;
523 }
524
amdgpu_xgmi_ras_fini(struct amdgpu_device * adev)525 void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
526 {
527 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
528 adev->gmc.xgmi.ras_if) {
529 struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
530 struct ras_ih_if ih_info = {
531 .cb = NULL,
532 };
533
534 amdgpu_ras_late_fini(adev, ras_if, &ih_info);
535 kfree(ras_if);
536 }
537 }
538