xref: /openbsd-src/sys/dev/pci/drm/amd/amdkfd/kfd_debug.c (revision ef9beff5ae8948e6653950871d7e3021802c23ad)
1f005ef32Sjsg /*
2f005ef32Sjsg  * Copyright 2023 Advanced Micro Devices, Inc.
3f005ef32Sjsg  *
4f005ef32Sjsg  * Permission is hereby granted, free of charge, to any person obtaining a
5f005ef32Sjsg  * copy of this software and associated documentation files (the "Software"),
6f005ef32Sjsg  * to deal in the Software without restriction, including without limitation
7f005ef32Sjsg  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8f005ef32Sjsg  * and/or sell copies of the Software, and to permit persons to whom the
9f005ef32Sjsg  * Software is furnished to do so, subject to the following conditions:
10f005ef32Sjsg  *
11f005ef32Sjsg  * The above copyright notice and this permission notice shall be included in
12f005ef32Sjsg  * all copies or substantial portions of the Software.
13f005ef32Sjsg  *
14f005ef32Sjsg  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15f005ef32Sjsg  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16f005ef32Sjsg  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17f005ef32Sjsg  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18f005ef32Sjsg  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19f005ef32Sjsg  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20f005ef32Sjsg  * OTHER DEALINGS IN THE SOFTWARE.
21f005ef32Sjsg  */
22f005ef32Sjsg 
23f005ef32Sjsg #include "kfd_debug.h"
24f005ef32Sjsg #include "kfd_device_queue_manager.h"
25f005ef32Sjsg #include "kfd_topology.h"
26f005ef32Sjsg #include <linux/file.h>
27f005ef32Sjsg #include <uapi/linux/kfd_ioctl.h>
28f005ef32Sjsg 
29f005ef32Sjsg #define MAX_WATCH_ADDRESSES	4
30f005ef32Sjsg 
31f005ef32Sjsg int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32f005ef32Sjsg 		      unsigned int *queue_id,
33f005ef32Sjsg 		      unsigned int *gpu_id,
34f005ef32Sjsg 		      uint64_t exception_clear_mask,
35f005ef32Sjsg 		      uint64_t *event_status)
36f005ef32Sjsg {
37f005ef32Sjsg 	struct process_queue_manager *pqm;
38f005ef32Sjsg 	struct process_queue_node *pqn;
39f005ef32Sjsg 	int i;
40f005ef32Sjsg 
41f005ef32Sjsg 	if (!(process && process->debug_trap_enabled))
42f005ef32Sjsg 		return -ENODATA;
43f005ef32Sjsg 
44f005ef32Sjsg 	mutex_lock(&process->event_mutex);
45f005ef32Sjsg 	*event_status = 0;
46f005ef32Sjsg 	*queue_id = 0;
47f005ef32Sjsg 	*gpu_id = 0;
48f005ef32Sjsg 
49f005ef32Sjsg 	/* find and report queue events */
50f005ef32Sjsg 	pqm = &process->pqm;
51f005ef32Sjsg 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52f005ef32Sjsg 		uint64_t tmp = process->exception_enable_mask;
53f005ef32Sjsg 
54f005ef32Sjsg 		if (!pqn->q)
55f005ef32Sjsg 			continue;
56f005ef32Sjsg 
57f005ef32Sjsg 		tmp &= pqn->q->properties.exception_status;
58f005ef32Sjsg 
59f005ef32Sjsg 		if (!tmp)
60f005ef32Sjsg 			continue;
61f005ef32Sjsg 
62f005ef32Sjsg 		*event_status = pqn->q->properties.exception_status;
63f005ef32Sjsg 		*queue_id = pqn->q->properties.queue_id;
64f005ef32Sjsg 		*gpu_id = pqn->q->device->id;
65f005ef32Sjsg 		pqn->q->properties.exception_status &= ~exception_clear_mask;
66f005ef32Sjsg 		goto out;
67f005ef32Sjsg 	}
68f005ef32Sjsg 
69f005ef32Sjsg 	/* find and report device events */
70f005ef32Sjsg 	for (i = 0; i < process->n_pdds; i++) {
71f005ef32Sjsg 		struct kfd_process_device *pdd = process->pdds[i];
72f005ef32Sjsg 		uint64_t tmp = process->exception_enable_mask
73f005ef32Sjsg 						& pdd->exception_status;
74f005ef32Sjsg 
75f005ef32Sjsg 		if (!tmp)
76f005ef32Sjsg 			continue;
77f005ef32Sjsg 
78f005ef32Sjsg 		*event_status = pdd->exception_status;
79f005ef32Sjsg 		*gpu_id = pdd->dev->id;
80f005ef32Sjsg 		pdd->exception_status &= ~exception_clear_mask;
81f005ef32Sjsg 		goto out;
82f005ef32Sjsg 	}
83f005ef32Sjsg 
84f005ef32Sjsg 	/* report process events */
85f005ef32Sjsg 	if (process->exception_enable_mask & process->exception_status) {
86f005ef32Sjsg 		*event_status = process->exception_status;
87f005ef32Sjsg 		process->exception_status &= ~exception_clear_mask;
88f005ef32Sjsg 	}
89f005ef32Sjsg 
90f005ef32Sjsg out:
91f005ef32Sjsg 	mutex_unlock(&process->event_mutex);
92f005ef32Sjsg 	return *event_status ? 0 : -EAGAIN;
93f005ef32Sjsg }
94f005ef32Sjsg 
95f005ef32Sjsg void debug_event_write_work_handler(struct work_struct *work)
96f005ef32Sjsg {
97f005ef32Sjsg 	struct kfd_process *process;
98f005ef32Sjsg 
99f005ef32Sjsg 	static const char write_data = '.';
100f005ef32Sjsg 	loff_t pos = 0;
101f005ef32Sjsg 
102f005ef32Sjsg 	process = container_of(work,
103f005ef32Sjsg 			struct kfd_process,
104f005ef32Sjsg 			debug_event_workarea);
105f005ef32Sjsg 
106*ef9beff5Sjsg 	if (process->debug_trap_enabled && process->dbg_ev_file)
107f005ef32Sjsg 		kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
108f005ef32Sjsg }
109f005ef32Sjsg 
110f005ef32Sjsg /* update process/device/queue exception status, write to descriptor
111f005ef32Sjsg  * only if exception_status is enabled.
112f005ef32Sjsg  */
113f005ef32Sjsg bool kfd_dbg_ev_raise(uint64_t event_mask,
114f005ef32Sjsg 			struct kfd_process *process, struct kfd_node *dev,
115f005ef32Sjsg 			unsigned int source_id, bool use_worker,
116f005ef32Sjsg 			void *exception_data, size_t exception_data_size)
117f005ef32Sjsg {
118f005ef32Sjsg 	struct process_queue_manager *pqm;
119f005ef32Sjsg 	struct process_queue_node *pqn;
120f005ef32Sjsg 	int i;
121f005ef32Sjsg 	static const char write_data = '.';
122f005ef32Sjsg 	loff_t pos = 0;
123f005ef32Sjsg 	bool is_subscribed = true;
124f005ef32Sjsg 
125f005ef32Sjsg 	if (!(process && process->debug_trap_enabled))
126f005ef32Sjsg 		return false;
127f005ef32Sjsg 
128f005ef32Sjsg 	mutex_lock(&process->event_mutex);
129f005ef32Sjsg 
130f005ef32Sjsg 	if (event_mask & KFD_EC_MASK_DEVICE) {
131f005ef32Sjsg 		for (i = 0; i < process->n_pdds; i++) {
132f005ef32Sjsg 			struct kfd_process_device *pdd = process->pdds[i];
133f005ef32Sjsg 
134f005ef32Sjsg 			if (pdd->dev != dev)
135f005ef32Sjsg 				continue;
136f005ef32Sjsg 
137f005ef32Sjsg 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
138f005ef32Sjsg 
139f005ef32Sjsg 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
140f005ef32Sjsg 				if (!pdd->vm_fault_exc_data) {
141f005ef32Sjsg 					pdd->vm_fault_exc_data = kmemdup(
142f005ef32Sjsg 							exception_data,
143f005ef32Sjsg 							exception_data_size,
144f005ef32Sjsg 							GFP_KERNEL);
145f005ef32Sjsg 					if (!pdd->vm_fault_exc_data)
146f005ef32Sjsg 						pr_debug("Failed to allocate exception data memory");
147f005ef32Sjsg 				} else {
148f005ef32Sjsg 					pr_debug("Debugger exception data not saved\n");
149f005ef32Sjsg 					print_hex_dump_bytes("exception data: ",
150f005ef32Sjsg 							DUMP_PREFIX_OFFSET,
151f005ef32Sjsg 							exception_data,
152f005ef32Sjsg 							exception_data_size);
153f005ef32Sjsg 				}
154f005ef32Sjsg 			}
155f005ef32Sjsg 			break;
156f005ef32Sjsg 		}
157f005ef32Sjsg 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
158f005ef32Sjsg 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
159f005ef32Sjsg 	} else {
160f005ef32Sjsg 		pqm = &process->pqm;
161f005ef32Sjsg 		list_for_each_entry(pqn, &pqm->queues,
162f005ef32Sjsg 				process_queue_list) {
163f005ef32Sjsg 			int target_id;
164f005ef32Sjsg 
165f005ef32Sjsg 			if (!pqn->q)
166f005ef32Sjsg 				continue;
167f005ef32Sjsg 
168f005ef32Sjsg 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
169f005ef32Sjsg 					pqn->q->properties.queue_id :
170f005ef32Sjsg 							pqn->q->doorbell_id;
171f005ef32Sjsg 
172f005ef32Sjsg 			if (pqn->q->device != dev || target_id != source_id)
173f005ef32Sjsg 				continue;
174f005ef32Sjsg 
175f005ef32Sjsg 			pqn->q->properties.exception_status |= event_mask;
176f005ef32Sjsg 			break;
177f005ef32Sjsg 		}
178f005ef32Sjsg 	}
179f005ef32Sjsg 
180f005ef32Sjsg 	if (process->exception_enable_mask & event_mask) {
181f005ef32Sjsg 		if (use_worker)
182f005ef32Sjsg 			schedule_work(&process->debug_event_workarea);
183f005ef32Sjsg 		else
184f005ef32Sjsg 			kernel_write(process->dbg_ev_file,
185f005ef32Sjsg 					&write_data,
186f005ef32Sjsg 					1,
187f005ef32Sjsg 					&pos);
188f005ef32Sjsg 	} else {
189f005ef32Sjsg 		is_subscribed = false;
190f005ef32Sjsg 	}
191f005ef32Sjsg 
192f005ef32Sjsg 	mutex_unlock(&process->event_mutex);
193f005ef32Sjsg 
194f005ef32Sjsg 	return is_subscribed;
195f005ef32Sjsg }
196f005ef32Sjsg 
197f005ef32Sjsg /* set pending event queue entry from ring entry  */
198f005ef32Sjsg bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
199f005ef32Sjsg 				   unsigned int pasid,
200f005ef32Sjsg 				   uint32_t doorbell_id,
201f005ef32Sjsg 				   uint64_t trap_mask,
202f005ef32Sjsg 				   void *exception_data,
203f005ef32Sjsg 				   size_t exception_data_size)
204f005ef32Sjsg {
205f005ef32Sjsg 	struct kfd_process *p;
206f005ef32Sjsg 	bool signaled_to_debugger_or_runtime = false;
207f005ef32Sjsg 
208f005ef32Sjsg 	p = kfd_lookup_process_by_pasid(pasid);
209f005ef32Sjsg 
210f005ef32Sjsg 	if (!p)
211f005ef32Sjsg 		return false;
212f005ef32Sjsg 
213f005ef32Sjsg 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
214f005ef32Sjsg 			      exception_data, exception_data_size)) {
215f005ef32Sjsg 		struct process_queue_manager *pqm;
216f005ef32Sjsg 		struct process_queue_node *pqn;
217f005ef32Sjsg 
218f005ef32Sjsg 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
219f005ef32Sjsg 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
220f005ef32Sjsg 			mutex_lock(&p->mutex);
221f005ef32Sjsg 
222f005ef32Sjsg 			pqm = &p->pqm;
223f005ef32Sjsg 			list_for_each_entry(pqn, &pqm->queues,
224f005ef32Sjsg 							process_queue_list) {
225f005ef32Sjsg 
226f005ef32Sjsg 				if (!(pqn->q && pqn->q->device == dev &&
227f005ef32Sjsg 				      pqn->q->doorbell_id == doorbell_id))
228f005ef32Sjsg 					continue;
229f005ef32Sjsg 
230f005ef32Sjsg 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
231f005ef32Sjsg 							      trap_mask);
232f005ef32Sjsg 
233f005ef32Sjsg 				signaled_to_debugger_or_runtime = true;
234f005ef32Sjsg 
235f005ef32Sjsg 				break;
236f005ef32Sjsg 			}
237f005ef32Sjsg 
238f005ef32Sjsg 			mutex_unlock(&p->mutex);
239f005ef32Sjsg 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
240f005ef32Sjsg 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
241f005ef32Sjsg 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
242f005ef32Sjsg 							exception_data);
243f005ef32Sjsg 
244f005ef32Sjsg 			signaled_to_debugger_or_runtime = true;
245f005ef32Sjsg 		}
246f005ef32Sjsg 	} else {
247f005ef32Sjsg 		signaled_to_debugger_or_runtime = true;
248f005ef32Sjsg 	}
249f005ef32Sjsg 
250f005ef32Sjsg 	kfd_unref_process(p);
251f005ef32Sjsg 
252f005ef32Sjsg 	return signaled_to_debugger_or_runtime;
253f005ef32Sjsg }
254f005ef32Sjsg 
255f005ef32Sjsg int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
256f005ef32Sjsg 					unsigned int dev_id,
257f005ef32Sjsg 					unsigned int queue_id,
258f005ef32Sjsg 					uint64_t error_reason)
259f005ef32Sjsg {
260f005ef32Sjsg 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
261f005ef32Sjsg 		struct kfd_process_device *pdd = NULL;
262f005ef32Sjsg 		struct kfd_hsa_memory_exception_data *data;
263f005ef32Sjsg 		int i;
264f005ef32Sjsg 
265f005ef32Sjsg 		for (i = 0; i < p->n_pdds; i++) {
266f005ef32Sjsg 			if (p->pdds[i]->dev->id == dev_id) {
267f005ef32Sjsg 				pdd = p->pdds[i];
268f005ef32Sjsg 				break;
269f005ef32Sjsg 			}
270f005ef32Sjsg 		}
271f005ef32Sjsg 
272f005ef32Sjsg 		if (!pdd)
273f005ef32Sjsg 			return -ENODEV;
274f005ef32Sjsg 
275f005ef32Sjsg 		data = (struct kfd_hsa_memory_exception_data *)
276f005ef32Sjsg 						pdd->vm_fault_exc_data;
277f005ef32Sjsg 
278f005ef32Sjsg 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
279f005ef32Sjsg 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
280f005ef32Sjsg 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
281f005ef32Sjsg 	}
282f005ef32Sjsg 
283f005ef32Sjsg 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
284f005ef32Sjsg 		/*
285f005ef32Sjsg 		 * block should only happen after the debugger receives runtime
286f005ef32Sjsg 		 * enable notice.
287f005ef32Sjsg 		 */
288f005ef32Sjsg 		up(&p->runtime_enable_sema);
289f005ef32Sjsg 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
290f005ef32Sjsg 	}
291f005ef32Sjsg 
292f005ef32Sjsg 	if (error_reason)
293f005ef32Sjsg 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
294f005ef32Sjsg 
295f005ef32Sjsg 	return 0;
296f005ef32Sjsg }
297f005ef32Sjsg 
298f005ef32Sjsg static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
299f005ef32Sjsg {
300f005ef32Sjsg 	struct mqd_update_info minfo = {0};
301f005ef32Sjsg 	int err;
302f005ef32Sjsg 
303f005ef32Sjsg 	if (!q)
304f005ef32Sjsg 		return 0;
305f005ef32Sjsg 
306f005ef32Sjsg 	if (!kfd_dbg_has_cwsr_workaround(q->device))
307f005ef32Sjsg 		return 0;
308f005ef32Sjsg 
309f005ef32Sjsg 	if (enable && q->properties.is_user_cu_masked)
310f005ef32Sjsg 		return -EBUSY;
311f005ef32Sjsg 
312f005ef32Sjsg 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
313f005ef32Sjsg 
314f005ef32Sjsg 	q->properties.is_dbg_wa = enable;
315f005ef32Sjsg 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
316f005ef32Sjsg 	if (err)
317f005ef32Sjsg 		q->properties.is_dbg_wa = false;
318f005ef32Sjsg 
319f005ef32Sjsg 	return err;
320f005ef32Sjsg }
321f005ef32Sjsg 
322f005ef32Sjsg static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
323f005ef32Sjsg {
324f005ef32Sjsg 	struct process_queue_manager *pqm = &target->pqm;
325f005ef32Sjsg 	struct process_queue_node *pqn;
326f005ef32Sjsg 	int r = 0;
327f005ef32Sjsg 
328f005ef32Sjsg 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
329f005ef32Sjsg 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
330f005ef32Sjsg 		if (enable && r)
331f005ef32Sjsg 			goto unwind;
332f005ef32Sjsg 	}
333f005ef32Sjsg 
334f005ef32Sjsg 	return 0;
335f005ef32Sjsg 
336f005ef32Sjsg unwind:
337f005ef32Sjsg 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
338f005ef32Sjsg 		kfd_dbg_set_queue_workaround(pqn->q, false);
339f005ef32Sjsg 
340f005ef32Sjsg 	if (enable)
341f005ef32Sjsg 		target->runtime_info.runtime_state = r == -EBUSY ?
342f005ef32Sjsg 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
343f005ef32Sjsg 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
344f005ef32Sjsg 
345f005ef32Sjsg 	return r;
346f005ef32Sjsg }
347f005ef32Sjsg 
348f005ef32Sjsg int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
349f005ef32Sjsg {
350f005ef32Sjsg 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
351f005ef32Sjsg 	uint32_t flags = pdd->process->dbg_flags;
352f005ef32Sjsg 
353f005ef32Sjsg 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
354f005ef32Sjsg 		return 0;
355f005ef32Sjsg 
356f005ef32Sjsg 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
357f005ef32Sjsg 						pdd->watch_points, flags, sq_trap_en);
358f005ef32Sjsg }
359f005ef32Sjsg 
360f005ef32Sjsg #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
361f005ef32Sjsg static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
362f005ef32Sjsg {
363f005ef32Sjsg 	int i;
364f005ef32Sjsg 
365f005ef32Sjsg 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
366f005ef32Sjsg 
367f005ef32Sjsg 	spin_lock(&pdd->dev->kfd->watch_points_lock);
368f005ef32Sjsg 
369f005ef32Sjsg 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
370f005ef32Sjsg 		/* device watchpoint in use so skip */
371f005ef32Sjsg 		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
372f005ef32Sjsg 			continue;
373f005ef32Sjsg 
374f005ef32Sjsg 		pdd->alloc_watch_ids |= 0x1 << i;
375f005ef32Sjsg 		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
376f005ef32Sjsg 		*watch_id = i;
377f005ef32Sjsg 		spin_unlock(&pdd->dev->kfd->watch_points_lock);
378f005ef32Sjsg 		return 0;
379f005ef32Sjsg 	}
380f005ef32Sjsg 
381f005ef32Sjsg 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
382f005ef32Sjsg 
383f005ef32Sjsg 	return -ENOMEM;
384f005ef32Sjsg }
385f005ef32Sjsg 
386f005ef32Sjsg static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
387f005ef32Sjsg {
388f005ef32Sjsg 	spin_lock(&pdd->dev->kfd->watch_points_lock);
389f005ef32Sjsg 
390f005ef32Sjsg 	/* process owns device watch point so safe to clear */
391f005ef32Sjsg 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
392f005ef32Sjsg 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
393f005ef32Sjsg 		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
394f005ef32Sjsg 	}
395f005ef32Sjsg 
396f005ef32Sjsg 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
397f005ef32Sjsg }
398f005ef32Sjsg 
399f005ef32Sjsg static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
400f005ef32Sjsg {
401f005ef32Sjsg 	bool owns_watch_id = false;
402f005ef32Sjsg 
403f005ef32Sjsg 	spin_lock(&pdd->dev->kfd->watch_points_lock);
404f005ef32Sjsg 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
405f005ef32Sjsg 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
406f005ef32Sjsg 
407f005ef32Sjsg 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
408f005ef32Sjsg 
409f005ef32Sjsg 	return owns_watch_id;
410f005ef32Sjsg }
411f005ef32Sjsg 
412f005ef32Sjsg int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
413f005ef32Sjsg 					uint32_t watch_id)
414f005ef32Sjsg {
415f005ef32Sjsg 	int r;
416f005ef32Sjsg 
417f005ef32Sjsg 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
418f005ef32Sjsg 		return -EINVAL;
419f005ef32Sjsg 
420f005ef32Sjsg 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
421f005ef32Sjsg 		r = debug_lock_and_unmap(pdd->dev->dqm);
422f005ef32Sjsg 		if (r)
423f005ef32Sjsg 			return r;
424f005ef32Sjsg 	}
425f005ef32Sjsg 
426f005ef32Sjsg 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
427f005ef32Sjsg 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
428f005ef32Sjsg 							pdd->dev->adev,
429f005ef32Sjsg 							watch_id);
430f005ef32Sjsg 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
431f005ef32Sjsg 
432f005ef32Sjsg 	if (!pdd->dev->kfd->shared_resources.enable_mes)
433f005ef32Sjsg 		r = debug_map_and_unlock(pdd->dev->dqm);
434f005ef32Sjsg 	else
435f005ef32Sjsg 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
436f005ef32Sjsg 
437f005ef32Sjsg 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
438f005ef32Sjsg 
439f005ef32Sjsg 	return r;
440f005ef32Sjsg }
441f005ef32Sjsg 
442f005ef32Sjsg int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
443f005ef32Sjsg 					uint64_t watch_address,
444f005ef32Sjsg 					uint32_t watch_address_mask,
445f005ef32Sjsg 					uint32_t *watch_id,
446f005ef32Sjsg 					uint32_t watch_mode)
447f005ef32Sjsg {
448f005ef32Sjsg 	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
449f005ef32Sjsg 	uint32_t xcc_mask = pdd->dev->xcc_mask;
450f005ef32Sjsg 
451f005ef32Sjsg 	if (r)
452f005ef32Sjsg 		return r;
453f005ef32Sjsg 
454f005ef32Sjsg 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
455f005ef32Sjsg 		r = debug_lock_and_unmap(pdd->dev->dqm);
456f005ef32Sjsg 		if (r) {
457f005ef32Sjsg 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
458f005ef32Sjsg 			return r;
459f005ef32Sjsg 		}
460f005ef32Sjsg 	}
461f005ef32Sjsg 
462f005ef32Sjsg 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
463f005ef32Sjsg 	for_each_inst(xcc_id, xcc_mask)
464f005ef32Sjsg 		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
465f005ef32Sjsg 				pdd->dev->adev,
466f005ef32Sjsg 				watch_address,
467f005ef32Sjsg 				watch_address_mask,
468f005ef32Sjsg 				*watch_id,
469f005ef32Sjsg 				watch_mode,
470f005ef32Sjsg 				pdd->dev->vm_info.last_vmid_kfd,
471f005ef32Sjsg 				xcc_id);
472f005ef32Sjsg 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
473f005ef32Sjsg 
474f005ef32Sjsg 	if (!pdd->dev->kfd->shared_resources.enable_mes)
475f005ef32Sjsg 		r = debug_map_and_unlock(pdd->dev->dqm);
476f005ef32Sjsg 	else
477f005ef32Sjsg 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
478f005ef32Sjsg 
479f005ef32Sjsg 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
480f005ef32Sjsg 	if (r)
481f005ef32Sjsg 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
482f005ef32Sjsg 
483f005ef32Sjsg 	return 0;
484f005ef32Sjsg }
485f005ef32Sjsg 
486f005ef32Sjsg static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
487f005ef32Sjsg {
488f005ef32Sjsg 	int i, j;
489f005ef32Sjsg 
490f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++)
491f005ef32Sjsg 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
492f005ef32Sjsg 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
493f005ef32Sjsg }
494f005ef32Sjsg 
495f005ef32Sjsg int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
496f005ef32Sjsg {
497f005ef32Sjsg 	uint32_t prev_flags = target->dbg_flags;
498f005ef32Sjsg 	int i, r = 0, rewind_count = 0;
499f005ef32Sjsg 
500f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
501f005ef32Sjsg 		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
502f005ef32Sjsg 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
503f005ef32Sjsg 			*flags = prev_flags;
504f005ef32Sjsg 			return -EACCES;
505f005ef32Sjsg 		}
506f005ef32Sjsg 	}
507f005ef32Sjsg 
508f005ef32Sjsg 	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
509f005ef32Sjsg 	*flags = prev_flags;
510f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
511f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
512f005ef32Sjsg 
513f005ef32Sjsg 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
514f005ef32Sjsg 			continue;
515f005ef32Sjsg 
516f005ef32Sjsg 		if (!pdd->dev->kfd->shared_resources.enable_mes)
517f005ef32Sjsg 			r = debug_refresh_runlist(pdd->dev->dqm);
518f005ef32Sjsg 		else
519f005ef32Sjsg 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
520f005ef32Sjsg 
521f005ef32Sjsg 		if (r) {
522f005ef32Sjsg 			target->dbg_flags = prev_flags;
523f005ef32Sjsg 			break;
524f005ef32Sjsg 		}
525f005ef32Sjsg 
526f005ef32Sjsg 		rewind_count++;
527f005ef32Sjsg 	}
528f005ef32Sjsg 
529f005ef32Sjsg 	/* Rewind flags */
530f005ef32Sjsg 	if (r) {
531f005ef32Sjsg 		target->dbg_flags = prev_flags;
532f005ef32Sjsg 
533f005ef32Sjsg 		for (i = 0; i < rewind_count; i++) {
534f005ef32Sjsg 			struct kfd_process_device *pdd = target->pdds[i];
535f005ef32Sjsg 
536f005ef32Sjsg 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
537f005ef32Sjsg 				continue;
538f005ef32Sjsg 
539f005ef32Sjsg 			if (!pdd->dev->kfd->shared_resources.enable_mes)
540f005ef32Sjsg 				debug_refresh_runlist(pdd->dev->dqm);
541f005ef32Sjsg 			else
542f005ef32Sjsg 				kfd_dbg_set_mes_debug_mode(pdd, true);
543f005ef32Sjsg 		}
544f005ef32Sjsg 	}
545f005ef32Sjsg 
546f005ef32Sjsg 	return r;
547f005ef32Sjsg }
548f005ef32Sjsg 
549f005ef32Sjsg /* kfd_dbg_trap_deactivate:
550f005ef32Sjsg  *	target: target process
551f005ef32Sjsg  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
552f005ef32Sjsg  *	unwind_count:
553f005ef32Sjsg  *		If unwind == true, how far down the pdd list we need
554f005ef32Sjsg  *				to unwind
555f005ef32Sjsg  *		else: ignored
556f005ef32Sjsg  */
557f005ef32Sjsg void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
558f005ef32Sjsg {
559f005ef32Sjsg 	int i;
560f005ef32Sjsg 
561f005ef32Sjsg 	if (!unwind) {
562f005ef32Sjsg 		uint32_t flags = 0;
563f005ef32Sjsg 		int resume_count = resume_queues(target, 0, NULL);
564f005ef32Sjsg 
565f005ef32Sjsg 		if (resume_count)
566f005ef32Sjsg 			pr_debug("Resumed %d queues\n", resume_count);
567f005ef32Sjsg 
568f005ef32Sjsg 		cancel_work_sync(&target->debug_event_workarea);
569f005ef32Sjsg 		kfd_dbg_clear_process_address_watch(target);
570f005ef32Sjsg 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
571f005ef32Sjsg 
572f005ef32Sjsg 		kfd_dbg_trap_set_flags(target, &flags);
573f005ef32Sjsg 	}
574f005ef32Sjsg 
575f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
576f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
577f005ef32Sjsg 
578f005ef32Sjsg 		/* If this is an unwind, and we have unwound the required
579f005ef32Sjsg 		 * enable calls on the pdd list, we need to stop now
580f005ef32Sjsg 		 * otherwise we may mess up another debugger session.
581f005ef32Sjsg 		 */
582f005ef32Sjsg 		if (unwind && i == unwind_count)
583f005ef32Sjsg 			break;
584f005ef32Sjsg 
585f005ef32Sjsg 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
586f005ef32Sjsg 
587f005ef32Sjsg 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
588f005ef32Sjsg 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
589f005ef32Sjsg 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
590f005ef32Sjsg 		pdd->spi_dbg_override =
591f005ef32Sjsg 				pdd->dev->kfd2kgd->disable_debug_trap(
592f005ef32Sjsg 				pdd->dev->adev,
593f005ef32Sjsg 				target->runtime_info.ttmp_setup,
594f005ef32Sjsg 				pdd->dev->vm_info.last_vmid_kfd);
595f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
596f005ef32Sjsg 
597f005ef32Sjsg 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
598f005ef32Sjsg 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
599f005ef32Sjsg 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
600f005ef32Sjsg 
601f005ef32Sjsg 		if (!pdd->dev->kfd->shared_resources.enable_mes)
602f005ef32Sjsg 			debug_refresh_runlist(pdd->dev->dqm);
603f005ef32Sjsg 		else
604f005ef32Sjsg 			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
605f005ef32Sjsg 	}
606f005ef32Sjsg 
607f005ef32Sjsg 	kfd_dbg_set_workaround(target, false);
608f005ef32Sjsg }
609f005ef32Sjsg 
610f005ef32Sjsg static void kfd_dbg_clean_exception_status(struct kfd_process *target)
611f005ef32Sjsg {
612f005ef32Sjsg 	struct process_queue_manager *pqm;
613f005ef32Sjsg 	struct process_queue_node *pqn;
614f005ef32Sjsg 	int i;
615f005ef32Sjsg 
616f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
617f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
618f005ef32Sjsg 
619f005ef32Sjsg 		kfd_process_drain_interrupts(pdd);
620f005ef32Sjsg 
621f005ef32Sjsg 		pdd->exception_status = 0;
622f005ef32Sjsg 	}
623f005ef32Sjsg 
624f005ef32Sjsg 	pqm = &target->pqm;
625f005ef32Sjsg 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
626f005ef32Sjsg 		if (!pqn->q)
627f005ef32Sjsg 			continue;
628f005ef32Sjsg 
629f005ef32Sjsg 		pqn->q->properties.exception_status = 0;
630f005ef32Sjsg 	}
631f005ef32Sjsg 
632f005ef32Sjsg 	target->exception_status = 0;
633f005ef32Sjsg }
634f005ef32Sjsg 
635f005ef32Sjsg int kfd_dbg_trap_disable(struct kfd_process *target)
636f005ef32Sjsg {
637f005ef32Sjsg 	if (!target->debug_trap_enabled)
638f005ef32Sjsg 		return 0;
639f005ef32Sjsg 
640f005ef32Sjsg 	/*
641f005ef32Sjsg 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
642f005ef32Sjsg 	 * attached running target runtime state to enable for re-attach.
643f005ef32Sjsg 	 */
644f005ef32Sjsg 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
645f005ef32Sjsg 		kfd_dbg_trap_deactivate(target, false, 0);
646f005ef32Sjsg 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
647f005ef32Sjsg 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
648f005ef32Sjsg 
649*ef9beff5Sjsg 	cancel_work_sync(&target->debug_event_workarea);
650f005ef32Sjsg 	fput(target->dbg_ev_file);
651f005ef32Sjsg 	target->dbg_ev_file = NULL;
652f005ef32Sjsg 
653f005ef32Sjsg 	if (target->debugger_process) {
654f005ef32Sjsg 		atomic_dec(&target->debugger_process->debugged_process_count);
655f005ef32Sjsg 		target->debugger_process = NULL;
656f005ef32Sjsg 	}
657f005ef32Sjsg 
658f005ef32Sjsg 	target->debug_trap_enabled = false;
659f005ef32Sjsg 	kfd_dbg_clean_exception_status(target);
660f005ef32Sjsg 	kfd_unref_process(target);
661f005ef32Sjsg 
662f005ef32Sjsg 	return 0;
663f005ef32Sjsg }
664f005ef32Sjsg 
665f005ef32Sjsg int kfd_dbg_trap_activate(struct kfd_process *target)
666f005ef32Sjsg {
667f005ef32Sjsg 	int i, r = 0;
668f005ef32Sjsg 
669f005ef32Sjsg 	r = kfd_dbg_set_workaround(target, true);
670f005ef32Sjsg 	if (r)
671f005ef32Sjsg 		return r;
672f005ef32Sjsg 
673f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
674f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
675f005ef32Sjsg 
676f005ef32Sjsg 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
677f005ef32Sjsg 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
678f005ef32Sjsg 
679f005ef32Sjsg 			if (r) {
680f005ef32Sjsg 				target->runtime_info.runtime_state = (r == -EBUSY) ?
681f005ef32Sjsg 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
682f005ef32Sjsg 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
683f005ef32Sjsg 
684f005ef32Sjsg 				goto unwind_err;
685f005ef32Sjsg 			}
686f005ef32Sjsg 		}
687f005ef32Sjsg 
688f005ef32Sjsg 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
689f005ef32Sjsg 		 * If RLC restore of debug registers is not supported and runtime enable
690f005ef32Sjsg 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
691f005ef32Sjsg 		 *
692f005ef32Sjsg 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
693f005ef32Sjsg 		 * the debug session.
694f005ef32Sjsg 		 */
695f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
696f005ef32Sjsg 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
697f005ef32Sjsg 						target->runtime_info.ttmp_setup))
698f005ef32Sjsg 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
699f005ef32Sjsg 								pdd->dev->vm_info.last_vmid_kfd);
700f005ef32Sjsg 
701f005ef32Sjsg 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
702f005ef32Sjsg 					pdd->dev->adev,
703f005ef32Sjsg 					false,
704f005ef32Sjsg 					pdd->dev->vm_info.last_vmid_kfd);
705f005ef32Sjsg 
706f005ef32Sjsg 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
707f005ef32Sjsg 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
708f005ef32Sjsg 
709f005ef32Sjsg 		/*
710f005ef32Sjsg 		 * Setting the debug flag in the trap handler requires that the TMA has been
711f005ef32Sjsg 		 * allocated, which occurs during CWSR initialization.
712f005ef32Sjsg 		 * In the event that CWSR has not been initialized at this point, setting the
713f005ef32Sjsg 		 * flag will be called again during CWSR initialization if the target process
714f005ef32Sjsg 		 * is still debug enabled.
715f005ef32Sjsg 		 */
716f005ef32Sjsg 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
717f005ef32Sjsg 
718f005ef32Sjsg 		if (!pdd->dev->kfd->shared_resources.enable_mes)
719f005ef32Sjsg 			r = debug_refresh_runlist(pdd->dev->dqm);
720f005ef32Sjsg 		else
721f005ef32Sjsg 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
722f005ef32Sjsg 
723f005ef32Sjsg 		if (r) {
724f005ef32Sjsg 			target->runtime_info.runtime_state =
725f005ef32Sjsg 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
726f005ef32Sjsg 			goto unwind_err;
727f005ef32Sjsg 		}
728f005ef32Sjsg 	}
729f005ef32Sjsg 
730f005ef32Sjsg 	return 0;
731f005ef32Sjsg 
732f005ef32Sjsg unwind_err:
733f005ef32Sjsg 	/* Enabling debug failed, we need to disable on
734f005ef32Sjsg 	 * all GPUs so the enable is all or nothing.
735f005ef32Sjsg 	 */
736f005ef32Sjsg 	kfd_dbg_trap_deactivate(target, true, i);
737f005ef32Sjsg 	return r;
738f005ef32Sjsg }
739f005ef32Sjsg 
740f005ef32Sjsg int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
741f005ef32Sjsg 			void __user *runtime_info, uint32_t *runtime_size)
742f005ef32Sjsg {
743f005ef32Sjsg 	struct file *f;
744f005ef32Sjsg 	uint32_t copy_size;
745f005ef32Sjsg 	int i, r = 0;
746f005ef32Sjsg 
747f005ef32Sjsg 	if (target->debug_trap_enabled)
748f005ef32Sjsg 		return -EALREADY;
749f005ef32Sjsg 
750f005ef32Sjsg 	/* Enable pre-checks */
751f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
752f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
753f005ef32Sjsg 
754f005ef32Sjsg 		if (!KFD_IS_SOC15(pdd->dev))
755f005ef32Sjsg 			return -ENODEV;
756f005ef32Sjsg 
757f005ef32Sjsg 		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
758f005ef32Sjsg 					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
759f005ef32Sjsg 			return -EBUSY;
760f005ef32Sjsg 	}
761f005ef32Sjsg 
762f005ef32Sjsg 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
763f005ef32Sjsg 
764f005ef32Sjsg 	f = fget(fd);
765f005ef32Sjsg 	if (!f) {
766f005ef32Sjsg 		pr_err("Failed to get file for (%i)\n", fd);
767f005ef32Sjsg 		return -EBADF;
768f005ef32Sjsg 	}
769f005ef32Sjsg 
770f005ef32Sjsg 	target->dbg_ev_file = f;
771f005ef32Sjsg 
772f005ef32Sjsg 	/* defer activation to runtime if not runtime enabled */
773f005ef32Sjsg 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
774f005ef32Sjsg 		kfd_dbg_trap_activate(target);
775f005ef32Sjsg 
776f005ef32Sjsg 	/* We already hold the process reference but hold another one for the
777f005ef32Sjsg 	 * debug session.
778f005ef32Sjsg 	 */
779f005ef32Sjsg 	kref_get(&target->ref);
780f005ef32Sjsg 	target->debug_trap_enabled = true;
781f005ef32Sjsg 
782f005ef32Sjsg 	if (target->debugger_process)
783f005ef32Sjsg 		atomic_inc(&target->debugger_process->debugged_process_count);
784f005ef32Sjsg 
785f005ef32Sjsg 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
786f005ef32Sjsg 		kfd_dbg_trap_deactivate(target, false, 0);
787f005ef32Sjsg 		r = -EFAULT;
788f005ef32Sjsg 	}
789f005ef32Sjsg 
790f005ef32Sjsg 	*runtime_size = sizeof(target->runtime_info);
791f005ef32Sjsg 
792f005ef32Sjsg 	return r;
793f005ef32Sjsg }
794f005ef32Sjsg 
795f005ef32Sjsg static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
796f005ef32Sjsg 						uint32_t trap_override,
797f005ef32Sjsg 						uint32_t trap_mask_request,
798f005ef32Sjsg 						uint32_t *trap_mask_supported)
799f005ef32Sjsg {
800f005ef32Sjsg 	int i = 0;
801f005ef32Sjsg 
802f005ef32Sjsg 	*trap_mask_supported = 0xffffffff;
803f005ef32Sjsg 
804f005ef32Sjsg 	for (i = 0; i < p->n_pdds; i++) {
805f005ef32Sjsg 		struct kfd_process_device *pdd = p->pdds[i];
806f005ef32Sjsg 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
807f005ef32Sjsg 								pdd->dev->adev,
808f005ef32Sjsg 								trap_override,
809f005ef32Sjsg 								trap_mask_supported);
810f005ef32Sjsg 
811f005ef32Sjsg 		if (err)
812f005ef32Sjsg 			return err;
813f005ef32Sjsg 	}
814f005ef32Sjsg 
815f005ef32Sjsg 	if (trap_mask_request & ~*trap_mask_supported)
816f005ef32Sjsg 		return -EACCES;
817f005ef32Sjsg 
818f005ef32Sjsg 	return 0;
819f005ef32Sjsg }
820f005ef32Sjsg 
821f005ef32Sjsg int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
822f005ef32Sjsg 					uint32_t trap_override,
823f005ef32Sjsg 					uint32_t trap_mask_bits,
824f005ef32Sjsg 					uint32_t trap_mask_request,
825f005ef32Sjsg 					uint32_t *trap_mask_prev,
826f005ef32Sjsg 					uint32_t *trap_mask_supported)
827f005ef32Sjsg {
828f005ef32Sjsg 	int r = 0, i;
829f005ef32Sjsg 
830f005ef32Sjsg 	r = kfd_dbg_validate_trap_override_request(target,
831f005ef32Sjsg 						trap_override,
832f005ef32Sjsg 						trap_mask_request,
833f005ef32Sjsg 						trap_mask_supported);
834f005ef32Sjsg 
835f005ef32Sjsg 	if (r)
836f005ef32Sjsg 		return r;
837f005ef32Sjsg 
838f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
839f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
840f005ef32Sjsg 
841f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
842f005ef32Sjsg 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
843f005ef32Sjsg 				pdd->dev->adev,
844f005ef32Sjsg 				pdd->dev->vm_info.last_vmid_kfd,
845f005ef32Sjsg 				trap_override,
846f005ef32Sjsg 				trap_mask_bits,
847f005ef32Sjsg 				trap_mask_request,
848f005ef32Sjsg 				trap_mask_prev,
849f005ef32Sjsg 				pdd->spi_dbg_override);
850f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
851f005ef32Sjsg 
852f005ef32Sjsg 		if (!pdd->dev->kfd->shared_resources.enable_mes)
853f005ef32Sjsg 			r = debug_refresh_runlist(pdd->dev->dqm);
854f005ef32Sjsg 		else
855f005ef32Sjsg 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
856f005ef32Sjsg 
857f005ef32Sjsg 		if (r)
858f005ef32Sjsg 			break;
859f005ef32Sjsg 	}
860f005ef32Sjsg 
861f005ef32Sjsg 	return r;
862f005ef32Sjsg }
863f005ef32Sjsg 
864f005ef32Sjsg int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
865f005ef32Sjsg 					uint8_t wave_launch_mode)
866f005ef32Sjsg {
867f005ef32Sjsg 	int r = 0, i;
868f005ef32Sjsg 
869f005ef32Sjsg 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
870f005ef32Sjsg 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
871f005ef32Sjsg 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
872f005ef32Sjsg 		return -EINVAL;
873f005ef32Sjsg 
874f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
875f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
876f005ef32Sjsg 
877f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
878f005ef32Sjsg 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
879f005ef32Sjsg 				pdd->dev->adev,
880f005ef32Sjsg 				wave_launch_mode,
881f005ef32Sjsg 				pdd->dev->vm_info.last_vmid_kfd);
882f005ef32Sjsg 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
883f005ef32Sjsg 
884f005ef32Sjsg 		if (!pdd->dev->kfd->shared_resources.enable_mes)
885f005ef32Sjsg 			r = debug_refresh_runlist(pdd->dev->dqm);
886f005ef32Sjsg 		else
887f005ef32Sjsg 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
888f005ef32Sjsg 
889f005ef32Sjsg 		if (r)
890f005ef32Sjsg 			break;
891f005ef32Sjsg 	}
892f005ef32Sjsg 
893f005ef32Sjsg 	return r;
894f005ef32Sjsg }
895f005ef32Sjsg 
896f005ef32Sjsg int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
897f005ef32Sjsg 		uint32_t source_id,
898f005ef32Sjsg 		uint32_t exception_code,
899f005ef32Sjsg 		bool clear_exception,
900f005ef32Sjsg 		void __user *info,
901f005ef32Sjsg 		uint32_t *info_size)
902f005ef32Sjsg {
903f005ef32Sjsg 	bool found = false;
904f005ef32Sjsg 	int r = 0;
905f005ef32Sjsg 	uint32_t copy_size, actual_info_size = 0;
906f005ef32Sjsg 	uint64_t *exception_status_ptr = NULL;
907f005ef32Sjsg 
908f005ef32Sjsg 	if (!target)
909f005ef32Sjsg 		return -EINVAL;
910f005ef32Sjsg 
911f005ef32Sjsg 	if (!info || !info_size)
912f005ef32Sjsg 		return -EINVAL;
913f005ef32Sjsg 
914f005ef32Sjsg 	mutex_lock(&target->event_mutex);
915f005ef32Sjsg 
916f005ef32Sjsg 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
917f005ef32Sjsg 		/* Per queue exceptions */
918f005ef32Sjsg 		struct queue *queue = NULL;
919f005ef32Sjsg 		int i;
920f005ef32Sjsg 
921f005ef32Sjsg 		for (i = 0; i < target->n_pdds; i++) {
922f005ef32Sjsg 			struct kfd_process_device *pdd = target->pdds[i];
923f005ef32Sjsg 			struct qcm_process_device *qpd = &pdd->qpd;
924f005ef32Sjsg 
925f005ef32Sjsg 			list_for_each_entry(queue, &qpd->queues_list, list) {
926f005ef32Sjsg 				if (!found && queue->properties.queue_id == source_id) {
927f005ef32Sjsg 					found = true;
928f005ef32Sjsg 					break;
929f005ef32Sjsg 				}
930f005ef32Sjsg 			}
931f005ef32Sjsg 			if (found)
932f005ef32Sjsg 				break;
933f005ef32Sjsg 		}
934f005ef32Sjsg 
935f005ef32Sjsg 		if (!found) {
936f005ef32Sjsg 			r = -EINVAL;
937f005ef32Sjsg 			goto out;
938f005ef32Sjsg 		}
939f005ef32Sjsg 
940f005ef32Sjsg 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
941f005ef32Sjsg 			r = -ENODATA;
942f005ef32Sjsg 			goto out;
943f005ef32Sjsg 		}
944f005ef32Sjsg 		exception_status_ptr = &queue->properties.exception_status;
945f005ef32Sjsg 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
946f005ef32Sjsg 		/* Per device exceptions */
947f005ef32Sjsg 		struct kfd_process_device *pdd = NULL;
948f005ef32Sjsg 		int i;
949f005ef32Sjsg 
950f005ef32Sjsg 		for (i = 0; i < target->n_pdds; i++) {
951f005ef32Sjsg 			pdd = target->pdds[i];
952f005ef32Sjsg 			if (pdd->dev->id == source_id) {
953f005ef32Sjsg 				found = true;
954f005ef32Sjsg 				break;
955f005ef32Sjsg 			}
956f005ef32Sjsg 		}
957f005ef32Sjsg 
958f005ef32Sjsg 		if (!found) {
959f005ef32Sjsg 			r = -EINVAL;
960f005ef32Sjsg 			goto out;
961f005ef32Sjsg 		}
962f005ef32Sjsg 
963f005ef32Sjsg 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
964f005ef32Sjsg 			r = -ENODATA;
965f005ef32Sjsg 			goto out;
966f005ef32Sjsg 		}
967f005ef32Sjsg 
968f005ef32Sjsg 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
969f005ef32Sjsg 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
970f005ef32Sjsg 
971f005ef32Sjsg 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
972f005ef32Sjsg 				r = -EFAULT;
973f005ef32Sjsg 				goto out;
974f005ef32Sjsg 			}
975f005ef32Sjsg 			actual_info_size = pdd->vm_fault_exc_data_size;
976f005ef32Sjsg 			if (clear_exception) {
977f005ef32Sjsg 				kfree(pdd->vm_fault_exc_data);
978f005ef32Sjsg 				pdd->vm_fault_exc_data = NULL;
979f005ef32Sjsg 				pdd->vm_fault_exc_data_size = 0;
980f005ef32Sjsg 			}
981f005ef32Sjsg 		}
982f005ef32Sjsg 		exception_status_ptr = &pdd->exception_status;
983f005ef32Sjsg 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
984f005ef32Sjsg 		/* Per process exceptions */
985f005ef32Sjsg 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
986f005ef32Sjsg 			r = -ENODATA;
987f005ef32Sjsg 			goto out;
988f005ef32Sjsg 		}
989f005ef32Sjsg 
990f005ef32Sjsg 		if (exception_code == EC_PROCESS_RUNTIME) {
991f005ef32Sjsg 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
992f005ef32Sjsg 
993f005ef32Sjsg 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
994f005ef32Sjsg 				r = -EFAULT;
995f005ef32Sjsg 				goto out;
996f005ef32Sjsg 			}
997f005ef32Sjsg 
998f005ef32Sjsg 			actual_info_size = sizeof(target->runtime_info);
999f005ef32Sjsg 		}
1000f005ef32Sjsg 
1001f005ef32Sjsg 		exception_status_ptr = &target->exception_status;
1002f005ef32Sjsg 	} else {
1003f005ef32Sjsg 		pr_debug("Bad exception type [%i]\n", exception_code);
1004f005ef32Sjsg 		r = -EINVAL;
1005f005ef32Sjsg 		goto out;
1006f005ef32Sjsg 	}
1007f005ef32Sjsg 
1008f005ef32Sjsg 	*info_size = actual_info_size;
1009f005ef32Sjsg 	if (clear_exception)
1010f005ef32Sjsg 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1011f005ef32Sjsg out:
1012f005ef32Sjsg 	mutex_unlock(&target->event_mutex);
1013f005ef32Sjsg 	return r;
1014f005ef32Sjsg }
1015f005ef32Sjsg 
1016f005ef32Sjsg int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1017f005ef32Sjsg 		uint64_t exception_clear_mask,
1018f005ef32Sjsg 		void __user *user_info,
1019f005ef32Sjsg 		uint32_t *number_of_device_infos,
1020f005ef32Sjsg 		uint32_t *entry_size)
1021f005ef32Sjsg {
1022f005ef32Sjsg 	struct kfd_dbg_device_info_entry device_info;
1023f005ef32Sjsg 	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1024f005ef32Sjsg 	int i, r = 0;
1025f005ef32Sjsg 
1026f005ef32Sjsg 	if (!(target && user_info && number_of_device_infos && entry_size))
1027f005ef32Sjsg 		return -EINVAL;
1028f005ef32Sjsg 
1029f005ef32Sjsg 	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1030f005ef32Sjsg 	*number_of_device_infos = target->n_pdds;
1031f005ef32Sjsg 	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1032f005ef32Sjsg 
1033f005ef32Sjsg 	if (!tmp_num_devices)
1034f005ef32Sjsg 		return 0;
1035f005ef32Sjsg 
1036f005ef32Sjsg 	memset(&device_info, 0, sizeof(device_info));
1037f005ef32Sjsg 
1038f005ef32Sjsg 	mutex_lock(&target->event_mutex);
1039f005ef32Sjsg 
1040f005ef32Sjsg 	/* Run over all pdd of the process */
1041f005ef32Sjsg 	for (i = 0; i < tmp_num_devices; i++) {
1042f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
1043f005ef32Sjsg 		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1044f005ef32Sjsg 
1045f005ef32Sjsg 		device_info.gpu_id = pdd->dev->id;
1046f005ef32Sjsg 		device_info.exception_status = pdd->exception_status;
1047f005ef32Sjsg 		device_info.lds_base = pdd->lds_base;
1048f005ef32Sjsg 		device_info.lds_limit = pdd->lds_limit;
1049f005ef32Sjsg 		device_info.scratch_base = pdd->scratch_base;
1050f005ef32Sjsg 		device_info.scratch_limit = pdd->scratch_limit;
1051f005ef32Sjsg 		device_info.gpuvm_base = pdd->gpuvm_base;
1052f005ef32Sjsg 		device_info.gpuvm_limit = pdd->gpuvm_limit;
1053f005ef32Sjsg 		device_info.location_id = topo_dev->node_props.location_id;
1054f005ef32Sjsg 		device_info.vendor_id = topo_dev->node_props.vendor_id;
1055f005ef32Sjsg 		device_info.device_id = topo_dev->node_props.device_id;
1056f005ef32Sjsg 		device_info.revision_id = pdd->dev->adev->pdev->revision;
1057f005ef32Sjsg 		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1058f005ef32Sjsg 		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1059f005ef32Sjsg 		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1060f005ef32Sjsg 		device_info.gfx_target_version =
1061f005ef32Sjsg 			topo_dev->node_props.gfx_target_version;
1062f005ef32Sjsg 		device_info.simd_count = topo_dev->node_props.simd_count;
1063f005ef32Sjsg 		device_info.max_waves_per_simd =
1064f005ef32Sjsg 			topo_dev->node_props.max_waves_per_simd;
1065f005ef32Sjsg 		device_info.array_count = topo_dev->node_props.array_count;
1066f005ef32Sjsg 		device_info.simd_arrays_per_engine =
1067f005ef32Sjsg 			topo_dev->node_props.simd_arrays_per_engine;
1068f005ef32Sjsg 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1069f005ef32Sjsg 		device_info.capability = topo_dev->node_props.capability;
1070f005ef32Sjsg 		device_info.debug_prop = topo_dev->node_props.debug_prop;
1071f005ef32Sjsg 
1072f005ef32Sjsg 		if (exception_clear_mask)
1073f005ef32Sjsg 			pdd->exception_status &= ~exception_clear_mask;
1074f005ef32Sjsg 
1075f005ef32Sjsg 		if (copy_to_user(user_info, &device_info, *entry_size)) {
1076f005ef32Sjsg 			r = -EFAULT;
1077f005ef32Sjsg 			break;
1078f005ef32Sjsg 		}
1079f005ef32Sjsg 
1080f005ef32Sjsg 		user_info += tmp_entry_size;
1081f005ef32Sjsg 	}
1082f005ef32Sjsg 
1083f005ef32Sjsg 	mutex_unlock(&target->event_mutex);
1084f005ef32Sjsg 
1085f005ef32Sjsg 	return r;
1086f005ef32Sjsg }
1087f005ef32Sjsg 
1088f005ef32Sjsg void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1089f005ef32Sjsg 					uint64_t exception_set_mask)
1090f005ef32Sjsg {
1091f005ef32Sjsg 	uint64_t found_mask = 0;
1092f005ef32Sjsg 	struct process_queue_manager *pqm;
1093f005ef32Sjsg 	struct process_queue_node *pqn;
1094f005ef32Sjsg 	static const char write_data = '.';
1095f005ef32Sjsg 	loff_t pos = 0;
1096f005ef32Sjsg 	int i;
1097f005ef32Sjsg 
1098f005ef32Sjsg 	mutex_lock(&target->event_mutex);
1099f005ef32Sjsg 
1100f005ef32Sjsg 	found_mask |= target->exception_status;
1101f005ef32Sjsg 
1102f005ef32Sjsg 	pqm = &target->pqm;
1103f005ef32Sjsg 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1104f005ef32Sjsg 		if (!pqn->q)
1105f005ef32Sjsg 			continue;
1106f005ef32Sjsg 
1107f005ef32Sjsg 		found_mask |= pqn->q->properties.exception_status;
1108f005ef32Sjsg 	}
1109f005ef32Sjsg 
1110f005ef32Sjsg 	for (i = 0; i < target->n_pdds; i++) {
1111f005ef32Sjsg 		struct kfd_process_device *pdd = target->pdds[i];
1112f005ef32Sjsg 
1113f005ef32Sjsg 		found_mask |= pdd->exception_status;
1114f005ef32Sjsg 	}
1115f005ef32Sjsg 
1116f005ef32Sjsg 	if (exception_set_mask & found_mask)
1117f005ef32Sjsg 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1118f005ef32Sjsg 
1119f005ef32Sjsg 	target->exception_enable_mask = exception_set_mask;
1120f005ef32Sjsg 
1121f005ef32Sjsg 	mutex_unlock(&target->event_mutex);
1122f005ef32Sjsg }
1123