1f005ef32Sjsg /* 2f005ef32Sjsg * Copyright 2023 Advanced Micro Devices, Inc. 3f005ef32Sjsg * 4f005ef32Sjsg * Permission is hereby granted, free of charge, to any person obtaining a 5f005ef32Sjsg * copy of this software and associated documentation files (the "Software"), 6f005ef32Sjsg * to deal in the Software without restriction, including without limitation 7f005ef32Sjsg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8f005ef32Sjsg * and/or sell copies of the Software, and to permit persons to whom the 9f005ef32Sjsg * Software is furnished to do so, subject to the following conditions: 10f005ef32Sjsg * 11f005ef32Sjsg * The above copyright notice and this permission notice shall be included in 12f005ef32Sjsg * all copies or substantial portions of the Software. 13f005ef32Sjsg * 14f005ef32Sjsg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15f005ef32Sjsg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16f005ef32Sjsg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17f005ef32Sjsg * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18f005ef32Sjsg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19f005ef32Sjsg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20f005ef32Sjsg * OTHER DEALINGS IN THE SOFTWARE. 21f005ef32Sjsg */ 22f005ef32Sjsg 23f005ef32Sjsg #include "kfd_debug.h" 24f005ef32Sjsg #include "kfd_device_queue_manager.h" 25f005ef32Sjsg #include "kfd_topology.h" 26f005ef32Sjsg #include <linux/file.h> 27f005ef32Sjsg #include <uapi/linux/kfd_ioctl.h> 28f005ef32Sjsg 29f005ef32Sjsg #define MAX_WATCH_ADDRESSES 4 30f005ef32Sjsg 31f005ef32Sjsg int kfd_dbg_ev_query_debug_event(struct kfd_process *process, 32f005ef32Sjsg unsigned int *queue_id, 33f005ef32Sjsg unsigned int *gpu_id, 34f005ef32Sjsg uint64_t exception_clear_mask, 35f005ef32Sjsg uint64_t *event_status) 36f005ef32Sjsg { 37f005ef32Sjsg struct process_queue_manager *pqm; 38f005ef32Sjsg struct process_queue_node *pqn; 39f005ef32Sjsg int i; 40f005ef32Sjsg 41f005ef32Sjsg if (!(process && process->debug_trap_enabled)) 42f005ef32Sjsg return -ENODATA; 43f005ef32Sjsg 44f005ef32Sjsg mutex_lock(&process->event_mutex); 45f005ef32Sjsg *event_status = 0; 46f005ef32Sjsg *queue_id = 0; 47f005ef32Sjsg *gpu_id = 0; 48f005ef32Sjsg 49f005ef32Sjsg /* find and report queue events */ 50f005ef32Sjsg pqm = &process->pqm; 51f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 52f005ef32Sjsg uint64_t tmp = process->exception_enable_mask; 53f005ef32Sjsg 54f005ef32Sjsg if (!pqn->q) 55f005ef32Sjsg continue; 56f005ef32Sjsg 57f005ef32Sjsg tmp &= pqn->q->properties.exception_status; 58f005ef32Sjsg 59f005ef32Sjsg if (!tmp) 60f005ef32Sjsg continue; 61f005ef32Sjsg 62f005ef32Sjsg *event_status = pqn->q->properties.exception_status; 63f005ef32Sjsg *queue_id = pqn->q->properties.queue_id; 64f005ef32Sjsg *gpu_id = pqn->q->device->id; 65f005ef32Sjsg pqn->q->properties.exception_status &= ~exception_clear_mask; 66f005ef32Sjsg goto out; 67f005ef32Sjsg } 68f005ef32Sjsg 69f005ef32Sjsg /* find and report device events */ 70f005ef32Sjsg for (i = 0; i < process->n_pdds; i++) { 71f005ef32Sjsg struct kfd_process_device *pdd = process->pdds[i]; 72f005ef32Sjsg uint64_t tmp = process->exception_enable_mask 73f005ef32Sjsg & pdd->exception_status; 74f005ef32Sjsg 75f005ef32Sjsg if (!tmp) 76f005ef32Sjsg continue; 77f005ef32Sjsg 78f005ef32Sjsg *event_status = pdd->exception_status; 79f005ef32Sjsg *gpu_id = pdd->dev->id; 80f005ef32Sjsg pdd->exception_status &= ~exception_clear_mask; 81f005ef32Sjsg goto out; 82f005ef32Sjsg } 83f005ef32Sjsg 84f005ef32Sjsg /* report process events */ 85f005ef32Sjsg if (process->exception_enable_mask & process->exception_status) { 86f005ef32Sjsg *event_status = process->exception_status; 87f005ef32Sjsg process->exception_status &= ~exception_clear_mask; 88f005ef32Sjsg } 89f005ef32Sjsg 90f005ef32Sjsg out: 91f005ef32Sjsg mutex_unlock(&process->event_mutex); 92f005ef32Sjsg return *event_status ? 0 : -EAGAIN; 93f005ef32Sjsg } 94f005ef32Sjsg 95f005ef32Sjsg void debug_event_write_work_handler(struct work_struct *work) 96f005ef32Sjsg { 97f005ef32Sjsg struct kfd_process *process; 98f005ef32Sjsg 99f005ef32Sjsg static const char write_data = '.'; 100f005ef32Sjsg loff_t pos = 0; 101f005ef32Sjsg 102f005ef32Sjsg process = container_of(work, 103f005ef32Sjsg struct kfd_process, 104f005ef32Sjsg debug_event_workarea); 105f005ef32Sjsg 106*ef9beff5Sjsg if (process->debug_trap_enabled && process->dbg_ev_file) 107f005ef32Sjsg kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 108f005ef32Sjsg } 109f005ef32Sjsg 110f005ef32Sjsg /* update process/device/queue exception status, write to descriptor 111f005ef32Sjsg * only if exception_status is enabled. 112f005ef32Sjsg */ 113f005ef32Sjsg bool kfd_dbg_ev_raise(uint64_t event_mask, 114f005ef32Sjsg struct kfd_process *process, struct kfd_node *dev, 115f005ef32Sjsg unsigned int source_id, bool use_worker, 116f005ef32Sjsg void *exception_data, size_t exception_data_size) 117f005ef32Sjsg { 118f005ef32Sjsg struct process_queue_manager *pqm; 119f005ef32Sjsg struct process_queue_node *pqn; 120f005ef32Sjsg int i; 121f005ef32Sjsg static const char write_data = '.'; 122f005ef32Sjsg loff_t pos = 0; 123f005ef32Sjsg bool is_subscribed = true; 124f005ef32Sjsg 125f005ef32Sjsg if (!(process && process->debug_trap_enabled)) 126f005ef32Sjsg return false; 127f005ef32Sjsg 128f005ef32Sjsg mutex_lock(&process->event_mutex); 129f005ef32Sjsg 130f005ef32Sjsg if (event_mask & KFD_EC_MASK_DEVICE) { 131f005ef32Sjsg for (i = 0; i < process->n_pdds; i++) { 132f005ef32Sjsg struct kfd_process_device *pdd = process->pdds[i]; 133f005ef32Sjsg 134f005ef32Sjsg if (pdd->dev != dev) 135f005ef32Sjsg continue; 136f005ef32Sjsg 137f005ef32Sjsg pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 138f005ef32Sjsg 139f005ef32Sjsg if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 140f005ef32Sjsg if (!pdd->vm_fault_exc_data) { 141f005ef32Sjsg pdd->vm_fault_exc_data = kmemdup( 142f005ef32Sjsg exception_data, 143f005ef32Sjsg exception_data_size, 144f005ef32Sjsg GFP_KERNEL); 145f005ef32Sjsg if (!pdd->vm_fault_exc_data) 146f005ef32Sjsg pr_debug("Failed to allocate exception data memory"); 147f005ef32Sjsg } else { 148f005ef32Sjsg pr_debug("Debugger exception data not saved\n"); 149f005ef32Sjsg print_hex_dump_bytes("exception data: ", 150f005ef32Sjsg DUMP_PREFIX_OFFSET, 151f005ef32Sjsg exception_data, 152f005ef32Sjsg exception_data_size); 153f005ef32Sjsg } 154f005ef32Sjsg } 155f005ef32Sjsg break; 156f005ef32Sjsg } 157f005ef32Sjsg } else if (event_mask & KFD_EC_MASK_PROCESS) { 158f005ef32Sjsg process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 159f005ef32Sjsg } else { 160f005ef32Sjsg pqm = &process->pqm; 161f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, 162f005ef32Sjsg process_queue_list) { 163f005ef32Sjsg int target_id; 164f005ef32Sjsg 165f005ef32Sjsg if (!pqn->q) 166f005ef32Sjsg continue; 167f005ef32Sjsg 168f005ef32Sjsg target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 169f005ef32Sjsg pqn->q->properties.queue_id : 170f005ef32Sjsg pqn->q->doorbell_id; 171f005ef32Sjsg 172f005ef32Sjsg if (pqn->q->device != dev || target_id != source_id) 173f005ef32Sjsg continue; 174f005ef32Sjsg 175f005ef32Sjsg pqn->q->properties.exception_status |= event_mask; 176f005ef32Sjsg break; 177f005ef32Sjsg } 178f005ef32Sjsg } 179f005ef32Sjsg 180f005ef32Sjsg if (process->exception_enable_mask & event_mask) { 181f005ef32Sjsg if (use_worker) 182f005ef32Sjsg schedule_work(&process->debug_event_workarea); 183f005ef32Sjsg else 184f005ef32Sjsg kernel_write(process->dbg_ev_file, 185f005ef32Sjsg &write_data, 186f005ef32Sjsg 1, 187f005ef32Sjsg &pos); 188f005ef32Sjsg } else { 189f005ef32Sjsg is_subscribed = false; 190f005ef32Sjsg } 191f005ef32Sjsg 192f005ef32Sjsg mutex_unlock(&process->event_mutex); 193f005ef32Sjsg 194f005ef32Sjsg return is_subscribed; 195f005ef32Sjsg } 196f005ef32Sjsg 197f005ef32Sjsg /* set pending event queue entry from ring entry */ 198f005ef32Sjsg bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 199f005ef32Sjsg unsigned int pasid, 200f005ef32Sjsg uint32_t doorbell_id, 201f005ef32Sjsg uint64_t trap_mask, 202f005ef32Sjsg void *exception_data, 203f005ef32Sjsg size_t exception_data_size) 204f005ef32Sjsg { 205f005ef32Sjsg struct kfd_process *p; 206f005ef32Sjsg bool signaled_to_debugger_or_runtime = false; 207f005ef32Sjsg 208f005ef32Sjsg p = kfd_lookup_process_by_pasid(pasid); 209f005ef32Sjsg 210f005ef32Sjsg if (!p) 211f005ef32Sjsg return false; 212f005ef32Sjsg 213f005ef32Sjsg if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 214f005ef32Sjsg exception_data, exception_data_size)) { 215f005ef32Sjsg struct process_queue_manager *pqm; 216f005ef32Sjsg struct process_queue_node *pqn; 217f005ef32Sjsg 218f005ef32Sjsg if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 219f005ef32Sjsg p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 220f005ef32Sjsg mutex_lock(&p->mutex); 221f005ef32Sjsg 222f005ef32Sjsg pqm = &p->pqm; 223f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, 224f005ef32Sjsg process_queue_list) { 225f005ef32Sjsg 226f005ef32Sjsg if (!(pqn->q && pqn->q->device == dev && 227f005ef32Sjsg pqn->q->doorbell_id == doorbell_id)) 228f005ef32Sjsg continue; 229f005ef32Sjsg 230f005ef32Sjsg kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 231f005ef32Sjsg trap_mask); 232f005ef32Sjsg 233f005ef32Sjsg signaled_to_debugger_or_runtime = true; 234f005ef32Sjsg 235f005ef32Sjsg break; 236f005ef32Sjsg } 237f005ef32Sjsg 238f005ef32Sjsg mutex_unlock(&p->mutex); 239f005ef32Sjsg } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 240f005ef32Sjsg kfd_dqm_evict_pasid(dev->dqm, p->pasid); 241f005ef32Sjsg kfd_signal_vm_fault_event(dev, p->pasid, NULL, 242f005ef32Sjsg exception_data); 243f005ef32Sjsg 244f005ef32Sjsg signaled_to_debugger_or_runtime = true; 245f005ef32Sjsg } 246f005ef32Sjsg } else { 247f005ef32Sjsg signaled_to_debugger_or_runtime = true; 248f005ef32Sjsg } 249f005ef32Sjsg 250f005ef32Sjsg kfd_unref_process(p); 251f005ef32Sjsg 252f005ef32Sjsg return signaled_to_debugger_or_runtime; 253f005ef32Sjsg } 254f005ef32Sjsg 255f005ef32Sjsg int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 256f005ef32Sjsg unsigned int dev_id, 257f005ef32Sjsg unsigned int queue_id, 258f005ef32Sjsg uint64_t error_reason) 259f005ef32Sjsg { 260f005ef32Sjsg if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 261f005ef32Sjsg struct kfd_process_device *pdd = NULL; 262f005ef32Sjsg struct kfd_hsa_memory_exception_data *data; 263f005ef32Sjsg int i; 264f005ef32Sjsg 265f005ef32Sjsg for (i = 0; i < p->n_pdds; i++) { 266f005ef32Sjsg if (p->pdds[i]->dev->id == dev_id) { 267f005ef32Sjsg pdd = p->pdds[i]; 268f005ef32Sjsg break; 269f005ef32Sjsg } 270f005ef32Sjsg } 271f005ef32Sjsg 272f005ef32Sjsg if (!pdd) 273f005ef32Sjsg return -ENODEV; 274f005ef32Sjsg 275f005ef32Sjsg data = (struct kfd_hsa_memory_exception_data *) 276f005ef32Sjsg pdd->vm_fault_exc_data; 277f005ef32Sjsg 278f005ef32Sjsg kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 279f005ef32Sjsg kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 280f005ef32Sjsg error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 281f005ef32Sjsg } 282f005ef32Sjsg 283f005ef32Sjsg if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 284f005ef32Sjsg /* 285f005ef32Sjsg * block should only happen after the debugger receives runtime 286f005ef32Sjsg * enable notice. 287f005ef32Sjsg */ 288f005ef32Sjsg up(&p->runtime_enable_sema); 289f005ef32Sjsg error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 290f005ef32Sjsg } 291f005ef32Sjsg 292f005ef32Sjsg if (error_reason) 293f005ef32Sjsg return kfd_send_exception_to_runtime(p, queue_id, error_reason); 294f005ef32Sjsg 295f005ef32Sjsg return 0; 296f005ef32Sjsg } 297f005ef32Sjsg 298f005ef32Sjsg static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 299f005ef32Sjsg { 300f005ef32Sjsg struct mqd_update_info minfo = {0}; 301f005ef32Sjsg int err; 302f005ef32Sjsg 303f005ef32Sjsg if (!q) 304f005ef32Sjsg return 0; 305f005ef32Sjsg 306f005ef32Sjsg if (!kfd_dbg_has_cwsr_workaround(q->device)) 307f005ef32Sjsg return 0; 308f005ef32Sjsg 309f005ef32Sjsg if (enable && q->properties.is_user_cu_masked) 310f005ef32Sjsg return -EBUSY; 311f005ef32Sjsg 312f005ef32Sjsg minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 313f005ef32Sjsg 314f005ef32Sjsg q->properties.is_dbg_wa = enable; 315f005ef32Sjsg err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 316f005ef32Sjsg if (err) 317f005ef32Sjsg q->properties.is_dbg_wa = false; 318f005ef32Sjsg 319f005ef32Sjsg return err; 320f005ef32Sjsg } 321f005ef32Sjsg 322f005ef32Sjsg static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 323f005ef32Sjsg { 324f005ef32Sjsg struct process_queue_manager *pqm = &target->pqm; 325f005ef32Sjsg struct process_queue_node *pqn; 326f005ef32Sjsg int r = 0; 327f005ef32Sjsg 328f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 329f005ef32Sjsg r = kfd_dbg_set_queue_workaround(pqn->q, enable); 330f005ef32Sjsg if (enable && r) 331f005ef32Sjsg goto unwind; 332f005ef32Sjsg } 333f005ef32Sjsg 334f005ef32Sjsg return 0; 335f005ef32Sjsg 336f005ef32Sjsg unwind: 337f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, process_queue_list) 338f005ef32Sjsg kfd_dbg_set_queue_workaround(pqn->q, false); 339f005ef32Sjsg 340f005ef32Sjsg if (enable) 341f005ef32Sjsg target->runtime_info.runtime_state = r == -EBUSY ? 342f005ef32Sjsg DEBUG_RUNTIME_STATE_ENABLED_BUSY : 343f005ef32Sjsg DEBUG_RUNTIME_STATE_ENABLED_ERROR; 344f005ef32Sjsg 345f005ef32Sjsg return r; 346f005ef32Sjsg } 347f005ef32Sjsg 348f005ef32Sjsg int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) 349f005ef32Sjsg { 350f005ef32Sjsg uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 351f005ef32Sjsg uint32_t flags = pdd->process->dbg_flags; 352f005ef32Sjsg 353f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 354f005ef32Sjsg return 0; 355f005ef32Sjsg 356f005ef32Sjsg return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 357f005ef32Sjsg pdd->watch_points, flags, sq_trap_en); 358f005ef32Sjsg } 359f005ef32Sjsg 360f005ef32Sjsg #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 361f005ef32Sjsg static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) 362f005ef32Sjsg { 363f005ef32Sjsg int i; 364f005ef32Sjsg 365f005ef32Sjsg *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; 366f005ef32Sjsg 367f005ef32Sjsg spin_lock(&pdd->dev->kfd->watch_points_lock); 368f005ef32Sjsg 369f005ef32Sjsg for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { 370f005ef32Sjsg /* device watchpoint in use so skip */ 371f005ef32Sjsg if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) 372f005ef32Sjsg continue; 373f005ef32Sjsg 374f005ef32Sjsg pdd->alloc_watch_ids |= 0x1 << i; 375f005ef32Sjsg pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; 376f005ef32Sjsg *watch_id = i; 377f005ef32Sjsg spin_unlock(&pdd->dev->kfd->watch_points_lock); 378f005ef32Sjsg return 0; 379f005ef32Sjsg } 380f005ef32Sjsg 381f005ef32Sjsg spin_unlock(&pdd->dev->kfd->watch_points_lock); 382f005ef32Sjsg 383f005ef32Sjsg return -ENOMEM; 384f005ef32Sjsg } 385f005ef32Sjsg 386f005ef32Sjsg static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 387f005ef32Sjsg { 388f005ef32Sjsg spin_lock(&pdd->dev->kfd->watch_points_lock); 389f005ef32Sjsg 390f005ef32Sjsg /* process owns device watch point so safe to clear */ 391f005ef32Sjsg if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { 392f005ef32Sjsg pdd->alloc_watch_ids &= ~(0x1 << watch_id); 393f005ef32Sjsg pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); 394f005ef32Sjsg } 395f005ef32Sjsg 396f005ef32Sjsg spin_unlock(&pdd->dev->kfd->watch_points_lock); 397f005ef32Sjsg } 398f005ef32Sjsg 399f005ef32Sjsg static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 400f005ef32Sjsg { 401f005ef32Sjsg bool owns_watch_id = false; 402f005ef32Sjsg 403f005ef32Sjsg spin_lock(&pdd->dev->kfd->watch_points_lock); 404f005ef32Sjsg owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && 405f005ef32Sjsg ((pdd->alloc_watch_ids >> watch_id) & 0x1); 406f005ef32Sjsg 407f005ef32Sjsg spin_unlock(&pdd->dev->kfd->watch_points_lock); 408f005ef32Sjsg 409f005ef32Sjsg return owns_watch_id; 410f005ef32Sjsg } 411f005ef32Sjsg 412f005ef32Sjsg int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, 413f005ef32Sjsg uint32_t watch_id) 414f005ef32Sjsg { 415f005ef32Sjsg int r; 416f005ef32Sjsg 417f005ef32Sjsg if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) 418f005ef32Sjsg return -EINVAL; 419f005ef32Sjsg 420f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) { 421f005ef32Sjsg r = debug_lock_and_unmap(pdd->dev->dqm); 422f005ef32Sjsg if (r) 423f005ef32Sjsg return r; 424f005ef32Sjsg } 425f005ef32Sjsg 426f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 427f005ef32Sjsg pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( 428f005ef32Sjsg pdd->dev->adev, 429f005ef32Sjsg watch_id); 430f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 431f005ef32Sjsg 432f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 433f005ef32Sjsg r = debug_map_and_unlock(pdd->dev->dqm); 434f005ef32Sjsg else 435f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 436f005ef32Sjsg 437f005ef32Sjsg kfd_dbg_clear_dev_watch_id(pdd, watch_id); 438f005ef32Sjsg 439f005ef32Sjsg return r; 440f005ef32Sjsg } 441f005ef32Sjsg 442f005ef32Sjsg int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, 443f005ef32Sjsg uint64_t watch_address, 444f005ef32Sjsg uint32_t watch_address_mask, 445f005ef32Sjsg uint32_t *watch_id, 446f005ef32Sjsg uint32_t watch_mode) 447f005ef32Sjsg { 448f005ef32Sjsg int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id); 449f005ef32Sjsg uint32_t xcc_mask = pdd->dev->xcc_mask; 450f005ef32Sjsg 451f005ef32Sjsg if (r) 452f005ef32Sjsg return r; 453f005ef32Sjsg 454f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) { 455f005ef32Sjsg r = debug_lock_and_unmap(pdd->dev->dqm); 456f005ef32Sjsg if (r) { 457f005ef32Sjsg kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 458f005ef32Sjsg return r; 459f005ef32Sjsg } 460f005ef32Sjsg } 461f005ef32Sjsg 462f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 463f005ef32Sjsg for_each_inst(xcc_id, xcc_mask) 464f005ef32Sjsg pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( 465f005ef32Sjsg pdd->dev->adev, 466f005ef32Sjsg watch_address, 467f005ef32Sjsg watch_address_mask, 468f005ef32Sjsg *watch_id, 469f005ef32Sjsg watch_mode, 470f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd, 471f005ef32Sjsg xcc_id); 472f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 473f005ef32Sjsg 474f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 475f005ef32Sjsg r = debug_map_and_unlock(pdd->dev->dqm); 476f005ef32Sjsg else 477f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 478f005ef32Sjsg 479f005ef32Sjsg /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ 480f005ef32Sjsg if (r) 481f005ef32Sjsg kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 482f005ef32Sjsg 483f005ef32Sjsg return 0; 484f005ef32Sjsg } 485f005ef32Sjsg 486f005ef32Sjsg static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) 487f005ef32Sjsg { 488f005ef32Sjsg int i, j; 489f005ef32Sjsg 490f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) 491f005ef32Sjsg for (j = 0; j < MAX_WATCH_ADDRESSES; j++) 492f005ef32Sjsg kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j); 493f005ef32Sjsg } 494f005ef32Sjsg 495f005ef32Sjsg int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) 496f005ef32Sjsg { 497f005ef32Sjsg uint32_t prev_flags = target->dbg_flags; 498f005ef32Sjsg int i, r = 0, rewind_count = 0; 499f005ef32Sjsg 500f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 501f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && 502f005ef32Sjsg (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { 503f005ef32Sjsg *flags = prev_flags; 504f005ef32Sjsg return -EACCES; 505f005ef32Sjsg } 506f005ef32Sjsg } 507f005ef32Sjsg 508f005ef32Sjsg target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; 509f005ef32Sjsg *flags = prev_flags; 510f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 511f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 512f005ef32Sjsg 513f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 514f005ef32Sjsg continue; 515f005ef32Sjsg 516f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 517f005ef32Sjsg r = debug_refresh_runlist(pdd->dev->dqm); 518f005ef32Sjsg else 519f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 520f005ef32Sjsg 521f005ef32Sjsg if (r) { 522f005ef32Sjsg target->dbg_flags = prev_flags; 523f005ef32Sjsg break; 524f005ef32Sjsg } 525f005ef32Sjsg 526f005ef32Sjsg rewind_count++; 527f005ef32Sjsg } 528f005ef32Sjsg 529f005ef32Sjsg /* Rewind flags */ 530f005ef32Sjsg if (r) { 531f005ef32Sjsg target->dbg_flags = prev_flags; 532f005ef32Sjsg 533f005ef32Sjsg for (i = 0; i < rewind_count; i++) { 534f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 535f005ef32Sjsg 536f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 537f005ef32Sjsg continue; 538f005ef32Sjsg 539f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 540f005ef32Sjsg debug_refresh_runlist(pdd->dev->dqm); 541f005ef32Sjsg else 542f005ef32Sjsg kfd_dbg_set_mes_debug_mode(pdd, true); 543f005ef32Sjsg } 544f005ef32Sjsg } 545f005ef32Sjsg 546f005ef32Sjsg return r; 547f005ef32Sjsg } 548f005ef32Sjsg 549f005ef32Sjsg /* kfd_dbg_trap_deactivate: 550f005ef32Sjsg * target: target process 551f005ef32Sjsg * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 552f005ef32Sjsg * unwind_count: 553f005ef32Sjsg * If unwind == true, how far down the pdd list we need 554f005ef32Sjsg * to unwind 555f005ef32Sjsg * else: ignored 556f005ef32Sjsg */ 557f005ef32Sjsg void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 558f005ef32Sjsg { 559f005ef32Sjsg int i; 560f005ef32Sjsg 561f005ef32Sjsg if (!unwind) { 562f005ef32Sjsg uint32_t flags = 0; 563f005ef32Sjsg int resume_count = resume_queues(target, 0, NULL); 564f005ef32Sjsg 565f005ef32Sjsg if (resume_count) 566f005ef32Sjsg pr_debug("Resumed %d queues\n", resume_count); 567f005ef32Sjsg 568f005ef32Sjsg cancel_work_sync(&target->debug_event_workarea); 569f005ef32Sjsg kfd_dbg_clear_process_address_watch(target); 570f005ef32Sjsg kfd_dbg_trap_set_wave_launch_mode(target, 0); 571f005ef32Sjsg 572f005ef32Sjsg kfd_dbg_trap_set_flags(target, &flags); 573f005ef32Sjsg } 574f005ef32Sjsg 575f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 576f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 577f005ef32Sjsg 578f005ef32Sjsg /* If this is an unwind, and we have unwound the required 579f005ef32Sjsg * enable calls on the pdd list, we need to stop now 580f005ef32Sjsg * otherwise we may mess up another debugger session. 581f005ef32Sjsg */ 582f005ef32Sjsg if (unwind && i == unwind_count) 583f005ef32Sjsg break; 584f005ef32Sjsg 585f005ef32Sjsg kfd_process_set_trap_debug_flag(&pdd->qpd, false); 586f005ef32Sjsg 587f005ef32Sjsg /* GFX off is already disabled by debug activate if not RLC restore supported. */ 588f005ef32Sjsg if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 589f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 590f005ef32Sjsg pdd->spi_dbg_override = 591f005ef32Sjsg pdd->dev->kfd2kgd->disable_debug_trap( 592f005ef32Sjsg pdd->dev->adev, 593f005ef32Sjsg target->runtime_info.ttmp_setup, 594f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd); 595f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 596f005ef32Sjsg 597f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 598f005ef32Sjsg release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 599f005ef32Sjsg pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 600f005ef32Sjsg 601f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 602f005ef32Sjsg debug_refresh_runlist(pdd->dev->dqm); 603f005ef32Sjsg else 604f005ef32Sjsg kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); 605f005ef32Sjsg } 606f005ef32Sjsg 607f005ef32Sjsg kfd_dbg_set_workaround(target, false); 608f005ef32Sjsg } 609f005ef32Sjsg 610f005ef32Sjsg static void kfd_dbg_clean_exception_status(struct kfd_process *target) 611f005ef32Sjsg { 612f005ef32Sjsg struct process_queue_manager *pqm; 613f005ef32Sjsg struct process_queue_node *pqn; 614f005ef32Sjsg int i; 615f005ef32Sjsg 616f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 617f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 618f005ef32Sjsg 619f005ef32Sjsg kfd_process_drain_interrupts(pdd); 620f005ef32Sjsg 621f005ef32Sjsg pdd->exception_status = 0; 622f005ef32Sjsg } 623f005ef32Sjsg 624f005ef32Sjsg pqm = &target->pqm; 625f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 626f005ef32Sjsg if (!pqn->q) 627f005ef32Sjsg continue; 628f005ef32Sjsg 629f005ef32Sjsg pqn->q->properties.exception_status = 0; 630f005ef32Sjsg } 631f005ef32Sjsg 632f005ef32Sjsg target->exception_status = 0; 633f005ef32Sjsg } 634f005ef32Sjsg 635f005ef32Sjsg int kfd_dbg_trap_disable(struct kfd_process *target) 636f005ef32Sjsg { 637f005ef32Sjsg if (!target->debug_trap_enabled) 638f005ef32Sjsg return 0; 639f005ef32Sjsg 640f005ef32Sjsg /* 641f005ef32Sjsg * Defer deactivation to runtime if runtime not enabled otherwise reset 642f005ef32Sjsg * attached running target runtime state to enable for re-attach. 643f005ef32Sjsg */ 644f005ef32Sjsg if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 645f005ef32Sjsg kfd_dbg_trap_deactivate(target, false, 0); 646f005ef32Sjsg else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 647f005ef32Sjsg target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 648f005ef32Sjsg 649*ef9beff5Sjsg cancel_work_sync(&target->debug_event_workarea); 650f005ef32Sjsg fput(target->dbg_ev_file); 651f005ef32Sjsg target->dbg_ev_file = NULL; 652f005ef32Sjsg 653f005ef32Sjsg if (target->debugger_process) { 654f005ef32Sjsg atomic_dec(&target->debugger_process->debugged_process_count); 655f005ef32Sjsg target->debugger_process = NULL; 656f005ef32Sjsg } 657f005ef32Sjsg 658f005ef32Sjsg target->debug_trap_enabled = false; 659f005ef32Sjsg kfd_dbg_clean_exception_status(target); 660f005ef32Sjsg kfd_unref_process(target); 661f005ef32Sjsg 662f005ef32Sjsg return 0; 663f005ef32Sjsg } 664f005ef32Sjsg 665f005ef32Sjsg int kfd_dbg_trap_activate(struct kfd_process *target) 666f005ef32Sjsg { 667f005ef32Sjsg int i, r = 0; 668f005ef32Sjsg 669f005ef32Sjsg r = kfd_dbg_set_workaround(target, true); 670f005ef32Sjsg if (r) 671f005ef32Sjsg return r; 672f005ef32Sjsg 673f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 674f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 675f005ef32Sjsg 676f005ef32Sjsg if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 677f005ef32Sjsg r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 678f005ef32Sjsg 679f005ef32Sjsg if (r) { 680f005ef32Sjsg target->runtime_info.runtime_state = (r == -EBUSY) ? 681f005ef32Sjsg DEBUG_RUNTIME_STATE_ENABLED_BUSY : 682f005ef32Sjsg DEBUG_RUNTIME_STATE_ENABLED_ERROR; 683f005ef32Sjsg 684f005ef32Sjsg goto unwind_err; 685f005ef32Sjsg } 686f005ef32Sjsg } 687f005ef32Sjsg 688f005ef32Sjsg /* Disable GFX OFF to prevent garbage read/writes to debug registers. 689f005ef32Sjsg * If RLC restore of debug registers is not supported and runtime enable 690f005ef32Sjsg * hasn't done so already on ttmp setup request, restore the trap config registers. 691f005ef32Sjsg * 692f005ef32Sjsg * If RLC restore of debug registers is not supported, keep gfx off disabled for 693f005ef32Sjsg * the debug session. 694f005ef32Sjsg */ 695f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 696f005ef32Sjsg if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 697f005ef32Sjsg target->runtime_info.ttmp_setup)) 698f005ef32Sjsg pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 699f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd); 700f005ef32Sjsg 701f005ef32Sjsg pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 702f005ef32Sjsg pdd->dev->adev, 703f005ef32Sjsg false, 704f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd); 705f005ef32Sjsg 706f005ef32Sjsg if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 707f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 708f005ef32Sjsg 709f005ef32Sjsg /* 710f005ef32Sjsg * Setting the debug flag in the trap handler requires that the TMA has been 711f005ef32Sjsg * allocated, which occurs during CWSR initialization. 712f005ef32Sjsg * In the event that CWSR has not been initialized at this point, setting the 713f005ef32Sjsg * flag will be called again during CWSR initialization if the target process 714f005ef32Sjsg * is still debug enabled. 715f005ef32Sjsg */ 716f005ef32Sjsg kfd_process_set_trap_debug_flag(&pdd->qpd, true); 717f005ef32Sjsg 718f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 719f005ef32Sjsg r = debug_refresh_runlist(pdd->dev->dqm); 720f005ef32Sjsg else 721f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 722f005ef32Sjsg 723f005ef32Sjsg if (r) { 724f005ef32Sjsg target->runtime_info.runtime_state = 725f005ef32Sjsg DEBUG_RUNTIME_STATE_ENABLED_ERROR; 726f005ef32Sjsg goto unwind_err; 727f005ef32Sjsg } 728f005ef32Sjsg } 729f005ef32Sjsg 730f005ef32Sjsg return 0; 731f005ef32Sjsg 732f005ef32Sjsg unwind_err: 733f005ef32Sjsg /* Enabling debug failed, we need to disable on 734f005ef32Sjsg * all GPUs so the enable is all or nothing. 735f005ef32Sjsg */ 736f005ef32Sjsg kfd_dbg_trap_deactivate(target, true, i); 737f005ef32Sjsg return r; 738f005ef32Sjsg } 739f005ef32Sjsg 740f005ef32Sjsg int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 741f005ef32Sjsg void __user *runtime_info, uint32_t *runtime_size) 742f005ef32Sjsg { 743f005ef32Sjsg struct file *f; 744f005ef32Sjsg uint32_t copy_size; 745f005ef32Sjsg int i, r = 0; 746f005ef32Sjsg 747f005ef32Sjsg if (target->debug_trap_enabled) 748f005ef32Sjsg return -EALREADY; 749f005ef32Sjsg 750f005ef32Sjsg /* Enable pre-checks */ 751f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 752f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 753f005ef32Sjsg 754f005ef32Sjsg if (!KFD_IS_SOC15(pdd->dev)) 755f005ef32Sjsg return -ENODEV; 756f005ef32Sjsg 757f005ef32Sjsg if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) || 758f005ef32Sjsg kfd_dbg_has_cwsr_workaround(pdd->dev))) 759f005ef32Sjsg return -EBUSY; 760f005ef32Sjsg } 761f005ef32Sjsg 762f005ef32Sjsg copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 763f005ef32Sjsg 764f005ef32Sjsg f = fget(fd); 765f005ef32Sjsg if (!f) { 766f005ef32Sjsg pr_err("Failed to get file for (%i)\n", fd); 767f005ef32Sjsg return -EBADF; 768f005ef32Sjsg } 769f005ef32Sjsg 770f005ef32Sjsg target->dbg_ev_file = f; 771f005ef32Sjsg 772f005ef32Sjsg /* defer activation to runtime if not runtime enabled */ 773f005ef32Sjsg if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 774f005ef32Sjsg kfd_dbg_trap_activate(target); 775f005ef32Sjsg 776f005ef32Sjsg /* We already hold the process reference but hold another one for the 777f005ef32Sjsg * debug session. 778f005ef32Sjsg */ 779f005ef32Sjsg kref_get(&target->ref); 780f005ef32Sjsg target->debug_trap_enabled = true; 781f005ef32Sjsg 782f005ef32Sjsg if (target->debugger_process) 783f005ef32Sjsg atomic_inc(&target->debugger_process->debugged_process_count); 784f005ef32Sjsg 785f005ef32Sjsg if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 786f005ef32Sjsg kfd_dbg_trap_deactivate(target, false, 0); 787f005ef32Sjsg r = -EFAULT; 788f005ef32Sjsg } 789f005ef32Sjsg 790f005ef32Sjsg *runtime_size = sizeof(target->runtime_info); 791f005ef32Sjsg 792f005ef32Sjsg return r; 793f005ef32Sjsg } 794f005ef32Sjsg 795f005ef32Sjsg static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 796f005ef32Sjsg uint32_t trap_override, 797f005ef32Sjsg uint32_t trap_mask_request, 798f005ef32Sjsg uint32_t *trap_mask_supported) 799f005ef32Sjsg { 800f005ef32Sjsg int i = 0; 801f005ef32Sjsg 802f005ef32Sjsg *trap_mask_supported = 0xffffffff; 803f005ef32Sjsg 804f005ef32Sjsg for (i = 0; i < p->n_pdds; i++) { 805f005ef32Sjsg struct kfd_process_device *pdd = p->pdds[i]; 806f005ef32Sjsg int err = pdd->dev->kfd2kgd->validate_trap_override_request( 807f005ef32Sjsg pdd->dev->adev, 808f005ef32Sjsg trap_override, 809f005ef32Sjsg trap_mask_supported); 810f005ef32Sjsg 811f005ef32Sjsg if (err) 812f005ef32Sjsg return err; 813f005ef32Sjsg } 814f005ef32Sjsg 815f005ef32Sjsg if (trap_mask_request & ~*trap_mask_supported) 816f005ef32Sjsg return -EACCES; 817f005ef32Sjsg 818f005ef32Sjsg return 0; 819f005ef32Sjsg } 820f005ef32Sjsg 821f005ef32Sjsg int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 822f005ef32Sjsg uint32_t trap_override, 823f005ef32Sjsg uint32_t trap_mask_bits, 824f005ef32Sjsg uint32_t trap_mask_request, 825f005ef32Sjsg uint32_t *trap_mask_prev, 826f005ef32Sjsg uint32_t *trap_mask_supported) 827f005ef32Sjsg { 828f005ef32Sjsg int r = 0, i; 829f005ef32Sjsg 830f005ef32Sjsg r = kfd_dbg_validate_trap_override_request(target, 831f005ef32Sjsg trap_override, 832f005ef32Sjsg trap_mask_request, 833f005ef32Sjsg trap_mask_supported); 834f005ef32Sjsg 835f005ef32Sjsg if (r) 836f005ef32Sjsg return r; 837f005ef32Sjsg 838f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 839f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 840f005ef32Sjsg 841f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 842f005ef32Sjsg pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 843f005ef32Sjsg pdd->dev->adev, 844f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd, 845f005ef32Sjsg trap_override, 846f005ef32Sjsg trap_mask_bits, 847f005ef32Sjsg trap_mask_request, 848f005ef32Sjsg trap_mask_prev, 849f005ef32Sjsg pdd->spi_dbg_override); 850f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 851f005ef32Sjsg 852f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 853f005ef32Sjsg r = debug_refresh_runlist(pdd->dev->dqm); 854f005ef32Sjsg else 855f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 856f005ef32Sjsg 857f005ef32Sjsg if (r) 858f005ef32Sjsg break; 859f005ef32Sjsg } 860f005ef32Sjsg 861f005ef32Sjsg return r; 862f005ef32Sjsg } 863f005ef32Sjsg 864f005ef32Sjsg int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 865f005ef32Sjsg uint8_t wave_launch_mode) 866f005ef32Sjsg { 867f005ef32Sjsg int r = 0, i; 868f005ef32Sjsg 869f005ef32Sjsg if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 870f005ef32Sjsg wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 871f005ef32Sjsg wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 872f005ef32Sjsg return -EINVAL; 873f005ef32Sjsg 874f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 875f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 876f005ef32Sjsg 877f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 878f005ef32Sjsg pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 879f005ef32Sjsg pdd->dev->adev, 880f005ef32Sjsg wave_launch_mode, 881f005ef32Sjsg pdd->dev->vm_info.last_vmid_kfd); 882f005ef32Sjsg amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 883f005ef32Sjsg 884f005ef32Sjsg if (!pdd->dev->kfd->shared_resources.enable_mes) 885f005ef32Sjsg r = debug_refresh_runlist(pdd->dev->dqm); 886f005ef32Sjsg else 887f005ef32Sjsg r = kfd_dbg_set_mes_debug_mode(pdd, true); 888f005ef32Sjsg 889f005ef32Sjsg if (r) 890f005ef32Sjsg break; 891f005ef32Sjsg } 892f005ef32Sjsg 893f005ef32Sjsg return r; 894f005ef32Sjsg } 895f005ef32Sjsg 896f005ef32Sjsg int kfd_dbg_trap_query_exception_info(struct kfd_process *target, 897f005ef32Sjsg uint32_t source_id, 898f005ef32Sjsg uint32_t exception_code, 899f005ef32Sjsg bool clear_exception, 900f005ef32Sjsg void __user *info, 901f005ef32Sjsg uint32_t *info_size) 902f005ef32Sjsg { 903f005ef32Sjsg bool found = false; 904f005ef32Sjsg int r = 0; 905f005ef32Sjsg uint32_t copy_size, actual_info_size = 0; 906f005ef32Sjsg uint64_t *exception_status_ptr = NULL; 907f005ef32Sjsg 908f005ef32Sjsg if (!target) 909f005ef32Sjsg return -EINVAL; 910f005ef32Sjsg 911f005ef32Sjsg if (!info || !info_size) 912f005ef32Sjsg return -EINVAL; 913f005ef32Sjsg 914f005ef32Sjsg mutex_lock(&target->event_mutex); 915f005ef32Sjsg 916f005ef32Sjsg if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { 917f005ef32Sjsg /* Per queue exceptions */ 918f005ef32Sjsg struct queue *queue = NULL; 919f005ef32Sjsg int i; 920f005ef32Sjsg 921f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 922f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 923f005ef32Sjsg struct qcm_process_device *qpd = &pdd->qpd; 924f005ef32Sjsg 925f005ef32Sjsg list_for_each_entry(queue, &qpd->queues_list, list) { 926f005ef32Sjsg if (!found && queue->properties.queue_id == source_id) { 927f005ef32Sjsg found = true; 928f005ef32Sjsg break; 929f005ef32Sjsg } 930f005ef32Sjsg } 931f005ef32Sjsg if (found) 932f005ef32Sjsg break; 933f005ef32Sjsg } 934f005ef32Sjsg 935f005ef32Sjsg if (!found) { 936f005ef32Sjsg r = -EINVAL; 937f005ef32Sjsg goto out; 938f005ef32Sjsg } 939f005ef32Sjsg 940f005ef32Sjsg if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { 941f005ef32Sjsg r = -ENODATA; 942f005ef32Sjsg goto out; 943f005ef32Sjsg } 944f005ef32Sjsg exception_status_ptr = &queue->properties.exception_status; 945f005ef32Sjsg } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { 946f005ef32Sjsg /* Per device exceptions */ 947f005ef32Sjsg struct kfd_process_device *pdd = NULL; 948f005ef32Sjsg int i; 949f005ef32Sjsg 950f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 951f005ef32Sjsg pdd = target->pdds[i]; 952f005ef32Sjsg if (pdd->dev->id == source_id) { 953f005ef32Sjsg found = true; 954f005ef32Sjsg break; 955f005ef32Sjsg } 956f005ef32Sjsg } 957f005ef32Sjsg 958f005ef32Sjsg if (!found) { 959f005ef32Sjsg r = -EINVAL; 960f005ef32Sjsg goto out; 961f005ef32Sjsg } 962f005ef32Sjsg 963f005ef32Sjsg if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { 964f005ef32Sjsg r = -ENODATA; 965f005ef32Sjsg goto out; 966f005ef32Sjsg } 967f005ef32Sjsg 968f005ef32Sjsg if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { 969f005ef32Sjsg copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); 970f005ef32Sjsg 971f005ef32Sjsg if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) { 972f005ef32Sjsg r = -EFAULT; 973f005ef32Sjsg goto out; 974f005ef32Sjsg } 975f005ef32Sjsg actual_info_size = pdd->vm_fault_exc_data_size; 976f005ef32Sjsg if (clear_exception) { 977f005ef32Sjsg kfree(pdd->vm_fault_exc_data); 978f005ef32Sjsg pdd->vm_fault_exc_data = NULL; 979f005ef32Sjsg pdd->vm_fault_exc_data_size = 0; 980f005ef32Sjsg } 981f005ef32Sjsg } 982f005ef32Sjsg exception_status_ptr = &pdd->exception_status; 983f005ef32Sjsg } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { 984f005ef32Sjsg /* Per process exceptions */ 985f005ef32Sjsg if (!(target->exception_status & KFD_EC_MASK(exception_code))) { 986f005ef32Sjsg r = -ENODATA; 987f005ef32Sjsg goto out; 988f005ef32Sjsg } 989f005ef32Sjsg 990f005ef32Sjsg if (exception_code == EC_PROCESS_RUNTIME) { 991f005ef32Sjsg copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); 992f005ef32Sjsg 993f005ef32Sjsg if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) { 994f005ef32Sjsg r = -EFAULT; 995f005ef32Sjsg goto out; 996f005ef32Sjsg } 997f005ef32Sjsg 998f005ef32Sjsg actual_info_size = sizeof(target->runtime_info); 999f005ef32Sjsg } 1000f005ef32Sjsg 1001f005ef32Sjsg exception_status_ptr = &target->exception_status; 1002f005ef32Sjsg } else { 1003f005ef32Sjsg pr_debug("Bad exception type [%i]\n", exception_code); 1004f005ef32Sjsg r = -EINVAL; 1005f005ef32Sjsg goto out; 1006f005ef32Sjsg } 1007f005ef32Sjsg 1008f005ef32Sjsg *info_size = actual_info_size; 1009f005ef32Sjsg if (clear_exception) 1010f005ef32Sjsg *exception_status_ptr &= ~KFD_EC_MASK(exception_code); 1011f005ef32Sjsg out: 1012f005ef32Sjsg mutex_unlock(&target->event_mutex); 1013f005ef32Sjsg return r; 1014f005ef32Sjsg } 1015f005ef32Sjsg 1016f005ef32Sjsg int kfd_dbg_trap_device_snapshot(struct kfd_process *target, 1017f005ef32Sjsg uint64_t exception_clear_mask, 1018f005ef32Sjsg void __user *user_info, 1019f005ef32Sjsg uint32_t *number_of_device_infos, 1020f005ef32Sjsg uint32_t *entry_size) 1021f005ef32Sjsg { 1022f005ef32Sjsg struct kfd_dbg_device_info_entry device_info; 1023f005ef32Sjsg uint32_t tmp_entry_size = *entry_size, tmp_num_devices; 1024f005ef32Sjsg int i, r = 0; 1025f005ef32Sjsg 1026f005ef32Sjsg if (!(target && user_info && number_of_device_infos && entry_size)) 1027f005ef32Sjsg return -EINVAL; 1028f005ef32Sjsg 1029f005ef32Sjsg tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); 1030f005ef32Sjsg *number_of_device_infos = target->n_pdds; 1031f005ef32Sjsg *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); 1032f005ef32Sjsg 1033f005ef32Sjsg if (!tmp_num_devices) 1034f005ef32Sjsg return 0; 1035f005ef32Sjsg 1036f005ef32Sjsg memset(&device_info, 0, sizeof(device_info)); 1037f005ef32Sjsg 1038f005ef32Sjsg mutex_lock(&target->event_mutex); 1039f005ef32Sjsg 1040f005ef32Sjsg /* Run over all pdd of the process */ 1041f005ef32Sjsg for (i = 0; i < tmp_num_devices; i++) { 1042f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 1043f005ef32Sjsg struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id); 1044f005ef32Sjsg 1045f005ef32Sjsg device_info.gpu_id = pdd->dev->id; 1046f005ef32Sjsg device_info.exception_status = pdd->exception_status; 1047f005ef32Sjsg device_info.lds_base = pdd->lds_base; 1048f005ef32Sjsg device_info.lds_limit = pdd->lds_limit; 1049f005ef32Sjsg device_info.scratch_base = pdd->scratch_base; 1050f005ef32Sjsg device_info.scratch_limit = pdd->scratch_limit; 1051f005ef32Sjsg device_info.gpuvm_base = pdd->gpuvm_base; 1052f005ef32Sjsg device_info.gpuvm_limit = pdd->gpuvm_limit; 1053f005ef32Sjsg device_info.location_id = topo_dev->node_props.location_id; 1054f005ef32Sjsg device_info.vendor_id = topo_dev->node_props.vendor_id; 1055f005ef32Sjsg device_info.device_id = topo_dev->node_props.device_id; 1056f005ef32Sjsg device_info.revision_id = pdd->dev->adev->pdev->revision; 1057f005ef32Sjsg device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor; 1058f005ef32Sjsg device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device; 1059f005ef32Sjsg device_info.fw_version = pdd->dev->kfd->mec_fw_version; 1060f005ef32Sjsg device_info.gfx_target_version = 1061f005ef32Sjsg topo_dev->node_props.gfx_target_version; 1062f005ef32Sjsg device_info.simd_count = topo_dev->node_props.simd_count; 1063f005ef32Sjsg device_info.max_waves_per_simd = 1064f005ef32Sjsg topo_dev->node_props.max_waves_per_simd; 1065f005ef32Sjsg device_info.array_count = topo_dev->node_props.array_count; 1066f005ef32Sjsg device_info.simd_arrays_per_engine = 1067f005ef32Sjsg topo_dev->node_props.simd_arrays_per_engine; 1068f005ef32Sjsg device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); 1069f005ef32Sjsg device_info.capability = topo_dev->node_props.capability; 1070f005ef32Sjsg device_info.debug_prop = topo_dev->node_props.debug_prop; 1071f005ef32Sjsg 1072f005ef32Sjsg if (exception_clear_mask) 1073f005ef32Sjsg pdd->exception_status &= ~exception_clear_mask; 1074f005ef32Sjsg 1075f005ef32Sjsg if (copy_to_user(user_info, &device_info, *entry_size)) { 1076f005ef32Sjsg r = -EFAULT; 1077f005ef32Sjsg break; 1078f005ef32Sjsg } 1079f005ef32Sjsg 1080f005ef32Sjsg user_info += tmp_entry_size; 1081f005ef32Sjsg } 1082f005ef32Sjsg 1083f005ef32Sjsg mutex_unlock(&target->event_mutex); 1084f005ef32Sjsg 1085f005ef32Sjsg return r; 1086f005ef32Sjsg } 1087f005ef32Sjsg 1088f005ef32Sjsg void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 1089f005ef32Sjsg uint64_t exception_set_mask) 1090f005ef32Sjsg { 1091f005ef32Sjsg uint64_t found_mask = 0; 1092f005ef32Sjsg struct process_queue_manager *pqm; 1093f005ef32Sjsg struct process_queue_node *pqn; 1094f005ef32Sjsg static const char write_data = '.'; 1095f005ef32Sjsg loff_t pos = 0; 1096f005ef32Sjsg int i; 1097f005ef32Sjsg 1098f005ef32Sjsg mutex_lock(&target->event_mutex); 1099f005ef32Sjsg 1100f005ef32Sjsg found_mask |= target->exception_status; 1101f005ef32Sjsg 1102f005ef32Sjsg pqm = &target->pqm; 1103f005ef32Sjsg list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 1104f005ef32Sjsg if (!pqn->q) 1105f005ef32Sjsg continue; 1106f005ef32Sjsg 1107f005ef32Sjsg found_mask |= pqn->q->properties.exception_status; 1108f005ef32Sjsg } 1109f005ef32Sjsg 1110f005ef32Sjsg for (i = 0; i < target->n_pdds; i++) { 1111f005ef32Sjsg struct kfd_process_device *pdd = target->pdds[i]; 1112f005ef32Sjsg 1113f005ef32Sjsg found_mask |= pdd->exception_status; 1114f005ef32Sjsg } 1115f005ef32Sjsg 1116f005ef32Sjsg if (exception_set_mask & found_mask) 1117f005ef32Sjsg kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 1118f005ef32Sjsg 1119f005ef32Sjsg target->exception_enable_mask = exception_set_mask; 1120f005ef32Sjsg 1121f005ef32Sjsg mutex_unlock(&target->event_mutex); 1122f005ef32Sjsg } 1123