1fb4d8502Sjsg /* 2fb4d8502Sjsg * Copyright 2015 Advanced Micro Devices, Inc. 3fb4d8502Sjsg * 4fb4d8502Sjsg * Permission is hereby granted, free of charge, to any person obtaining a 5fb4d8502Sjsg * copy of this software and associated documentation files (the "Software"), 6fb4d8502Sjsg * to deal in the Software without restriction, including without limitation 7fb4d8502Sjsg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8fb4d8502Sjsg * and/or sell copies of the Software, and to permit persons to whom the 9fb4d8502Sjsg * Software is furnished to do so, subject to the following conditions: 10fb4d8502Sjsg * 11fb4d8502Sjsg * The above copyright notice and this permission notice shall be included in 12fb4d8502Sjsg * all copies or substantial portions of the Software. 13fb4d8502Sjsg * 14fb4d8502Sjsg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15fb4d8502Sjsg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16fb4d8502Sjsg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17fb4d8502Sjsg * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18fb4d8502Sjsg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19fb4d8502Sjsg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20fb4d8502Sjsg * OTHER DEALINGS IN THE SOFTWARE. 21fb4d8502Sjsg * 22fb4d8502Sjsg * 23fb4d8502Sjsg */ 24fb4d8502Sjsg #include <linux/kthread.h> 25fb4d8502Sjsg #include <linux/wait.h> 26fb4d8502Sjsg #include <linux/sched.h> 27c349dbc7Sjsg 285ca02815Sjsg #include <drm/drm_drv.h> 295ca02815Sjsg 30fb4d8502Sjsg #include "amdgpu.h" 31fb4d8502Sjsg #include "amdgpu_trace.h" 321bb76ff1Sjsg #include "amdgpu_reset.h" 33fb4d8502Sjsg 345ca02815Sjsg static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) 35fb4d8502Sjsg { 36fb4d8502Sjsg struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); 37fb4d8502Sjsg struct amdgpu_job *job = to_amdgpu_job(s_job); 38c349dbc7Sjsg struct amdgpu_task_info ti; 39ad8b1aafSjsg struct amdgpu_device *adev = ring->adev; 405ca02815Sjsg int idx; 411bb76ff1Sjsg int r; 425ca02815Sjsg 431bb76ff1Sjsg if (!drm_dev_enter(adev_to_drm(adev), &idx)) { 445ca02815Sjsg DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", 455ca02815Sjsg __func__, s_job->sched->name); 465ca02815Sjsg 475ca02815Sjsg /* Effectively the job is aborted as the device is gone */ 485ca02815Sjsg return DRM_GPU_SCHED_STAT_ENODEV; 495ca02815Sjsg } 50fb4d8502Sjsg 51c349dbc7Sjsg memset(&ti, 0, sizeof(struct amdgpu_task_info)); 521bb76ff1Sjsg adev->job_hang = true; 53c349dbc7Sjsg 5460ed5e11Sjsg if (amdgpu_gpu_recovery && 5560ed5e11Sjsg amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { 56c349dbc7Sjsg DRM_ERROR("ring %s timeout, but soft recovered\n", 57c349dbc7Sjsg s_job->sched->name); 585ca02815Sjsg goto exit; 59c349dbc7Sjsg } 60c349dbc7Sjsg 61c349dbc7Sjsg amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); 62fb4d8502Sjsg DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", 63fb4d8502Sjsg job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), 64fb4d8502Sjsg ring->fence_drv.sync_seq); 65c349dbc7Sjsg DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", 66c349dbc7Sjsg ti.process_name, ti.tgid, ti.task_name, ti.pid); 67fb4d8502Sjsg 68f005ef32Sjsg dma_fence_set_error(&s_job->s_fence->finished, -ETIME); 69f005ef32Sjsg 70ad8b1aafSjsg if (amdgpu_device_should_recover_gpu(ring->adev)) { 711bb76ff1Sjsg struct amdgpu_reset_context reset_context; 721bb76ff1Sjsg memset(&reset_context, 0, sizeof(reset_context)); 731bb76ff1Sjsg 741bb76ff1Sjsg reset_context.method = AMD_RESET_METHOD_NONE; 751bb76ff1Sjsg reset_context.reset_req_dev = adev; 761bb76ff1Sjsg clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 771bb76ff1Sjsg 781bb76ff1Sjsg r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); 791bb76ff1Sjsg if (r) 801bb76ff1Sjsg DRM_ERROR("GPU Recovery Failed: %d\n", r); 81ad8b1aafSjsg } else { 82c349dbc7Sjsg drm_sched_suspend_timeout(&ring->sched); 83ad8b1aafSjsg if (amdgpu_sriov_vf(adev)) 84ad8b1aafSjsg adev->virt.tdr_debug = true; 85ad8b1aafSjsg } 865ca02815Sjsg 875ca02815Sjsg exit: 881bb76ff1Sjsg adev->job_hang = false; 895ca02815Sjsg drm_dev_exit(idx); 905ca02815Sjsg return DRM_GPU_SCHED_STAT_NOMINAL; 91fb4d8502Sjsg } 92fb4d8502Sjsg 93f005ef32Sjsg int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, 94f005ef32Sjsg struct drm_sched_entity *entity, void *owner, 95f005ef32Sjsg unsigned int num_ibs, struct amdgpu_job **job) 96fb4d8502Sjsg { 97fb4d8502Sjsg if (num_ibs == 0) 98fb4d8502Sjsg return -EINVAL; 99fb4d8502Sjsg 1001bb76ff1Sjsg *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); 101fb4d8502Sjsg if (!*job) 102fb4d8502Sjsg return -ENOMEM; 103fb4d8502Sjsg 104fb4d8502Sjsg /* 105fb4d8502Sjsg * Initialize the scheduler to at least some ring so that we always 106fb4d8502Sjsg * have a pointer to adev. 107fb4d8502Sjsg */ 108fb4d8502Sjsg (*job)->base.sched = &adev->rings[0]->sched; 109fb4d8502Sjsg (*job)->vm = vm; 110fb4d8502Sjsg 111f005ef32Sjsg amdgpu_sync_create(&(*job)->explicit_sync); 112f005ef32Sjsg (*job)->generation = amdgpu_vm_generation(adev, vm); 113fb4d8502Sjsg (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; 114fb4d8502Sjsg 115f005ef32Sjsg if (!entity) 116fb4d8502Sjsg return 0; 117f005ef32Sjsg 118f005ef32Sjsg return drm_sched_job_init(&(*job)->base, entity, owner); 119fb4d8502Sjsg } 120fb4d8502Sjsg 121f005ef32Sjsg int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, 122f005ef32Sjsg struct drm_sched_entity *entity, void *owner, 123f005ef32Sjsg size_t size, enum amdgpu_ib_pool_type pool_type, 124fb4d8502Sjsg struct amdgpu_job **job) 125fb4d8502Sjsg { 126fb4d8502Sjsg int r; 127fb4d8502Sjsg 128f005ef32Sjsg r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job); 129fb4d8502Sjsg if (r) 130fb4d8502Sjsg return r; 131fb4d8502Sjsg 1321bb76ff1Sjsg (*job)->num_ibs = 1; 133ad8b1aafSjsg r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); 134f005ef32Sjsg if (r) { 135f005ef32Sjsg if (entity) 136f005ef32Sjsg drm_sched_job_cleanup(&(*job)->base); 137fb4d8502Sjsg kfree(*job); 138f005ef32Sjsg } 139fb4d8502Sjsg 140fb4d8502Sjsg return r; 141fb4d8502Sjsg } 142fb4d8502Sjsg 1431bb76ff1Sjsg void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, 1441bb76ff1Sjsg struct amdgpu_bo *gws, struct amdgpu_bo *oa) 1451bb76ff1Sjsg { 1461bb76ff1Sjsg if (gds) { 1471bb76ff1Sjsg job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; 1481bb76ff1Sjsg job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; 1491bb76ff1Sjsg } 1501bb76ff1Sjsg if (gws) { 1511bb76ff1Sjsg job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT; 1521bb76ff1Sjsg job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT; 1531bb76ff1Sjsg } 1541bb76ff1Sjsg if (oa) { 1551bb76ff1Sjsg job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT; 1561bb76ff1Sjsg job->oa_size = amdgpu_bo_size(oa) >> PAGE_SHIFT; 1571bb76ff1Sjsg } 1581bb76ff1Sjsg } 1591bb76ff1Sjsg 160fb4d8502Sjsg void amdgpu_job_free_resources(struct amdgpu_job *job) 161fb4d8502Sjsg { 162fb4d8502Sjsg struct dma_fence *f; 163fb4d8502Sjsg unsigned i; 164fb4d8502Sjsg 16592fe9140Sjsg /* Check if any fences where initialized */ 16692fe9140Sjsg if (job->base.s_fence && job->base.s_fence->finished.ops) 16792fe9140Sjsg f = &job->base.s_fence->finished; 16892fe9140Sjsg else if (job->hw_fence.ops) 16992fe9140Sjsg f = &job->hw_fence; 17092fe9140Sjsg else 17192fe9140Sjsg f = NULL; 17292fe9140Sjsg 173fb4d8502Sjsg for (i = 0; i < job->num_ibs; ++i) 174*83431a9dSjsg amdgpu_ib_free(NULL, &job->ibs[i], f); 175fb4d8502Sjsg } 176fb4d8502Sjsg 177fb4d8502Sjsg static void amdgpu_job_free_cb(struct drm_sched_job *s_job) 178fb4d8502Sjsg { 179fb4d8502Sjsg struct amdgpu_job *job = to_amdgpu_job(s_job); 180fb4d8502Sjsg 181c349dbc7Sjsg drm_sched_job_cleanup(s_job); 182c349dbc7Sjsg 183f005ef32Sjsg amdgpu_sync_free(&job->explicit_sync); 1845ca02815Sjsg 1855ca02815Sjsg /* only put the hw fence if has embedded fence */ 1861bb76ff1Sjsg if (!job->hw_fence.ops) 187fb4d8502Sjsg kfree(job); 1881bb76ff1Sjsg else 1891bb76ff1Sjsg dma_fence_put(&job->hw_fence); 1901bb76ff1Sjsg } 1911bb76ff1Sjsg 1921bb76ff1Sjsg void amdgpu_job_set_gang_leader(struct amdgpu_job *job, 1931bb76ff1Sjsg struct amdgpu_job *leader) 1941bb76ff1Sjsg { 1951bb76ff1Sjsg struct dma_fence *fence = &leader->base.s_fence->scheduled; 1961bb76ff1Sjsg 1971bb76ff1Sjsg WARN_ON(job->gang_submit); 1981bb76ff1Sjsg 1991bb76ff1Sjsg /* 2001bb76ff1Sjsg * Don't add a reference when we are the gang leader to avoid circle 2011bb76ff1Sjsg * dependency. 2021bb76ff1Sjsg */ 2031bb76ff1Sjsg if (job != leader) 2041bb76ff1Sjsg dma_fence_get(fence); 2051bb76ff1Sjsg job->gang_submit = fence; 206fb4d8502Sjsg } 207fb4d8502Sjsg 208fb4d8502Sjsg void amdgpu_job_free(struct amdgpu_job *job) 209fb4d8502Sjsg { 210f005ef32Sjsg if (job->base.entity) 211f005ef32Sjsg drm_sched_job_cleanup(&job->base); 212f005ef32Sjsg 213fb4d8502Sjsg amdgpu_job_free_resources(job); 214f005ef32Sjsg amdgpu_sync_free(&job->explicit_sync); 2151bb76ff1Sjsg if (job->gang_submit != &job->base.s_fence->scheduled) 2161bb76ff1Sjsg dma_fence_put(job->gang_submit); 2175ca02815Sjsg 2181bb76ff1Sjsg if (!job->hw_fence.ops) 219fb4d8502Sjsg kfree(job); 2201bb76ff1Sjsg else 2211bb76ff1Sjsg dma_fence_put(&job->hw_fence); 222fb4d8502Sjsg } 223fb4d8502Sjsg 224f005ef32Sjsg struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) 225fb4d8502Sjsg { 226f005ef32Sjsg struct dma_fence *f; 227fb4d8502Sjsg 2281bb76ff1Sjsg drm_sched_job_arm(&job->base); 229f005ef32Sjsg f = dma_fence_get(&job->base.s_fence->finished); 230fb4d8502Sjsg amdgpu_job_free_resources(job); 2311bb76ff1Sjsg drm_sched_entity_push_job(&job->base); 232fb4d8502Sjsg 233f005ef32Sjsg return f; 234fb4d8502Sjsg } 235fb4d8502Sjsg 236fb4d8502Sjsg int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, 237fb4d8502Sjsg struct dma_fence **fence) 238fb4d8502Sjsg { 239fb4d8502Sjsg int r; 240fb4d8502Sjsg 241fb4d8502Sjsg job->base.sched = &ring->sched; 2421bb76ff1Sjsg r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence); 2431bb76ff1Sjsg 244fb4d8502Sjsg if (r) 245fb4d8502Sjsg return r; 246fb4d8502Sjsg 247fb4d8502Sjsg amdgpu_job_free(job); 248fb4d8502Sjsg return 0; 249fb4d8502Sjsg } 250fb4d8502Sjsg 251f005ef32Sjsg static struct dma_fence * 252f005ef32Sjsg amdgpu_job_prepare_job(struct drm_sched_job *sched_job, 253fb4d8502Sjsg struct drm_sched_entity *s_entity) 254fb4d8502Sjsg { 255fb4d8502Sjsg struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); 256fb4d8502Sjsg struct amdgpu_job *job = to_amdgpu_job(sched_job); 257f005ef32Sjsg struct dma_fence *fence = NULL; 258fb4d8502Sjsg int r; 259fb4d8502Sjsg 260f005ef32Sjsg r = drm_sched_entity_error(s_entity); 261b023b714Sjsg if (r) 262f005ef32Sjsg goto error; 263fb4d8502Sjsg 2641bb76ff1Sjsg if (!fence && job->gang_submit) 2651bb76ff1Sjsg fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); 2661bb76ff1Sjsg 267f005ef32Sjsg while (!fence && job->vm && !job->vmid) { 268f005ef32Sjsg r = amdgpu_vmid_grab(job->vm, ring, job, &fence); 269f005ef32Sjsg if (r) { 270fb4d8502Sjsg DRM_ERROR("Error getting VM ID (%d)\n", r); 271f005ef32Sjsg goto error; 272f005ef32Sjsg } 273fb4d8502Sjsg } 274fb4d8502Sjsg 275fb4d8502Sjsg return fence; 276f005ef32Sjsg 277f005ef32Sjsg error: 278f005ef32Sjsg dma_fence_set_error(&job->base.s_fence->finished, r); 279f005ef32Sjsg return NULL; 280fb4d8502Sjsg } 281fb4d8502Sjsg 282fb4d8502Sjsg static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) 283fb4d8502Sjsg { 284fb4d8502Sjsg struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); 2851bb76ff1Sjsg struct amdgpu_device *adev = ring->adev; 286fb4d8502Sjsg struct dma_fence *fence = NULL, *finished; 287fb4d8502Sjsg struct amdgpu_job *job; 28832b3d3fdSjsg int r = 0; 289fb4d8502Sjsg 290fb4d8502Sjsg job = to_amdgpu_job(sched_job); 291fb4d8502Sjsg finished = &job->base.s_fence->finished; 292fb4d8502Sjsg 293fb4d8502Sjsg trace_amdgpu_sched_run_job(job); 294fb4d8502Sjsg 2951bb76ff1Sjsg /* Skip job if VRAM is lost and never resubmit gangs */ 296f005ef32Sjsg if (job->generation != amdgpu_vm_generation(adev, job->vm) || 2971bb76ff1Sjsg (job->job_run_counter && job->gang_submit)) 2981bb76ff1Sjsg dma_fence_set_error(finished, -ECANCELED); 299fb4d8502Sjsg 300fb4d8502Sjsg if (finished->error < 0) { 301a2a58543Sjsg dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", 302a2a58543Sjsg ring->name); 303fb4d8502Sjsg } else { 304fb4d8502Sjsg r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, 305fb4d8502Sjsg &fence); 306fb4d8502Sjsg if (r) 307a2a58543Sjsg dev_err(adev->dev, 308a2a58543Sjsg "Error scheduling IBs (%d) in ring(%s)", r, 309a2a58543Sjsg ring->name); 310fb4d8502Sjsg } 311fb4d8502Sjsg 3125ca02815Sjsg job->job_run_counter++; 313fb4d8502Sjsg amdgpu_job_free_resources(job); 31432b3d3fdSjsg 31532b3d3fdSjsg fence = r ? ERR_PTR(r) : fence; 316fb4d8502Sjsg return fence; 317fb4d8502Sjsg } 318fb4d8502Sjsg 319c349dbc7Sjsg #define to_drm_sched_job(sched_job) \ 320c349dbc7Sjsg container_of((sched_job), struct drm_sched_job, queue_node) 321c349dbc7Sjsg 322c349dbc7Sjsg void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) 323c349dbc7Sjsg { 324c349dbc7Sjsg struct drm_sched_job *s_job; 325c349dbc7Sjsg struct drm_sched_entity *s_entity = NULL; 326c349dbc7Sjsg int i; 327c349dbc7Sjsg 328c349dbc7Sjsg /* Signal all jobs not yet scheduled */ 329ad8b1aafSjsg for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { 330c349dbc7Sjsg struct drm_sched_rq *rq = &sched->sched_rq[i]; 331c349dbc7Sjsg spin_lock(&rq->lock); 332c349dbc7Sjsg list_for_each_entry(s_entity, &rq->entities, list) { 333c349dbc7Sjsg while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { 334c349dbc7Sjsg struct drm_sched_fence *s_fence = s_job->s_fence; 335c349dbc7Sjsg 336c349dbc7Sjsg dma_fence_signal(&s_fence->scheduled); 337c349dbc7Sjsg dma_fence_set_error(&s_fence->finished, -EHWPOISON); 338c349dbc7Sjsg dma_fence_signal(&s_fence->finished); 339c349dbc7Sjsg } 340c349dbc7Sjsg } 341c349dbc7Sjsg spin_unlock(&rq->lock); 342c349dbc7Sjsg } 343c349dbc7Sjsg 344c349dbc7Sjsg /* Signal all jobs already scheduled to HW */ 3455ca02815Sjsg list_for_each_entry(s_job, &sched->pending_list, list) { 346c349dbc7Sjsg struct drm_sched_fence *s_fence = s_job->s_fence; 347c349dbc7Sjsg 348c349dbc7Sjsg dma_fence_set_error(&s_fence->finished, -EHWPOISON); 349c349dbc7Sjsg dma_fence_signal(&s_fence->finished); 350c349dbc7Sjsg } 351c349dbc7Sjsg } 352c349dbc7Sjsg 353fb4d8502Sjsg const struct drm_sched_backend_ops amdgpu_sched_ops = { 354f005ef32Sjsg .prepare_job = amdgpu_job_prepare_job, 355fb4d8502Sjsg .run_job = amdgpu_job_run, 356fb4d8502Sjsg .timedout_job = amdgpu_job_timedout, 357fb4d8502Sjsg .free_job = amdgpu_job_free_cb 358fb4d8502Sjsg }; 359