1 /* $NetBSD: amdgpu_cs.c,v 1.3 2018/08/27 14:04:50 riastradh Exp $ */ 2 3 /* 4 * Copyright 2008 Jerome Glisse. 5 * All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the next 15 * paragraph) shall be included in all copies or substantial portions of the 16 * Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 * DEALINGS IN THE SOFTWARE. 25 * 26 * Authors: 27 * Jerome Glisse <glisse@freedesktop.org> 28 */ 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: amdgpu_cs.c,v 1.3 2018/08/27 14:04:50 riastradh Exp $"); 31 32 #include <linux/list_sort.h> 33 #include <drm/drmP.h> 34 #include <drm/amdgpu_drm.h> 35 #include "amdgpu.h" 36 #include "amdgpu_trace.h" 37 38 #define AMDGPU_CS_MAX_PRIORITY 32u 39 #define AMDGPU_CS_NUM_BUCKETS (AMDGPU_CS_MAX_PRIORITY + 1) 40 41 /* This is based on the bucket sort with O(n) time complexity. 42 * An item with priority "i" is added to bucket[i]. The lists are then 43 * concatenated in descending order. 44 */ 45 struct amdgpu_cs_buckets { 46 struct list_head bucket[AMDGPU_CS_NUM_BUCKETS]; 47 }; 48 49 static void amdgpu_cs_buckets_init(struct amdgpu_cs_buckets *b) 50 { 51 unsigned i; 52 53 for (i = 0; i < AMDGPU_CS_NUM_BUCKETS; i++) 54 INIT_LIST_HEAD(&b->bucket[i]); 55 } 56 57 static void amdgpu_cs_buckets_add(struct amdgpu_cs_buckets *b, 58 struct list_head *item, unsigned priority) 59 { 60 /* Since buffers which appear sooner in the relocation list are 61 * likely to be used more often than buffers which appear later 62 * in the list, the sort mustn't change the ordering of buffers 63 * with the same priority, i.e. it must be stable. 64 */ 65 list_add_tail(item, &b->bucket[min(priority, AMDGPU_CS_MAX_PRIORITY)]); 66 } 67 68 static void amdgpu_cs_buckets_get_list(struct amdgpu_cs_buckets *b, 69 struct list_head *out_list) 70 { 71 unsigned i; 72 73 /* Connect the sorted buckets in the output list. */ 74 for (i = 0; i < AMDGPU_CS_NUM_BUCKETS; i++) { 75 list_splice(&b->bucket[i], out_list); 76 } 77 } 78 79 int amdgpu_cs_get_ring(struct amdgpu_device *adev, u32 ip_type, 80 u32 ip_instance, u32 ring, 81 struct amdgpu_ring **out_ring) 82 { 83 /* Right now all IPs have only one instance - multiple rings. */ 84 if (ip_instance != 0) { 85 DRM_ERROR("invalid ip instance: %d\n", ip_instance); 86 return -EINVAL; 87 } 88 89 switch (ip_type) { 90 default: 91 DRM_ERROR("unknown ip type: %d\n", ip_type); 92 return -EINVAL; 93 case AMDGPU_HW_IP_GFX: 94 if (ring < adev->gfx.num_gfx_rings) { 95 *out_ring = &adev->gfx.gfx_ring[ring]; 96 } else { 97 DRM_ERROR("only %d gfx rings are supported now\n", 98 adev->gfx.num_gfx_rings); 99 return -EINVAL; 100 } 101 break; 102 case AMDGPU_HW_IP_COMPUTE: 103 if (ring < adev->gfx.num_compute_rings) { 104 *out_ring = &adev->gfx.compute_ring[ring]; 105 } else { 106 DRM_ERROR("only %d compute rings are supported now\n", 107 adev->gfx.num_compute_rings); 108 return -EINVAL; 109 } 110 break; 111 case AMDGPU_HW_IP_DMA: 112 if (ring < adev->sdma.num_instances) { 113 *out_ring = &adev->sdma.instance[ring].ring; 114 } else { 115 DRM_ERROR("only %d SDMA rings are supported\n", 116 adev->sdma.num_instances); 117 return -EINVAL; 118 } 119 break; 120 case AMDGPU_HW_IP_UVD: 121 *out_ring = &adev->uvd.ring; 122 break; 123 case AMDGPU_HW_IP_VCE: 124 if (ring < 2){ 125 *out_ring = &adev->vce.ring[ring]; 126 } else { 127 DRM_ERROR("only two VCE rings are supported\n"); 128 return -EINVAL; 129 } 130 break; 131 } 132 133 if (!(*out_ring && (*out_ring)->adev)) { 134 DRM_ERROR("Ring %d is not initialized on IP %d\n", 135 ring, ip_type); 136 return -EINVAL; 137 } 138 139 return 0; 140 } 141 142 static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, 143 struct drm_amdgpu_cs_chunk_fence *fence_data) 144 { 145 struct drm_gem_object *gobj; 146 uint32_t handle __unused; 147 148 handle = fence_data->handle; 149 gobj = drm_gem_object_lookup(p->adev->ddev, p->filp, 150 fence_data->handle); 151 if (gobj == NULL) 152 return -EINVAL; 153 154 p->uf.bo = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj)); 155 p->uf.offset = fence_data->offset; 156 157 if (amdgpu_ttm_tt_has_userptr(p->uf.bo->tbo.ttm)) { 158 drm_gem_object_unreference_unlocked(gobj); 159 return -EINVAL; 160 } 161 162 p->uf_entry.robj = amdgpu_bo_ref(p->uf.bo); 163 p->uf_entry.prefered_domains = AMDGPU_GEM_DOMAIN_GTT; 164 p->uf_entry.allowed_domains = AMDGPU_GEM_DOMAIN_GTT; 165 p->uf_entry.priority = 0; 166 p->uf_entry.tv.bo = &p->uf_entry.robj->tbo; 167 p->uf_entry.tv.shared = true; 168 169 drm_gem_object_unreference_unlocked(gobj); 170 return 0; 171 } 172 173 int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) 174 { 175 union drm_amdgpu_cs *cs = data; 176 uint64_t *chunk_array_user; 177 uint64_t *chunk_array; 178 struct amdgpu_fpriv *fpriv = p->filp->driver_priv; 179 unsigned size; 180 int i; 181 int ret; 182 183 if (cs->in.num_chunks == 0) 184 return 0; 185 186 chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL); 187 if (!chunk_array) 188 return -ENOMEM; 189 190 p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id); 191 if (!p->ctx) { 192 ret = -EINVAL; 193 goto free_chunk; 194 } 195 196 p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); 197 198 /* get chunks */ 199 INIT_LIST_HEAD(&p->validated); 200 chunk_array_user = (uint64_t __user *)(unsigned long)(cs->in.chunks); 201 if (copy_from_user(chunk_array, chunk_array_user, 202 sizeof(uint64_t)*cs->in.num_chunks)) { 203 ret = -EFAULT; 204 goto put_bo_list; 205 } 206 207 p->nchunks = cs->in.num_chunks; 208 p->chunks = kmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk), 209 GFP_KERNEL); 210 if (!p->chunks) { 211 ret = -ENOMEM; 212 goto put_bo_list; 213 } 214 215 for (i = 0; i < p->nchunks; i++) { 216 struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL; 217 struct drm_amdgpu_cs_chunk user_chunk; 218 uint32_t __user *cdata; 219 220 chunk_ptr = (void __user *)(unsigned long)chunk_array[i]; 221 if (copy_from_user(&user_chunk, chunk_ptr, 222 sizeof(struct drm_amdgpu_cs_chunk))) { 223 ret = -EFAULT; 224 i--; 225 goto free_partial_kdata; 226 } 227 p->chunks[i].chunk_id = user_chunk.chunk_id; 228 p->chunks[i].length_dw = user_chunk.length_dw; 229 230 size = p->chunks[i].length_dw; 231 cdata = (void __user *)(unsigned long)user_chunk.chunk_data; 232 p->chunks[i].user_ptr = cdata; 233 234 p->chunks[i].kdata = drm_malloc_ab(size, sizeof(uint32_t)); 235 if (p->chunks[i].kdata == NULL) { 236 ret = -ENOMEM; 237 i--; 238 goto free_partial_kdata; 239 } 240 size *= sizeof(uint32_t); 241 if (copy_from_user(p->chunks[i].kdata, cdata, size)) { 242 ret = -EFAULT; 243 goto free_partial_kdata; 244 } 245 246 switch (p->chunks[i].chunk_id) { 247 case AMDGPU_CHUNK_ID_IB: 248 p->num_ibs++; 249 break; 250 251 case AMDGPU_CHUNK_ID_FENCE: 252 size = sizeof(struct drm_amdgpu_cs_chunk_fence); 253 if (p->chunks[i].length_dw * sizeof(uint32_t) < size) { 254 ret = -EINVAL; 255 goto free_partial_kdata; 256 } 257 258 ret = amdgpu_cs_user_fence_chunk(p, (void *)p->chunks[i].kdata); 259 if (ret) 260 goto free_partial_kdata; 261 262 break; 263 264 case AMDGPU_CHUNK_ID_DEPENDENCIES: 265 break; 266 267 default: 268 ret = -EINVAL; 269 goto free_partial_kdata; 270 } 271 } 272 273 274 p->ibs = kcalloc(p->num_ibs, sizeof(struct amdgpu_ib), GFP_KERNEL); 275 if (!p->ibs) { 276 ret = -ENOMEM; 277 goto free_all_kdata; 278 } 279 280 kfree(chunk_array); 281 return 0; 282 283 free_all_kdata: 284 i = p->nchunks - 1; 285 free_partial_kdata: 286 for (; i >= 0; i--) 287 drm_free_large(p->chunks[i].kdata); 288 kfree(p->chunks); 289 put_bo_list: 290 if (p->bo_list) 291 amdgpu_bo_list_put(p->bo_list); 292 amdgpu_ctx_put(p->ctx); 293 free_chunk: 294 kfree(chunk_array); 295 296 return ret; 297 } 298 299 /* Returns how many bytes TTM can move per IB. 300 */ 301 static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) 302 { 303 u64 real_vram_size = adev->mc.real_vram_size; 304 u64 vram_usage = atomic64_read(&adev->vram_usage); 305 306 /* This function is based on the current VRAM usage. 307 * 308 * - If all of VRAM is free, allow relocating the number of bytes that 309 * is equal to 1/4 of the size of VRAM for this IB. 310 311 * - If more than one half of VRAM is occupied, only allow relocating 312 * 1 MB of data for this IB. 313 * 314 * - From 0 to one half of used VRAM, the threshold decreases 315 * linearly. 316 * __________________ 317 * 1/4 of -|\ | 318 * VRAM | \ | 319 * | \ | 320 * | \ | 321 * | \ | 322 * | \ | 323 * | \ | 324 * | \________|1 MB 325 * |----------------| 326 * VRAM 0 % 100 % 327 * used used 328 * 329 * Note: It's a threshold, not a limit. The threshold must be crossed 330 * for buffer relocations to stop, so any buffer of an arbitrary size 331 * can be moved as long as the threshold isn't crossed before 332 * the relocation takes place. We don't want to disable buffer 333 * relocations completely. 334 * 335 * The idea is that buffers should be placed in VRAM at creation time 336 * and TTM should only do a minimum number of relocations during 337 * command submission. In practice, you need to submit at least 338 * a dozen IBs to move all buffers to VRAM if they are in GTT. 339 * 340 * Also, things can get pretty crazy under memory pressure and actual 341 * VRAM usage can change a lot, so playing safe even at 50% does 342 * consistently increase performance. 343 */ 344 345 u64 half_vram = real_vram_size >> 1; 346 u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; 347 u64 bytes_moved_threshold = half_free_vram >> 1; 348 return max(bytes_moved_threshold, 1024*1024ull); 349 } 350 351 static 352 int amdgpu_cs_list_validate(struct amdgpu_device *adev, 353 struct amdgpu_vm *vm, 354 struct list_head *validated) 355 { 356 struct amdgpu_bo_list_entry *lobj; 357 struct amdgpu_bo *bo; 358 u64 bytes_moved = 0, initial_bytes_moved; 359 u64 bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(adev); 360 int r; 361 362 list_for_each_entry(lobj, validated, tv.head) { 363 bo = lobj->robj; 364 if (!bo->pin_count) { 365 u32 domain = lobj->prefered_domains; 366 u32 current_domain = 367 amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); 368 369 /* Check if this buffer will be moved and don't move it 370 * if we have moved too many buffers for this IB already. 371 * 372 * Note that this allows moving at least one buffer of 373 * any size, because it doesn't take the current "bo" 374 * into account. We don't want to disallow buffer moves 375 * completely. 376 */ 377 if ((lobj->allowed_domains & current_domain) != 0 && 378 (domain & current_domain) == 0 && /* will be moved */ 379 bytes_moved > bytes_moved_threshold) { 380 /* don't move it */ 381 domain = current_domain; 382 } 383 384 retry: 385 amdgpu_ttm_placement_from_domain(bo, domain); 386 initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); 387 r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); 388 bytes_moved += atomic64_read(&adev->num_bytes_moved) - 389 initial_bytes_moved; 390 391 if (unlikely(r)) { 392 if (r != -ERESTARTSYS && domain != lobj->allowed_domains) { 393 domain = lobj->allowed_domains; 394 goto retry; 395 } 396 return r; 397 } 398 } 399 lobj->bo_va = amdgpu_vm_bo_find(vm, bo); 400 } 401 return 0; 402 } 403 404 static int amdgpu_cs_parser_relocs(struct amdgpu_cs_parser *p) 405 { 406 struct amdgpu_fpriv *fpriv = p->filp->driver_priv; 407 struct amdgpu_cs_buckets buckets; 408 struct list_head duplicates; 409 bool need_mmap_lock __diagused = false; 410 int i, r; 411 412 if (p->bo_list) { 413 need_mmap_lock = p->bo_list->has_userptr; 414 amdgpu_cs_buckets_init(&buckets); 415 for (i = 0; i < p->bo_list->num_entries; i++) 416 amdgpu_cs_buckets_add(&buckets, &p->bo_list->array[i].tv.head, 417 p->bo_list->array[i].priority); 418 419 amdgpu_cs_buckets_get_list(&buckets, &p->validated); 420 } 421 422 p->vm_bos = amdgpu_vm_get_bos(p->adev, &fpriv->vm, 423 &p->validated); 424 425 if (p->uf.bo) 426 list_add(&p->uf_entry.tv.head, &p->validated); 427 428 #ifdef __NetBSD__ 429 KASSERTMSG(!need_mmap_lock, 430 "someone didn't finish adding support for userptr" 431 " and it wasn't me"); 432 #else 433 if (need_mmap_lock) 434 down_read(¤t->mm->mmap_sem); 435 #endif 436 437 INIT_LIST_HEAD(&duplicates); 438 r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true, &duplicates); 439 if (unlikely(r != 0)) 440 goto error_reserve; 441 442 r = amdgpu_cs_list_validate(p->adev, &fpriv->vm, &p->validated); 443 if (r) 444 goto error_validate; 445 446 r = amdgpu_cs_list_validate(p->adev, &fpriv->vm, &duplicates); 447 448 error_validate: 449 if (r) 450 ttm_eu_backoff_reservation(&p->ticket, &p->validated); 451 452 error_reserve: 453 #ifndef __NetBSD__ 454 if (need_mmap_lock) 455 up_read(¤t->mm->mmap_sem); 456 #endif 457 458 return r; 459 } 460 461 static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) 462 { 463 struct amdgpu_bo_list_entry *e; 464 int r; 465 466 list_for_each_entry(e, &p->validated, tv.head) { 467 struct reservation_object *resv = e->robj->tbo.resv; 468 r = amdgpu_sync_resv(p->adev, &p->ibs[0].sync, resv, p->filp); 469 470 if (r) 471 return r; 472 } 473 return 0; 474 } 475 476 static int cmp_size_smaller_first(void *priv, struct list_head *a, 477 struct list_head *b) 478 { 479 struct amdgpu_bo_list_entry *la = list_entry(a, struct amdgpu_bo_list_entry, tv.head); 480 struct amdgpu_bo_list_entry *lb = list_entry(b, struct amdgpu_bo_list_entry, tv.head); 481 482 /* Sort A before B if A is smaller. */ 483 return (int)la->robj->tbo.num_pages - (int)lb->robj->tbo.num_pages; 484 } 485 486 /** 487 * cs_parser_fini() - clean parser states 488 * @parser: parser structure holding parsing context. 489 * @error: error number 490 * 491 * If error is set than unvalidate buffer, otherwise just free memory 492 * used by parsing context. 493 **/ 494 static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, bool backoff) 495 { 496 unsigned i; 497 498 if (!error) { 499 /* Sort the buffer list from the smallest to largest buffer, 500 * which affects the order of buffers in the LRU list. 501 * This assures that the smallest buffers are added first 502 * to the LRU list, so they are likely to be later evicted 503 * first, instead of large buffers whose eviction is more 504 * expensive. 505 * 506 * This slightly lowers the number of bytes moved by TTM 507 * per frame under memory pressure. 508 */ 509 list_sort(NULL, &parser->validated, cmp_size_smaller_first); 510 511 ttm_eu_fence_buffer_objects(&parser->ticket, 512 &parser->validated, 513 parser->fence); 514 } else if (backoff) { 515 ttm_eu_backoff_reservation(&parser->ticket, 516 &parser->validated); 517 } 518 fence_put(parser->fence); 519 520 if (parser->ctx) 521 amdgpu_ctx_put(parser->ctx); 522 if (parser->bo_list) 523 amdgpu_bo_list_put(parser->bo_list); 524 525 drm_free_large(parser->vm_bos); 526 for (i = 0; i < parser->nchunks; i++) 527 drm_free_large(parser->chunks[i].kdata); 528 kfree(parser->chunks); 529 if (parser->ibs) 530 for (i = 0; i < parser->num_ibs; i++) 531 amdgpu_ib_free(parser->adev, &parser->ibs[i]); 532 kfree(parser->ibs); 533 amdgpu_bo_unref(&parser->uf.bo); 534 amdgpu_bo_unref(&parser->uf_entry.robj); 535 } 536 537 static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p, 538 struct amdgpu_vm *vm) 539 { 540 struct amdgpu_device *adev = p->adev; 541 struct amdgpu_bo_va *bo_va; 542 struct amdgpu_bo *bo; 543 int i, r; 544 545 r = amdgpu_vm_update_page_directory(adev, vm); 546 if (r) 547 return r; 548 549 r = amdgpu_sync_fence(adev, &p->ibs[0].sync, vm->page_directory_fence); 550 if (r) 551 return r; 552 553 r = amdgpu_vm_clear_freed(adev, vm); 554 if (r) 555 return r; 556 557 if (p->bo_list) { 558 for (i = 0; i < p->bo_list->num_entries; i++) { 559 struct fence *f; 560 561 /* ignore duplicates */ 562 bo = p->bo_list->array[i].robj; 563 if (!bo) 564 continue; 565 566 bo_va = p->bo_list->array[i].bo_va; 567 if (bo_va == NULL) 568 continue; 569 570 r = amdgpu_vm_bo_update(adev, bo_va, &bo->tbo.mem); 571 if (r) 572 return r; 573 574 f = bo_va->last_pt_update; 575 r = amdgpu_sync_fence(adev, &p->ibs[0].sync, f); 576 if (r) 577 return r; 578 } 579 580 } 581 582 r = amdgpu_vm_clear_invalids(adev, vm, &p->ibs[0].sync); 583 584 if (amdgpu_vm_debug && p->bo_list) { 585 /* Invalidate all BOs to test for userspace bugs */ 586 for (i = 0; i < p->bo_list->num_entries; i++) { 587 /* ignore duplicates */ 588 bo = p->bo_list->array[i].robj; 589 if (!bo) 590 continue; 591 592 amdgpu_vm_bo_invalidate(adev, bo); 593 } 594 } 595 596 return r; 597 } 598 599 static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, 600 struct amdgpu_cs_parser *parser) 601 { 602 struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; 603 struct amdgpu_vm *vm = &fpriv->vm; 604 struct amdgpu_ring *ring; 605 int i, r; 606 607 if (parser->num_ibs == 0) 608 return 0; 609 610 /* Only for UVD/VCE VM emulation */ 611 for (i = 0; i < parser->num_ibs; i++) { 612 ring = parser->ibs[i].ring; 613 if (ring->funcs->parse_cs) { 614 r = amdgpu_ring_parse_cs(ring, parser, i); 615 if (r) 616 return r; 617 } 618 } 619 620 r = amdgpu_bo_vm_update_pte(parser, vm); 621 if (!r) 622 amdgpu_cs_sync_rings(parser); 623 624 return r; 625 } 626 627 static int amdgpu_cs_handle_lockup(struct amdgpu_device *adev, int r) 628 { 629 if (r == -EDEADLK) { 630 r = amdgpu_gpu_reset(adev); 631 if (!r) 632 r = -EAGAIN; 633 } 634 return r; 635 } 636 637 static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, 638 struct amdgpu_cs_parser *parser) 639 { 640 struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; 641 struct amdgpu_vm *vm = &fpriv->vm; 642 int i, j; 643 int r; 644 645 for (i = 0, j = 0; i < parser->nchunks && j < parser->num_ibs; i++) { 646 struct amdgpu_cs_chunk *chunk; 647 struct amdgpu_ib *ib; 648 struct drm_amdgpu_cs_chunk_ib *chunk_ib; 649 struct amdgpu_ring *ring; 650 651 chunk = &parser->chunks[i]; 652 ib = &parser->ibs[j]; 653 chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata; 654 655 if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB) 656 continue; 657 658 r = amdgpu_cs_get_ring(adev, chunk_ib->ip_type, 659 chunk_ib->ip_instance, chunk_ib->ring, 660 &ring); 661 if (r) 662 return r; 663 664 if (ring->funcs->parse_cs) { 665 struct amdgpu_bo_va_mapping *m; 666 struct amdgpu_bo *aobj = NULL; 667 uint64_t offset; 668 uint8_t *kptr; 669 670 m = amdgpu_cs_find_mapping(parser, chunk_ib->va_start, 671 &aobj); 672 if (!aobj) { 673 DRM_ERROR("IB va_start is invalid\n"); 674 return -EINVAL; 675 } 676 677 if ((chunk_ib->va_start + chunk_ib->ib_bytes) > 678 (m->it.last + 1) * AMDGPU_GPU_PAGE_SIZE) { 679 DRM_ERROR("IB va_start+ib_bytes is invalid\n"); 680 return -EINVAL; 681 } 682 683 /* the IB should be reserved at this point */ 684 r = amdgpu_bo_kmap(aobj, (void **)&kptr); 685 if (r) { 686 return r; 687 } 688 689 offset = ((uint64_t)m->it.start) * AMDGPU_GPU_PAGE_SIZE; 690 kptr += chunk_ib->va_start - offset; 691 692 r = amdgpu_ib_get(ring, NULL, chunk_ib->ib_bytes, ib); 693 if (r) { 694 DRM_ERROR("Failed to get ib !\n"); 695 return r; 696 } 697 698 memcpy(ib->ptr, kptr, chunk_ib->ib_bytes); 699 amdgpu_bo_kunmap(aobj); 700 } else { 701 r = amdgpu_ib_get(ring, vm, 0, ib); 702 if (r) { 703 DRM_ERROR("Failed to get ib !\n"); 704 return r; 705 } 706 707 ib->gpu_addr = chunk_ib->va_start; 708 } 709 710 ib->length_dw = chunk_ib->ib_bytes / 4; 711 ib->flags = chunk_ib->flags; 712 ib->ctx = parser->ctx; 713 j++; 714 } 715 716 if (!parser->num_ibs) 717 return 0; 718 719 /* add GDS resources to first IB */ 720 if (parser->bo_list) { 721 struct amdgpu_bo *gds = parser->bo_list->gds_obj; 722 struct amdgpu_bo *gws = parser->bo_list->gws_obj; 723 struct amdgpu_bo *oa = parser->bo_list->oa_obj; 724 struct amdgpu_ib *ib = &parser->ibs[0]; 725 726 if (gds) { 727 ib->gds_base = amdgpu_bo_gpu_offset(gds); 728 ib->gds_size = amdgpu_bo_size(gds); 729 } 730 if (gws) { 731 ib->gws_base = amdgpu_bo_gpu_offset(gws); 732 ib->gws_size = amdgpu_bo_size(gws); 733 } 734 if (oa) { 735 ib->oa_base = amdgpu_bo_gpu_offset(oa); 736 ib->oa_size = amdgpu_bo_size(oa); 737 } 738 } 739 /* wrap the last IB with user fence */ 740 if (parser->uf.bo) { 741 struct amdgpu_ib *ib = &parser->ibs[parser->num_ibs - 1]; 742 743 /* UVD & VCE fw doesn't support user fences */ 744 if (ib->ring->type == AMDGPU_RING_TYPE_UVD || 745 ib->ring->type == AMDGPU_RING_TYPE_VCE) 746 return -EINVAL; 747 748 ib->user = &parser->uf; 749 } 750 751 return 0; 752 } 753 754 static int amdgpu_cs_dependencies(struct amdgpu_device *adev, 755 struct amdgpu_cs_parser *p) 756 { 757 struct amdgpu_fpriv *fpriv = p->filp->driver_priv; 758 struct amdgpu_ib *ib; 759 int i, j, r; 760 761 if (!p->num_ibs) 762 return 0; 763 764 /* Add dependencies to first IB */ 765 ib = &p->ibs[0]; 766 for (i = 0; i < p->nchunks; ++i) { 767 struct drm_amdgpu_cs_chunk_dep *deps; 768 struct amdgpu_cs_chunk *chunk; 769 unsigned num_deps; 770 771 chunk = &p->chunks[i]; 772 773 if (chunk->chunk_id != AMDGPU_CHUNK_ID_DEPENDENCIES) 774 continue; 775 776 deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata; 777 num_deps = chunk->length_dw * 4 / 778 sizeof(struct drm_amdgpu_cs_chunk_dep); 779 780 for (j = 0; j < num_deps; ++j) { 781 struct amdgpu_ring *ring; 782 struct amdgpu_ctx *ctx; 783 struct fence *fence; 784 785 r = amdgpu_cs_get_ring(adev, deps[j].ip_type, 786 deps[j].ip_instance, 787 deps[j].ring, &ring); 788 if (r) 789 return r; 790 791 ctx = amdgpu_ctx_get(fpriv, deps[j].ctx_id); 792 if (ctx == NULL) 793 return -EINVAL; 794 795 fence = amdgpu_ctx_get_fence(ctx, ring, 796 deps[j].handle); 797 if (IS_ERR(fence)) { 798 r = PTR_ERR(fence); 799 amdgpu_ctx_put(ctx); 800 return r; 801 802 } else if (fence) { 803 r = amdgpu_sync_fence(adev, &ib->sync, fence); 804 fence_put(fence); 805 amdgpu_ctx_put(ctx); 806 if (r) 807 return r; 808 } 809 } 810 } 811 812 return 0; 813 } 814 815 static int amdgpu_cs_free_job(struct amdgpu_job *job) 816 { 817 int i; 818 if (job->ibs) 819 for (i = 0; i < job->num_ibs; i++) 820 amdgpu_ib_free(job->adev, &job->ibs[i]); 821 kfree(job->ibs); 822 if (job->uf.bo) 823 amdgpu_bo_unref(&job->uf.bo); 824 return 0; 825 } 826 827 int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) 828 { 829 struct amdgpu_device *adev = dev->dev_private; 830 union drm_amdgpu_cs *cs = data; 831 struct amdgpu_cs_parser parser = {}; 832 bool reserved_buffers = false; 833 int i, r; 834 835 if (!adev->accel_working) 836 return -EBUSY; 837 838 parser.adev = adev; 839 parser.filp = filp; 840 841 r = amdgpu_cs_parser_init(&parser, data); 842 if (r) { 843 DRM_ERROR("Failed to initialize parser !\n"); 844 amdgpu_cs_parser_fini(&parser, r, false); 845 r = amdgpu_cs_handle_lockup(adev, r); 846 return r; 847 } 848 r = amdgpu_cs_parser_relocs(&parser); 849 if (r == -ENOMEM) 850 DRM_ERROR("Not enough memory for command submission!\n"); 851 else if (r && r != -ERESTARTSYS) 852 DRM_ERROR("Failed to process the buffer list %d!\n", r); 853 else if (!r) { 854 reserved_buffers = true; 855 r = amdgpu_cs_ib_fill(adev, &parser); 856 } 857 858 if (!r) { 859 r = amdgpu_cs_dependencies(adev, &parser); 860 if (r) 861 DRM_ERROR("Failed in the dependencies handling %d!\n", r); 862 } 863 864 if (r) 865 goto out; 866 867 for (i = 0; i < parser.num_ibs; i++) 868 trace_amdgpu_cs(&parser, i); 869 870 r = amdgpu_cs_ib_vm_chunk(adev, &parser); 871 if (r) 872 goto out; 873 874 if (amdgpu_enable_scheduler && parser.num_ibs) { 875 struct amdgpu_ring * ring = parser.ibs->ring; 876 struct amd_sched_fence *fence; 877 struct amdgpu_job *job; 878 879 job = kzalloc(sizeof(struct amdgpu_job), GFP_KERNEL); 880 if (!job) { 881 r = -ENOMEM; 882 goto out; 883 } 884 885 job->base.sched = &ring->sched; 886 job->base.s_entity = &parser.ctx->rings[ring->idx].entity; 887 job->adev = parser.adev; 888 job->owner = parser.filp; 889 job->free_job = amdgpu_cs_free_job; 890 891 job->ibs = parser.ibs; 892 job->num_ibs = parser.num_ibs; 893 parser.ibs = NULL; 894 parser.num_ibs = 0; 895 896 if (job->ibs[job->num_ibs - 1].user) { 897 job->uf = parser.uf; 898 job->ibs[job->num_ibs - 1].user = &job->uf; 899 parser.uf.bo = NULL; 900 } 901 902 fence = amd_sched_fence_create(job->base.s_entity, 903 parser.filp); 904 if (!fence) { 905 r = -ENOMEM; 906 amdgpu_cs_free_job(job); 907 kfree(job); 908 goto out; 909 } 910 job->base.s_fence = fence; 911 parser.fence = fence_get(&fence->base); 912 913 cs->out.handle = amdgpu_ctx_add_fence(parser.ctx, ring, 914 &fence->base); 915 job->ibs[job->num_ibs - 1].sequence = cs->out.handle; 916 917 trace_amdgpu_cs_ioctl(job); 918 amd_sched_entity_push_job(&job->base); 919 920 } else { 921 struct amdgpu_fence *fence; 922 923 r = amdgpu_ib_schedule(adev, parser.num_ibs, parser.ibs, 924 parser.filp); 925 fence = parser.ibs[parser.num_ibs - 1].fence; 926 parser.fence = fence_get(&fence->base); 927 cs->out.handle = parser.ibs[parser.num_ibs - 1].sequence; 928 } 929 930 out: 931 amdgpu_cs_parser_fini(&parser, r, reserved_buffers); 932 r = amdgpu_cs_handle_lockup(adev, r); 933 return r; 934 } 935 936 /** 937 * amdgpu_cs_wait_ioctl - wait for a command submission to finish 938 * 939 * @dev: drm device 940 * @data: data from userspace 941 * @filp: file private 942 * 943 * Wait for the command submission identified by handle to finish. 944 */ 945 int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, 946 struct drm_file *filp) 947 { 948 union drm_amdgpu_wait_cs *wait = data; 949 struct amdgpu_device *adev = dev->dev_private; 950 unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); 951 struct amdgpu_ring *ring = NULL; 952 struct amdgpu_ctx *ctx; 953 struct fence *fence; 954 long r; 955 956 r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance, 957 wait->in.ring, &ring); 958 if (r) 959 return r; 960 961 ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id); 962 if (ctx == NULL) 963 return -EINVAL; 964 965 fence = amdgpu_ctx_get_fence(ctx, ring, wait->in.handle); 966 if (IS_ERR(fence)) 967 r = PTR_ERR(fence); 968 else if (fence) { 969 r = fence_wait_timeout(fence, true, timeout); 970 fence_put(fence); 971 } else 972 r = 1; 973 974 amdgpu_ctx_put(ctx); 975 if (r < 0) 976 return r; 977 978 memset(wait, 0, sizeof(*wait)); 979 wait->out.status = (r == 0); 980 981 return 0; 982 } 983 984 /** 985 * amdgpu_cs_find_bo_va - find bo_va for VM address 986 * 987 * @parser: command submission parser context 988 * @addr: VM address 989 * @bo: resulting BO of the mapping found 990 * 991 * Search the buffer objects in the command submission context for a certain 992 * virtual memory address. Returns allocation structure when found, NULL 993 * otherwise. 994 */ 995 struct amdgpu_bo_va_mapping * 996 amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, 997 uint64_t addr, struct amdgpu_bo **bo) 998 { 999 struct amdgpu_bo_list_entry *reloc; 1000 struct amdgpu_bo_va_mapping *mapping; 1001 1002 addr /= AMDGPU_GPU_PAGE_SIZE; 1003 1004 list_for_each_entry(reloc, &parser->validated, tv.head) { 1005 if (!reloc->bo_va) 1006 continue; 1007 1008 list_for_each_entry(mapping, &reloc->bo_va->valids, list) { 1009 if (mapping->it.start > addr || 1010 addr > mapping->it.last) 1011 continue; 1012 1013 *bo = reloc->bo_va->bo; 1014 return mapping; 1015 } 1016 1017 list_for_each_entry(mapping, &reloc->bo_va->invalids, list) { 1018 if (mapping->it.start > addr || 1019 addr > mapping->it.last) 1020 continue; 1021 1022 *bo = reloc->bo_va->bo; 1023 return mapping; 1024 } 1025 } 1026 1027 return NULL; 1028 } 1029