1 /* 2 * Copyright © 2008-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/dma-fence-array.h> 26 #include <linux/irq_work.h> 27 #include <linux/prefetch.h> 28 #include <linux/sched.h> 29 #include <linux/sched/clock.h> 30 #include <linux/sched/signal.h> 31 32 #include "gem/i915_gem_context.h" 33 #include "gt/intel_context.h" 34 #include "gt/intel_ring.h" 35 #include "gt/intel_rps.h" 36 37 #include "i915_active.h" 38 #include "i915_drv.h" 39 #include "i915_globals.h" 40 #include "i915_trace.h" 41 #include "intel_pm.h" 42 43 struct execute_cb { 44 struct list_head link; 45 struct irq_work work; 46 struct i915_sw_fence *fence; 47 void (*hook)(struct i915_request *rq, struct dma_fence *signal); 48 struct i915_request *signal; 49 }; 50 51 static struct i915_global_request { 52 struct i915_global base; 53 #ifdef __linux__ 54 struct kmem_cache *slab_requests; 55 struct kmem_cache *slab_execute_cbs; 56 #else 57 struct pool slab_requests; 58 struct pool slab_execute_cbs; 59 #endif 60 } global; 61 62 static const char *i915_fence_get_driver_name(struct dma_fence *fence) 63 { 64 return dev_name(to_request(fence)->i915->drm.dev); 65 } 66 67 static const char *i915_fence_get_timeline_name(struct dma_fence *fence) 68 { 69 const struct i915_gem_context *ctx; 70 71 /* 72 * The timeline struct (as part of the ppgtt underneath a context) 73 * may be freed when the request is no longer in use by the GPU. 74 * We could extend the life of a context to beyond that of all 75 * fences, possibly keeping the hw resource around indefinitely, 76 * or we just give them a false name. Since 77 * dma_fence_ops.get_timeline_name is a debug feature, the occasional 78 * lie seems justifiable. 79 */ 80 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 81 return "signaled"; 82 83 ctx = i915_request_gem_context(to_request(fence)); 84 if (!ctx) 85 return "[" DRIVER_NAME "]"; 86 87 return ctx->name; 88 } 89 90 static bool i915_fence_signaled(struct dma_fence *fence) 91 { 92 return i915_request_completed(to_request(fence)); 93 } 94 95 static bool i915_fence_enable_signaling(struct dma_fence *fence) 96 { 97 return i915_request_enable_breadcrumb(to_request(fence)); 98 } 99 100 static signed long i915_fence_wait(struct dma_fence *fence, 101 bool interruptible, 102 signed long timeout) 103 { 104 return i915_request_wait(to_request(fence), 105 interruptible | I915_WAIT_PRIORITY, 106 timeout); 107 } 108 109 static void i915_fence_release(struct dma_fence *fence) 110 { 111 struct i915_request *rq = to_request(fence); 112 113 /* 114 * The request is put onto a RCU freelist (i.e. the address 115 * is immediately reused), mark the fences as being freed now. 116 * Otherwise the debugobjects for the fences are only marked as 117 * freed when the slab cache itself is freed, and so we would get 118 * caught trying to reuse dead objects. 119 */ 120 i915_sw_fence_fini(&rq->submit); 121 i915_sw_fence_fini(&rq->semaphore); 122 123 #ifdef __linux__ 124 kmem_cache_free(global.slab_requests, rq); 125 #else 126 pool_put(&global.slab_requests, rq); 127 #endif 128 } 129 130 const struct dma_fence_ops i915_fence_ops = { 131 .get_driver_name = i915_fence_get_driver_name, 132 .get_timeline_name = i915_fence_get_timeline_name, 133 .enable_signaling = i915_fence_enable_signaling, 134 .signaled = i915_fence_signaled, 135 .wait = i915_fence_wait, 136 .release = i915_fence_release, 137 }; 138 139 static void irq_execute_cb(struct irq_work *wrk) 140 { 141 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 142 143 i915_sw_fence_complete(cb->fence); 144 #ifdef __linux__ 145 kmem_cache_free(global.slab_execute_cbs, cb); 146 #else 147 pool_put(&global.slab_execute_cbs, cb); 148 #endif 149 } 150 151 static void irq_execute_cb_hook(struct irq_work *wrk) 152 { 153 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 154 155 cb->hook(container_of(cb->fence, struct i915_request, submit), 156 &cb->signal->fence); 157 i915_request_put(cb->signal); 158 159 irq_execute_cb(wrk); 160 } 161 162 static void __notify_execute_cb(struct i915_request *rq) 163 { 164 struct execute_cb *cb; 165 166 lockdep_assert_held(&rq->lock); 167 168 if (list_empty(&rq->execute_cb)) 169 return; 170 171 list_for_each_entry(cb, &rq->execute_cb, link) 172 irq_work_queue(&cb->work); 173 174 /* 175 * XXX Rollback on __i915_request_unsubmit() 176 * 177 * In the future, perhaps when we have an active time-slicing scheduler, 178 * it will be interesting to unsubmit parallel execution and remove 179 * busywaits from the GPU until their master is restarted. This is 180 * quite hairy, we have to carefully rollback the fence and do a 181 * preempt-to-idle cycle on the target engine, all the while the 182 * master execute_cb may refire. 183 */ 184 INIT_LIST_HEAD(&rq->execute_cb); 185 } 186 187 static inline void 188 remove_from_client(struct i915_request *request) 189 { 190 struct drm_i915_file_private *file_priv; 191 192 if (!READ_ONCE(request->file_priv)) 193 return; 194 195 rcu_read_lock(); 196 file_priv = xchg(&request->file_priv, NULL); 197 if (file_priv) { 198 spin_lock(&file_priv->mm.lock); 199 list_del(&request->client_link); 200 spin_unlock(&file_priv->mm.lock); 201 } 202 rcu_read_unlock(); 203 } 204 205 static void free_capture_list(struct i915_request *request) 206 { 207 struct i915_capture_list *capture; 208 209 capture = fetch_and_zero(&request->capture_list); 210 while (capture) { 211 struct i915_capture_list *next = capture->next; 212 213 kfree(capture); 214 capture = next; 215 } 216 } 217 218 static void __i915_request_fill(struct i915_request *rq, u8 val) 219 { 220 void *vaddr = rq->ring->vaddr; 221 u32 head; 222 223 head = rq->infix; 224 if (rq->postfix < head) { 225 memset(vaddr + head, val, rq->ring->size - head); 226 head = 0; 227 } 228 memset(vaddr + head, val, rq->postfix - head); 229 } 230 231 static void remove_from_engine(struct i915_request *rq) 232 { 233 struct intel_engine_cs *engine, *locked; 234 235 /* 236 * Virtual engines complicate acquiring the engine timeline lock, 237 * as their rq->engine pointer is not stable until under that 238 * engine lock. The simple ploy we use is to take the lock then 239 * check that the rq still belongs to the newly locked engine. 240 */ 241 locked = READ_ONCE(rq->engine); 242 spin_lock_irq(&locked->active.lock); 243 while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { 244 spin_unlock(&locked->active.lock); 245 spin_lock(&engine->active.lock); 246 locked = engine; 247 } 248 list_del_init(&rq->sched.link); 249 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 250 clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); 251 spin_unlock_irq(&locked->active.lock); 252 } 253 254 bool i915_request_retire(struct i915_request *rq) 255 { 256 if (!i915_request_completed(rq)) 257 return false; 258 259 RQ_TRACE(rq, "\n"); 260 261 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 262 trace_i915_request_retire(rq); 263 264 /* 265 * We know the GPU must have read the request to have 266 * sent us the seqno + interrupt, so use the position 267 * of tail of the request to update the last known position 268 * of the GPU head. 269 * 270 * Note this requires that we are always called in request 271 * completion order. 272 */ 273 GEM_BUG_ON(!list_is_first(&rq->link, 274 &i915_request_timeline(rq)->requests)); 275 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 276 /* Poison before we release our space in the ring */ 277 __i915_request_fill(rq, POISON_FREE); 278 rq->ring->head = rq->postfix; 279 280 /* 281 * We only loosely track inflight requests across preemption, 282 * and so we may find ourselves attempting to retire a _completed_ 283 * request that we have removed from the HW and put back on a run 284 * queue. 285 */ 286 remove_from_engine(rq); 287 288 spin_lock_irq(&rq->lock); 289 i915_request_mark_complete(rq); 290 if (!i915_request_signaled(rq)) 291 dma_fence_signal_locked(&rq->fence); 292 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) 293 i915_request_cancel_breadcrumb(rq); 294 if (i915_request_has_waitboost(rq)) { 295 GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); 296 atomic_dec(&rq->engine->gt->rps.num_waiters); 297 } 298 if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { 299 set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); 300 __notify_execute_cb(rq); 301 } 302 GEM_BUG_ON(!list_empty(&rq->execute_cb)); 303 spin_unlock_irq(&rq->lock); 304 305 remove_from_client(rq); 306 __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */ 307 308 intel_context_exit(rq->context); 309 intel_context_unpin(rq->context); 310 311 free_capture_list(rq); 312 i915_sched_node_fini(&rq->sched); 313 i915_request_put(rq); 314 315 return true; 316 } 317 318 void i915_request_retire_upto(struct i915_request *rq) 319 { 320 struct intel_timeline * const tl = i915_request_timeline(rq); 321 struct i915_request *tmp; 322 323 RQ_TRACE(rq, "\n"); 324 325 GEM_BUG_ON(!i915_request_completed(rq)); 326 327 do { 328 tmp = list_first_entry(&tl->requests, typeof(*tmp), link); 329 } while (i915_request_retire(tmp) && tmp != rq); 330 } 331 332 static int 333 __await_execution(struct i915_request *rq, 334 struct i915_request *signal, 335 void (*hook)(struct i915_request *rq, 336 struct dma_fence *signal), 337 gfp_t gfp) 338 { 339 struct execute_cb *cb; 340 341 if (i915_request_is_active(signal)) { 342 if (hook) 343 hook(rq, &signal->fence); 344 return 0; 345 } 346 347 #ifdef __linux__ 348 cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); 349 #else 350 cb = pool_get(&global.slab_execute_cbs, 351 (gfp & GFP_NOWAIT) ? PR_NOWAIT : PR_WAITOK); 352 #endif 353 if (!cb) 354 return -ENOMEM; 355 356 cb->fence = &rq->submit; 357 i915_sw_fence_await(cb->fence); 358 init_irq_work(&cb->work, irq_execute_cb); 359 360 if (hook) { 361 cb->hook = hook; 362 cb->signal = i915_request_get(signal); 363 #ifdef __linux__ 364 cb->work.func = irq_execute_cb_hook; 365 #else 366 init_irq_work(&cb->work, irq_execute_cb_hook); 367 #endif 368 } 369 370 spin_lock_irq(&signal->lock); 371 if (i915_request_is_active(signal)) { 372 if (hook) { 373 hook(rq, &signal->fence); 374 i915_request_put(signal); 375 } 376 i915_sw_fence_complete(cb->fence); 377 #ifdef __linux__ 378 kmem_cache_free(global.slab_execute_cbs, cb); 379 #else 380 pool_put(&global.slab_execute_cbs, cb); 381 #endif 382 } else { 383 list_add_tail(&cb->link, &signal->execute_cb); 384 } 385 spin_unlock_irq(&signal->lock); 386 387 /* Copy across semaphore status as we need the same behaviour */ 388 rq->sched.flags |= signal->sched.flags; 389 return 0; 390 } 391 392 static bool fatal_error(int error) 393 { 394 switch (error) { 395 case 0: /* not an error! */ 396 case -EAGAIN: /* innocent victim of a GT reset (__i915_request_reset) */ 397 case -ETIMEDOUT: /* waiting for Godot (timer_i915_sw_fence_wake) */ 398 return false; 399 default: 400 return true; 401 } 402 } 403 404 void __i915_request_skip(struct i915_request *rq) 405 { 406 GEM_BUG_ON(!fatal_error(rq->fence.error)); 407 408 if (rq->infix == rq->postfix) 409 return; 410 411 /* 412 * As this request likely depends on state from the lost 413 * context, clear out all the user operations leaving the 414 * breadcrumb at the end (so we get the fence notifications). 415 */ 416 __i915_request_fill(rq, 0); 417 rq->infix = rq->postfix; 418 } 419 420 void i915_request_set_error_once(struct i915_request *rq, int error) 421 { 422 int old; 423 424 GEM_BUG_ON(!IS_ERR_VALUE((long)error)); 425 426 if (i915_request_signaled(rq)) 427 return; 428 429 old = READ_ONCE(rq->fence.error); 430 do { 431 if (fatal_error(old)) 432 return; 433 } while (!try_cmpxchg(&rq->fence.error, &old, error)); 434 } 435 436 bool __i915_request_submit(struct i915_request *request) 437 { 438 struct intel_engine_cs *engine = request->engine; 439 bool result = false; 440 441 RQ_TRACE(request, "\n"); 442 443 GEM_BUG_ON(!irqs_disabled()); 444 lockdep_assert_held(&engine->active.lock); 445 446 /* 447 * With the advent of preempt-to-busy, we frequently encounter 448 * requests that we have unsubmitted from HW, but left running 449 * until the next ack and so have completed in the meantime. On 450 * resubmission of that completed request, we can skip 451 * updating the payload, and execlists can even skip submitting 452 * the request. 453 * 454 * We must remove the request from the caller's priority queue, 455 * and the caller must only call us when the request is in their 456 * priority queue, under the active.lock. This ensures that the 457 * request has *not* yet been retired and we can safely move 458 * the request into the engine->active.list where it will be 459 * dropped upon retiring. (Otherwise if resubmit a *retired* 460 * request, this would be a horrible use-after-free.) 461 */ 462 if (i915_request_completed(request)) 463 goto xfer; 464 465 if (unlikely(intel_context_is_banned(request->context))) 466 i915_request_set_error_once(request, -EIO); 467 if (unlikely(fatal_error(request->fence.error))) 468 __i915_request_skip(request); 469 470 /* 471 * Are we using semaphores when the gpu is already saturated? 472 * 473 * Using semaphores incurs a cost in having the GPU poll a 474 * memory location, busywaiting for it to change. The continual 475 * memory reads can have a noticeable impact on the rest of the 476 * system with the extra bus traffic, stalling the cpu as it too 477 * tries to access memory across the bus (perf stat -e bus-cycles). 478 * 479 * If we installed a semaphore on this request and we only submit 480 * the request after the signaler completed, that indicates the 481 * system is overloaded and using semaphores at this time only 482 * increases the amount of work we are doing. If so, we disable 483 * further use of semaphores until we are idle again, whence we 484 * optimistically try again. 485 */ 486 if (request->sched.semaphores && 487 i915_sw_fence_signaled(&request->semaphore)) 488 engine->saturated |= request->sched.semaphores; 489 490 engine->emit_fini_breadcrumb(request, 491 request->ring->vaddr + request->postfix); 492 493 trace_i915_request_execute(request); 494 engine->serial++; 495 result = true; 496 497 xfer: /* We may be recursing from the signal callback of another i915 fence */ 498 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 499 500 if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) { 501 list_move_tail(&request->sched.link, &engine->active.requests); 502 clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); 503 } 504 505 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && 506 !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && 507 !i915_request_enable_breadcrumb(request)) 508 intel_engine_signal_breadcrumbs(engine); 509 510 __notify_execute_cb(request); 511 512 spin_unlock(&request->lock); 513 514 return result; 515 } 516 517 void i915_request_submit(struct i915_request *request) 518 { 519 struct intel_engine_cs *engine = request->engine; 520 unsigned long flags; 521 522 /* Will be called from irq-context when using foreign fences. */ 523 spin_lock_irqsave(&engine->active.lock, flags); 524 525 __i915_request_submit(request); 526 527 spin_unlock_irqrestore(&engine->active.lock, flags); 528 } 529 530 void __i915_request_unsubmit(struct i915_request *request) 531 { 532 struct intel_engine_cs *engine = request->engine; 533 534 RQ_TRACE(request, "\n"); 535 536 GEM_BUG_ON(!irqs_disabled()); 537 lockdep_assert_held(&engine->active.lock); 538 539 /* 540 * Only unwind in reverse order, required so that the per-context list 541 * is kept in seqno/ring order. 542 */ 543 544 /* We may be recursing from the signal callback of another i915 fence */ 545 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 546 547 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) 548 i915_request_cancel_breadcrumb(request); 549 550 GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 551 clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 552 553 spin_unlock(&request->lock); 554 555 /* We've already spun, don't charge on resubmitting. */ 556 if (request->sched.semaphores && i915_request_started(request)) { 557 request->sched.attr.priority |= I915_PRIORITY_NOSEMAPHORE; 558 request->sched.semaphores = 0; 559 } 560 561 /* 562 * We don't need to wake_up any waiters on request->execute, they 563 * will get woken by any other event or us re-adding this request 564 * to the engine timeline (__i915_request_submit()). The waiters 565 * should be quite adapt at finding that the request now has a new 566 * global_seqno to the one they went to sleep on. 567 */ 568 } 569 570 void i915_request_unsubmit(struct i915_request *request) 571 { 572 struct intel_engine_cs *engine = request->engine; 573 unsigned long flags; 574 575 /* Will be called from irq-context when using foreign fences. */ 576 spin_lock_irqsave(&engine->active.lock, flags); 577 578 __i915_request_unsubmit(request); 579 580 spin_unlock_irqrestore(&engine->active.lock, flags); 581 } 582 583 static int __i915_sw_fence_call 584 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 585 { 586 struct i915_request *request = 587 container_of(fence, typeof(*request), submit); 588 589 switch (state) { 590 case FENCE_COMPLETE: 591 trace_i915_request_submit(request); 592 593 if (unlikely(fence->error)) 594 i915_request_set_error_once(request, fence->error); 595 596 /* 597 * We need to serialize use of the submit_request() callback 598 * with its hotplugging performed during an emergency 599 * i915_gem_set_wedged(). We use the RCU mechanism to mark the 600 * critical section in order to force i915_gem_set_wedged() to 601 * wait until the submit_request() is completed before 602 * proceeding. 603 */ 604 rcu_read_lock(); 605 request->engine->submit_request(request); 606 rcu_read_unlock(); 607 break; 608 609 case FENCE_FREE: 610 i915_request_put(request); 611 break; 612 } 613 614 return NOTIFY_DONE; 615 } 616 617 static void irq_semaphore_cb(struct irq_work *wrk) 618 { 619 struct i915_request *rq = 620 container_of(wrk, typeof(*rq), semaphore_work); 621 622 i915_schedule_bump_priority(rq, I915_PRIORITY_NOSEMAPHORE); 623 i915_request_put(rq); 624 } 625 626 static int __i915_sw_fence_call 627 semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 628 { 629 struct i915_request *rq = container_of(fence, typeof(*rq), semaphore); 630 631 switch (state) { 632 case FENCE_COMPLETE: 633 if (!(READ_ONCE(rq->sched.attr.priority) & I915_PRIORITY_NOSEMAPHORE)) { 634 i915_request_get(rq); 635 init_irq_work(&rq->semaphore_work, irq_semaphore_cb); 636 irq_work_queue(&rq->semaphore_work); 637 } 638 break; 639 640 case FENCE_FREE: 641 i915_request_put(rq); 642 break; 643 } 644 645 return NOTIFY_DONE; 646 } 647 648 static void retire_requests(struct intel_timeline *tl) 649 { 650 struct i915_request *rq, *rn; 651 652 list_for_each_entry_safe(rq, rn, &tl->requests, link) 653 if (!i915_request_retire(rq)) 654 break; 655 } 656 657 static void __i915_request_ctor(void *); 658 659 static noinline struct i915_request * 660 request_alloc_slow(struct intel_timeline *tl, gfp_t gfp) 661 { 662 struct i915_request *rq; 663 664 if (list_empty(&tl->requests)) 665 goto out; 666 667 if (!gfpflags_allow_blocking(gfp)) 668 goto out; 669 670 /* Move our oldest request to the slab-cache (if not in use!) */ 671 rq = list_first_entry(&tl->requests, typeof(*rq), link); 672 i915_request_retire(rq); 673 674 #ifdef __linux__ 675 rq = kmem_cache_alloc(global.slab_requests, 676 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 677 #else 678 rq = pool_get(&global.slab_requests, 679 (gfp & GFP_NOWAIT) ? PR_NOWAIT : PR_WAITOK); 680 if (rq) 681 __i915_request_ctor(rq); 682 #endif 683 if (rq) 684 return rq; 685 686 /* Ratelimit ourselves to prevent oom from malicious clients */ 687 rq = list_last_entry(&tl->requests, typeof(*rq), link); 688 cond_synchronize_rcu(rq->rcustate); 689 690 /* Retire our old requests in the hope that we free some */ 691 retire_requests(tl); 692 693 out: 694 #ifdef __linux__ 695 return kmem_cache_alloc(global.slab_requests, gfp); 696 #else 697 rq = pool_get(&global.slab_requests, 698 (gfp & GFP_NOWAIT) ? PR_NOWAIT : PR_WAITOK); 699 if (rq) 700 __i915_request_ctor(rq); 701 return rq; 702 #endif 703 } 704 705 static void __i915_request_ctor(void *arg) 706 { 707 struct i915_request *rq = arg; 708 709 mtx_init(&rq->lock, IPL_TTY); 710 i915_sched_node_init(&rq->sched); 711 i915_sw_fence_init(&rq->submit, submit_notify); 712 i915_sw_fence_init(&rq->semaphore, semaphore_notify); 713 714 dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); 715 716 rq->file_priv = NULL; 717 rq->capture_list = NULL; 718 719 INIT_LIST_HEAD(&rq->execute_cb); 720 } 721 722 struct i915_request * 723 __i915_request_create(struct intel_context *ce, gfp_t gfp) 724 { 725 struct intel_timeline *tl = ce->timeline; 726 struct i915_request *rq; 727 u32 seqno; 728 int ret; 729 730 might_sleep_if(gfpflags_allow_blocking(gfp)); 731 732 /* Check that the caller provided an already pinned context */ 733 __intel_context_pin(ce); 734 735 /* 736 * Beware: Dragons be flying overhead. 737 * 738 * We use RCU to look up requests in flight. The lookups may 739 * race with the request being allocated from the slab freelist. 740 * That is the request we are writing to here, may be in the process 741 * of being read by __i915_active_request_get_rcu(). As such, 742 * we have to be very careful when overwriting the contents. During 743 * the RCU lookup, we change chase the request->engine pointer, 744 * read the request->global_seqno and increment the reference count. 745 * 746 * The reference count is incremented atomically. If it is zero, 747 * the lookup knows the request is unallocated and complete. Otherwise, 748 * it is either still in use, or has been reallocated and reset 749 * with dma_fence_init(). This increment is safe for release as we 750 * check that the request we have a reference to and matches the active 751 * request. 752 * 753 * Before we increment the refcount, we chase the request->engine 754 * pointer. We must not call kmem_cache_zalloc() or else we set 755 * that pointer to NULL and cause a crash during the lookup. If 756 * we see the request is completed (based on the value of the 757 * old engine and seqno), the lookup is complete and reports NULL. 758 * If we decide the request is not completed (new engine or seqno), 759 * then we grab a reference and double check that it is still the 760 * active request - which it won't be and restart the lookup. 761 * 762 * Do not use kmem_cache_zalloc() here! 763 */ 764 #ifdef __linux__ 765 rq = kmem_cache_alloc(global.slab_requests, 766 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 767 #else 768 rq = pool_get(&global.slab_requests, 769 (gfp & GFP_NOWAIT) ? PR_NOWAIT : PR_WAITOK); 770 if (rq) 771 __i915_request_ctor(rq); 772 #endif 773 if (unlikely(!rq)) { 774 rq = request_alloc_slow(tl, gfp); 775 if (!rq) { 776 ret = -ENOMEM; 777 goto err_unreserve; 778 } 779 } 780 781 rq->i915 = ce->engine->i915; 782 rq->context = ce; 783 rq->engine = ce->engine; 784 rq->ring = ce->ring; 785 rq->execution_mask = ce->engine->mask; 786 787 kref_init(&rq->fence.refcount); 788 rq->fence.flags = 0; 789 rq->fence.error = 0; 790 INIT_LIST_HEAD(&rq->fence.cb_list); 791 792 ret = intel_timeline_get_seqno(tl, rq, &seqno); 793 if (ret) 794 goto err_free; 795 796 rq->fence.context = tl->fence_context; 797 rq->fence.seqno = seqno; 798 799 RCU_INIT_POINTER(rq->timeline, tl); 800 RCU_INIT_POINTER(rq->hwsp_cacheline, tl->hwsp_cacheline); 801 rq->hwsp_seqno = tl->hwsp_seqno; 802 GEM_BUG_ON(i915_request_completed(rq)); 803 804 rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ 805 806 /* We bump the ref for the fence chain */ 807 i915_sw_fence_reinit(&i915_request_get(rq)->submit); 808 i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); 809 810 i915_sched_node_reinit(&rq->sched); 811 812 /* No zalloc, everything must be cleared after use */ 813 rq->batch = NULL; 814 GEM_BUG_ON(rq->file_priv); 815 GEM_BUG_ON(rq->capture_list); 816 GEM_BUG_ON(!list_empty(&rq->execute_cb)); 817 818 /* 819 * Reserve space in the ring buffer for all the commands required to 820 * eventually emit this request. This is to guarantee that the 821 * i915_request_add() call can't fail. Note that the reserve may need 822 * to be redone if the request is not actually submitted straight 823 * away, e.g. because a GPU scheduler has deferred it. 824 * 825 * Note that due to how we add reserved_space to intel_ring_begin() 826 * we need to double our request to ensure that if we need to wrap 827 * around inside i915_request_add() there is sufficient space at 828 * the beginning of the ring as well. 829 */ 830 rq->reserved_space = 831 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); 832 833 /* 834 * Record the position of the start of the request so that 835 * should we detect the updated seqno part-way through the 836 * GPU processing the request, we never over-estimate the 837 * position of the head. 838 */ 839 rq->head = rq->ring->emit; 840 841 ret = rq->engine->request_alloc(rq); 842 if (ret) 843 goto err_unwind; 844 845 rq->infix = rq->ring->emit; /* end of header; start of user payload */ 846 847 intel_context_mark_active(ce); 848 list_add_tail_rcu(&rq->link, &tl->requests); 849 850 return rq; 851 852 err_unwind: 853 ce->ring->emit = rq->head; 854 855 /* Make sure we didn't add ourselves to external state before freeing */ 856 GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); 857 GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); 858 859 err_free: 860 #ifdef __linux__ 861 kmem_cache_free(global.slab_requests, rq); 862 #else 863 pool_put(&global.slab_requests, rq); 864 #endif 865 err_unreserve: 866 intel_context_unpin(ce); 867 return ERR_PTR(ret); 868 } 869 870 struct i915_request * 871 i915_request_create(struct intel_context *ce) 872 { 873 struct i915_request *rq; 874 struct intel_timeline *tl; 875 876 tl = intel_context_timeline_lock(ce); 877 if (IS_ERR(tl)) 878 return ERR_CAST(tl); 879 880 /* Move our oldest request to the slab-cache (if not in use!) */ 881 rq = list_first_entry(&tl->requests, typeof(*rq), link); 882 if (!list_is_last(&rq->link, &tl->requests)) 883 i915_request_retire(rq); 884 885 intel_context_enter(ce); 886 rq = __i915_request_create(ce, GFP_KERNEL); 887 intel_context_exit(ce); /* active reference transferred to request */ 888 if (IS_ERR(rq)) 889 goto err_unlock; 890 891 /* Check that we do not interrupt ourselves with a new request */ 892 rq->cookie = lockdep_pin_lock(&tl->mutex); 893 894 return rq; 895 896 err_unlock: 897 intel_context_timeline_unlock(tl); 898 return rq; 899 } 900 901 static int 902 i915_request_await_start(struct i915_request *rq, struct i915_request *signal) 903 { 904 struct dma_fence *fence; 905 int err; 906 907 if (i915_request_timeline(rq) == rcu_access_pointer(signal->timeline)) 908 return 0; 909 910 if (i915_request_started(signal)) 911 return 0; 912 913 fence = NULL; 914 rcu_read_lock(); 915 spin_lock_irq(&signal->lock); 916 do { 917 struct list_head *pos = READ_ONCE(signal->link.prev); 918 struct i915_request *prev; 919 920 /* Confirm signal has not been retired, the link is valid */ 921 if (unlikely(i915_request_started(signal))) 922 break; 923 924 /* Is signal the earliest request on its timeline? */ 925 if (pos == &rcu_dereference(signal->timeline)->requests) 926 break; 927 928 /* 929 * Peek at the request before us in the timeline. That 930 * request will only be valid before it is retired, so 931 * after acquiring a reference to it, confirm that it is 932 * still part of the signaler's timeline. 933 */ 934 prev = list_entry(pos, typeof(*prev), link); 935 if (!i915_request_get_rcu(prev)) 936 break; 937 938 /* After the strong barrier, confirm prev is still attached */ 939 if (unlikely(READ_ONCE(prev->link.next) != &signal->link)) { 940 i915_request_put(prev); 941 break; 942 } 943 944 fence = &prev->fence; 945 } while (0); 946 spin_unlock_irq(&signal->lock); 947 rcu_read_unlock(); 948 if (!fence) 949 return 0; 950 951 err = 0; 952 if (!intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) 953 err = i915_sw_fence_await_dma_fence(&rq->submit, 954 fence, 0, 955 I915_FENCE_GFP); 956 dma_fence_put(fence); 957 958 return err; 959 } 960 961 static intel_engine_mask_t 962 already_busywaiting(struct i915_request *rq) 963 { 964 /* 965 * Polling a semaphore causes bus traffic, delaying other users of 966 * both the GPU and CPU. We want to limit the impact on others, 967 * while taking advantage of early submission to reduce GPU 968 * latency. Therefore we restrict ourselves to not using more 969 * than one semaphore from each source, and not using a semaphore 970 * if we have detected the engine is saturated (i.e. would not be 971 * submitted early and cause bus traffic reading an already passed 972 * semaphore). 973 * 974 * See the are-we-too-late? check in __i915_request_submit(). 975 */ 976 return rq->sched.semaphores | READ_ONCE(rq->engine->saturated); 977 } 978 979 static int 980 __emit_semaphore_wait(struct i915_request *to, 981 struct i915_request *from, 982 u32 seqno) 983 { 984 const int has_token = INTEL_GEN(to->i915) >= 12; 985 u32 hwsp_offset; 986 int len, err; 987 u32 *cs; 988 989 GEM_BUG_ON(INTEL_GEN(to->i915) < 8); 990 991 /* We need to pin the signaler's HWSP until we are finished reading. */ 992 err = intel_timeline_read_hwsp(from, to, &hwsp_offset); 993 if (err) 994 return err; 995 996 len = 4; 997 if (has_token) 998 len += 2; 999 1000 cs = intel_ring_begin(to, len); 1001 if (IS_ERR(cs)) 1002 return PTR_ERR(cs); 1003 1004 /* 1005 * Using greater-than-or-equal here means we have to worry 1006 * about seqno wraparound. To side step that issue, we swap 1007 * the timeline HWSP upon wrapping, so that everyone listening 1008 * for the old (pre-wrap) values do not see the much smaller 1009 * (post-wrap) values than they were expecting (and so wait 1010 * forever). 1011 */ 1012 *cs++ = (MI_SEMAPHORE_WAIT | 1013 MI_SEMAPHORE_GLOBAL_GTT | 1014 MI_SEMAPHORE_POLL | 1015 MI_SEMAPHORE_SAD_GTE_SDD) + 1016 has_token; 1017 *cs++ = seqno; 1018 *cs++ = hwsp_offset; 1019 *cs++ = 0; 1020 if (has_token) { 1021 *cs++ = 0; 1022 *cs++ = MI_NOOP; 1023 } 1024 1025 intel_ring_advance(to, cs); 1026 return 0; 1027 } 1028 1029 static int 1030 emit_semaphore_wait(struct i915_request *to, 1031 struct i915_request *from, 1032 gfp_t gfp) 1033 { 1034 const intel_engine_mask_t mask = READ_ONCE(from->engine)->mask; 1035 1036 if (!intel_context_use_semaphores(to->context)) 1037 goto await_fence; 1038 1039 if (!rcu_access_pointer(from->hwsp_cacheline)) 1040 goto await_fence; 1041 1042 /* Just emit the first semaphore we see as request space is limited. */ 1043 if (already_busywaiting(to) & mask) 1044 goto await_fence; 1045 1046 if (i915_request_await_start(to, from) < 0) 1047 goto await_fence; 1048 1049 /* Only submit our spinner after the signaler is running! */ 1050 if (__await_execution(to, from, NULL, gfp)) 1051 goto await_fence; 1052 1053 if (__emit_semaphore_wait(to, from, from->fence.seqno)) 1054 goto await_fence; 1055 1056 to->sched.semaphores |= mask; 1057 to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; 1058 return 0; 1059 1060 await_fence: 1061 return i915_sw_fence_await_dma_fence(&to->submit, 1062 &from->fence, 0, 1063 I915_FENCE_GFP); 1064 } 1065 1066 static int 1067 i915_request_await_request(struct i915_request *to, struct i915_request *from) 1068 { 1069 int ret; 1070 1071 GEM_BUG_ON(to == from); 1072 GEM_BUG_ON(to->timeline == from->timeline); 1073 1074 if (i915_request_completed(from)) { 1075 i915_sw_fence_set_error_once(&to->submit, from->fence.error); 1076 return 0; 1077 } 1078 1079 if (to->engine->schedule) { 1080 ret = i915_sched_node_add_dependency(&to->sched, 1081 &from->sched, 1082 I915_DEPENDENCY_EXTERNAL); 1083 if (ret < 0) 1084 return ret; 1085 } 1086 1087 if (to->engine == from->engine) 1088 ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, 1089 &from->submit, 1090 I915_FENCE_GFP); 1091 else 1092 ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); 1093 if (ret < 0) 1094 return ret; 1095 1096 if (to->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN) { 1097 ret = i915_sw_fence_await_dma_fence(&to->semaphore, 1098 &from->fence, 0, 1099 I915_FENCE_GFP); 1100 if (ret < 0) 1101 return ret; 1102 } 1103 1104 return 0; 1105 } 1106 1107 int 1108 i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) 1109 { 1110 struct dma_fence **child = &fence; 1111 unsigned int nchild = 1; 1112 int ret; 1113 1114 /* 1115 * Note that if the fence-array was created in signal-on-any mode, 1116 * we should *not* decompose it into its individual fences. However, 1117 * we don't currently store which mode the fence-array is operating 1118 * in. Fortunately, the only user of signal-on-any is private to 1119 * amdgpu and we should not see any incoming fence-array from 1120 * sync-file being in signal-on-any mode. 1121 */ 1122 if (dma_fence_is_array(fence)) { 1123 struct dma_fence_array *array = to_dma_fence_array(fence); 1124 1125 child = array->fences; 1126 nchild = array->num_fences; 1127 GEM_BUG_ON(!nchild); 1128 } 1129 1130 do { 1131 fence = *child++; 1132 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1133 i915_sw_fence_set_error_once(&rq->submit, fence->error); 1134 continue; 1135 } 1136 1137 /* 1138 * Requests on the same timeline are explicitly ordered, along 1139 * with their dependencies, by i915_request_add() which ensures 1140 * that requests are submitted in-order through each ring. 1141 */ 1142 if (fence->context == rq->fence.context) 1143 continue; 1144 1145 /* Squash repeated waits to the same timelines */ 1146 if (fence->context && 1147 intel_timeline_sync_is_later(i915_request_timeline(rq), 1148 fence)) 1149 continue; 1150 1151 if (dma_fence_is_i915(fence)) 1152 ret = i915_request_await_request(rq, to_request(fence)); 1153 else 1154 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1155 fence->context ? I915_FENCE_TIMEOUT : 0, 1156 I915_FENCE_GFP); 1157 if (ret < 0) 1158 return ret; 1159 1160 /* Record the latest fence used against each timeline */ 1161 if (fence->context) 1162 intel_timeline_sync_set(i915_request_timeline(rq), 1163 fence); 1164 } while (--nchild); 1165 1166 return 0; 1167 } 1168 1169 static bool intel_timeline_sync_has_start(struct intel_timeline *tl, 1170 struct dma_fence *fence) 1171 { 1172 return __intel_timeline_sync_is_later(tl, 1173 fence->context, 1174 fence->seqno - 1); 1175 } 1176 1177 static int intel_timeline_sync_set_start(struct intel_timeline *tl, 1178 const struct dma_fence *fence) 1179 { 1180 return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); 1181 } 1182 1183 static int 1184 __i915_request_await_execution(struct i915_request *to, 1185 struct i915_request *from, 1186 void (*hook)(struct i915_request *rq, 1187 struct dma_fence *signal)) 1188 { 1189 int err; 1190 1191 GEM_BUG_ON(intel_context_is_barrier(from->context)); 1192 1193 /* Submit both requests at the same time */ 1194 err = __await_execution(to, from, hook, I915_FENCE_GFP); 1195 if (err) 1196 return err; 1197 1198 /* Squash repeated depenendices to the same timelines */ 1199 if (intel_timeline_sync_has_start(i915_request_timeline(to), 1200 &from->fence)) 1201 return 0; 1202 1203 /* 1204 * Wait until the start of this request. 1205 * 1206 * The execution cb fires when we submit the request to HW. But in 1207 * many cases this may be long before the request itself is ready to 1208 * run (consider that we submit 2 requests for the same context, where 1209 * the request of interest is behind an indefinite spinner). So we hook 1210 * up to both to reduce our queues and keep the execution lag minimised 1211 * in the worst case, though we hope that the await_start is elided. 1212 */ 1213 err = i915_request_await_start(to, from); 1214 if (err < 0) 1215 return err; 1216 1217 /* 1218 * Ensure both start together [after all semaphores in signal] 1219 * 1220 * Now that we are queued to the HW at roughly the same time (thanks 1221 * to the execute cb) and are ready to run at roughly the same time 1222 * (thanks to the await start), our signaler may still be indefinitely 1223 * delayed by waiting on a semaphore from a remote engine. If our 1224 * signaler depends on a semaphore, so indirectly do we, and we do not 1225 * want to start our payload until our signaler also starts theirs. 1226 * So we wait. 1227 * 1228 * However, there is also a second condition for which we need to wait 1229 * for the precise start of the signaler. Consider that the signaler 1230 * was submitted in a chain of requests following another context 1231 * (with just an ordinary intra-engine fence dependency between the 1232 * two). In this case the signaler is queued to HW, but not for 1233 * immediate execution, and so we must wait until it reaches the 1234 * active slot. 1235 */ 1236 if (intel_engine_has_semaphores(to->engine)) { 1237 err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); 1238 if (err < 0) 1239 return err; 1240 } 1241 1242 /* Couple the dependency tree for PI on this exposed to->fence */ 1243 if (to->engine->schedule) { 1244 err = i915_sched_node_add_dependency(&to->sched, 1245 &from->sched, 1246 I915_DEPENDENCY_WEAK); 1247 if (err < 0) 1248 return err; 1249 } 1250 1251 return intel_timeline_sync_set_start(i915_request_timeline(to), 1252 &from->fence); 1253 } 1254 1255 int 1256 i915_request_await_execution(struct i915_request *rq, 1257 struct dma_fence *fence, 1258 void (*hook)(struct i915_request *rq, 1259 struct dma_fence *signal)) 1260 { 1261 struct dma_fence **child = &fence; 1262 unsigned int nchild = 1; 1263 int ret; 1264 1265 if (dma_fence_is_array(fence)) { 1266 struct dma_fence_array *array = to_dma_fence_array(fence); 1267 1268 /* XXX Error for signal-on-any fence arrays */ 1269 1270 child = array->fences; 1271 nchild = array->num_fences; 1272 GEM_BUG_ON(!nchild); 1273 } 1274 1275 do { 1276 fence = *child++; 1277 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1278 i915_sw_fence_set_error_once(&rq->submit, fence->error); 1279 continue; 1280 } 1281 1282 /* 1283 * We don't squash repeated fence dependencies here as we 1284 * want to run our callback in all cases. 1285 */ 1286 1287 if (dma_fence_is_i915(fence)) 1288 ret = __i915_request_await_execution(rq, 1289 to_request(fence), 1290 hook); 1291 else 1292 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1293 I915_FENCE_TIMEOUT, 1294 GFP_KERNEL); 1295 if (ret < 0) 1296 return ret; 1297 } while (--nchild); 1298 1299 return 0; 1300 } 1301 1302 /** 1303 * i915_request_await_object - set this request to (async) wait upon a bo 1304 * @to: request we are wishing to use 1305 * @obj: object which may be in use on another ring. 1306 * @write: whether the wait is on behalf of a writer 1307 * 1308 * This code is meant to abstract object synchronization with the GPU. 1309 * Conceptually we serialise writes between engines inside the GPU. 1310 * We only allow one engine to write into a buffer at any time, but 1311 * multiple readers. To ensure each has a coherent view of memory, we must: 1312 * 1313 * - If there is an outstanding write request to the object, the new 1314 * request must wait for it to complete (either CPU or in hw, requests 1315 * on the same ring will be naturally ordered). 1316 * 1317 * - If we are a write request (pending_write_domain is set), the new 1318 * request must wait for outstanding read requests to complete. 1319 * 1320 * Returns 0 if successful, else propagates up the lower layer error. 1321 */ 1322 int 1323 i915_request_await_object(struct i915_request *to, 1324 struct drm_i915_gem_object *obj, 1325 bool write) 1326 { 1327 struct dma_fence *excl; 1328 int ret = 0; 1329 1330 if (write) { 1331 struct dma_fence **shared; 1332 unsigned int count, i; 1333 1334 ret = dma_resv_get_fences_rcu(obj->base.resv, 1335 &excl, &count, &shared); 1336 if (ret) 1337 return ret; 1338 1339 for (i = 0; i < count; i++) { 1340 ret = i915_request_await_dma_fence(to, shared[i]); 1341 if (ret) 1342 break; 1343 1344 dma_fence_put(shared[i]); 1345 } 1346 1347 for (; i < count; i++) 1348 dma_fence_put(shared[i]); 1349 kfree(shared); 1350 } else { 1351 excl = dma_resv_get_excl_rcu(obj->base.resv); 1352 } 1353 1354 if (excl) { 1355 if (ret == 0) 1356 ret = i915_request_await_dma_fence(to, excl); 1357 1358 dma_fence_put(excl); 1359 } 1360 1361 return ret; 1362 } 1363 1364 static struct i915_request * 1365 __i915_request_add_to_timeline(struct i915_request *rq) 1366 { 1367 struct intel_timeline *timeline = i915_request_timeline(rq); 1368 struct i915_request *prev; 1369 1370 /* 1371 * Dependency tracking and request ordering along the timeline 1372 * is special cased so that we can eliminate redundant ordering 1373 * operations while building the request (we know that the timeline 1374 * itself is ordered, and here we guarantee it). 1375 * 1376 * As we know we will need to emit tracking along the timeline, 1377 * we embed the hooks into our request struct -- at the cost of 1378 * having to have specialised no-allocation interfaces (which will 1379 * be beneficial elsewhere). 1380 * 1381 * A second benefit to open-coding i915_request_await_request is 1382 * that we can apply a slight variant of the rules specialised 1383 * for timelines that jump between engines (such as virtual engines). 1384 * If we consider the case of virtual engine, we must emit a dma-fence 1385 * to prevent scheduling of the second request until the first is 1386 * complete (to maximise our greedy late load balancing) and this 1387 * precludes optimising to use semaphores serialisation of a single 1388 * timeline across engines. 1389 */ 1390 prev = to_request(__i915_active_fence_set(&timeline->last_request, 1391 &rq->fence)); 1392 if (prev && !i915_request_completed(prev)) { 1393 /* 1394 * The requests are supposed to be kept in order. However, 1395 * we need to be wary in case the timeline->last_request 1396 * is used as a barrier for external modification to this 1397 * context. 1398 */ 1399 GEM_BUG_ON(prev->context == rq->context && 1400 i915_seqno_passed(prev->fence.seqno, 1401 rq->fence.seqno)); 1402 1403 if (is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) 1404 i915_sw_fence_await_sw_fence(&rq->submit, 1405 &prev->submit, 1406 &rq->submitq); 1407 else 1408 __i915_sw_fence_await_dma_fence(&rq->submit, 1409 &prev->fence, 1410 &rq->dmaq); 1411 if (rq->engine->schedule) 1412 __i915_sched_node_add_dependency(&rq->sched, 1413 &prev->sched, 1414 &rq->dep, 1415 0); 1416 } 1417 1418 /* 1419 * Make sure that no request gazumped us - if it was allocated after 1420 * our i915_request_alloc() and called __i915_request_add() before 1421 * us, the timeline will hold its seqno which is later than ours. 1422 */ 1423 GEM_BUG_ON(timeline->seqno != rq->fence.seqno); 1424 1425 return prev; 1426 } 1427 1428 /* 1429 * NB: This function is not allowed to fail. Doing so would mean the the 1430 * request is not being tracked for completion but the work itself is 1431 * going to happen on the hardware. This would be a Bad Thing(tm). 1432 */ 1433 struct i915_request *__i915_request_commit(struct i915_request *rq) 1434 { 1435 struct intel_engine_cs *engine = rq->engine; 1436 struct intel_ring *ring = rq->ring; 1437 u32 *cs; 1438 1439 RQ_TRACE(rq, "\n"); 1440 1441 /* 1442 * To ensure that this call will not fail, space for its emissions 1443 * should already have been reserved in the ring buffer. Let the ring 1444 * know that it is time to use that space up. 1445 */ 1446 GEM_BUG_ON(rq->reserved_space > ring->space); 1447 rq->reserved_space = 0; 1448 rq->emitted_jiffies = jiffies; 1449 1450 /* 1451 * Record the position of the start of the breadcrumb so that 1452 * should we detect the updated seqno part-way through the 1453 * GPU processing the request, we never over-estimate the 1454 * position of the ring's HEAD. 1455 */ 1456 cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); 1457 GEM_BUG_ON(IS_ERR(cs)); 1458 rq->postfix = intel_ring_offset(rq, cs); 1459 1460 return __i915_request_add_to_timeline(rq); 1461 } 1462 1463 void __i915_request_queue(struct i915_request *rq, 1464 const struct i915_sched_attr *attr) 1465 { 1466 /* 1467 * Let the backend know a new request has arrived that may need 1468 * to adjust the existing execution schedule due to a high priority 1469 * request - i.e. we may want to preempt the current request in order 1470 * to run a high priority dependency chain *before* we can execute this 1471 * request. 1472 * 1473 * This is called before the request is ready to run so that we can 1474 * decide whether to preempt the entire chain so that it is ready to 1475 * run at the earliest possible convenience. 1476 */ 1477 if (attr && rq->engine->schedule) 1478 rq->engine->schedule(rq, attr); 1479 i915_sw_fence_commit(&rq->semaphore); 1480 i915_sw_fence_commit(&rq->submit); 1481 } 1482 1483 void i915_request_add(struct i915_request *rq) 1484 { 1485 struct intel_timeline * const tl = i915_request_timeline(rq); 1486 struct i915_sched_attr attr = {}; 1487 struct i915_gem_context *ctx; 1488 1489 lockdep_assert_held(&tl->mutex); 1490 lockdep_unpin_lock(&tl->mutex, rq->cookie); 1491 1492 trace_i915_request_add(rq); 1493 __i915_request_commit(rq); 1494 1495 /* XXX placeholder for selftests */ 1496 rcu_read_lock(); 1497 ctx = rcu_dereference(rq->context->gem_context); 1498 if (ctx) 1499 attr = ctx->sched; 1500 rcu_read_unlock(); 1501 1502 if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN)) 1503 attr.priority |= I915_PRIORITY_NOSEMAPHORE; 1504 if (list_empty(&rq->sched.signalers_list)) 1505 attr.priority |= I915_PRIORITY_WAIT; 1506 1507 local_bh_disable(); 1508 __i915_request_queue(rq, &attr); 1509 local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ 1510 1511 mutex_unlock(&tl->mutex); 1512 } 1513 1514 static unsigned long local_clock_ns(unsigned int *cpu) 1515 { 1516 unsigned long t; 1517 1518 /* 1519 * Cheaply and approximately convert from nanoseconds to microseconds. 1520 * The result and subsequent calculations are also defined in the same 1521 * approximate microseconds units. The principal source of timing 1522 * error here is from the simple truncation. 1523 * 1524 * Note that local_clock() is only defined wrt to the current CPU; 1525 * the comparisons are no longer valid if we switch CPUs. Instead of 1526 * blocking preemption for the entire busywait, we can detect the CPU 1527 * switch and use that as indicator of system load and a reason to 1528 * stop busywaiting, see busywait_stop(). 1529 */ 1530 *cpu = get_cpu(); 1531 t = local_clock(); 1532 put_cpu(); 1533 1534 return t; 1535 } 1536 1537 static bool busywait_stop(unsigned long timeout, unsigned int cpu) 1538 { 1539 unsigned int this_cpu; 1540 1541 if (time_after(local_clock_ns(&this_cpu), timeout)) 1542 return true; 1543 1544 return this_cpu != cpu; 1545 } 1546 1547 static bool __i915_spin_request(const struct i915_request * const rq, int state) 1548 { 1549 unsigned long timeout_ns; 1550 unsigned int cpu; 1551 1552 /* 1553 * Only wait for the request if we know it is likely to complete. 1554 * 1555 * We don't track the timestamps around requests, nor the average 1556 * request length, so we do not have a good indicator that this 1557 * request will complete within the timeout. What we do know is the 1558 * order in which requests are executed by the context and so we can 1559 * tell if the request has been started. If the request is not even 1560 * running yet, it is a fair assumption that it will not complete 1561 * within our relatively short timeout. 1562 */ 1563 if (!i915_request_is_running(rq)) 1564 return false; 1565 1566 /* 1567 * When waiting for high frequency requests, e.g. during synchronous 1568 * rendering split between the CPU and GPU, the finite amount of time 1569 * required to set up the irq and wait upon it limits the response 1570 * rate. By busywaiting on the request completion for a short while we 1571 * can service the high frequency waits as quick as possible. However, 1572 * if it is a slow request, we want to sleep as quickly as possible. 1573 * The tradeoff between waiting and sleeping is roughly the time it 1574 * takes to sleep on a request, on the order of a microsecond. 1575 */ 1576 1577 timeout_ns = READ_ONCE(rq->engine->props.max_busywait_duration_ns); 1578 timeout_ns += local_clock_ns(&cpu); 1579 do { 1580 if (i915_request_completed(rq)) 1581 return true; 1582 1583 if (signal_pending_state(state, current)) 1584 break; 1585 1586 if (busywait_stop(timeout_ns, cpu)) 1587 break; 1588 1589 cpu_relax(); 1590 } while (!drm_need_resched()); 1591 1592 return false; 1593 } 1594 1595 struct request_wait { 1596 struct dma_fence_cb cb; 1597 #ifdef __linux__ 1598 struct task_struct *tsk; 1599 #else 1600 struct proc *tsk; 1601 #endif 1602 }; 1603 1604 static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) 1605 { 1606 struct request_wait *wait = container_of(cb, typeof(*wait), cb); 1607 1608 wake_up_process(wait->tsk); 1609 } 1610 1611 /** 1612 * i915_request_wait - wait until execution of request has finished 1613 * @rq: the request to wait upon 1614 * @flags: how to wait 1615 * @timeout: how long to wait in jiffies 1616 * 1617 * i915_request_wait() waits for the request to be completed, for a 1618 * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an 1619 * unbounded wait). 1620 * 1621 * Returns the remaining time (in jiffies) if the request completed, which may 1622 * be zero or -ETIME if the request is unfinished after the timeout expires. 1623 * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is 1624 * pending before the request completes. 1625 */ 1626 long i915_request_wait(struct i915_request *rq, 1627 unsigned int flags, 1628 long timeout) 1629 { 1630 const int state = flags & I915_WAIT_INTERRUPTIBLE ? 1631 TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; 1632 struct request_wait wait; 1633 1634 might_sleep(); 1635 GEM_BUG_ON(timeout < 0); 1636 1637 if (dma_fence_is_signaled(&rq->fence)) 1638 return timeout; 1639 1640 if (!timeout) 1641 return -ETIME; 1642 1643 trace_i915_request_wait_begin(rq, flags); 1644 1645 /* 1646 * We must never wait on the GPU while holding a lock as we 1647 * may need to perform a GPU reset. So while we don't need to 1648 * serialise wait/reset with an explicit lock, we do want 1649 * lockdep to detect potential dependency cycles. 1650 */ 1651 mutex_acquire(&rq->engine->gt->reset.mutex.dep_map, 0, 0, _THIS_IP_); 1652 1653 /* 1654 * Optimistic spin before touching IRQs. 1655 * 1656 * We may use a rather large value here to offset the penalty of 1657 * switching away from the active task. Frequently, the client will 1658 * wait upon an old swapbuffer to throttle itself to remain within a 1659 * frame of the gpu. If the client is running in lockstep with the gpu, 1660 * then it should not be waiting long at all, and a sleep now will incur 1661 * extra scheduler latency in producing the next frame. To try to 1662 * avoid adding the cost of enabling/disabling the interrupt to the 1663 * short wait, we first spin to see if the request would have completed 1664 * in the time taken to setup the interrupt. 1665 * 1666 * We need upto 5us to enable the irq, and upto 20us to hide the 1667 * scheduler latency of a context switch, ignoring the secondary 1668 * impacts from a context switch such as cache eviction. 1669 * 1670 * The scheme used for low-latency IO is called "hybrid interrupt 1671 * polling". The suggestion there is to sleep until just before you 1672 * expect to be woken by the device interrupt and then poll for its 1673 * completion. That requires having a good predictor for the request 1674 * duration, which we currently lack. 1675 */ 1676 if (IS_ACTIVE(CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT) && 1677 __i915_spin_request(rq, state)) { 1678 dma_fence_signal(&rq->fence); 1679 goto out; 1680 } 1681 1682 /* 1683 * This client is about to stall waiting for the GPU. In many cases 1684 * this is undesirable and limits the throughput of the system, as 1685 * many clients cannot continue processing user input/output whilst 1686 * blocked. RPS autotuning may take tens of milliseconds to respond 1687 * to the GPU load and thus incurs additional latency for the client. 1688 * We can circumvent that by promoting the GPU frequency to maximum 1689 * before we sleep. This makes the GPU throttle up much more quickly 1690 * (good for benchmarks and user experience, e.g. window animations), 1691 * but at a cost of spending more power processing the workload 1692 * (bad for battery). 1693 */ 1694 if (flags & I915_WAIT_PRIORITY) { 1695 if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6) 1696 intel_rps_boost(rq); 1697 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); 1698 } 1699 1700 #ifdef __linux__ 1701 wait.tsk = current; 1702 #else 1703 wait.tsk = curproc; 1704 #endif 1705 if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) 1706 goto out; 1707 1708 for (;;) { 1709 set_current_state(state); 1710 1711 if (i915_request_completed(rq)) { 1712 dma_fence_signal(&rq->fence); 1713 break; 1714 } 1715 1716 intel_engine_flush_submission(rq->engine); 1717 1718 if (signal_pending_state(state, current)) { 1719 timeout = -ERESTARTSYS; 1720 break; 1721 } 1722 1723 if (!timeout) { 1724 timeout = -ETIME; 1725 break; 1726 } 1727 1728 timeout = io_schedule_timeout(timeout); 1729 } 1730 __set_current_state(TASK_RUNNING); 1731 1732 dma_fence_remove_callback(&rq->fence, &wait.cb); 1733 1734 out: 1735 mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); 1736 trace_i915_request_wait_end(rq); 1737 return timeout; 1738 } 1739 1740 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1741 #include "selftests/mock_request.c" 1742 #include "selftests/i915_request.c" 1743 #endif 1744 1745 static void i915_global_request_shrink(void) 1746 { 1747 #ifdef notyet 1748 kmem_cache_shrink(global.slab_execute_cbs); 1749 kmem_cache_shrink(global.slab_requests); 1750 #endif 1751 } 1752 1753 static void i915_global_request_exit(void) 1754 { 1755 #ifdef __linux__ 1756 kmem_cache_destroy(global.slab_execute_cbs); 1757 kmem_cache_destroy(global.slab_requests); 1758 #else 1759 pool_destroy(&global.slab_execute_cbs); 1760 pool_destroy(&global.slab_requests); 1761 #endif 1762 } 1763 1764 static struct i915_global_request global = { { 1765 .shrink = i915_global_request_shrink, 1766 .exit = i915_global_request_exit, 1767 } }; 1768 1769 int __init i915_global_request_init(void) 1770 { 1771 #ifdef __linux__ 1772 global.slab_requests = 1773 kmem_cache_create("i915_request", 1774 sizeof(struct i915_request), 1775 __alignof__(struct i915_request), 1776 SLAB_HWCACHE_ALIGN | 1777 SLAB_RECLAIM_ACCOUNT | 1778 SLAB_TYPESAFE_BY_RCU, 1779 __i915_request_ctor); 1780 if (!global.slab_requests) 1781 return -ENOMEM; 1782 1783 global.slab_execute_cbs = KMEM_CACHE(execute_cb, 1784 SLAB_HWCACHE_ALIGN | 1785 SLAB_RECLAIM_ACCOUNT | 1786 SLAB_TYPESAFE_BY_RCU); 1787 if (!global.slab_execute_cbs) 1788 goto err_requests; 1789 #else 1790 pool_init(&global.slab_requests, sizeof(struct i915_request), 1791 CACHELINESIZE, IPL_TTY, 0, "i915_request", NULL); 1792 pool_init(&global.slab_execute_cbs, sizeof(struct execute_cb), 1793 CACHELINESIZE, IPL_TTY, 0, "i915_exec", NULL); 1794 #endif 1795 1796 i915_global_register(&global.base); 1797 return 0; 1798 1799 #ifdef __linux__ 1800 err_requests: 1801 kmem_cache_destroy(global.slab_requests); 1802 return -ENOMEM; 1803 #endif 1804 } 1805