1 /* $NetBSD: intel_lrc.c,v 1.8 2021/12/19 12:32:15 riastradh Exp $ */
2
3 /*
4 * Copyright © 2014 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 *
25 * Authors:
26 * Ben Widawsky <ben@bwidawsk.net>
27 * Michel Thierry <michel.thierry@intel.com>
28 * Thomas Daniel <thomas.daniel@intel.com>
29 * Oscar Mateo <oscar.mateo@intel.com>
30 *
31 */
32
33 /**
34 * DOC: Logical Rings, Logical Ring Contexts and Execlists
35 *
36 * Motivation:
37 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
38 * These expanded contexts enable a number of new abilities, especially
39 * "Execlists" (also implemented in this file).
40 *
41 * One of the main differences with the legacy HW contexts is that logical
42 * ring contexts incorporate many more things to the context's state, like
43 * PDPs or ringbuffer control registers:
44 *
45 * The reason why PDPs are included in the context is straightforward: as
46 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
47 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
48 * instead, the GPU will do it for you on the context switch.
49 *
50 * But, what about the ringbuffer control registers (head, tail, etc..)?
51 * shouldn't we just need a set of those per engine command streamer? This is
52 * where the name "Logical Rings" starts to make sense: by virtualizing the
53 * rings, the engine cs shifts to a new "ring buffer" with every context
54 * switch. When you want to submit a workload to the GPU you: A) choose your
55 * context, B) find its appropriate virtualized ring, C) write commands to it
56 * and then, finally, D) tell the GPU to switch to that context.
57 *
58 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
59 * to a contexts is via a context execution list, ergo "Execlists".
60 *
61 * LRC implementation:
62 * Regarding the creation of contexts, we have:
63 *
64 * - One global default context.
65 * - One local default context for each opened fd.
66 * - One local extra context for each context create ioctl call.
67 *
68 * Now that ringbuffers belong per-context (and not per-engine, like before)
69 * and that contexts are uniquely tied to a given engine (and not reusable,
70 * like before) we need:
71 *
72 * - One ringbuffer per-engine inside each context.
73 * - One backing object per-engine inside each context.
74 *
75 * The global default context starts its life with these new objects fully
76 * allocated and populated. The local default context for each opened fd is
77 * more complex, because we don't know at creation time which engine is going
78 * to use them. To handle this, we have implemented a deferred creation of LR
79 * contexts:
80 *
81 * The local context starts its life as a hollow or blank holder, that only
82 * gets populated for a given engine once we receive an execbuffer. If later
83 * on we receive another execbuffer ioctl for the same context but a different
84 * engine, we allocate/populate a new ringbuffer and context backing object and
85 * so on.
86 *
87 * Finally, regarding local contexts created using the ioctl call: as they are
88 * only allowed with the render ring, we can allocate & populate them right
89 * away (no need to defer anything, at least for now).
90 *
91 * Execlists implementation:
92 * Execlists are the new method by which, on gen8+ hardware, workloads are
93 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
94 * This method works as follows:
95 *
96 * When a request is committed, its commands (the BB start and any leading or
97 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
98 * for the appropriate context. The tail pointer in the hardware context is not
99 * updated at this time, but instead, kept by the driver in the ringbuffer
100 * structure. A structure representing this request is added to a request queue
101 * for the appropriate engine: this structure contains a copy of the context's
102 * tail after the request was written to the ring buffer and a pointer to the
103 * context itself.
104 *
105 * If the engine's request queue was empty before the request was added, the
106 * queue is processed immediately. Otherwise the queue will be processed during
107 * a context switch interrupt. In any case, elements on the queue will get sent
108 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
109 * globally unique 20-bits submission ID.
110 *
111 * When execution of a request completes, the GPU updates the context status
112 * buffer with a context complete event and generates a context switch interrupt.
113 * During the interrupt handling, the driver examines the events in the buffer:
114 * for each context complete event, if the announced ID matches that on the head
115 * of the request queue, then that request is retired and removed from the queue.
116 *
117 * After processing, if any requests were retired and the queue is not empty
118 * then a new execution list can be submitted. The two requests at the front of
119 * the queue are next to be submitted but since a context may not occur twice in
120 * an execution list, if subsequent requests have the same ID as the first then
121 * the two requests must be combined. This is done simply by discarding requests
122 * at the head of the queue until either only one requests is left (in which case
123 * we use a NULL second context) or the first two requests have unique IDs.
124 *
125 * By always executing the first two requests in the queue the driver ensures
126 * that the GPU is kept as busy as possible. In the case where a single context
127 * completes but a second context is still executing, the request for this second
128 * context will be at the head of the queue when we remove the first one. This
129 * request will then be resubmitted along with a new request for a different context,
130 * which will cause the hardware to continue executing the second request and queue
131 * the new request (the GPU detects the condition of a context getting preempted
132 * with the same context and optimizes the context switch flow by not doing
133 * preemption, but just sampling the new tail pointer).
134 *
135 */
136 #include <sys/cdefs.h>
137 __KERNEL_RCSID(0, "$NetBSD: intel_lrc.c,v 1.8 2021/12/19 12:32:15 riastradh Exp $");
138
139 #include <linux/interrupt.h>
140
141 #include "i915_drv.h"
142 #include "i915_perf.h"
143 #include "i915_trace.h"
144 #include "i915_vgpu.h"
145 #include "intel_context.h"
146 #include "intel_engine_pm.h"
147 #include "intel_gt.h"
148 #include "intel_gt_pm.h"
149 #include "intel_gt_requests.h"
150 #include "intel_lrc_reg.h"
151 #include "intel_mocs.h"
152 #include "intel_reset.h"
153 #include "intel_ring.h"
154 #include "intel_workarounds.h"
155
156 #include <linux/nbsd-namespace.h>
157
158 #define RING_EXECLIST_QFULL (1 << 0x2)
159 #define RING_EXECLIST1_VALID (1 << 0x3)
160 #define RING_EXECLIST0_VALID (1 << 0x4)
161 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
162 #define RING_EXECLIST1_ACTIVE (1 << 0x11)
163 #define RING_EXECLIST0_ACTIVE (1 << 0x12)
164
165 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
166 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
167 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
168 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
169 #define GEN8_CTX_STATUS_COMPLETE (1 << 4)
170 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
171
172 #define GEN8_CTX_STATUS_COMPLETED_MASK \
173 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
174
175 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
176
177 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
178 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
179 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
180 #define GEN12_IDLE_CTX_ID 0x7FF
181 #define GEN12_CSB_CTX_VALID(csb_dw) \
182 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
183
184 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
185 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
186 #define WA_TAIL_DWORDS 2
187 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
188
189 struct virtual_engine {
190 struct intel_engine_cs base;
191 struct intel_context context;
192
193 /*
194 * We allow only a single request through the virtual engine at a time
195 * (each request in the timeline waits for the completion fence of
196 * the previous before being submitted). By restricting ourselves to
197 * only submitting a single request, each request is placed on to a
198 * physical to maximise load spreading (by virtue of the late greedy
199 * scheduling -- each real engine takes the next available request
200 * upon idling).
201 */
202 struct i915_request *request;
203
204 /*
205 * We keep a rbtree of available virtual engines inside each physical
206 * engine, sorted by priority. Here we preallocate the nodes we need
207 * for the virtual engine, indexed by physical_engine->id.
208 */
209 struct ve_node {
210 struct rb_node rb;
211 int prio;
212 uint64_t order;
213 bool inserted;
214 } nodes[I915_NUM_ENGINES];
215 uint64_t order;
216
217 /*
218 * Keep track of bonded pairs -- restrictions upon on our selection
219 * of physical engines any particular request may be submitted to.
220 * If we receive a submit-fence from a master engine, we will only
221 * use one of sibling_mask physical engines.
222 */
223 struct ve_bond {
224 const struct intel_engine_cs *master;
225 intel_engine_mask_t sibling_mask;
226 } *bonds;
227 unsigned int num_bonds;
228
229 /* And finally, which physical engines this virtual engine maps onto. */
230 unsigned int num_siblings;
231 struct intel_engine_cs *siblings[0];
232 };
233
234 #ifdef __NetBSD__
235 static int
compare_ve_nodes(void * cookie,const void * va,const void * vb)236 compare_ve_nodes(void *cookie, const void *va, const void *vb)
237 {
238 const struct ve_node *na = va;
239 const struct ve_node *nb = vb;
240
241 if (na->prio < nb->prio)
242 return -1;
243 if (na->prio > nb->prio)
244 return +1;
245 if (na->order < nb->order)
246 return -1;
247 if (na->order > nb->order)
248 return +1;
249 return 0;
250 }
251
252 static int
compare_ve_node_key(void * cookie,const void * vn,const void * vk)253 compare_ve_node_key(void *cookie, const void *vn, const void *vk)
254 {
255 const struct ve_node *n = vn;
256 const int *k = vk;
257
258 if (n->prio < *k)
259 return -1;
260 if (n->prio > *k)
261 return +1;
262 return 0;
263 }
264
265 static const rb_tree_ops_t ve_tree_ops = {
266 .rbto_compare_nodes = compare_ve_nodes,
267 .rbto_compare_key = compare_ve_node_key,
268 .rbto_node_offset = offsetof(struct ve_node, rb),
269 };
270 #endif
271
to_virtual_engine(struct intel_engine_cs * engine)272 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
273 {
274 GEM_BUG_ON(!intel_engine_is_virtual(engine));
275 return container_of(engine, struct virtual_engine, base);
276 }
277
278 static int __execlists_context_alloc(struct intel_context *ce,
279 struct intel_engine_cs *engine);
280
281 static void execlists_init_reg_state(u32 *reg_state,
282 const struct intel_context *ce,
283 const struct intel_engine_cs *engine,
284 const struct intel_ring *ring,
285 bool close);
286 static void
287 __execlists_update_reg_state(const struct intel_context *ce,
288 const struct intel_engine_cs *engine,
289 u32 head);
290
mark_eio(struct i915_request * rq)291 static void mark_eio(struct i915_request *rq)
292 {
293 if (i915_request_completed(rq))
294 return;
295
296 GEM_BUG_ON(i915_request_signaled(rq));
297
298 dma_fence_set_error(&rq->fence, -EIO);
299 i915_request_mark_complete(rq);
300 }
301
302 static struct i915_request *
active_request(const struct intel_timeline * const tl,struct i915_request * rq)303 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
304 {
305 struct i915_request *active = rq;
306
307 rcu_read_lock();
308 list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
309 if (i915_request_completed(rq))
310 break;
311
312 active = rq;
313 }
314 rcu_read_unlock();
315
316 return active;
317 }
318
intel_hws_preempt_address(struct intel_engine_cs * engine)319 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
320 {
321 return (i915_ggtt_offset(engine->status_page.vma) +
322 I915_GEM_HWS_PREEMPT_ADDR);
323 }
324
325 static inline void
ring_set_paused(const struct intel_engine_cs * engine,int state)326 ring_set_paused(const struct intel_engine_cs *engine, int state)
327 {
328 /*
329 * We inspect HWS_PREEMPT with a semaphore inside
330 * engine->emit_fini_breadcrumb. If the dword is true,
331 * the ring is paused as the semaphore will busywait
332 * until the dword is false.
333 */
334 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
335 if (state)
336 wmb();
337 }
338
to_priolist(struct rb_node * rb)339 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
340 {
341 return rb_entry(rb, struct i915_priolist, node);
342 }
343
rq_prio(const struct i915_request * rq)344 static inline int rq_prio(const struct i915_request *rq)
345 {
346 return rq->sched.attr.priority;
347 }
348
effective_prio(const struct i915_request * rq)349 static int effective_prio(const struct i915_request *rq)
350 {
351 int prio = rq_prio(rq);
352
353 /*
354 * If this request is special and must not be interrupted at any
355 * cost, so be it. Note we are only checking the most recent request
356 * in the context and so may be masking an earlier vip request. It
357 * is hoped that under the conditions where nopreempt is used, this
358 * will not matter (i.e. all requests to that context will be
359 * nopreempt for as long as desired).
360 */
361 if (i915_request_has_nopreempt(rq))
362 prio = I915_PRIORITY_UNPREEMPTABLE;
363
364 /*
365 * On unwinding the active request, we give it a priority bump
366 * if it has completed waiting on any semaphore. If we know that
367 * the request has already started, we can prevent an unwanted
368 * preempt-to-idle cycle by taking that into account now.
369 */
370 if (__i915_request_has_started(rq))
371 prio |= I915_PRIORITY_NOSEMAPHORE;
372
373 /* Restrict mere WAIT boosts from triggering preemption */
374 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
375 return prio | __NO_PREEMPTION;
376 }
377
queue_prio(const struct intel_engine_execlists * execlists)378 static int queue_prio(const struct intel_engine_execlists *execlists)
379 {
380 struct i915_priolist *p;
381 struct rb_node *rb;
382
383 rb = rb_first_cached(&execlists->queue);
384 if (!rb)
385 return INT_MIN;
386
387 /*
388 * As the priolist[] are inverted, with the highest priority in [0],
389 * we have to flip the index value to become priority.
390 */
391 p = to_priolist(rb);
392 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
393 }
394
need_preempt(const struct intel_engine_cs * engine,const struct i915_request * rq,struct rb_node * rb)395 static inline bool need_preempt(const struct intel_engine_cs *engine,
396 const struct i915_request *rq,
397 struct rb_node *rb)
398 {
399 int last_prio;
400
401 if (!intel_engine_has_semaphores(engine))
402 return false;
403
404 /*
405 * Check if the current priority hint merits a preemption attempt.
406 *
407 * We record the highest value priority we saw during rescheduling
408 * prior to this dequeue, therefore we know that if it is strictly
409 * less than the current tail of ESLP[0], we do not need to force
410 * a preempt-to-idle cycle.
411 *
412 * However, the priority hint is a mere hint that we may need to
413 * preempt. If that hint is stale or we may be trying to preempt
414 * ourselves, ignore the request.
415 *
416 * More naturally we would write
417 * prio >= max(0, last);
418 * except that we wish to prevent triggering preemption at the same
419 * priority level: the task that is running should remain running
420 * to preserve FIFO ordering of dependencies.
421 */
422 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
423 if (engine->execlists.queue_priority_hint <= last_prio)
424 return false;
425
426 /*
427 * Check against the first request in ELSP[1], it will, thanks to the
428 * power of PI, be the highest priority of that context.
429 */
430 if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
431 rq_prio(list_next_entry(rq, sched.link)) > last_prio)
432 return true;
433
434 if (rb) {
435 struct virtual_engine *ve =
436 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
437 bool preempt = false;
438
439 if (engine == ve->siblings[0]) { /* only preempt one sibling */
440 struct i915_request *next;
441
442 rcu_read_lock();
443 next = READ_ONCE(ve->request);
444 if (next)
445 preempt = rq_prio(next) > last_prio;
446 rcu_read_unlock();
447 }
448
449 if (preempt)
450 return preempt;
451 }
452
453 /*
454 * If the inflight context did not trigger the preemption, then maybe
455 * it was the set of queued requests? Pick the highest priority in
456 * the queue (the first active priolist) and see if it deserves to be
457 * running instead of ELSP[0].
458 *
459 * The highest priority request in the queue can not be either
460 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
461 * context, it's priority would not exceed ELSP[0] aka last_prio.
462 */
463 return queue_prio(&engine->execlists) > last_prio;
464 }
465
466 __maybe_unused static inline bool
assert_priority_queue(const struct i915_request * prev,const struct i915_request * next)467 assert_priority_queue(const struct i915_request *prev,
468 const struct i915_request *next)
469 {
470 /*
471 * Without preemption, the prev may refer to the still active element
472 * which we refuse to let go.
473 *
474 * Even with preemption, there are times when we think it is better not
475 * to preempt and leave an ostensibly lower priority request in flight.
476 */
477 if (i915_request_is_active(prev))
478 return true;
479
480 return rq_prio(prev) >= rq_prio(next);
481 }
482
483 /*
484 * The context descriptor encodes various attributes of a context,
485 * including its GTT address and some flags. Because it's fairly
486 * expensive to calculate, we'll just do it once and cache the result,
487 * which remains valid until the context is unpinned.
488 *
489 * This is what a descriptor looks like, from LSB to MSB::
490 *
491 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
492 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
493 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
494 * bits 53-54: mbz, reserved for use by hardware
495 * bits 55-63: group ID, currently unused and set to 0
496 *
497 * Starting from Gen11, the upper dword of the descriptor has a new format:
498 *
499 * bits 32-36: reserved
500 * bits 37-47: SW context ID
501 * bits 48:53: engine instance
502 * bit 54: mbz, reserved for use by hardware
503 * bits 55-60: SW counter
504 * bits 61-63: engine class
505 *
506 * engine info, SW context ID and SW counter need to form a unique number
507 * (Context ID) per lrc.
508 */
509 static u64
lrc_descriptor(struct intel_context * ce,struct intel_engine_cs * engine)510 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
511 {
512 u64 desc;
513
514 desc = INTEL_LEGACY_32B_CONTEXT;
515 if (i915_vm_is_4lvl(ce->vm))
516 desc = INTEL_LEGACY_64B_CONTEXT;
517 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
518
519 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
520 if (IS_GEN(engine->i915, 8))
521 desc |= GEN8_CTX_L3LLC_COHERENT;
522
523 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
524 /*
525 * The following 32bits are copied into the OA reports (dword 2).
526 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
527 * anything below.
528 */
529 if (INTEL_GEN(engine->i915) >= 11) {
530 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
531 /* bits 48-53 */
532
533 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
534 /* bits 61-63 */
535 }
536
537 return desc;
538 }
539
dword_in_page(void * addr)540 static inline unsigned int dword_in_page(void *addr)
541 {
542 return offset_in_page(addr) / sizeof(u32);
543 }
544
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool clear)545 static void set_offsets(u32 *regs,
546 const u8 *data,
547 const struct intel_engine_cs *engine,
548 bool clear)
549 #define NOP(x) (BIT(7) | (x))
550 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
551 #define POSTED BIT(0)
552 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
553 #define REG16(x) \
554 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
555 (((x) >> 2) & 0x7f)
556 #define END(x) 0, (x)
557 {
558 const u32 base = engine->mmio_base;
559
560 while (*data) {
561 u8 count, flags;
562
563 if (*data & BIT(7)) { /* skip */
564 count = *data++ & ~BIT(7);
565 if (clear)
566 memset32(regs, MI_NOOP, count);
567 regs += count;
568 continue;
569 }
570
571 count = *data & 0x3f;
572 flags = *data >> 6;
573 data++;
574
575 *regs = MI_LOAD_REGISTER_IMM(count);
576 if (flags & POSTED)
577 *regs |= MI_LRI_FORCE_POSTED;
578 if (INTEL_GEN(engine->i915) >= 11)
579 *regs |= MI_LRI_CS_MMIO;
580 regs++;
581
582 GEM_BUG_ON(!count);
583 do {
584 u32 offset = 0;
585 u8 v;
586
587 do {
588 v = *data++;
589 offset <<= 7;
590 offset |= v & ~BIT(7);
591 } while (v & BIT(7));
592
593 regs[0] = base + (offset << 2);
594 if (clear)
595 regs[1] = 0;
596 regs += 2;
597 } while (--count);
598 }
599
600 if (clear) {
601 u8 count = *++data;
602
603 /* Clear past the tail for HW access */
604 GEM_BUG_ON(dword_in_page(regs) > count);
605 memset32(regs, MI_NOOP, count - dword_in_page(regs));
606
607 /* Close the batch; used mainly by live_lrc_layout() */
608 *regs = MI_BATCH_BUFFER_END;
609 if (INTEL_GEN(engine->i915) >= 10)
610 *regs |= BIT(0);
611 }
612 }
613
614 static const u8 gen8_xcs_offsets[] = {
615 NOP(1),
616 LRI(11, 0),
617 REG16(0x244),
618 REG(0x034),
619 REG(0x030),
620 REG(0x038),
621 REG(0x03c),
622 REG(0x168),
623 REG(0x140),
624 REG(0x110),
625 REG(0x11c),
626 REG(0x114),
627 REG(0x118),
628
629 NOP(9),
630 LRI(9, 0),
631 REG16(0x3a8),
632 REG16(0x28c),
633 REG16(0x288),
634 REG16(0x284),
635 REG16(0x280),
636 REG16(0x27c),
637 REG16(0x278),
638 REG16(0x274),
639 REG16(0x270),
640
641 NOP(13),
642 LRI(2, 0),
643 REG16(0x200),
644 REG(0x028),
645
646 END(80)
647 };
648
649 static const u8 gen9_xcs_offsets[] = {
650 NOP(1),
651 LRI(14, POSTED),
652 REG16(0x244),
653 REG(0x034),
654 REG(0x030),
655 REG(0x038),
656 REG(0x03c),
657 REG(0x168),
658 REG(0x140),
659 REG(0x110),
660 REG(0x11c),
661 REG(0x114),
662 REG(0x118),
663 REG(0x1c0),
664 REG(0x1c4),
665 REG(0x1c8),
666
667 NOP(3),
668 LRI(9, POSTED),
669 REG16(0x3a8),
670 REG16(0x28c),
671 REG16(0x288),
672 REG16(0x284),
673 REG16(0x280),
674 REG16(0x27c),
675 REG16(0x278),
676 REG16(0x274),
677 REG16(0x270),
678
679 NOP(13),
680 LRI(1, POSTED),
681 REG16(0x200),
682
683 NOP(13),
684 LRI(44, POSTED),
685 REG(0x028),
686 REG(0x09c),
687 REG(0x0c0),
688 REG(0x178),
689 REG(0x17c),
690 REG16(0x358),
691 REG(0x170),
692 REG(0x150),
693 REG(0x154),
694 REG(0x158),
695 REG16(0x41c),
696 REG16(0x600),
697 REG16(0x604),
698 REG16(0x608),
699 REG16(0x60c),
700 REG16(0x610),
701 REG16(0x614),
702 REG16(0x618),
703 REG16(0x61c),
704 REG16(0x620),
705 REG16(0x624),
706 REG16(0x628),
707 REG16(0x62c),
708 REG16(0x630),
709 REG16(0x634),
710 REG16(0x638),
711 REG16(0x63c),
712 REG16(0x640),
713 REG16(0x644),
714 REG16(0x648),
715 REG16(0x64c),
716 REG16(0x650),
717 REG16(0x654),
718 REG16(0x658),
719 REG16(0x65c),
720 REG16(0x660),
721 REG16(0x664),
722 REG16(0x668),
723 REG16(0x66c),
724 REG16(0x670),
725 REG16(0x674),
726 REG16(0x678),
727 REG16(0x67c),
728 REG(0x068),
729
730 END(176)
731 };
732
733 static const u8 gen12_xcs_offsets[] = {
734 NOP(1),
735 LRI(13, POSTED),
736 REG16(0x244),
737 REG(0x034),
738 REG(0x030),
739 REG(0x038),
740 REG(0x03c),
741 REG(0x168),
742 REG(0x140),
743 REG(0x110),
744 REG(0x1c0),
745 REG(0x1c4),
746 REG(0x1c8),
747 REG(0x180),
748 REG16(0x2b4),
749
750 NOP(5),
751 LRI(9, POSTED),
752 REG16(0x3a8),
753 REG16(0x28c),
754 REG16(0x288),
755 REG16(0x284),
756 REG16(0x280),
757 REG16(0x27c),
758 REG16(0x278),
759 REG16(0x274),
760 REG16(0x270),
761
762 END(80)
763 };
764
765 static const u8 gen8_rcs_offsets[] = {
766 NOP(1),
767 LRI(14, POSTED),
768 REG16(0x244),
769 REG(0x034),
770 REG(0x030),
771 REG(0x038),
772 REG(0x03c),
773 REG(0x168),
774 REG(0x140),
775 REG(0x110),
776 REG(0x11c),
777 REG(0x114),
778 REG(0x118),
779 REG(0x1c0),
780 REG(0x1c4),
781 REG(0x1c8),
782
783 NOP(3),
784 LRI(9, POSTED),
785 REG16(0x3a8),
786 REG16(0x28c),
787 REG16(0x288),
788 REG16(0x284),
789 REG16(0x280),
790 REG16(0x27c),
791 REG16(0x278),
792 REG16(0x274),
793 REG16(0x270),
794
795 NOP(13),
796 LRI(1, 0),
797 REG(0x0c8),
798
799 END(80)
800 };
801
802 static const u8 gen9_rcs_offsets[] = {
803 NOP(1),
804 LRI(14, POSTED),
805 REG16(0x244),
806 REG(0x34),
807 REG(0x30),
808 REG(0x38),
809 REG(0x3c),
810 REG(0x168),
811 REG(0x140),
812 REG(0x110),
813 REG(0x11c),
814 REG(0x114),
815 REG(0x118),
816 REG(0x1c0),
817 REG(0x1c4),
818 REG(0x1c8),
819
820 NOP(3),
821 LRI(9, POSTED),
822 REG16(0x3a8),
823 REG16(0x28c),
824 REG16(0x288),
825 REG16(0x284),
826 REG16(0x280),
827 REG16(0x27c),
828 REG16(0x278),
829 REG16(0x274),
830 REG16(0x270),
831
832 NOP(13),
833 LRI(1, 0),
834 REG(0xc8),
835
836 NOP(13),
837 LRI(44, POSTED),
838 REG(0x28),
839 REG(0x9c),
840 REG(0xc0),
841 REG(0x178),
842 REG(0x17c),
843 REG16(0x358),
844 REG(0x170),
845 REG(0x150),
846 REG(0x154),
847 REG(0x158),
848 REG16(0x41c),
849 REG16(0x600),
850 REG16(0x604),
851 REG16(0x608),
852 REG16(0x60c),
853 REG16(0x610),
854 REG16(0x614),
855 REG16(0x618),
856 REG16(0x61c),
857 REG16(0x620),
858 REG16(0x624),
859 REG16(0x628),
860 REG16(0x62c),
861 REG16(0x630),
862 REG16(0x634),
863 REG16(0x638),
864 REG16(0x63c),
865 REG16(0x640),
866 REG16(0x644),
867 REG16(0x648),
868 REG16(0x64c),
869 REG16(0x650),
870 REG16(0x654),
871 REG16(0x658),
872 REG16(0x65c),
873 REG16(0x660),
874 REG16(0x664),
875 REG16(0x668),
876 REG16(0x66c),
877 REG16(0x670),
878 REG16(0x674),
879 REG16(0x678),
880 REG16(0x67c),
881 REG(0x68),
882
883 END(176)
884 };
885
886 static const u8 gen11_rcs_offsets[] = {
887 NOP(1),
888 LRI(15, POSTED),
889 REG16(0x244),
890 REG(0x034),
891 REG(0x030),
892 REG(0x038),
893 REG(0x03c),
894 REG(0x168),
895 REG(0x140),
896 REG(0x110),
897 REG(0x11c),
898 REG(0x114),
899 REG(0x118),
900 REG(0x1c0),
901 REG(0x1c4),
902 REG(0x1c8),
903 REG(0x180),
904
905 NOP(1),
906 LRI(9, POSTED),
907 REG16(0x3a8),
908 REG16(0x28c),
909 REG16(0x288),
910 REG16(0x284),
911 REG16(0x280),
912 REG16(0x27c),
913 REG16(0x278),
914 REG16(0x274),
915 REG16(0x270),
916
917 LRI(1, POSTED),
918 REG(0x1b0),
919
920 NOP(10),
921 LRI(1, 0),
922 REG(0x0c8),
923
924 END(80)
925 };
926
927 static const u8 gen12_rcs_offsets[] = {
928 NOP(1),
929 LRI(13, POSTED),
930 REG16(0x244),
931 REG(0x034),
932 REG(0x030),
933 REG(0x038),
934 REG(0x03c),
935 REG(0x168),
936 REG(0x140),
937 REG(0x110),
938 REG(0x1c0),
939 REG(0x1c4),
940 REG(0x1c8),
941 REG(0x180),
942 REG16(0x2b4),
943
944 NOP(5),
945 LRI(9, POSTED),
946 REG16(0x3a8),
947 REG16(0x28c),
948 REG16(0x288),
949 REG16(0x284),
950 REG16(0x280),
951 REG16(0x27c),
952 REG16(0x278),
953 REG16(0x274),
954 REG16(0x270),
955
956 LRI(3, POSTED),
957 REG(0x1b0),
958 REG16(0x5a8),
959 REG16(0x5ac),
960
961 NOP(6),
962 LRI(1, 0),
963 REG(0x0c8),
964
965 END(80)
966 };
967
968 #undef END
969 #undef REG16
970 #undef REG
971 #undef LRI
972 #undef NOP
973
reg_offsets(const struct intel_engine_cs * engine)974 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
975 {
976 /*
977 * The gen12+ lists only have the registers we program in the basic
978 * default state. We rely on the context image using relative
979 * addressing to automatic fixup the register state between the
980 * physical engines for virtual engine.
981 */
982 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
983 !intel_engine_has_relative_mmio(engine));
984
985 if (engine->class == RENDER_CLASS) {
986 if (INTEL_GEN(engine->i915) >= 12)
987 return gen12_rcs_offsets;
988 else if (INTEL_GEN(engine->i915) >= 11)
989 return gen11_rcs_offsets;
990 else if (INTEL_GEN(engine->i915) >= 9)
991 return gen9_rcs_offsets;
992 else
993 return gen8_rcs_offsets;
994 } else {
995 if (INTEL_GEN(engine->i915) >= 12)
996 return gen12_xcs_offsets;
997 else if (INTEL_GEN(engine->i915) >= 9)
998 return gen9_xcs_offsets;
999 else
1000 return gen8_xcs_offsets;
1001 }
1002 }
1003
1004 static struct i915_request *
__unwind_incomplete_requests(struct intel_engine_cs * engine)1005 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1006 {
1007 struct i915_request *rq, *rn, *active = NULL;
1008 struct list_head *uninitialized_var(pl);
1009 int prio = I915_PRIORITY_INVALID;
1010
1011 lockdep_assert_held(&engine->active.lock);
1012
1013 list_for_each_entry_safe_reverse(rq, rn,
1014 &engine->active.requests,
1015 sched.link) {
1016 if (i915_request_completed(rq))
1017 continue; /* XXX */
1018
1019 __i915_request_unsubmit(rq);
1020
1021 /*
1022 * Push the request back into the queue for later resubmission.
1023 * If this request is not native to this physical engine (i.e.
1024 * it came from a virtual source), push it back onto the virtual
1025 * engine so that it can be moved across onto another physical
1026 * engine as load dictates.
1027 */
1028 if (likely(rq->execution_mask == engine->mask)) {
1029 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1030 if (rq_prio(rq) != prio) {
1031 prio = rq_prio(rq);
1032 pl = i915_sched_lookup_priolist(engine, prio);
1033 }
1034 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1035
1036 list_move(&rq->sched.link, pl);
1037 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1038
1039 active = rq;
1040 } else {
1041 struct intel_engine_cs *owner = rq->context->engine;
1042
1043 /*
1044 * Decouple the virtual breadcrumb before moving it
1045 * back to the virtual engine -- we don't want the
1046 * request to complete in the background and try
1047 * and cancel the breadcrumb on the virtual engine
1048 * (instead of the old engine where it is linked)!
1049 */
1050 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1051 &rq->fence.flags)) {
1052 spin_lock_nested(&rq->lock,
1053 SINGLE_DEPTH_NESTING);
1054 i915_request_cancel_breadcrumb(rq);
1055 spin_unlock(&rq->lock);
1056 }
1057 rq->engine = owner;
1058 owner->submit_request(rq);
1059 active = NULL;
1060 }
1061 }
1062
1063 return active;
1064 }
1065
1066 struct i915_request *
execlists_unwind_incomplete_requests(struct intel_engine_execlists * execlists)1067 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1068 {
1069 struct intel_engine_cs *engine =
1070 container_of(execlists, typeof(*engine), execlists);
1071
1072 return __unwind_incomplete_requests(engine);
1073 }
1074
1075 static inline void
execlists_context_status_change(struct i915_request * rq,unsigned long status)1076 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1077 {
1078 /*
1079 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1080 * The compiler should eliminate this function as dead-code.
1081 */
1082 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1083 return;
1084
1085 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1086 status, rq);
1087 }
1088
intel_engine_context_in(struct intel_engine_cs * engine)1089 static void intel_engine_context_in(struct intel_engine_cs *engine)
1090 {
1091 unsigned long flags;
1092
1093 if (READ_ONCE(engine->stats.enabled) == 0)
1094 return;
1095
1096 write_seqlock_irqsave(&engine->stats.lock, flags);
1097
1098 if (engine->stats.enabled > 0) {
1099 if (engine->stats.active++ == 0)
1100 engine->stats.start = ktime_get();
1101 GEM_BUG_ON(engine->stats.active == 0);
1102 }
1103
1104 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1105 }
1106
intel_engine_context_out(struct intel_engine_cs * engine)1107 static void intel_engine_context_out(struct intel_engine_cs *engine)
1108 {
1109 unsigned long flags;
1110
1111 if (READ_ONCE(engine->stats.enabled) == 0)
1112 return;
1113
1114 write_seqlock_irqsave(&engine->stats.lock, flags);
1115
1116 if (engine->stats.enabled > 0) {
1117 ktime_t last;
1118
1119 if (engine->stats.active && --engine->stats.active == 0) {
1120 /*
1121 * Decrement the active context count and in case GPU
1122 * is now idle add up to the running total.
1123 */
1124 last = ktime_sub(ktime_get(), engine->stats.start);
1125
1126 engine->stats.total = ktime_add(engine->stats.total,
1127 last);
1128 } else if (engine->stats.active == 0) {
1129 /*
1130 * After turning on engine stats, context out might be
1131 * the first event in which case we account from the
1132 * time stats gathering was turned on.
1133 */
1134 last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1135
1136 engine->stats.total = ktime_add(engine->stats.total,
1137 last);
1138 }
1139 }
1140
1141 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1142 }
1143
lrc_ring_mi_mode(const struct intel_engine_cs * engine)1144 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1145 {
1146 if (INTEL_GEN(engine->i915) >= 12)
1147 return 0x60;
1148 else if (INTEL_GEN(engine->i915) >= 9)
1149 return 0x54;
1150 else if (engine->class == RENDER_CLASS)
1151 return 0x58;
1152 else
1153 return -1;
1154 }
1155
1156 static void
execlists_check_context(const struct intel_context * ce,const struct intel_engine_cs * engine)1157 execlists_check_context(const struct intel_context *ce,
1158 const struct intel_engine_cs *engine)
1159 {
1160 const struct intel_ring *ring = ce->ring;
1161 u32 *regs = ce->lrc_reg_state;
1162 bool valid = true;
1163 int x;
1164
1165 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1166 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1167 engine->name,
1168 regs[CTX_RING_START],
1169 i915_ggtt_offset(ring->vma));
1170 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1171 valid = false;
1172 }
1173
1174 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1175 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1176 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1177 engine->name,
1178 regs[CTX_RING_CTL],
1179 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1180 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1181 valid = false;
1182 }
1183
1184 x = lrc_ring_mi_mode(engine);
1185 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1186 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1187 engine->name, regs[x + 1]);
1188 regs[x + 1] &= ~STOP_RING;
1189 regs[x + 1] |= STOP_RING << 16;
1190 valid = false;
1191 }
1192
1193 WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1194 }
1195
restore_default_state(struct intel_context * ce,struct intel_engine_cs * engine)1196 static void restore_default_state(struct intel_context *ce,
1197 struct intel_engine_cs *engine)
1198 {
1199 u32 *regs = ce->lrc_reg_state;
1200
1201 if (engine->pinned_default_state)
1202 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1203 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1204 engine->context_size - PAGE_SIZE);
1205
1206 execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1207 }
1208
reset_active(struct i915_request * rq,struct intel_engine_cs * engine)1209 static void reset_active(struct i915_request *rq,
1210 struct intel_engine_cs *engine)
1211 {
1212 struct intel_context * const ce = rq->context;
1213 u32 head;
1214
1215 /*
1216 * The executing context has been cancelled. We want to prevent
1217 * further execution along this context and propagate the error on
1218 * to anything depending on its results.
1219 *
1220 * In __i915_request_submit(), we apply the -EIO and remove the
1221 * requests' payloads for any banned requests. But first, we must
1222 * rewind the context back to the start of the incomplete request so
1223 * that we do not jump back into the middle of the batch.
1224 *
1225 * We preserve the breadcrumbs and semaphores of the incomplete
1226 * requests so that inter-timeline dependencies (i.e other timelines)
1227 * remain correctly ordered. And we defer to __i915_request_submit()
1228 * so that all asynchronous waits are correctly handled.
1229 */
1230 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1231 rq->fence.context, rq->fence.seqno);
1232
1233 /* On resubmission of the active request, payload will be scrubbed */
1234 if (i915_request_completed(rq))
1235 head = rq->tail;
1236 else
1237 head = active_request(ce->timeline, rq)->head;
1238 head = intel_ring_wrap(ce->ring, head);
1239
1240 /* Scrub the context image to prevent replaying the previous batch */
1241 restore_default_state(ce, engine);
1242 __execlists_update_reg_state(ce, engine, head);
1243
1244 /* We've switched away, so this should be a no-op, but intent matters */
1245 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1246 }
1247
1248 static inline struct intel_engine_cs *
__execlists_schedule_in(struct i915_request * rq)1249 __execlists_schedule_in(struct i915_request *rq)
1250 {
1251 struct intel_engine_cs * const engine = rq->engine;
1252 struct intel_context * const ce = rq->context;
1253
1254 intel_context_get(ce);
1255
1256 if (unlikely(intel_context_is_banned(ce)))
1257 reset_active(rq, engine);
1258
1259 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1260 execlists_check_context(ce, engine);
1261
1262 if (ce->tag) {
1263 /* Use a fixed tag for OA and friends */
1264 ce->lrc_desc |= (u64)ce->tag << 32;
1265 } else {
1266 /* We don't need a strict matching tag, just different values */
1267 ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1268 ce->lrc_desc |=
1269 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1270 GEN11_SW_CTX_ID_SHIFT;
1271 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1272 }
1273
1274 __intel_gt_pm_get(engine->gt);
1275 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1276 intel_engine_context_in(engine);
1277
1278 return engine;
1279 }
1280
1281 static inline struct i915_request *
execlists_schedule_in(struct i915_request * rq,int idx)1282 execlists_schedule_in(struct i915_request *rq, int idx)
1283 {
1284 struct intel_context * const ce = rq->context;
1285 struct intel_engine_cs *old;
1286
1287 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1288 trace_i915_request_in(rq, idx);
1289
1290 old = READ_ONCE(ce->inflight);
1291 do {
1292 if (!old) {
1293 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1294 break;
1295 }
1296 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1297
1298 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1299 return i915_request_get(rq);
1300 }
1301
kick_siblings(struct i915_request * rq,struct intel_context * ce)1302 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1303 {
1304 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1305 struct i915_request *next = READ_ONCE(ve->request);
1306
1307 if (next && next->execution_mask & ~rq->execution_mask)
1308 tasklet_schedule(&ve->base.execlists.tasklet);
1309 }
1310
1311 static inline void
__execlists_schedule_out(struct i915_request * rq,struct intel_engine_cs * const engine)1312 __execlists_schedule_out(struct i915_request *rq,
1313 struct intel_engine_cs * const engine)
1314 {
1315 struct intel_context * const ce = rq->context;
1316
1317 /*
1318 * NB process_csb() is not under the engine->active.lock and hence
1319 * schedule_out can race with schedule_in meaning that we should
1320 * refrain from doing non-trivial work here.
1321 */
1322
1323 /*
1324 * If we have just completed this context, the engine may now be
1325 * idle and we want to re-enter powersaving.
1326 */
1327 if (list_is_last(&rq->link, &ce->timeline->requests) &&
1328 i915_request_completed(rq))
1329 intel_engine_add_retire(engine, ce->timeline);
1330
1331 intel_engine_context_out(engine);
1332 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1333 intel_gt_pm_put_async(engine->gt);
1334
1335 /*
1336 * If this is part of a virtual engine, its next request may
1337 * have been blocked waiting for access to the active context.
1338 * We have to kick all the siblings again in case we need to
1339 * switch (e.g. the next request is not runnable on this
1340 * engine). Hopefully, we will already have submitted the next
1341 * request before the tasklet runs and do not need to rebuild
1342 * each virtual tree and kick everyone again.
1343 */
1344 if (ce->engine != engine)
1345 kick_siblings(rq, ce);
1346
1347 intel_context_put(ce);
1348 }
1349
1350 static inline void
execlists_schedule_out(struct i915_request * rq)1351 execlists_schedule_out(struct i915_request *rq)
1352 {
1353 struct intel_context * const ce = rq->context;
1354 struct intel_engine_cs *cur, *old;
1355
1356 trace_i915_request_out(rq);
1357
1358 old = READ_ONCE(ce->inflight);
1359 do
1360 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1361 while (!try_cmpxchg(&ce->inflight, &old, cur));
1362 if (!cur)
1363 __execlists_schedule_out(rq, old);
1364
1365 i915_request_put(rq);
1366 }
1367
execlists_update_context(struct i915_request * rq)1368 static u64 execlists_update_context(struct i915_request *rq)
1369 {
1370 struct intel_context *ce = rq->context;
1371 u64 desc = ce->lrc_desc;
1372 u32 tail, prev;
1373
1374 /*
1375 * WaIdleLiteRestore:bdw,skl
1376 *
1377 * We should never submit the context with the same RING_TAIL twice
1378 * just in case we submit an empty ring, which confuses the HW.
1379 *
1380 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1381 * the normal request to be able to always advance the RING_TAIL on
1382 * subsequent resubmissions (for lite restore). Should that fail us,
1383 * and we try and submit the same tail again, force the context
1384 * reload.
1385 *
1386 * If we need to return to a preempted context, we need to skip the
1387 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1388 * HW has a tendency to ignore us rewinding the TAIL to the end of
1389 * an earlier request.
1390 */
1391 tail = intel_ring_set_tail(rq->ring, rq->tail);
1392 prev = ce->lrc_reg_state[CTX_RING_TAIL];
1393 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1394 desc |= CTX_DESC_FORCE_RESTORE;
1395 ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1396 rq->tail = rq->wa_tail;
1397
1398 /*
1399 * Make sure the context image is complete before we submit it to HW.
1400 *
1401 * Ostensibly, writes (including the WCB) should be flushed prior to
1402 * an uncached write such as our mmio register access, the empirical
1403 * evidence (esp. on Braswell) suggests that the WC write into memory
1404 * may not be visible to the HW prior to the completion of the UC
1405 * register write and that we may begin execution from the context
1406 * before its image is complete leading to invalid PD chasing.
1407 */
1408 wmb();
1409
1410 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1411 return desc;
1412 }
1413
write_desc(struct intel_engine_execlists * execlists,u64 desc,u32 port)1414 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1415 {
1416 #ifdef __NetBSD__
1417 if (execlists->ctrl_reg) {
1418 bus_space_write_4(execlists->bst, execlists->bsh, execlists->submit_reg + port * 2, lower_32_bits(desc));
1419 bus_space_write_4(execlists->bst, execlists->bsh, execlists->submit_reg + port * 2 + 1, upper_32_bits(desc));
1420 } else {
1421 bus_space_write_4(execlists->bst, execlists->bsh, execlists->submit_reg, upper_32_bits(desc));
1422 bus_space_write_4(execlists->bst, execlists->bsh, execlists->submit_reg, lower_32_bits(desc));
1423 }
1424 #else
1425 if (execlists->ctrl_reg) {
1426 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1427 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1428 } else {
1429 writel(upper_32_bits(desc), execlists->submit_reg);
1430 writel(lower_32_bits(desc), execlists->submit_reg);
1431 }
1432 #endif
1433 }
1434
1435 static __maybe_unused void
trace_ports(const struct intel_engine_execlists * execlists,const char * msg,struct i915_request * const * ports)1436 trace_ports(const struct intel_engine_execlists *execlists,
1437 const char *msg,
1438 struct i915_request * const *ports)
1439 {
1440 const struct intel_engine_cs *engine =
1441 const_container_of(execlists, typeof(*engine), execlists);
1442
1443 if (!ports[0])
1444 return;
1445
1446 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1447 ports[0]->fence.context,
1448 ports[0]->fence.seqno,
1449 i915_request_completed(ports[0]) ? "!" :
1450 i915_request_started(ports[0]) ? "*" :
1451 "",
1452 ports[1] ? ports[1]->fence.context : 0,
1453 ports[1] ? ports[1]->fence.seqno : 0);
1454 }
1455
1456 static __maybe_unused bool
assert_pending_valid(const struct intel_engine_execlists * execlists,const char * msg)1457 assert_pending_valid(const struct intel_engine_execlists *execlists,
1458 const char *msg)
1459 {
1460 struct i915_request * const *port, *rq;
1461 struct intel_context *ce = NULL;
1462
1463 trace_ports(execlists, msg, execlists->pending);
1464
1465 if (!execlists->pending[0]) {
1466 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1467 return false;
1468 }
1469
1470 if (execlists->pending[execlists_num_ports(execlists)]) {
1471 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1472 execlists_num_ports(execlists));
1473 return false;
1474 }
1475
1476 for (port = execlists->pending; (rq = *port); port++) {
1477 unsigned long flags;
1478 bool ok = true;
1479
1480 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1481 GEM_BUG_ON(!i915_request_is_active(rq));
1482
1483 if (ce == rq->context) {
1484 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1485 ce->timeline->fence_context,
1486 port - execlists->pending);
1487 return false;
1488 }
1489 ce = rq->context;
1490
1491 /* Hold tightly onto the lock to prevent concurrent retires! */
1492 if (!spin_trylock_irqsave(&rq->lock, flags))
1493 continue;
1494
1495 if (i915_request_completed(rq))
1496 goto unlock;
1497
1498 if (i915_active_is_idle(&ce->active) &&
1499 !intel_context_is_barrier(ce)) {
1500 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1501 ce->timeline->fence_context,
1502 port - execlists->pending);
1503 ok = false;
1504 goto unlock;
1505 }
1506
1507 if (!i915_vma_is_pinned(ce->state)) {
1508 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1509 ce->timeline->fence_context,
1510 port - execlists->pending);
1511 ok = false;
1512 goto unlock;
1513 }
1514
1515 if (!i915_vma_is_pinned(ce->ring->vma)) {
1516 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1517 ce->timeline->fence_context,
1518 port - execlists->pending);
1519 ok = false;
1520 goto unlock;
1521 }
1522
1523 unlock:
1524 spin_unlock_irqrestore(&rq->lock, flags);
1525 if (!ok)
1526 return false;
1527 }
1528
1529 return ce;
1530 }
1531
execlists_submit_ports(struct intel_engine_cs * engine)1532 static void execlists_submit_ports(struct intel_engine_cs *engine)
1533 {
1534 struct intel_engine_execlists *execlists = &engine->execlists;
1535 unsigned int n;
1536
1537 GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1538
1539 /*
1540 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1541 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1542 * not be relinquished until the device is idle (see
1543 * i915_gem_idle_work_handler()). As a precaution, we make sure
1544 * that all ELSP are drained i.e. we have processed the CSB,
1545 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1546 */
1547 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1548
1549 /*
1550 * ELSQ note: the submit queue is not cleared after being submitted
1551 * to the HW so we need to make sure we always clean it up. This is
1552 * currently ensured by the fact that we always write the same number
1553 * of elsq entries, keep this in mind before changing the loop below.
1554 */
1555 for (n = execlists_num_ports(execlists); n--; ) {
1556 struct i915_request *rq = execlists->pending[n];
1557
1558 write_desc(execlists,
1559 rq ? execlists_update_context(rq) : 0,
1560 n);
1561 }
1562
1563 /* we need to manually load the submit queue */
1564 if (execlists->ctrl_reg)
1565 #ifdef __NetBSD__
1566 bus_space_write_4(execlists->bst, execlists->bsh, execlists->ctrl_reg, EL_CTRL_LOAD);
1567 #else
1568 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1569 #endif
1570 }
1571
ctx_single_port_submission(const struct intel_context * ce)1572 static bool ctx_single_port_submission(const struct intel_context *ce)
1573 {
1574 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1575 intel_context_force_single_submission(ce));
1576 }
1577
can_merge_ctx(const struct intel_context * prev,const struct intel_context * next)1578 static bool can_merge_ctx(const struct intel_context *prev,
1579 const struct intel_context *next)
1580 {
1581 if (prev != next)
1582 return false;
1583
1584 if (ctx_single_port_submission(prev))
1585 return false;
1586
1587 return true;
1588 }
1589
can_merge_rq(const struct i915_request * prev,const struct i915_request * next)1590 static bool can_merge_rq(const struct i915_request *prev,
1591 const struct i915_request *next)
1592 {
1593 GEM_BUG_ON(prev == next);
1594 GEM_BUG_ON(!assert_priority_queue(prev, next));
1595
1596 /*
1597 * We do not submit known completed requests. Therefore if the next
1598 * request is already completed, we can pretend to merge it in
1599 * with the previous context (and we will skip updating the ELSP
1600 * and tracking). Thus hopefully keeping the ELSP full with active
1601 * contexts, despite the best efforts of preempt-to-busy to confuse
1602 * us.
1603 */
1604 if (i915_request_completed(next))
1605 return true;
1606
1607 if (unlikely((prev->fence.flags ^ next->fence.flags) &
1608 (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1609 BIT(I915_FENCE_FLAG_SENTINEL))))
1610 return false;
1611
1612 if (!can_merge_ctx(prev->context, next->context))
1613 return false;
1614
1615 return true;
1616 }
1617
virtual_update_register_offsets(u32 * regs,struct intel_engine_cs * engine)1618 static void virtual_update_register_offsets(u32 *regs,
1619 struct intel_engine_cs *engine)
1620 {
1621 set_offsets(regs, reg_offsets(engine), engine, false);
1622 }
1623
virtual_matches(const struct virtual_engine * ve,const struct i915_request * rq,const struct intel_engine_cs * engine)1624 static bool virtual_matches(const struct virtual_engine *ve,
1625 const struct i915_request *rq,
1626 const struct intel_engine_cs *engine)
1627 {
1628 const struct intel_engine_cs *inflight;
1629
1630 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1631 return false;
1632
1633 /*
1634 * We track when the HW has completed saving the context image
1635 * (i.e. when we have seen the final CS event switching out of
1636 * the context) and must not overwrite the context image before
1637 * then. This restricts us to only using the active engine
1638 * while the previous virtualized request is inflight (so
1639 * we reuse the register offsets). This is a very small
1640 * hystersis on the greedy seelction algorithm.
1641 */
1642 inflight = intel_context_inflight(&ve->context);
1643 if (inflight && inflight != engine)
1644 return false;
1645
1646 return true;
1647 }
1648
virtual_xfer_breadcrumbs(struct virtual_engine * ve,struct intel_engine_cs * engine)1649 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1650 struct intel_engine_cs *engine)
1651 {
1652 struct intel_engine_cs *old = ve->siblings[0];
1653
1654 /* All unattached (rq->engine == old) must already be completed */
1655
1656 spin_lock(&old->breadcrumbs.irq_lock);
1657 if (!list_empty(&ve->context.signal_link)) {
1658 list_move_tail(&ve->context.signal_link,
1659 &engine->breadcrumbs.signalers);
1660 intel_engine_signal_breadcrumbs(engine);
1661 }
1662 spin_unlock(&old->breadcrumbs.irq_lock);
1663 }
1664
1665 static struct i915_request *
last_active(const struct intel_engine_execlists * execlists)1666 last_active(const struct intel_engine_execlists *execlists)
1667 {
1668 struct i915_request * const *last = READ_ONCE(execlists->active);
1669
1670 while (*last && i915_request_completed(*last))
1671 last++;
1672
1673 return *last;
1674 }
1675
1676 #define for_each_waiter(p__, rq__) \
1677 list_for_each_entry_lockless(p__, \
1678 &(rq__)->sched.waiters_list, \
1679 wait_link)
1680
defer_request(struct i915_request * rq,struct list_head * const pl)1681 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1682 {
1683 LIST_HEAD(list);
1684
1685 /*
1686 * We want to move the interrupted request to the back of
1687 * the round-robin list (i.e. its priority level), but
1688 * in doing so, we must then move all requests that were in
1689 * flight and were waiting for the interrupted request to
1690 * be run after it again.
1691 */
1692 do {
1693 struct i915_dependency *p;
1694
1695 GEM_BUG_ON(i915_request_is_active(rq));
1696 list_move_tail(&rq->sched.link, pl);
1697
1698 for_each_waiter(p, rq) {
1699 struct i915_request *w =
1700 container_of(p->waiter, typeof(*w), sched);
1701
1702 /* Leave semaphores spinning on the other engines */
1703 if (w->engine != rq->engine)
1704 continue;
1705
1706 /* No waiter should start before its signaler */
1707 GEM_BUG_ON(i915_request_started(w) &&
1708 !i915_request_completed(rq));
1709
1710 GEM_BUG_ON(i915_request_is_active(w));
1711 if (!i915_request_is_ready(w))
1712 continue;
1713
1714 if (rq_prio(w) < rq_prio(rq))
1715 continue;
1716
1717 GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1718 list_move_tail(&w->sched.link, &list);
1719 }
1720
1721 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1722 } while (rq);
1723 }
1724
defer_active(struct intel_engine_cs * engine)1725 static void defer_active(struct intel_engine_cs *engine)
1726 {
1727 struct i915_request *rq;
1728
1729 rq = __unwind_incomplete_requests(engine);
1730 if (!rq)
1731 return;
1732
1733 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1734 }
1735
1736 static bool
need_timeslice(struct intel_engine_cs * engine,const struct i915_request * rq)1737 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1738 {
1739 int hint;
1740
1741 if (!intel_engine_has_timeslices(engine))
1742 return false;
1743
1744 if (list_is_last(&rq->sched.link, &engine->active.requests))
1745 return false;
1746
1747 hint = max(rq_prio(list_next_entry(rq, sched.link)),
1748 engine->execlists.queue_priority_hint);
1749
1750 return hint >= effective_prio(rq);
1751 }
1752
1753 static int
switch_prio(struct intel_engine_cs * engine,const struct i915_request * rq)1754 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1755 {
1756 if (list_is_last(&rq->sched.link, &engine->active.requests))
1757 return INT_MIN;
1758
1759 return rq_prio(list_next_entry(rq, sched.link));
1760 }
1761
1762 static inline unsigned long
timeslice(const struct intel_engine_cs * engine)1763 timeslice(const struct intel_engine_cs *engine)
1764 {
1765 return READ_ONCE(engine->props.timeslice_duration_ms);
1766 }
1767
1768 static unsigned long
active_timeslice(const struct intel_engine_cs * engine)1769 active_timeslice(const struct intel_engine_cs *engine)
1770 {
1771 const struct i915_request *rq = *engine->execlists.active;
1772
1773 if (!rq || i915_request_completed(rq))
1774 return 0;
1775
1776 if (engine->execlists.switch_priority_hint < effective_prio(rq))
1777 return 0;
1778
1779 return timeslice(engine);
1780 }
1781
set_timeslice(struct intel_engine_cs * engine)1782 static void set_timeslice(struct intel_engine_cs *engine)
1783 {
1784 if (!intel_engine_has_timeslices(engine))
1785 return;
1786
1787 set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1788 }
1789
record_preemption(struct intel_engine_execlists * execlists)1790 static void record_preemption(struct intel_engine_execlists *execlists)
1791 {
1792 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1793 }
1794
active_preempt_timeout(struct intel_engine_cs * engine)1795 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1796 {
1797 struct i915_request *rq;
1798
1799 rq = last_active(&engine->execlists);
1800 if (!rq)
1801 return 0;
1802
1803 /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1804 if (unlikely(intel_context_is_banned(rq->context)))
1805 return 1;
1806
1807 return READ_ONCE(engine->props.preempt_timeout_ms);
1808 }
1809
set_preempt_timeout(struct intel_engine_cs * engine)1810 static void set_preempt_timeout(struct intel_engine_cs *engine)
1811 {
1812 if (!intel_engine_has_preempt_reset(engine))
1813 return;
1814
1815 set_timer_ms(&engine->execlists.preempt,
1816 active_preempt_timeout(engine));
1817 }
1818
clear_ports(struct i915_request ** ports,int count)1819 static inline void clear_ports(struct i915_request **ports, int count)
1820 {
1821 memset_p((void **)ports, NULL, count);
1822 }
1823
execlists_dequeue(struct intel_engine_cs * engine)1824 static void execlists_dequeue(struct intel_engine_cs *engine)
1825 {
1826 struct intel_engine_execlists * const execlists = &engine->execlists;
1827 struct i915_request **port = execlists->pending;
1828 struct i915_request ** const last_port = port + execlists->port_mask;
1829 struct i915_request *last;
1830 struct rb_node *rb;
1831 bool submit = false;
1832
1833 /*
1834 * Hardware submission is through 2 ports. Conceptually each port
1835 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1836 * static for a context, and unique to each, so we only execute
1837 * requests belonging to a single context from each ring. RING_HEAD
1838 * is maintained by the CS in the context image, it marks the place
1839 * where it got up to last time, and through RING_TAIL we tell the CS
1840 * where we want to execute up to this time.
1841 *
1842 * In this list the requests are in order of execution. Consecutive
1843 * requests from the same context are adjacent in the ringbuffer. We
1844 * can combine these requests into a single RING_TAIL update:
1845 *
1846 * RING_HEAD...req1...req2
1847 * ^- RING_TAIL
1848 * since to execute req2 the CS must first execute req1.
1849 *
1850 * Our goal then is to point each port to the end of a consecutive
1851 * sequence of requests as being the most optimal (fewest wake ups
1852 * and context switches) submission.
1853 */
1854
1855 for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1856 struct virtual_engine *ve =
1857 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1858 struct i915_request *rq = READ_ONCE(ve->request);
1859
1860 if (!rq) { /* lazily cleanup after another engine handled rq */
1861 rb_erase_cached(rb, &execlists->virtual);
1862 container_of(rb, struct ve_node, rb)->inserted =
1863 false;
1864 rb = rb_first_cached(&execlists->virtual);
1865 continue;
1866 }
1867
1868 if (!virtual_matches(ve, rq, engine)) {
1869 rb = rb_next2(&execlists->virtual.rb_root, rb);
1870 continue;
1871 }
1872
1873 break;
1874 }
1875
1876 /*
1877 * If the queue is higher priority than the last
1878 * request in the currently active context, submit afresh.
1879 * We will resubmit again afterwards in case we need to split
1880 * the active context to interject the preemption request,
1881 * i.e. we will retrigger preemption following the ack in case
1882 * of trouble.
1883 */
1884 last = last_active(execlists);
1885 if (last) {
1886 if (need_preempt(engine, last, rb)) {
1887 ENGINE_TRACE(engine,
1888 "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1889 last->fence.context,
1890 last->fence.seqno,
1891 last->sched.attr.priority,
1892 execlists->queue_priority_hint);
1893 record_preemption(execlists);
1894
1895 /*
1896 * Don't let the RING_HEAD advance past the breadcrumb
1897 * as we unwind (and until we resubmit) so that we do
1898 * not accidentally tell it to go backwards.
1899 */
1900 ring_set_paused(engine, 1);
1901
1902 /*
1903 * Note that we have not stopped the GPU at this point,
1904 * so we are unwinding the incomplete requests as they
1905 * remain inflight and so by the time we do complete
1906 * the preemption, some of the unwound requests may
1907 * complete!
1908 */
1909 __unwind_incomplete_requests(engine);
1910
1911 last = NULL;
1912 } else if (need_timeslice(engine, last) &&
1913 timer_expired(&engine->execlists.timer)) {
1914 ENGINE_TRACE(engine,
1915 "expired last=%llx:%lld, prio=%d, hint=%d\n",
1916 last->fence.context,
1917 last->fence.seqno,
1918 last->sched.attr.priority,
1919 execlists->queue_priority_hint);
1920
1921 ring_set_paused(engine, 1);
1922 defer_active(engine);
1923
1924 /*
1925 * Unlike for preemption, if we rewind and continue
1926 * executing the same context as previously active,
1927 * the order of execution will remain the same and
1928 * the tail will only advance. We do not need to
1929 * force a full context restore, as a lite-restore
1930 * is sufficient to resample the monotonic TAIL.
1931 *
1932 * If we switch to any other context, similarly we
1933 * will not rewind TAIL of current context, and
1934 * normal save/restore will preserve state and allow
1935 * us to later continue executing the same request.
1936 */
1937 last = NULL;
1938 } else {
1939 /*
1940 * Otherwise if we already have a request pending
1941 * for execution after the current one, we can
1942 * just wait until the next CS event before
1943 * queuing more. In either case we will force a
1944 * lite-restore preemption event, but if we wait
1945 * we hopefully coalesce several updates into a single
1946 * submission.
1947 */
1948 if (!list_is_last(&last->sched.link,
1949 &engine->active.requests)) {
1950 /*
1951 * Even if ELSP[1] is occupied and not worthy
1952 * of timeslices, our queue might be.
1953 */
1954 if (!timer_pending(&execlists->timer) &&
1955 need_timeslice(engine, last))
1956 set_timer_ms(&execlists->timer,
1957 timeslice(engine));
1958
1959 return;
1960 }
1961 }
1962 }
1963
1964 while (rb) { /* XXX virtual is always taking precedence */
1965 struct virtual_engine *ve =
1966 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1967 struct i915_request *rq;
1968
1969 spin_lock(&ve->base.active.lock);
1970
1971 rq = ve->request;
1972 if (unlikely(!rq)) { /* lost the race to a sibling */
1973 spin_unlock(&ve->base.active.lock);
1974 rb_erase_cached(rb, &execlists->virtual);
1975 container_of(rb, struct ve_node, rb)->inserted =
1976 false;
1977 rb = rb_first_cached(&execlists->virtual);
1978 continue;
1979 }
1980
1981 GEM_BUG_ON(rq != ve->request);
1982 GEM_BUG_ON(rq->engine != &ve->base);
1983 GEM_BUG_ON(rq->context != &ve->context);
1984
1985 if (rq_prio(rq) >= queue_prio(execlists)) {
1986 if (!virtual_matches(ve, rq, engine)) {
1987 spin_unlock(&ve->base.active.lock);
1988 rb = rb_next2(&execlists->virtual.rb_root,
1989 rb);
1990 continue;
1991 }
1992
1993 if (last && !can_merge_rq(last, rq)) {
1994 spin_unlock(&ve->base.active.lock);
1995 return; /* leave this for another */
1996 }
1997
1998 ENGINE_TRACE(engine,
1999 "virtual rq=%llx:%lld%s, new engine? %s\n",
2000 rq->fence.context,
2001 rq->fence.seqno,
2002 i915_request_completed(rq) ? "!" :
2003 i915_request_started(rq) ? "*" :
2004 "",
2005 yesno(engine != ve->siblings[0]));
2006
2007 ve->request = NULL;
2008 ve->base.execlists.queue_priority_hint = INT_MIN;
2009 rb_erase_cached(rb, &execlists->virtual);
2010 container_of(rb, struct ve_node, rb)->inserted =
2011 false;
2012
2013 GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2014 rq->engine = engine;
2015
2016 if (engine != ve->siblings[0]) {
2017 u32 *regs = ve->context.lrc_reg_state;
2018 unsigned int n;
2019
2020 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2021
2022 if (!intel_engine_has_relative_mmio(engine))
2023 virtual_update_register_offsets(regs,
2024 engine);
2025
2026 if (!list_empty(&ve->context.signals))
2027 virtual_xfer_breadcrumbs(ve, engine);
2028
2029 /*
2030 * Move the bound engine to the top of the list
2031 * for future execution. We then kick this
2032 * tasklet first before checking others, so that
2033 * we preferentially reuse this set of bound
2034 * registers.
2035 */
2036 for (n = 1; n < ve->num_siblings; n++) {
2037 if (ve->siblings[n] == engine) {
2038 swap(ve->siblings[n],
2039 ve->siblings[0]);
2040 break;
2041 }
2042 }
2043
2044 GEM_BUG_ON(ve->siblings[0] != engine);
2045 }
2046
2047 if (__i915_request_submit(rq)) {
2048 submit = true;
2049 last = rq;
2050 }
2051 i915_request_put(rq);
2052
2053 /*
2054 * Hmm, we have a bunch of virtual engine requests,
2055 * but the first one was already completed (thanks
2056 * preempt-to-busy!). Keep looking at the veng queue
2057 * until we have no more relevant requests (i.e.
2058 * the normal submit queue has higher priority).
2059 */
2060 if (!submit) {
2061 spin_unlock(&ve->base.active.lock);
2062 rb = rb_first_cached(&execlists->virtual);
2063 continue;
2064 }
2065 }
2066
2067 spin_unlock(&ve->base.active.lock);
2068 break;
2069 }
2070
2071 while ((rb = rb_first_cached(&execlists->queue))) {
2072 struct i915_priolist *p = to_priolist(rb);
2073 struct i915_request *rq, *rn;
2074 int i;
2075
2076 priolist_for_each_request_consume(rq, rn, p, i) {
2077 bool merge = true;
2078
2079 /*
2080 * Can we combine this request with the current port?
2081 * It has to be the same context/ringbuffer and not
2082 * have any exceptions (e.g. GVT saying never to
2083 * combine contexts).
2084 *
2085 * If we can combine the requests, we can execute both
2086 * by updating the RING_TAIL to point to the end of the
2087 * second request, and so we never need to tell the
2088 * hardware about the first.
2089 */
2090 if (last && !can_merge_rq(last, rq)) {
2091 /*
2092 * If we are on the second port and cannot
2093 * combine this request with the last, then we
2094 * are done.
2095 */
2096 if (port == last_port)
2097 goto done;
2098
2099 /*
2100 * We must not populate both ELSP[] with the
2101 * same LRCA, i.e. we must submit 2 different
2102 * contexts if we submit 2 ELSP.
2103 */
2104 if (last->context == rq->context)
2105 goto done;
2106
2107 if (i915_request_has_sentinel(last))
2108 goto done;
2109
2110 /*
2111 * If GVT overrides us we only ever submit
2112 * port[0], leaving port[1] empty. Note that we
2113 * also have to be careful that we don't queue
2114 * the same context (even though a different
2115 * request) to the second port.
2116 */
2117 if (ctx_single_port_submission(last->context) ||
2118 ctx_single_port_submission(rq->context))
2119 goto done;
2120
2121 merge = false;
2122 }
2123
2124 if (__i915_request_submit(rq)) {
2125 if (!merge) {
2126 *port = execlists_schedule_in(last, port - execlists->pending);
2127 port++;
2128 last = NULL;
2129 }
2130
2131 GEM_BUG_ON(last &&
2132 !can_merge_ctx(last->context,
2133 rq->context));
2134
2135 submit = true;
2136 last = rq;
2137 }
2138 }
2139
2140 rb_erase_cached(&p->node, &execlists->queue);
2141 i915_priolist_free(p);
2142 }
2143
2144 done:
2145 /*
2146 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2147 *
2148 * We choose the priority hint such that if we add a request of greater
2149 * priority than this, we kick the submission tasklet to decide on
2150 * the right order of submitting the requests to hardware. We must
2151 * also be prepared to reorder requests as they are in-flight on the
2152 * HW. We derive the priority hint then as the first "hole" in
2153 * the HW submission ports and if there are no available slots,
2154 * the priority of the lowest executing request, i.e. last.
2155 *
2156 * When we do receive a higher priority request ready to run from the
2157 * user, see queue_request(), the priority hint is bumped to that
2158 * request triggering preemption on the next dequeue (or subsequent
2159 * interrupt for secondary ports).
2160 */
2161 execlists->queue_priority_hint = queue_prio(execlists);
2162
2163 if (submit) {
2164 *port = execlists_schedule_in(last, port - execlists->pending);
2165 execlists->switch_priority_hint =
2166 switch_prio(engine, *execlists->pending);
2167
2168 /*
2169 * Skip if we ended up with exactly the same set of requests,
2170 * e.g. trying to timeslice a pair of ordered contexts
2171 */
2172 if (!memcmp(execlists->active, execlists->pending,
2173 (port - execlists->pending + 1) * sizeof(*port))) {
2174 do
2175 execlists_schedule_out(fetch_and_zero(port));
2176 while (port-- != execlists->pending);
2177
2178 goto skip_submit;
2179 }
2180 clear_ports(port + 1, last_port - port);
2181
2182 execlists_submit_ports(engine);
2183 set_preempt_timeout(engine);
2184 } else {
2185 skip_submit:
2186 ring_set_paused(engine, 0);
2187 }
2188 }
2189
2190 static void
cancel_port_requests(struct intel_engine_execlists * const execlists)2191 cancel_port_requests(struct intel_engine_execlists * const execlists)
2192 {
2193 struct i915_request * const *port;
2194
2195 for (port = execlists->pending; *port; port++)
2196 execlists_schedule_out(*port);
2197 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2198
2199 /* Mark the end of active before we overwrite *active */
2200 for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2201 execlists_schedule_out(*port);
2202 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2203
2204 WRITE_ONCE(execlists->active, execlists->inflight);
2205 }
2206
2207 static inline void
invalidate_csb_entries(const u32 * first,const u32 * last)2208 invalidate_csb_entries(const u32 *first, const u32 *last)
2209 {
2210 clflush(__UNCONST(first));
2211 clflush(__UNCONST(last));
2212 }
2213
2214 static inline bool
reset_in_progress(const struct intel_engine_execlists * execlists)2215 reset_in_progress(const struct intel_engine_execlists *execlists)
2216 {
2217 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2218 }
2219
2220 /*
2221 * Starting with Gen12, the status has a new format:
2222 *
2223 * bit 0: switched to new queue
2224 * bit 1: reserved
2225 * bit 2: semaphore wait mode (poll or signal), only valid when
2226 * switch detail is set to "wait on semaphore"
2227 * bits 3-5: engine class
2228 * bits 6-11: engine instance
2229 * bits 12-14: reserved
2230 * bits 15-25: sw context id of the lrc the GT switched to
2231 * bits 26-31: sw counter of the lrc the GT switched to
2232 * bits 32-35: context switch detail
2233 * - 0: ctx complete
2234 * - 1: wait on sync flip
2235 * - 2: wait on vblank
2236 * - 3: wait on scanline
2237 * - 4: wait on semaphore
2238 * - 5: context preempted (not on SEMAPHORE_WAIT or
2239 * WAIT_FOR_EVENT)
2240 * bit 36: reserved
2241 * bits 37-43: wait detail (for switch detail 1 to 4)
2242 * bits 44-46: reserved
2243 * bits 47-57: sw context id of the lrc the GT switched away from
2244 * bits 58-63: sw counter of the lrc the GT switched away from
2245 */
2246 static inline bool
gen12_csb_parse(const struct intel_engine_execlists * execlists,const u32 * csb)2247 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2248 {
2249 u32 lower_dw = csb[0];
2250 u32 upper_dw = csb[1];
2251 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2252 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2253 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2254
2255 /*
2256 * The context switch detail is not guaranteed to be 5 when a preemption
2257 * occurs, so we can't just check for that. The check below works for
2258 * all the cases we care about, including preemptions of WAIT
2259 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2260 * would require some extra handling, but we don't support that.
2261 */
2262 if (!ctx_away_valid || new_queue) {
2263 GEM_BUG_ON(!ctx_to_valid);
2264 return true;
2265 }
2266
2267 /*
2268 * switch detail = 5 is covered by the case above and we do not expect a
2269 * context switch on an unsuccessful wait instruction since we always
2270 * use polling mode.
2271 */
2272 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2273 return false;
2274 }
2275
2276 static inline bool
gen8_csb_parse(const struct intel_engine_execlists * execlists,const u32 * csb)2277 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2278 {
2279 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2280 }
2281
process_csb(struct intel_engine_cs * engine)2282 static void process_csb(struct intel_engine_cs *engine)
2283 {
2284 struct intel_engine_execlists * const execlists = &engine->execlists;
2285 const u32 * const buf = execlists->csb_status;
2286 const u8 num_entries = execlists->csb_size;
2287 u8 head, tail;
2288
2289 /*
2290 * As we modify our execlists state tracking we require exclusive
2291 * access. Either we are inside the tasklet, or the tasklet is disabled
2292 * and we assume that is only inside the reset paths and so serialised.
2293 */
2294 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2295 !reset_in_progress(execlists));
2296 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2297
2298 /*
2299 * Note that csb_write, csb_status may be either in HWSP or mmio.
2300 * When reading from the csb_write mmio register, we have to be
2301 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2302 * the low 4bits. As it happens we know the next 4bits are always
2303 * zero and so we can simply masked off the low u8 of the register
2304 * and treat it identically to reading from the HWSP (without having
2305 * to use explicit shifting and masking, and probably bifurcating
2306 * the code to handle the legacy mmio read).
2307 */
2308 head = execlists->csb_head;
2309 tail = READ_ONCE(*execlists->csb_write);
2310 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2311 if (unlikely(head == tail))
2312 return;
2313
2314 /*
2315 * Hopefully paired with a wmb() in HW!
2316 *
2317 * We must complete the read of the write pointer before any reads
2318 * from the CSB, so that we do not see stale values. Without an rmb
2319 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2320 * we perform the READ_ONCE(*csb_write).
2321 */
2322 rmb();
2323
2324 do {
2325 bool promote;
2326
2327 if (++head == num_entries)
2328 head = 0;
2329
2330 /*
2331 * We are flying near dragons again.
2332 *
2333 * We hold a reference to the request in execlist_port[]
2334 * but no more than that. We are operating in softirq
2335 * context and so cannot hold any mutex or sleep. That
2336 * prevents us stopping the requests we are processing
2337 * in port[] from being retired simultaneously (the
2338 * breadcrumb will be complete before we see the
2339 * context-switch). As we only hold the reference to the
2340 * request, any pointer chasing underneath the request
2341 * is subject to a potential use-after-free. Thus we
2342 * store all of the bookkeeping within port[] as
2343 * required, and avoid using unguarded pointers beneath
2344 * request itself. The same applies to the atomic
2345 * status notifier.
2346 */
2347
2348 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2349 head, buf[2 * head + 0], buf[2 * head + 1]);
2350
2351 if (INTEL_GEN(engine->i915) >= 12)
2352 promote = gen12_csb_parse(execlists, buf + 2 * head);
2353 else
2354 promote = gen8_csb_parse(execlists, buf + 2 * head);
2355 if (promote) {
2356 struct i915_request * const *old = execlists->active;
2357
2358 /* Point active to the new ELSP; prevent overwriting */
2359 WRITE_ONCE(execlists->active, execlists->pending);
2360
2361 if (!inject_preempt_hang(execlists))
2362 ring_set_paused(engine, 0);
2363
2364 /* cancel old inflight, prepare for switch */
2365 trace_ports(execlists, "preempted", old);
2366 while (*old)
2367 execlists_schedule_out(*old++);
2368
2369 /* switch pending to inflight */
2370 GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2371 WRITE_ONCE(execlists->active,
2372 memcpy(execlists->inflight,
2373 execlists->pending,
2374 execlists_num_ports(execlists) *
2375 sizeof(*execlists->pending)));
2376
2377 WRITE_ONCE(execlists->pending[0], NULL);
2378 } else {
2379 GEM_BUG_ON(!*execlists->active);
2380
2381 /* port0 completed, advanced to port1 */
2382 trace_ports(execlists, "completed", execlists->active);
2383
2384 /*
2385 * We rely on the hardware being strongly
2386 * ordered, that the breadcrumb write is
2387 * coherent (visible from the CPU) before the
2388 * user interrupt and CSB is processed.
2389 */
2390 GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2391 !reset_in_progress(execlists));
2392 execlists_schedule_out(*execlists->active++);
2393
2394 GEM_BUG_ON(execlists->active - execlists->inflight >
2395 execlists_num_ports(execlists));
2396 }
2397 } while (head != tail);
2398
2399 execlists->csb_head = head;
2400 set_timeslice(engine);
2401
2402 /*
2403 * Gen11 has proven to fail wrt global observation point between
2404 * entry and tail update, failing on the ordering and thus
2405 * we see an old entry in the context status buffer.
2406 *
2407 * Forcibly evict out entries for the next gpu csb update,
2408 * to increase the odds that we get a fresh entries with non
2409 * working hardware. The cost for doing so comes out mostly with
2410 * the wash as hardware, working or not, will need to do the
2411 * invalidation before.
2412 */
2413 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2414 }
2415
__execlists_submission_tasklet(struct intel_engine_cs * const engine)2416 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2417 {
2418 lockdep_assert_held(&engine->active.lock);
2419 if (!engine->execlists.pending[0]) {
2420 rcu_read_lock(); /* protect peeking at execlists->active */
2421 execlists_dequeue(engine);
2422 rcu_read_unlock();
2423 }
2424 }
2425
__execlists_hold(struct i915_request * rq)2426 static void __execlists_hold(struct i915_request *rq)
2427 {
2428 LIST_HEAD(list);
2429
2430 do {
2431 struct i915_dependency *p;
2432
2433 if (i915_request_is_active(rq))
2434 __i915_request_unsubmit(rq);
2435
2436 RQ_TRACE(rq, "on hold\n");
2437 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2438 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2439 i915_request_set_hold(rq);
2440
2441 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2442 struct i915_request *w =
2443 container_of(p->waiter, typeof(*w), sched);
2444
2445 /* Leave semaphores spinning on the other engines */
2446 if (w->engine != rq->engine)
2447 continue;
2448
2449 if (!i915_request_is_ready(w))
2450 continue;
2451
2452 if (i915_request_completed(w))
2453 continue;
2454
2455 if (i915_request_on_hold(rq))
2456 continue;
2457
2458 list_move_tail(&w->sched.link, &list);
2459 }
2460
2461 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2462 } while (rq);
2463 }
2464
execlists_hold(struct intel_engine_cs * engine,struct i915_request * rq)2465 static bool execlists_hold(struct intel_engine_cs *engine,
2466 struct i915_request *rq)
2467 {
2468 spin_lock_irq(&engine->active.lock);
2469
2470 if (i915_request_completed(rq)) { /* too late! */
2471 rq = NULL;
2472 goto unlock;
2473 }
2474
2475 if (rq->engine != engine) { /* preempted virtual engine */
2476 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2477
2478 /*
2479 * intel_context_inflight() is only protected by virtue
2480 * of process_csb() being called only by the tasklet (or
2481 * directly from inside reset while the tasklet is suspended).
2482 * Assert that neither of those are allowed to run while we
2483 * poke at the request queues.
2484 */
2485 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2486
2487 /*
2488 * An unsubmitted request along a virtual engine will
2489 * remain on the active (this) engine until we are able
2490 * to process the context switch away (and so mark the
2491 * context as no longer in flight). That cannot have happened
2492 * yet, otherwise we would not be hanging!
2493 */
2494 spin_lock(&ve->base.active.lock);
2495 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2496 GEM_BUG_ON(ve->request != rq);
2497 ve->request = NULL;
2498 spin_unlock(&ve->base.active.lock);
2499 i915_request_put(rq);
2500
2501 rq->engine = engine;
2502 }
2503
2504 /*
2505 * Transfer this request onto the hold queue to prevent it
2506 * being resumbitted to HW (and potentially completed) before we have
2507 * released it. Since we may have already submitted following
2508 * requests, we need to remove those as well.
2509 */
2510 GEM_BUG_ON(i915_request_on_hold(rq));
2511 GEM_BUG_ON(rq->engine != engine);
2512 __execlists_hold(rq);
2513
2514 unlock:
2515 spin_unlock_irq(&engine->active.lock);
2516 return rq;
2517 }
2518
hold_request(const struct i915_request * rq)2519 static bool hold_request(const struct i915_request *rq)
2520 {
2521 struct i915_dependency *p;
2522
2523 /*
2524 * If one of our ancestors is on hold, we must also be on hold,
2525 * otherwise we will bypass it and execute before it.
2526 */
2527 list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2528 const struct i915_request *s =
2529 container_of(p->signaler, typeof(*s), sched);
2530
2531 if (s->engine != rq->engine)
2532 continue;
2533
2534 if (i915_request_on_hold(s))
2535 return true;
2536 }
2537
2538 return false;
2539 }
2540
__execlists_unhold(struct i915_request * rq)2541 static void __execlists_unhold(struct i915_request *rq)
2542 {
2543 LIST_HEAD(list);
2544
2545 do {
2546 struct i915_dependency *p;
2547
2548 GEM_BUG_ON(!i915_request_on_hold(rq));
2549 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2550
2551 i915_request_clear_hold(rq);
2552 list_move_tail(&rq->sched.link,
2553 i915_sched_lookup_priolist(rq->engine,
2554 rq_prio(rq)));
2555 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2556 RQ_TRACE(rq, "hold release\n");
2557
2558 /* Also release any children on this engine that are ready */
2559 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2560 struct i915_request *w =
2561 container_of(p->waiter, typeof(*w), sched);
2562
2563 if (w->engine != rq->engine)
2564 continue;
2565
2566 if (!i915_request_on_hold(rq))
2567 continue;
2568
2569 /* Check that no other parents are also on hold */
2570 if (hold_request(rq))
2571 continue;
2572
2573 list_move_tail(&w->sched.link, &list);
2574 }
2575
2576 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2577 } while (rq);
2578 }
2579
execlists_unhold(struct intel_engine_cs * engine,struct i915_request * rq)2580 static void execlists_unhold(struct intel_engine_cs *engine,
2581 struct i915_request *rq)
2582 {
2583 spin_lock_irq(&engine->active.lock);
2584
2585 /*
2586 * Move this request back to the priority queue, and all of its
2587 * children and grandchildren that were suspended along with it.
2588 */
2589 __execlists_unhold(rq);
2590
2591 if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2592 engine->execlists.queue_priority_hint = rq_prio(rq);
2593 tasklet_hi_schedule(&engine->execlists.tasklet);
2594 }
2595
2596 spin_unlock_irq(&engine->active.lock);
2597 }
2598
2599 struct execlists_capture {
2600 struct work_struct work;
2601 struct i915_request *rq;
2602 struct i915_gpu_coredump *error;
2603 };
2604
execlists_capture_work(struct work_struct * work)2605 static void execlists_capture_work(struct work_struct *work)
2606 {
2607 struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2608 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2609 struct intel_engine_cs *engine = cap->rq->engine;
2610 struct intel_gt_coredump *gt = cap->error->gt;
2611 struct intel_engine_capture_vma *vma;
2612
2613 /* Compress all the objects attached to the request, slow! */
2614 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2615 if (vma) {
2616 struct i915_vma_compress *compress =
2617 i915_vma_capture_prepare(gt);
2618
2619 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2620 i915_vma_capture_finish(gt, compress);
2621 }
2622
2623 gt->simulated = gt->engine->simulated;
2624 cap->error->simulated = gt->simulated;
2625
2626 /* Publish the error state, and announce it to the world */
2627 i915_error_state_store(cap->error);
2628 i915_gpu_coredump_put(cap->error);
2629
2630 /* Return this request and all that depend upon it for signaling */
2631 execlists_unhold(engine, cap->rq);
2632 i915_request_put(cap->rq);
2633
2634 kfree(cap);
2635 }
2636
capture_regs(struct intel_engine_cs * engine)2637 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2638 {
2639 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2640 struct execlists_capture *cap;
2641
2642 cap = kmalloc(sizeof(*cap), gfp);
2643 if (!cap)
2644 return NULL;
2645
2646 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2647 if (!cap->error)
2648 goto err_cap;
2649
2650 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2651 if (!cap->error->gt)
2652 goto err_gpu;
2653
2654 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2655 if (!cap->error->gt->engine)
2656 goto err_gt;
2657
2658 return cap;
2659
2660 err_gt:
2661 kfree(cap->error->gt);
2662 err_gpu:
2663 kfree(cap->error);
2664 err_cap:
2665 kfree(cap);
2666 return NULL;
2667 }
2668
execlists_capture(struct intel_engine_cs * engine)2669 static bool execlists_capture(struct intel_engine_cs *engine)
2670 {
2671 struct execlists_capture *cap;
2672
2673 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2674 return true;
2675
2676 /*
2677 * We need to _quickly_ capture the engine state before we reset.
2678 * We are inside an atomic section (softirq) here and we are delaying
2679 * the forced preemption event.
2680 */
2681 cap = capture_regs(engine);
2682 if (!cap)
2683 return true;
2684
2685 cap->rq = execlists_active(&engine->execlists);
2686 GEM_BUG_ON(!cap->rq);
2687
2688 rcu_read_lock();
2689 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2690 cap->rq = i915_request_get_rcu(cap->rq);
2691 rcu_read_unlock();
2692 if (!cap->rq)
2693 goto err_free;
2694
2695 /*
2696 * Remove the request from the execlists queue, and take ownership
2697 * of the request. We pass it to our worker who will _slowly_ compress
2698 * all the pages the _user_ requested for debugging their batch, after
2699 * which we return it to the queue for signaling.
2700 *
2701 * By removing them from the execlists queue, we also remove the
2702 * requests from being processed by __unwind_incomplete_requests()
2703 * during the intel_engine_reset(), and so they will *not* be replayed
2704 * afterwards.
2705 *
2706 * Note that because we have not yet reset the engine at this point,
2707 * it is possible for the request that we have identified as being
2708 * guilty, did in fact complete and we will then hit an arbitration
2709 * point allowing the outstanding preemption to succeed. The likelihood
2710 * of that is very low (as capturing of the engine registers should be
2711 * fast enough to run inside an irq-off atomic section!), so we will
2712 * simply hold that request accountable for being non-preemptible
2713 * long enough to force the reset.
2714 */
2715 if (!execlists_hold(engine, cap->rq))
2716 goto err_rq;
2717
2718 INIT_WORK(&cap->work, execlists_capture_work);
2719 schedule_work(&cap->work);
2720 return true;
2721
2722 err_rq:
2723 i915_request_put(cap->rq);
2724 err_free:
2725 i915_gpu_coredump_put(cap->error);
2726 kfree(cap);
2727 return false;
2728 }
2729
preempt_reset(struct intel_engine_cs * engine)2730 static noinline void preempt_reset(struct intel_engine_cs *engine)
2731 {
2732 const unsigned int bit = I915_RESET_ENGINE + engine->id;
2733 unsigned long *lock = &engine->gt->reset.flags;
2734
2735 if (i915_modparams.reset < 3)
2736 return;
2737
2738 if (test_and_set_bit(bit, lock))
2739 return;
2740
2741 /* Mark this tasklet as disabled to avoid waiting for it to complete */
2742 tasklet_disable_nosync(&engine->execlists.tasklet);
2743
2744 ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2745 READ_ONCE(engine->props.preempt_timeout_ms),
2746 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2747
2748 ring_set_paused(engine, 1); /* Freeze the current request in place */
2749 if (execlists_capture(engine))
2750 intel_engine_reset(engine, "preemption time out");
2751 else
2752 ring_set_paused(engine, 0);
2753
2754 tasklet_enable(&engine->execlists.tasklet);
2755 clear_and_wake_up_bit(bit, lock);
2756 }
2757
preempt_timeout(const struct intel_engine_cs * const engine)2758 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2759 {
2760 const struct timer_list *t = &engine->execlists.preempt;
2761
2762 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2763 return false;
2764
2765 if (!timer_expired(t))
2766 return false;
2767
2768 return READ_ONCE(engine->execlists.pending[0]);
2769 }
2770
2771 /*
2772 * Check the unread Context Status Buffers and manage the submission of new
2773 * contexts to the ELSP accordingly.
2774 */
execlists_submission_tasklet(unsigned long data)2775 static void execlists_submission_tasklet(unsigned long data)
2776 {
2777 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2778 bool timeout = preempt_timeout(engine);
2779
2780 process_csb(engine);
2781 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2782 unsigned long flags;
2783
2784 spin_lock_irqsave(&engine->active.lock, flags);
2785 __execlists_submission_tasklet(engine);
2786 spin_unlock_irqrestore(&engine->active.lock, flags);
2787
2788 /* Recheck after serialising with direct-submission */
2789 if (timeout && preempt_timeout(engine))
2790 preempt_reset(engine);
2791 }
2792 }
2793
__execlists_kick(struct intel_engine_execlists * execlists)2794 static void __execlists_kick(struct intel_engine_execlists *execlists)
2795 {
2796 /* Kick the tasklet for some interrupt coalescing and reset handling */
2797 tasklet_hi_schedule(&execlists->tasklet);
2798 }
2799
2800 #define execlists_kick(t, member) \
2801 __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2802
execlists_timeslice(struct timer_list * timer)2803 static void execlists_timeslice(struct timer_list *timer)
2804 {
2805 execlists_kick(timer, timer);
2806 }
2807
execlists_preempt(struct timer_list * timer)2808 static void execlists_preempt(struct timer_list *timer)
2809 {
2810 execlists_kick(timer, preempt);
2811 }
2812
queue_request(struct intel_engine_cs * engine,struct i915_request * rq)2813 static void queue_request(struct intel_engine_cs *engine,
2814 struct i915_request *rq)
2815 {
2816 GEM_BUG_ON(!list_empty(&rq->sched.link));
2817 list_add_tail(&rq->sched.link,
2818 i915_sched_lookup_priolist(engine, rq_prio(rq)));
2819 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2820 }
2821
__submit_queue_imm(struct intel_engine_cs * engine)2822 static void __submit_queue_imm(struct intel_engine_cs *engine)
2823 {
2824 struct intel_engine_execlists * const execlists = &engine->execlists;
2825
2826 if (reset_in_progress(execlists))
2827 return; /* defer until we restart the engine following reset */
2828
2829 if (execlists->tasklet.func == execlists_submission_tasklet)
2830 __execlists_submission_tasklet(engine);
2831 else
2832 tasklet_hi_schedule(&execlists->tasklet);
2833 }
2834
submit_queue(struct intel_engine_cs * engine,const struct i915_request * rq)2835 static void submit_queue(struct intel_engine_cs *engine,
2836 const struct i915_request *rq)
2837 {
2838 struct intel_engine_execlists *execlists = &engine->execlists;
2839
2840 if (rq_prio(rq) <= execlists->queue_priority_hint)
2841 return;
2842
2843 execlists->queue_priority_hint = rq_prio(rq);
2844 __submit_queue_imm(engine);
2845 }
2846
ancestor_on_hold(const struct intel_engine_cs * engine,const struct i915_request * rq)2847 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2848 const struct i915_request *rq)
2849 {
2850 GEM_BUG_ON(i915_request_on_hold(rq));
2851 return !list_empty(&engine->active.hold) && hold_request(rq);
2852 }
2853
execlists_submit_request(struct i915_request * request)2854 static void execlists_submit_request(struct i915_request *request)
2855 {
2856 struct intel_engine_cs *engine = request->engine;
2857 unsigned long flags;
2858
2859 /* Will be called from irq-context when using foreign fences. */
2860 spin_lock_irqsave(&engine->active.lock, flags);
2861
2862 if (unlikely(ancestor_on_hold(engine, request))) {
2863 list_add_tail(&request->sched.link, &engine->active.hold);
2864 i915_request_set_hold(request);
2865 } else {
2866 queue_request(engine, request);
2867
2868 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2869 GEM_BUG_ON(list_empty(&request->sched.link));
2870
2871 submit_queue(engine, request);
2872 }
2873
2874 spin_unlock_irqrestore(&engine->active.lock, flags);
2875 }
2876
__execlists_context_fini(struct intel_context * ce)2877 static void __execlists_context_fini(struct intel_context *ce)
2878 {
2879 intel_ring_put(ce->ring);
2880 i915_vma_put(ce->state);
2881 }
2882
execlists_context_destroy(struct kref * kref)2883 static void execlists_context_destroy(struct kref *kref)
2884 {
2885 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2886
2887 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2888 GEM_BUG_ON(intel_context_is_pinned(ce));
2889
2890 if (ce->state)
2891 __execlists_context_fini(ce);
2892
2893 intel_context_fini(ce);
2894 intel_context_free(ce);
2895 }
2896
2897 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)2898 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2899 {
2900 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2901 return;
2902
2903 vaddr += engine->context_size;
2904
2905 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2906 }
2907
2908 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)2909 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2910 {
2911 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2912 return;
2913
2914 vaddr += engine->context_size;
2915
2916 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2917 dev_err_once(engine->i915->drm.dev,
2918 "%s context redzone overwritten!\n",
2919 engine->name);
2920 }
2921
execlists_context_unpin(struct intel_context * ce)2922 static void execlists_context_unpin(struct intel_context *ce)
2923 {
2924 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2925 ce->engine);
2926
2927 i915_gem_object_unpin_map(ce->state->obj);
2928 }
2929
2930 static void
__execlists_update_reg_state(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)2931 __execlists_update_reg_state(const struct intel_context *ce,
2932 const struct intel_engine_cs *engine,
2933 u32 head)
2934 {
2935 struct intel_ring *ring = ce->ring;
2936 u32 *regs = ce->lrc_reg_state;
2937
2938 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2939 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2940
2941 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2942 regs[CTX_RING_HEAD] = head;
2943 regs[CTX_RING_TAIL] = ring->tail;
2944
2945 /* RPCS */
2946 if (engine->class == RENDER_CLASS) {
2947 regs[CTX_R_PWR_CLK_STATE] =
2948 intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2949
2950 i915_oa_init_reg_state(ce, engine);
2951 }
2952 }
2953
2954 static int
__execlists_context_pin(struct intel_context * ce,struct intel_engine_cs * engine)2955 __execlists_context_pin(struct intel_context *ce,
2956 struct intel_engine_cs *engine)
2957 {
2958 void *vaddr;
2959
2960 GEM_BUG_ON(!ce->state);
2961 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2962
2963 vaddr = i915_gem_object_pin_map(ce->state->obj,
2964 i915_coherent_map_type(engine->i915) |
2965 I915_MAP_OVERRIDE);
2966 if (IS_ERR(vaddr))
2967 return PTR_ERR(vaddr);
2968
2969 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2970 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2971 __execlists_update_reg_state(ce, engine, ce->ring->tail);
2972
2973 return 0;
2974 }
2975
execlists_context_pin(struct intel_context * ce)2976 static int execlists_context_pin(struct intel_context *ce)
2977 {
2978 return __execlists_context_pin(ce, ce->engine);
2979 }
2980
execlists_context_alloc(struct intel_context * ce)2981 static int execlists_context_alloc(struct intel_context *ce)
2982 {
2983 return __execlists_context_alloc(ce, ce->engine);
2984 }
2985
execlists_context_reset(struct intel_context * ce)2986 static void execlists_context_reset(struct intel_context *ce)
2987 {
2988 CE_TRACE(ce, "reset\n");
2989 GEM_BUG_ON(!intel_context_is_pinned(ce));
2990
2991 /*
2992 * Because we emit WA_TAIL_DWORDS there may be a disparity
2993 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2994 * that stored in context. As we only write new commands from
2995 * ce->ring->tail onwards, everything before that is junk. If the GPU
2996 * starts reading from its RING_HEAD from the context, it may try to
2997 * execute that junk and die.
2998 *
2999 * The contexts that are stilled pinned on resume belong to the
3000 * kernel, and are local to each engine. All other contexts will
3001 * have their head/tail sanitized upon pinning before use, so they
3002 * will never see garbage,
3003 *
3004 * So to avoid that we reset the context images upon resume. For
3005 * simplicity, we just zero everything out.
3006 */
3007 intel_ring_reset(ce->ring, ce->ring->emit);
3008
3009 /* Scrub away the garbage */
3010 execlists_init_reg_state(ce->lrc_reg_state,
3011 ce, ce->engine, ce->ring, true);
3012 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3013
3014 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3015 }
3016
3017 static const struct intel_context_ops execlists_context_ops = {
3018 .alloc = execlists_context_alloc,
3019
3020 .pin = execlists_context_pin,
3021 .unpin = execlists_context_unpin,
3022
3023 .enter = intel_context_enter_engine,
3024 .exit = intel_context_exit_engine,
3025
3026 .reset = execlists_context_reset,
3027 .destroy = execlists_context_destroy,
3028 };
3029
gen8_emit_init_breadcrumb(struct i915_request * rq)3030 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3031 {
3032 u32 *cs;
3033
3034 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
3035
3036 cs = intel_ring_begin(rq, 6);
3037 if (IS_ERR(cs))
3038 return PTR_ERR(cs);
3039
3040 /*
3041 * Check if we have been preempted before we even get started.
3042 *
3043 * After this point i915_request_started() reports true, even if
3044 * we get preempted and so are no longer running.
3045 */
3046 *cs++ = MI_ARB_CHECK;
3047 *cs++ = MI_NOOP;
3048
3049 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3050 *cs++ = i915_request_timeline(rq)->hwsp_offset;
3051 *cs++ = 0;
3052 *cs++ = rq->fence.seqno - 1;
3053
3054 intel_ring_advance(rq, cs);
3055
3056 /* Record the updated position of the request's payload */
3057 rq->infix = intel_ring_offset(rq, cs);
3058
3059 return 0;
3060 }
3061
execlists_request_alloc(struct i915_request * request)3062 static int execlists_request_alloc(struct i915_request *request)
3063 {
3064 int ret;
3065
3066 GEM_BUG_ON(!intel_context_is_pinned(request->context));
3067
3068 /*
3069 * Flush enough space to reduce the likelihood of waiting after
3070 * we start building the request - in which case we will just
3071 * have to repeat work.
3072 */
3073 request->reserved_space += EXECLISTS_REQUEST_SIZE;
3074
3075 /*
3076 * Note that after this point, we have committed to using
3077 * this request as it is being used to both track the
3078 * state of engine initialisation and liveness of the
3079 * golden renderstate above. Think twice before you try
3080 * to cancel/unwind this request now.
3081 */
3082
3083 /* Unconditionally invalidate GPU caches and TLBs. */
3084 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3085 if (ret)
3086 return ret;
3087
3088 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3089 return 0;
3090 }
3091
3092 /*
3093 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3094 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3095 * but there is a slight complication as this is applied in WA batch where the
3096 * values are only initialized once so we cannot take register value at the
3097 * beginning and reuse it further; hence we save its value to memory, upload a
3098 * constant value with bit21 set and then we restore it back with the saved value.
3099 * To simplify the WA, a constant value is formed by using the default value
3100 * of this register. This shouldn't be a problem because we are only modifying
3101 * it for a short period and this batch in non-premptible. We can ofcourse
3102 * use additional instructions that read the actual value of the register
3103 * at that time and set our bit of interest but it makes the WA complicated.
3104 *
3105 * This WA is also required for Gen9 so extracting as a function avoids
3106 * code duplication.
3107 */
3108 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)3109 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3110 {
3111 /* NB no one else is allowed to scribble over scratch + 256! */
3112 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3113 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3114 *batch++ = intel_gt_scratch_offset(engine->gt,
3115 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3116 *batch++ = 0;
3117
3118 *batch++ = MI_LOAD_REGISTER_IMM(1);
3119 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3120 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3121
3122 batch = gen8_emit_pipe_control(batch,
3123 PIPE_CONTROL_CS_STALL |
3124 PIPE_CONTROL_DC_FLUSH_ENABLE,
3125 0);
3126
3127 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3128 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3129 *batch++ = intel_gt_scratch_offset(engine->gt,
3130 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3131 *batch++ = 0;
3132
3133 return batch;
3134 }
3135
3136 /*
3137 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3138 * initialized at the beginning and shared across all contexts but this field
3139 * helps us to have multiple batches at different offsets and select them based
3140 * on a criteria. At the moment this batch always start at the beginning of the page
3141 * and at this point we don't have multiple wa_ctx batch buffers.
3142 *
3143 * The number of WA applied are not known at the beginning; we use this field
3144 * to return the no of DWORDS written.
3145 *
3146 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3147 * so it adds NOOPs as padding to make it cacheline aligned.
3148 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3149 * makes a complete batch buffer.
3150 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3151 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3152 {
3153 /* WaDisableCtxRestoreArbitration:bdw,chv */
3154 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3155
3156 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3157 if (IS_BROADWELL(engine->i915))
3158 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3159
3160 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3161 /* Actual scratch location is at 128 bytes offset */
3162 batch = gen8_emit_pipe_control(batch,
3163 PIPE_CONTROL_FLUSH_L3 |
3164 PIPE_CONTROL_STORE_DATA_INDEX |
3165 PIPE_CONTROL_CS_STALL |
3166 PIPE_CONTROL_QW_WRITE,
3167 LRC_PPHWSP_SCRATCH_ADDR);
3168
3169 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3170
3171 /* Pad to end of cacheline */
3172 while ((unsigned long)batch % CACHELINE_BYTES)
3173 *batch++ = MI_NOOP;
3174
3175 /*
3176 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3177 * execution depends on the length specified in terms of cache lines
3178 * in the register CTX_RCS_INDIRECT_CTX
3179 */
3180
3181 return batch;
3182 }
3183
3184 struct lri {
3185 i915_reg_t reg;
3186 u32 value;
3187 };
3188
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)3189 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3190 {
3191 GEM_BUG_ON(!count || count > 63);
3192
3193 *batch++ = MI_LOAD_REGISTER_IMM(count);
3194 do {
3195 *batch++ = i915_mmio_reg_offset(lri->reg);
3196 *batch++ = lri->value;
3197 } while (lri++, --count);
3198 *batch++ = MI_NOOP;
3199
3200 return batch;
3201 }
3202
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3203 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3204 {
3205 static const struct lri lri[] = {
3206 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3207 {
3208 COMMON_SLICE_CHICKEN2,
3209 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3210 0),
3211 },
3212
3213 /* BSpec: 11391 */
3214 {
3215 FF_SLICE_CHICKEN,
3216 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3217 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3218 },
3219
3220 /* BSpec: 11299 */
3221 {
3222 _3D_CHICKEN3,
3223 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3224 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3225 }
3226 };
3227
3228 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3229
3230 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3231 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3232
3233 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3234 batch = gen8_emit_pipe_control(batch,
3235 PIPE_CONTROL_FLUSH_L3 |
3236 PIPE_CONTROL_STORE_DATA_INDEX |
3237 PIPE_CONTROL_CS_STALL |
3238 PIPE_CONTROL_QW_WRITE,
3239 LRC_PPHWSP_SCRATCH_ADDR);
3240
3241 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3242
3243 /* WaMediaPoolStateCmdInWABB:bxt,glk */
3244 if (HAS_POOLED_EU(engine->i915)) {
3245 /*
3246 * EU pool configuration is setup along with golden context
3247 * during context initialization. This value depends on
3248 * device type (2x6 or 3x6) and needs to be updated based
3249 * on which subslice is disabled especially for 2x6
3250 * devices, however it is safe to load default
3251 * configuration of 3x6 device instead of masking off
3252 * corresponding bits because HW ignores bits of a disabled
3253 * subslice and drops down to appropriate config. Please
3254 * see render_state_setup() in i915_gem_render_state.c for
3255 * possible configurations, to avoid duplication they are
3256 * not shown here again.
3257 */
3258 *batch++ = GEN9_MEDIA_POOL_STATE;
3259 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3260 *batch++ = 0x00777000;
3261 *batch++ = 0;
3262 *batch++ = 0;
3263 *batch++ = 0;
3264 }
3265
3266 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3267
3268 /* Pad to end of cacheline */
3269 while ((unsigned long)batch % CACHELINE_BYTES)
3270 *batch++ = MI_NOOP;
3271
3272 return batch;
3273 }
3274
3275 static u32 *
gen10_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3276 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3277 {
3278 int i;
3279
3280 /*
3281 * WaPipeControlBefore3DStateSamplePattern: cnl
3282 *
3283 * Ensure the engine is idle prior to programming a
3284 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3285 */
3286 batch = gen8_emit_pipe_control(batch,
3287 PIPE_CONTROL_CS_STALL,
3288 0);
3289 /*
3290 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3291 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3292 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3293 * confusing. Since gen8_emit_pipe_control() already advances the
3294 * batch by 6 dwords, we advance the other 10 here, completing a
3295 * cacheline. It's not clear if the workaround requires this padding
3296 * before other commands, or if it's just the regular padding we would
3297 * already have for the workaround bb, so leave it here for now.
3298 */
3299 for (i = 0; i < 10; i++)
3300 *batch++ = MI_NOOP;
3301
3302 /* Pad to end of cacheline */
3303 while ((unsigned long)batch % CACHELINE_BYTES)
3304 *batch++ = MI_NOOP;
3305
3306 return batch;
3307 }
3308
3309 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3310
lrc_setup_wa_ctx(struct intel_engine_cs * engine)3311 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3312 {
3313 struct drm_i915_gem_object *obj;
3314 struct i915_vma *vma;
3315 int err;
3316
3317 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3318 if (IS_ERR(obj))
3319 return PTR_ERR(obj);
3320
3321 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3322 if (IS_ERR(vma)) {
3323 err = PTR_ERR(vma);
3324 goto err;
3325 }
3326
3327 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3328 if (err)
3329 goto err;
3330
3331 engine->wa_ctx.vma = vma;
3332 return 0;
3333
3334 err:
3335 i915_gem_object_put(obj);
3336 return err;
3337 }
3338
lrc_destroy_wa_ctx(struct intel_engine_cs * engine)3339 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3340 {
3341 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3342 }
3343
3344 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3345
intel_init_workaround_bb(struct intel_engine_cs * engine)3346 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3347 {
3348 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3349 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3350 &wa_ctx->per_ctx };
3351 wa_bb_func_t wa_bb_fn[2];
3352 struct page *page;
3353 void *batch, *batch_ptr;
3354 unsigned int i;
3355 int ret;
3356
3357 if (engine->class != RENDER_CLASS)
3358 return 0;
3359
3360 switch (INTEL_GEN(engine->i915)) {
3361 case 12:
3362 case 11:
3363 return 0;
3364 case 10:
3365 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3366 wa_bb_fn[1] = NULL;
3367 break;
3368 case 9:
3369 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3370 wa_bb_fn[1] = NULL;
3371 break;
3372 case 8:
3373 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3374 wa_bb_fn[1] = NULL;
3375 break;
3376 default:
3377 MISSING_CASE(INTEL_GEN(engine->i915));
3378 return 0;
3379 }
3380
3381 ret = lrc_setup_wa_ctx(engine);
3382 if (ret) {
3383 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3384 return ret;
3385 }
3386
3387 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3388 batch = batch_ptr = kmap_atomic(page);
3389
3390 /*
3391 * Emit the two workaround batch buffers, recording the offset from the
3392 * start of the workaround batch buffer object for each and their
3393 * respective sizes.
3394 */
3395 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3396 wa_bb[i]->offset = batch_ptr - batch;
3397 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3398 CACHELINE_BYTES))) {
3399 ret = -EINVAL;
3400 break;
3401 }
3402 if (wa_bb_fn[i])
3403 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3404 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3405 }
3406
3407 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3408
3409 kunmap_atomic(batch);
3410 if (ret)
3411 lrc_destroy_wa_ctx(engine);
3412
3413 return ret;
3414 }
3415
enable_execlists(struct intel_engine_cs * engine)3416 static void enable_execlists(struct intel_engine_cs *engine)
3417 {
3418 u32 mode;
3419
3420 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3421
3422 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3423
3424 if (INTEL_GEN(engine->i915) >= 11)
3425 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3426 else
3427 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3428 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3429
3430 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3431
3432 ENGINE_WRITE_FW(engine,
3433 RING_HWS_PGA,
3434 i915_ggtt_offset(engine->status_page.vma));
3435 ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3436
3437 engine->context_tag = 0;
3438 }
3439
unexpected_starting_state(struct intel_engine_cs * engine)3440 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3441 {
3442 bool unexpected = false;
3443
3444 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3445 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3446 unexpected = true;
3447 }
3448
3449 return unexpected;
3450 }
3451
execlists_resume(struct intel_engine_cs * engine)3452 static int execlists_resume(struct intel_engine_cs *engine)
3453 {
3454 intel_engine_apply_workarounds(engine);
3455 intel_engine_apply_whitelist(engine);
3456
3457 intel_mocs_init_engine(engine);
3458
3459 intel_engine_reset_breadcrumbs(engine);
3460
3461 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3462 struct drm_printer p = drm_debug_printer(__func__);
3463
3464 intel_engine_dump(engine, &p, NULL);
3465 }
3466
3467 enable_execlists(engine);
3468
3469 return 0;
3470 }
3471
execlists_reset_prepare(struct intel_engine_cs * engine)3472 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3473 {
3474 struct intel_engine_execlists * const execlists = &engine->execlists;
3475 unsigned long flags;
3476
3477 ENGINE_TRACE(engine, "depth<-%d\n",
3478 atomic_read(&execlists->tasklet.count));
3479
3480 /*
3481 * Prevent request submission to the hardware until we have
3482 * completed the reset in i915_gem_reset_finish(). If a request
3483 * is completed by one engine, it may then queue a request
3484 * to a second via its execlists->tasklet *just* as we are
3485 * calling engine->resume() and also writing the ELSP.
3486 * Turning off the execlists->tasklet until the reset is over
3487 * prevents the race.
3488 */
3489 __tasklet_disable_sync_once(&execlists->tasklet);
3490 GEM_BUG_ON(!reset_in_progress(execlists));
3491
3492 /* And flush any current direct submission. */
3493 spin_lock_irqsave(&engine->active.lock, flags);
3494 spin_unlock_irqrestore(&engine->active.lock, flags);
3495
3496 /*
3497 * We stop engines, otherwise we might get failed reset and a
3498 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3499 * from system hang if batchbuffer is progressing when
3500 * the reset is issued, regardless of READY_TO_RESET ack.
3501 * Thus assume it is best to stop engines on all gens
3502 * where we have a gpu reset.
3503 *
3504 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3505 *
3506 * FIXME: Wa for more modern gens needs to be validated
3507 */
3508 intel_engine_stop_cs(engine);
3509 }
3510
reset_csb_pointers(struct intel_engine_cs * engine)3511 static void reset_csb_pointers(struct intel_engine_cs *engine)
3512 {
3513 struct intel_engine_execlists * const execlists = &engine->execlists;
3514 const unsigned int reset_value = execlists->csb_size - 1;
3515
3516 ring_set_paused(engine, 0);
3517
3518 /*
3519 * After a reset, the HW starts writing into CSB entry [0]. We
3520 * therefore have to set our HEAD pointer back one entry so that
3521 * the *first* entry we check is entry 0. To complicate this further,
3522 * as we don't wait for the first interrupt after reset, we have to
3523 * fake the HW write to point back to the last entry so that our
3524 * inline comparison of our cached head position against the last HW
3525 * write works even before the first interrupt.
3526 */
3527 execlists->csb_head = reset_value;
3528 WRITE_ONCE(*execlists->csb_write, reset_value);
3529 wmb(); /* Make sure this is visible to HW (paranoia?) */
3530
3531 /*
3532 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3533 * Bludgeon them with a mmio update to be sure.
3534 */
3535 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3536 reset_value << 8 | reset_value);
3537 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3538
3539 invalidate_csb_entries(&execlists->csb_status[0],
3540 &execlists->csb_status[reset_value]);
3541 }
3542
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)3543 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3544 {
3545 int x;
3546
3547 x = lrc_ring_mi_mode(engine);
3548 if (x != -1) {
3549 regs[x + 1] &= ~STOP_RING;
3550 regs[x + 1] |= STOP_RING << 16;
3551 }
3552 }
3553
__execlists_reset_reg_state(const struct intel_context * ce,const struct intel_engine_cs * engine)3554 static void __execlists_reset_reg_state(const struct intel_context *ce,
3555 const struct intel_engine_cs *engine)
3556 {
3557 u32 *regs = ce->lrc_reg_state;
3558
3559 __reset_stop_ring(regs, engine);
3560 }
3561
__execlists_reset(struct intel_engine_cs * engine,bool stalled)3562 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3563 {
3564 struct intel_engine_execlists * const execlists = &engine->execlists;
3565 struct intel_context *ce;
3566 struct i915_request *rq;
3567 u32 head;
3568
3569 mb(); /* paranoia: read the CSB pointers from after the reset */
3570 clflush(execlists->csb_write);
3571 mb();
3572
3573 process_csb(engine); /* drain preemption events */
3574
3575 /* Following the reset, we need to reload the CSB read/write pointers */
3576 reset_csb_pointers(engine);
3577
3578 /*
3579 * Save the currently executing context, even if we completed
3580 * its request, it was still running at the time of the
3581 * reset and will have been clobbered.
3582 */
3583 rq = execlists_active(execlists);
3584 if (!rq)
3585 goto unwind;
3586
3587 /* We still have requests in-flight; the engine should be active */
3588 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3589
3590 ce = rq->context;
3591 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3592
3593 if (i915_request_completed(rq)) {
3594 /* Idle context; tidy up the ring so we can restart afresh */
3595 head = intel_ring_wrap(ce->ring, rq->tail);
3596 goto out_replay;
3597 }
3598
3599 /* Context has requests still in-flight; it should not be idle! */
3600 GEM_BUG_ON(i915_active_is_idle(&ce->active));
3601 rq = active_request(ce->timeline, rq);
3602 head = intel_ring_wrap(ce->ring, rq->head);
3603 GEM_BUG_ON(head == ce->ring->tail);
3604
3605 /*
3606 * If this request hasn't started yet, e.g. it is waiting on a
3607 * semaphore, we need to avoid skipping the request or else we
3608 * break the signaling chain. However, if the context is corrupt
3609 * the request will not restart and we will be stuck with a wedged
3610 * device. It is quite often the case that if we issue a reset
3611 * while the GPU is loading the context image, that the context
3612 * image becomes corrupt.
3613 *
3614 * Otherwise, if we have not started yet, the request should replay
3615 * perfectly and we do not need to flag the result as being erroneous.
3616 */
3617 if (!i915_request_started(rq))
3618 goto out_replay;
3619
3620 /*
3621 * If the request was innocent, we leave the request in the ELSP
3622 * and will try to replay it on restarting. The context image may
3623 * have been corrupted by the reset, in which case we may have
3624 * to service a new GPU hang, but more likely we can continue on
3625 * without impact.
3626 *
3627 * If the request was guilty, we presume the context is corrupt
3628 * and have to at least restore the RING register in the context
3629 * image back to the expected values to skip over the guilty request.
3630 */
3631 __i915_request_reset(rq, stalled);
3632 if (!stalled)
3633 goto out_replay;
3634
3635 /*
3636 * We want a simple context + ring to execute the breadcrumb update.
3637 * We cannot rely on the context being intact across the GPU hang,
3638 * so clear it and rebuild just what we need for the breadcrumb.
3639 * All pending requests for this context will be zapped, and any
3640 * future request will be after userspace has had the opportunity
3641 * to recreate its own state.
3642 */
3643 GEM_BUG_ON(!intel_context_is_pinned(ce));
3644 restore_default_state(ce, engine);
3645
3646 out_replay:
3647 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3648 head, ce->ring->tail);
3649 __execlists_reset_reg_state(ce, engine);
3650 __execlists_update_reg_state(ce, engine, head);
3651 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3652
3653 unwind:
3654 /* Push back any incomplete requests for replay after the reset. */
3655 cancel_port_requests(execlists);
3656 __unwind_incomplete_requests(engine);
3657 }
3658
execlists_reset_rewind(struct intel_engine_cs * engine,bool stalled)3659 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3660 {
3661 unsigned long flags;
3662
3663 ENGINE_TRACE(engine, "\n");
3664
3665 spin_lock_irqsave(&engine->active.lock, flags);
3666
3667 __execlists_reset(engine, stalled);
3668
3669 spin_unlock_irqrestore(&engine->active.lock, flags);
3670 }
3671
nop_submission_tasklet(unsigned long data)3672 static void nop_submission_tasklet(unsigned long data)
3673 {
3674 /* The driver is wedged; don't process any more events. */
3675 }
3676
execlists_reset_cancel(struct intel_engine_cs * engine)3677 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3678 {
3679 struct intel_engine_execlists * const execlists = &engine->execlists;
3680 struct i915_request *rq, *rn;
3681 struct rb_node *rb;
3682 unsigned long flags;
3683
3684 ENGINE_TRACE(engine, "\n");
3685
3686 /*
3687 * Before we call engine->cancel_requests(), we should have exclusive
3688 * access to the submission state. This is arranged for us by the
3689 * caller disabling the interrupt generation, the tasklet and other
3690 * threads that may then access the same state, giving us a free hand
3691 * to reset state. However, we still need to let lockdep be aware that
3692 * we know this state may be accessed in hardirq context, so we
3693 * disable the irq around this manipulation and we want to keep
3694 * the spinlock focused on its duties and not accidentally conflate
3695 * coverage to the submission's irq state. (Similarly, although we
3696 * shouldn't need to disable irq around the manipulation of the
3697 * submission's irq state, we also wish to remind ourselves that
3698 * it is irq state.)
3699 */
3700 spin_lock_irqsave(&engine->active.lock, flags);
3701
3702 __execlists_reset(engine, true);
3703
3704 /* Mark all executing requests as skipped. */
3705 list_for_each_entry(rq, &engine->active.requests, sched.link)
3706 mark_eio(rq);
3707
3708 /* Flush the queued requests to the timeline list (for retiring). */
3709 while ((rb = rb_first_cached(&execlists->queue))) {
3710 struct i915_priolist *p = to_priolist(rb);
3711 int i;
3712
3713 priolist_for_each_request_consume(rq, rn, p, i) {
3714 mark_eio(rq);
3715 __i915_request_submit(rq);
3716 }
3717
3718 rb_erase_cached(&p->node, &execlists->queue);
3719 i915_priolist_free(p);
3720 }
3721
3722 /* On-hold requests will be flushed to timeline upon their release */
3723 list_for_each_entry(rq, &engine->active.hold, sched.link)
3724 mark_eio(rq);
3725
3726 /* Cancel all attached virtual engines */
3727 while ((rb = rb_first_cached(&execlists->virtual))) {
3728 struct virtual_engine *ve =
3729 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3730
3731 rb_erase_cached(rb, &execlists->virtual);
3732 container_of(rb, struct ve_node, rb)->inserted = false;
3733
3734 spin_lock(&ve->base.active.lock);
3735 rq = fetch_and_zero(&ve->request);
3736 if (rq) {
3737 mark_eio(rq);
3738
3739 rq->engine = engine;
3740 __i915_request_submit(rq);
3741 i915_request_put(rq);
3742
3743 ve->base.execlists.queue_priority_hint = INT_MIN;
3744 }
3745 spin_unlock(&ve->base.active.lock);
3746 }
3747
3748 /* Remaining _unready_ requests will be nop'ed when submitted */
3749
3750 execlists->queue_priority_hint = INT_MIN;
3751 #ifdef __NetBSD__
3752 i915_sched_init(execlists);
3753 rb_tree_init(&execlists->virtual.rb_root.rbr_tree, &ve_tree_ops);
3754 #else
3755 execlists->queue = RB_ROOT_CACHED;
3756 #endif
3757
3758 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3759 execlists->tasklet.func = nop_submission_tasklet;
3760
3761 spin_unlock_irqrestore(&engine->active.lock, flags);
3762 }
3763
execlists_reset_finish(struct intel_engine_cs * engine)3764 static void execlists_reset_finish(struct intel_engine_cs *engine)
3765 {
3766 struct intel_engine_execlists * const execlists = &engine->execlists;
3767
3768 /*
3769 * After a GPU reset, we may have requests to replay. Do so now while
3770 * we still have the forcewake to be sure that the GPU is not allowed
3771 * to sleep before we restart and reload a context.
3772 */
3773 GEM_BUG_ON(!reset_in_progress(execlists));
3774 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3775 execlists->tasklet.func(execlists->tasklet.data);
3776
3777 if (__tasklet_enable(&execlists->tasklet))
3778 /* And kick in case we missed a new request submission. */
3779 tasklet_hi_schedule(&execlists->tasklet);
3780 ENGINE_TRACE(engine, "depth->%d\n",
3781 atomic_read(&execlists->tasklet.count));
3782 }
3783
gen8_emit_bb_start_noarb(struct i915_request * rq,u64 offset,u32 len,const unsigned int flags)3784 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3785 u64 offset, u32 len,
3786 const unsigned int flags)
3787 {
3788 u32 *cs;
3789
3790 cs = intel_ring_begin(rq, 4);
3791 if (IS_ERR(cs))
3792 return PTR_ERR(cs);
3793
3794 /*
3795 * WaDisableCtxRestoreArbitration:bdw,chv
3796 *
3797 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3798 * particular all the gen that do not need the w/a at all!), if we
3799 * took care to make sure that on every switch into this context
3800 * (both ordinary and for preemption) that arbitrartion was enabled
3801 * we would be fine. However, for gen8 there is another w/a that
3802 * requires us to not preempt inside GPGPU execution, so we keep
3803 * arbitration disabled for gen8 batches. Arbitration will be
3804 * re-enabled before we close the request
3805 * (engine->emit_fini_breadcrumb).
3806 */
3807 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3808
3809 /* FIXME(BDW+): Address space and security selectors. */
3810 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3811 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3812 *cs++ = lower_32_bits(offset);
3813 *cs++ = upper_32_bits(offset);
3814
3815 intel_ring_advance(rq, cs);
3816
3817 return 0;
3818 }
3819
gen8_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,const unsigned int flags)3820 static int gen8_emit_bb_start(struct i915_request *rq,
3821 u64 offset, u32 len,
3822 const unsigned int flags)
3823 {
3824 u32 *cs;
3825
3826 cs = intel_ring_begin(rq, 6);
3827 if (IS_ERR(cs))
3828 return PTR_ERR(cs);
3829
3830 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3831
3832 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3833 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3834 *cs++ = lower_32_bits(offset);
3835 *cs++ = upper_32_bits(offset);
3836
3837 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3838 *cs++ = MI_NOOP;
3839
3840 intel_ring_advance(rq, cs);
3841
3842 return 0;
3843 }
3844
gen8_logical_ring_enable_irq(struct intel_engine_cs * engine)3845 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3846 {
3847 ENGINE_WRITE(engine, RING_IMR,
3848 ~(engine->irq_enable_mask | engine->irq_keep_mask));
3849 ENGINE_POSTING_READ(engine, RING_IMR);
3850 }
3851
gen8_logical_ring_disable_irq(struct intel_engine_cs * engine)3852 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3853 {
3854 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3855 }
3856
gen8_emit_flush(struct i915_request * request,u32 mode)3857 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3858 {
3859 u32 cmd, *cs;
3860
3861 cs = intel_ring_begin(request, 4);
3862 if (IS_ERR(cs))
3863 return PTR_ERR(cs);
3864
3865 cmd = MI_FLUSH_DW + 1;
3866
3867 /* We always require a command barrier so that subsequent
3868 * commands, such as breadcrumb interrupts, are strictly ordered
3869 * wrt the contents of the write cache being flushed to memory
3870 * (and thus being coherent from the CPU).
3871 */
3872 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3873
3874 if (mode & EMIT_INVALIDATE) {
3875 cmd |= MI_INVALIDATE_TLB;
3876 if (request->engine->class == VIDEO_DECODE_CLASS)
3877 cmd |= MI_INVALIDATE_BSD;
3878 }
3879
3880 *cs++ = cmd;
3881 *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3882 *cs++ = 0; /* upper addr */
3883 *cs++ = 0; /* value */
3884 intel_ring_advance(request, cs);
3885
3886 return 0;
3887 }
3888
gen8_emit_flush_render(struct i915_request * request,u32 mode)3889 static int gen8_emit_flush_render(struct i915_request *request,
3890 u32 mode)
3891 {
3892 bool vf_flush_wa = false, dc_flush_wa = false;
3893 u32 *cs, flags = 0;
3894 int len;
3895
3896 flags |= PIPE_CONTROL_CS_STALL;
3897
3898 if (mode & EMIT_FLUSH) {
3899 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3900 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3901 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3902 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3903 }
3904
3905 if (mode & EMIT_INVALIDATE) {
3906 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3907 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3908 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3909 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3910 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3911 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3912 flags |= PIPE_CONTROL_QW_WRITE;
3913 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3914
3915 /*
3916 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3917 * pipe control.
3918 */
3919 if (IS_GEN(request->i915, 9))
3920 vf_flush_wa = true;
3921
3922 /* WaForGAMHang:kbl */
3923 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3924 dc_flush_wa = true;
3925 }
3926
3927 len = 6;
3928
3929 if (vf_flush_wa)
3930 len += 6;
3931
3932 if (dc_flush_wa)
3933 len += 12;
3934
3935 cs = intel_ring_begin(request, len);
3936 if (IS_ERR(cs))
3937 return PTR_ERR(cs);
3938
3939 if (vf_flush_wa)
3940 cs = gen8_emit_pipe_control(cs, 0, 0);
3941
3942 if (dc_flush_wa)
3943 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3944 0);
3945
3946 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3947
3948 if (dc_flush_wa)
3949 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3950
3951 intel_ring_advance(request, cs);
3952
3953 return 0;
3954 }
3955
gen11_emit_flush_render(struct i915_request * request,u32 mode)3956 static int gen11_emit_flush_render(struct i915_request *request,
3957 u32 mode)
3958 {
3959 if (mode & EMIT_FLUSH) {
3960 u32 *cs;
3961 u32 flags = 0;
3962
3963 flags |= PIPE_CONTROL_CS_STALL;
3964
3965 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3966 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3967 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3968 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3969 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3970 flags |= PIPE_CONTROL_QW_WRITE;
3971 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3972
3973 cs = intel_ring_begin(request, 6);
3974 if (IS_ERR(cs))
3975 return PTR_ERR(cs);
3976
3977 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3978 intel_ring_advance(request, cs);
3979 }
3980
3981 if (mode & EMIT_INVALIDATE) {
3982 u32 *cs;
3983 u32 flags = 0;
3984
3985 flags |= PIPE_CONTROL_CS_STALL;
3986
3987 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3988 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3989 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3990 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3991 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3992 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3993 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3994 flags |= PIPE_CONTROL_QW_WRITE;
3995 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3996
3997 cs = intel_ring_begin(request, 6);
3998 if (IS_ERR(cs))
3999 return PTR_ERR(cs);
4000
4001 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4002 intel_ring_advance(request, cs);
4003 }
4004
4005 return 0;
4006 }
4007
preparser_disable(bool state)4008 static u32 preparser_disable(bool state)
4009 {
4010 return MI_ARB_CHECK | 1 << 8 | state;
4011 }
4012
gen12_emit_flush_render(struct i915_request * request,u32 mode)4013 static int gen12_emit_flush_render(struct i915_request *request,
4014 u32 mode)
4015 {
4016 if (mode & EMIT_FLUSH) {
4017 u32 flags = 0;
4018 u32 *cs;
4019
4020 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4021 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4022 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4023 /* Wa_1409600907:tgl */
4024 flags |= PIPE_CONTROL_DEPTH_STALL;
4025 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4026 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4027 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4028
4029 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4030 flags |= PIPE_CONTROL_QW_WRITE;
4031
4032 flags |= PIPE_CONTROL_CS_STALL;
4033
4034 cs = intel_ring_begin(request, 6);
4035 if (IS_ERR(cs))
4036 return PTR_ERR(cs);
4037
4038 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4039 intel_ring_advance(request, cs);
4040 }
4041
4042 if (mode & EMIT_INVALIDATE) {
4043 u32 flags = 0;
4044 u32 *cs;
4045
4046 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4047 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4048 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4049 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4050 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4051 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4052 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4053 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4054
4055 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4056 flags |= PIPE_CONTROL_QW_WRITE;
4057
4058 flags |= PIPE_CONTROL_CS_STALL;
4059
4060 cs = intel_ring_begin(request, 8);
4061 if (IS_ERR(cs))
4062 return PTR_ERR(cs);
4063
4064 /*
4065 * Prevent the pre-parser from skipping past the TLB
4066 * invalidate and loading a stale page for the batch
4067 * buffer / request payload.
4068 */
4069 *cs++ = preparser_disable(true);
4070
4071 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4072
4073 *cs++ = preparser_disable(false);
4074 intel_ring_advance(request, cs);
4075
4076 /*
4077 * Wa_1604544889:tgl
4078 */
4079 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
4080 flags = 0;
4081 flags |= PIPE_CONTROL_CS_STALL;
4082 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4083
4084 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4085 flags |= PIPE_CONTROL_QW_WRITE;
4086
4087 cs = intel_ring_begin(request, 6);
4088 if (IS_ERR(cs))
4089 return PTR_ERR(cs);
4090
4091 cs = gen8_emit_pipe_control(cs, flags,
4092 LRC_PPHWSP_SCRATCH_ADDR);
4093 intel_ring_advance(request, cs);
4094 }
4095 }
4096
4097 return 0;
4098 }
4099
4100 /*
4101 * Reserve space for 2 NOOPs at the end of each request to be
4102 * used as a workaround for not being allowed to do lite
4103 * restore with HEAD==TAIL (WaIdleLiteRestore).
4104 */
gen8_emit_wa_tail(struct i915_request * request,u32 * cs)4105 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4106 {
4107 /* Ensure there's always at least one preemption point per-request. */
4108 *cs++ = MI_ARB_CHECK;
4109 *cs++ = MI_NOOP;
4110 request->wa_tail = intel_ring_offset(request, cs);
4111
4112 return cs;
4113 }
4114
emit_preempt_busywait(struct i915_request * request,u32 * cs)4115 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4116 {
4117 *cs++ = MI_SEMAPHORE_WAIT |
4118 MI_SEMAPHORE_GLOBAL_GTT |
4119 MI_SEMAPHORE_POLL |
4120 MI_SEMAPHORE_SAD_EQ_SDD;
4121 *cs++ = 0;
4122 *cs++ = intel_hws_preempt_address(request->engine);
4123 *cs++ = 0;
4124
4125 return cs;
4126 }
4127
4128 static __always_inline u32*
gen8_emit_fini_breadcrumb_footer(struct i915_request * request,u32 * cs)4129 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4130 u32 *cs)
4131 {
4132 *cs++ = MI_USER_INTERRUPT;
4133
4134 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4135 if (intel_engine_has_semaphores(request->engine))
4136 cs = emit_preempt_busywait(request, cs);
4137
4138 request->tail = intel_ring_offset(request, cs);
4139 assert_ring_tail_valid(request->ring, request->tail);
4140
4141 return gen8_emit_wa_tail(request, cs);
4142 }
4143
gen8_emit_fini_breadcrumb(struct i915_request * request,u32 * cs)4144 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4145 {
4146 cs = gen8_emit_ggtt_write(cs,
4147 request->fence.seqno,
4148 i915_request_active_timeline(request)->hwsp_offset,
4149 0);
4150
4151 return gen8_emit_fini_breadcrumb_footer(request, cs);
4152 }
4153
gen8_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)4154 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4155 {
4156 cs = gen8_emit_pipe_control(cs,
4157 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4158 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4159 PIPE_CONTROL_DC_FLUSH_ENABLE,
4160 0);
4161
4162 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4163 cs = gen8_emit_ggtt_write_rcs(cs,
4164 request->fence.seqno,
4165 i915_request_active_timeline(request)->hwsp_offset,
4166 PIPE_CONTROL_FLUSH_ENABLE |
4167 PIPE_CONTROL_CS_STALL);
4168
4169 return gen8_emit_fini_breadcrumb_footer(request, cs);
4170 }
4171
4172 static u32 *
gen11_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)4173 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4174 {
4175 cs = gen8_emit_ggtt_write_rcs(cs,
4176 request->fence.seqno,
4177 i915_request_active_timeline(request)->hwsp_offset,
4178 PIPE_CONTROL_CS_STALL |
4179 PIPE_CONTROL_TILE_CACHE_FLUSH |
4180 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4181 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4182 PIPE_CONTROL_DC_FLUSH_ENABLE |
4183 PIPE_CONTROL_FLUSH_ENABLE);
4184
4185 return gen8_emit_fini_breadcrumb_footer(request, cs);
4186 }
4187
4188 /*
4189 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4190 * flush and will continue pre-fetching the instructions after it before the
4191 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4192 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4193 * of the next request before the memory has been flushed, we're guaranteed that
4194 * we won't access the batch itself too early.
4195 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4196 * so, if the current request is modifying an instruction in the next request on
4197 * the same intel_context, we might pre-fetch and then execute the pre-update
4198 * instruction. To avoid this, the users of self-modifying code should either
4199 * disable the parser around the code emitting the memory writes, via a new flag
4200 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4201 * the in-kernel use-cases we've opted to use a separate context, see
4202 * reloc_gpu() as an example.
4203 * All the above applies only to the instructions themselves. Non-inline data
4204 * used by the instructions is not pre-fetched.
4205 */
4206
gen12_emit_preempt_busywait(struct i915_request * request,u32 * cs)4207 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4208 {
4209 *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4210 MI_SEMAPHORE_GLOBAL_GTT |
4211 MI_SEMAPHORE_POLL |
4212 MI_SEMAPHORE_SAD_EQ_SDD;
4213 *cs++ = 0;
4214 *cs++ = intel_hws_preempt_address(request->engine);
4215 *cs++ = 0;
4216 *cs++ = 0;
4217 *cs++ = MI_NOOP;
4218
4219 return cs;
4220 }
4221
4222 static __always_inline u32*
gen12_emit_fini_breadcrumb_footer(struct i915_request * request,u32 * cs)4223 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4224 {
4225 *cs++ = MI_USER_INTERRUPT;
4226
4227 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4228 if (intel_engine_has_semaphores(request->engine))
4229 cs = gen12_emit_preempt_busywait(request, cs);
4230
4231 request->tail = intel_ring_offset(request, cs);
4232 assert_ring_tail_valid(request->ring, request->tail);
4233
4234 return gen8_emit_wa_tail(request, cs);
4235 }
4236
gen12_emit_fini_breadcrumb(struct i915_request * request,u32 * cs)4237 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4238 {
4239 cs = gen8_emit_ggtt_write(cs,
4240 request->fence.seqno,
4241 i915_request_active_timeline(request)->hwsp_offset,
4242 0);
4243
4244 return gen12_emit_fini_breadcrumb_footer(request, cs);
4245 }
4246
4247 static u32 *
gen12_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)4248 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4249 {
4250 cs = gen8_emit_ggtt_write_rcs(cs,
4251 request->fence.seqno,
4252 i915_request_active_timeline(request)->hwsp_offset,
4253 PIPE_CONTROL_CS_STALL |
4254 PIPE_CONTROL_TILE_CACHE_FLUSH |
4255 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4256 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4257 /* Wa_1409600907:tgl */
4258 PIPE_CONTROL_DEPTH_STALL |
4259 PIPE_CONTROL_DC_FLUSH_ENABLE |
4260 PIPE_CONTROL_FLUSH_ENABLE |
4261 PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4262
4263 return gen12_emit_fini_breadcrumb_footer(request, cs);
4264 }
4265
execlists_park(struct intel_engine_cs * engine)4266 static void execlists_park(struct intel_engine_cs *engine)
4267 {
4268 cancel_timer(&engine->execlists.timer);
4269 cancel_timer(&engine->execlists.preempt);
4270 }
4271
intel_execlists_set_default_submission(struct intel_engine_cs * engine)4272 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4273 {
4274 engine->submit_request = execlists_submit_request;
4275 engine->schedule = i915_schedule;
4276 engine->execlists.tasklet.func = execlists_submission_tasklet;
4277
4278 engine->reset.prepare = execlists_reset_prepare;
4279 engine->reset.rewind = execlists_reset_rewind;
4280 engine->reset.cancel = execlists_reset_cancel;
4281 engine->reset.finish = execlists_reset_finish;
4282
4283 engine->park = execlists_park;
4284 engine->unpark = NULL;
4285
4286 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4287 if (!intel_vgpu_active(engine->i915)) {
4288 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4289 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4290 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4291 }
4292
4293 if (INTEL_GEN(engine->i915) >= 12)
4294 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4295
4296 if (intel_engine_has_preemption(engine))
4297 engine->emit_bb_start = gen8_emit_bb_start;
4298 else
4299 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4300 }
4301
execlists_shutdown(struct intel_engine_cs * engine)4302 static void execlists_shutdown(struct intel_engine_cs *engine)
4303 {
4304 /* Synchronise with residual timers and any softirq they raise */
4305 del_timer_sync(&engine->execlists.timer);
4306 del_timer_sync(&engine->execlists.preempt);
4307 tasklet_kill(&engine->execlists.tasklet);
4308 }
4309
execlists_release(struct intel_engine_cs * engine)4310 static void execlists_release(struct intel_engine_cs *engine)
4311 {
4312 execlists_shutdown(engine);
4313
4314 intel_engine_cleanup_common(engine);
4315 lrc_destroy_wa_ctx(engine);
4316 }
4317
4318 static void
logical_ring_default_vfuncs(struct intel_engine_cs * engine)4319 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4320 {
4321 /* Default vfuncs which can be overriden by each engine. */
4322
4323 engine->resume = execlists_resume;
4324
4325 engine->cops = &execlists_context_ops;
4326 engine->request_alloc = execlists_request_alloc;
4327
4328 engine->emit_flush = gen8_emit_flush;
4329 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4330 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4331 if (INTEL_GEN(engine->i915) >= 12)
4332 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4333
4334 engine->set_default_submission = intel_execlists_set_default_submission;
4335
4336 if (INTEL_GEN(engine->i915) < 11) {
4337 engine->irq_enable = gen8_logical_ring_enable_irq;
4338 engine->irq_disable = gen8_logical_ring_disable_irq;
4339 } else {
4340 /*
4341 * TODO: On Gen11 interrupt masks need to be clear
4342 * to allow C6 entry. Keep interrupts enabled at
4343 * and take the hit of generating extra interrupts
4344 * until a more refined solution exists.
4345 */
4346 }
4347 }
4348
4349 static inline void
logical_ring_default_irqs(struct intel_engine_cs * engine)4350 logical_ring_default_irqs(struct intel_engine_cs *engine)
4351 {
4352 unsigned int shift = 0;
4353
4354 if (INTEL_GEN(engine->i915) < 11) {
4355 const u8 irq_shifts[] = {
4356 [RCS0] = GEN8_RCS_IRQ_SHIFT,
4357 [BCS0] = GEN8_BCS_IRQ_SHIFT,
4358 [VCS0] = GEN8_VCS0_IRQ_SHIFT,
4359 [VCS1] = GEN8_VCS1_IRQ_SHIFT,
4360 [VECS0] = GEN8_VECS_IRQ_SHIFT,
4361 };
4362
4363 shift = irq_shifts[engine->id];
4364 }
4365
4366 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4367 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4368 }
4369
rcs_submission_override(struct intel_engine_cs * engine)4370 static void rcs_submission_override(struct intel_engine_cs *engine)
4371 {
4372 switch (INTEL_GEN(engine->i915)) {
4373 case 12:
4374 engine->emit_flush = gen12_emit_flush_render;
4375 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4376 break;
4377 case 11:
4378 engine->emit_flush = gen11_emit_flush_render;
4379 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4380 break;
4381 default:
4382 engine->emit_flush = gen8_emit_flush_render;
4383 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4384 break;
4385 }
4386 }
4387
intel_execlists_submission_setup(struct intel_engine_cs * engine)4388 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4389 {
4390 struct intel_engine_execlists * const execlists = &engine->execlists;
4391 struct drm_i915_private *i915 = engine->i915;
4392 struct intel_uncore *uncore = engine->uncore;
4393 u32 base = engine->mmio_base;
4394
4395 i915_sched_init(&engine->execlists);
4396
4397 tasklet_init(&engine->execlists.tasklet,
4398 execlists_submission_tasklet, (unsigned long)engine);
4399 timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4400 timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4401
4402 logical_ring_default_vfuncs(engine);
4403 logical_ring_default_irqs(engine);
4404
4405 if (engine->class == RENDER_CLASS)
4406 rcs_submission_override(engine);
4407
4408 if (intel_init_workaround_bb(engine))
4409 /*
4410 * We continue even if we fail to initialize WA batch
4411 * because we only expect rare glitches but nothing
4412 * critical to prevent us from using GPU
4413 */
4414 DRM_ERROR("WA batch buffer initialization failed\n");
4415
4416 if (HAS_LOGICAL_RING_ELSQ(i915)) {
4417 #ifdef __NetBSD__
4418 execlists->submit_reg = i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4419 execlists->ctrl_reg = i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4420 execlists->bsh = uncore->regs_bsh;
4421 execlists->bst = uncore->regs_bst;
4422 #else
4423 execlists->submit_reg = uncore->regs +
4424 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4425 execlists->ctrl_reg = uncore->regs +
4426 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4427 #endif
4428 } else {
4429 #ifdef __NetBSD__
4430 execlists->submit_reg = i915_mmio_reg_offset(RING_ELSP(base));
4431 execlists->bsh = uncore->regs_bsh;
4432 execlists->bst = uncore->regs_bst;
4433 #else
4434 execlists->submit_reg = uncore->regs +
4435 i915_mmio_reg_offset(RING_ELSP(base));
4436 #endif
4437 }
4438
4439 execlists->csb_status =
4440 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4441
4442 execlists->csb_write =
4443 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4444
4445 if (INTEL_GEN(i915) < 11)
4446 execlists->csb_size = GEN8_CSB_ENTRIES;
4447 else
4448 execlists->csb_size = GEN11_CSB_ENTRIES;
4449
4450 reset_csb_pointers(engine);
4451
4452 /* Finally, take ownership and responsibility for cleanup! */
4453 engine->release = execlists_release;
4454
4455 return 0;
4456 }
4457
intel_lr_indirect_ctx_offset(const struct intel_engine_cs * engine)4458 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4459 {
4460 u32 indirect_ctx_offset;
4461
4462 switch (INTEL_GEN(engine->i915)) {
4463 default:
4464 MISSING_CASE(INTEL_GEN(engine->i915));
4465 /* fall through */
4466 case 12:
4467 indirect_ctx_offset =
4468 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4469 break;
4470 case 11:
4471 indirect_ctx_offset =
4472 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4473 break;
4474 case 10:
4475 indirect_ctx_offset =
4476 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4477 break;
4478 case 9:
4479 indirect_ctx_offset =
4480 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4481 break;
4482 case 8:
4483 indirect_ctx_offset =
4484 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4485 break;
4486 }
4487
4488 return indirect_ctx_offset;
4489 }
4490
4491
init_common_reg_state(u32 * const regs,const struct intel_engine_cs * engine,const struct intel_ring * ring,bool inhibit)4492 static void init_common_reg_state(u32 * const regs,
4493 const struct intel_engine_cs *engine,
4494 const struct intel_ring *ring,
4495 bool inhibit)
4496 {
4497 u32 ctl;
4498
4499 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4500 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4501 if (inhibit)
4502 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4503 if (INTEL_GEN(engine->i915) < 11)
4504 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4505 CTX_CTRL_RS_CTX_ENABLE);
4506 regs[CTX_CONTEXT_CONTROL] = ctl;
4507
4508 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4509 }
4510
init_wa_bb_reg_state(u32 * const regs,const struct intel_engine_cs * engine,u32 pos_bb_per_ctx)4511 static void init_wa_bb_reg_state(u32 * const regs,
4512 const struct intel_engine_cs *engine,
4513 u32 pos_bb_per_ctx)
4514 {
4515 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4516
4517 if (wa_ctx->per_ctx.size) {
4518 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4519
4520 regs[pos_bb_per_ctx] =
4521 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4522 }
4523
4524 if (wa_ctx->indirect_ctx.size) {
4525 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4526
4527 regs[pos_bb_per_ctx + 2] =
4528 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4529 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4530
4531 regs[pos_bb_per_ctx + 4] =
4532 intel_lr_indirect_ctx_offset(engine) << 6;
4533 }
4534 }
4535
init_ppgtt_reg_state(u32 * regs,const struct i915_ppgtt * ppgtt)4536 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4537 {
4538 if (i915_vm_is_4lvl(&ppgtt->vm)) {
4539 /* 64b PPGTT (48bit canonical)
4540 * PDP0_DESCRIPTOR contains the base address to PML4 and
4541 * other PDP Descriptors are ignored.
4542 */
4543 ASSIGN_CTX_PML4(ppgtt, regs);
4544 } else {
4545 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4546 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4547 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4548 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4549 }
4550 }
4551
vm_alias(struct i915_address_space * vm)4552 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4553 {
4554 if (i915_is_ggtt(vm))
4555 return i915_vm_to_ggtt(vm)->alias;
4556 else
4557 return i915_vm_to_ppgtt(vm);
4558 }
4559
execlists_init_reg_state(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,const struct intel_ring * ring,bool inhibit)4560 static void execlists_init_reg_state(u32 *regs,
4561 const struct intel_context *ce,
4562 const struct intel_engine_cs *engine,
4563 const struct intel_ring *ring,
4564 bool inhibit)
4565 {
4566 /*
4567 * A context is actually a big batch buffer with several
4568 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4569 * values we are setting here are only for the first context restore:
4570 * on a subsequent save, the GPU will recreate this batchbuffer with new
4571 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4572 * we are not initializing here).
4573 *
4574 * Must keep consistent with virtual_update_register_offsets().
4575 */
4576 set_offsets(regs, reg_offsets(engine), engine, inhibit);
4577
4578 init_common_reg_state(regs, engine, ring, inhibit);
4579 init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4580
4581 init_wa_bb_reg_state(regs, engine,
4582 INTEL_GEN(engine->i915) >= 12 ?
4583 GEN12_CTX_BB_PER_CTX_PTR :
4584 CTX_BB_PER_CTX_PTR);
4585
4586 __reset_stop_ring(regs, engine);
4587 }
4588
4589 static int
populate_lr_context(struct intel_context * ce,struct drm_i915_gem_object * ctx_obj,struct intel_engine_cs * engine,struct intel_ring * ring)4590 populate_lr_context(struct intel_context *ce,
4591 struct drm_i915_gem_object *ctx_obj,
4592 struct intel_engine_cs *engine,
4593 struct intel_ring *ring)
4594 {
4595 bool inhibit = true;
4596 void *vaddr;
4597 int ret;
4598
4599 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4600 if (IS_ERR(vaddr)) {
4601 ret = PTR_ERR(vaddr);
4602 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4603 return ret;
4604 }
4605
4606 set_redzone(vaddr, engine);
4607
4608 if (engine->default_state) {
4609 void *defaults;
4610
4611 defaults = i915_gem_object_pin_map(engine->default_state,
4612 I915_MAP_WB);
4613 if (IS_ERR(defaults)) {
4614 ret = PTR_ERR(defaults);
4615 goto err_unpin_ctx;
4616 }
4617
4618 memcpy(vaddr, defaults, engine->context_size);
4619 i915_gem_object_unpin_map(engine->default_state);
4620 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4621 inhibit = false;
4622 }
4623
4624 /* The second page of the context object contains some fields which must
4625 * be set up prior to the first execution. */
4626 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4627 ce, engine, ring, inhibit);
4628
4629 ret = 0;
4630 err_unpin_ctx:
4631 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4632 i915_gem_object_unpin_map(ctx_obj);
4633 return ret;
4634 }
4635
__execlists_context_alloc(struct intel_context * ce,struct intel_engine_cs * engine)4636 static int __execlists_context_alloc(struct intel_context *ce,
4637 struct intel_engine_cs *engine)
4638 {
4639 struct drm_i915_gem_object *ctx_obj;
4640 struct intel_ring *ring;
4641 struct i915_vma *vma;
4642 u32 context_size;
4643 int ret;
4644
4645 GEM_BUG_ON(ce->state);
4646 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4647
4648 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4649 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4650
4651 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4652 if (IS_ERR(ctx_obj))
4653 return PTR_ERR(ctx_obj);
4654
4655 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4656 if (IS_ERR(vma)) {
4657 ret = PTR_ERR(vma);
4658 goto error_deref_obj;
4659 }
4660
4661 if (!ce->timeline) {
4662 struct intel_timeline *tl;
4663
4664 tl = intel_timeline_create(engine->gt, NULL);
4665 if (IS_ERR(tl)) {
4666 ret = PTR_ERR(tl);
4667 goto error_deref_obj;
4668 }
4669
4670 ce->timeline = tl;
4671 }
4672
4673 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4674 if (IS_ERR(ring)) {
4675 ret = PTR_ERR(ring);
4676 goto error_deref_obj;
4677 }
4678
4679 ret = populate_lr_context(ce, ctx_obj, engine, ring);
4680 if (ret) {
4681 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4682 goto error_ring_free;
4683 }
4684
4685 ce->ring = ring;
4686 ce->state = vma;
4687
4688 return 0;
4689
4690 error_ring_free:
4691 intel_ring_put(ring);
4692 error_deref_obj:
4693 i915_gem_object_put(ctx_obj);
4694 return ret;
4695 }
4696
virtual_queue(struct virtual_engine * ve)4697 static struct list_head *virtual_queue(struct virtual_engine *ve)
4698 {
4699 return &ve->base.execlists.default_priolist.requests[0];
4700 }
4701
virtual_context_destroy(struct kref * kref)4702 static void virtual_context_destroy(struct kref *kref)
4703 {
4704 struct virtual_engine *ve =
4705 container_of(kref, typeof(*ve), context.ref);
4706 unsigned int n;
4707
4708 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4709 GEM_BUG_ON(ve->request);
4710 GEM_BUG_ON(ve->context.inflight);
4711
4712 for (n = 0; n < ve->num_siblings; n++) {
4713 struct intel_engine_cs *sibling = ve->siblings[n];
4714 struct rb_node *node = &ve->nodes[sibling->id].rb;
4715 unsigned long flags;
4716
4717 if (!ve->nodes[sibling->id].inserted)
4718 continue;
4719
4720 spin_lock_irqsave(&sibling->active.lock, flags);
4721
4722 /* Detachment is lazily performed in the execlists tasklet */
4723 if (ve->nodes[sibling->id].inserted) {
4724 rb_erase_cached(node, &sibling->execlists.virtual);
4725 ve->nodes[sibling->id].inserted = false;
4726 }
4727
4728 spin_unlock_irqrestore(&sibling->active.lock, flags);
4729 }
4730 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4731
4732 if (ve->context.state)
4733 __execlists_context_fini(&ve->context);
4734 intel_context_fini(&ve->context);
4735
4736 intel_engine_fini_breadcrumbs(&ve->base);
4737 spin_lock_destroy(&ve->base.active.lock);
4738
4739 kfree(ve->bonds);
4740 kfree(ve);
4741 }
4742
virtual_engine_initial_hint(struct virtual_engine * ve)4743 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4744 {
4745 int swp;
4746
4747 /*
4748 * Pick a random sibling on starting to help spread the load around.
4749 *
4750 * New contexts are typically created with exactly the same order
4751 * of siblings, and often started in batches. Due to the way we iterate
4752 * the array of sibling when submitting requests, sibling[0] is
4753 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4754 * randomised across the system, we also help spread the load by the
4755 * first engine we inspect being different each time.
4756 *
4757 * NB This does not force us to execute on this engine, it will just
4758 * typically be the first we inspect for submission.
4759 */
4760 swp = prandom_u32_max(ve->num_siblings);
4761 if (!swp)
4762 return;
4763
4764 swap(ve->siblings[swp], ve->siblings[0]);
4765 if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4766 virtual_update_register_offsets(ve->context.lrc_reg_state,
4767 ve->siblings[0]);
4768 }
4769
virtual_context_alloc(struct intel_context * ce)4770 static int virtual_context_alloc(struct intel_context *ce)
4771 {
4772 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4773
4774 return __execlists_context_alloc(ce, ve->siblings[0]);
4775 }
4776
virtual_context_pin(struct intel_context * ce)4777 static int virtual_context_pin(struct intel_context *ce)
4778 {
4779 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4780 int err;
4781
4782 /* Note: we must use a real engine class for setting up reg state */
4783 err = __execlists_context_pin(ce, ve->siblings[0]);
4784 if (err)
4785 return err;
4786
4787 virtual_engine_initial_hint(ve);
4788 return 0;
4789 }
4790
virtual_context_enter(struct intel_context * ce)4791 static void virtual_context_enter(struct intel_context *ce)
4792 {
4793 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4794 unsigned int n;
4795
4796 for (n = 0; n < ve->num_siblings; n++)
4797 intel_engine_pm_get(ve->siblings[n]);
4798
4799 intel_timeline_enter(ce->timeline);
4800 }
4801
virtual_context_exit(struct intel_context * ce)4802 static void virtual_context_exit(struct intel_context *ce)
4803 {
4804 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4805 unsigned int n;
4806
4807 intel_timeline_exit(ce->timeline);
4808
4809 for (n = 0; n < ve->num_siblings; n++)
4810 intel_engine_pm_put(ve->siblings[n]);
4811 }
4812
4813 static const struct intel_context_ops virtual_context_ops = {
4814 .alloc = virtual_context_alloc,
4815
4816 .pin = virtual_context_pin,
4817 .unpin = execlists_context_unpin,
4818
4819 .enter = virtual_context_enter,
4820 .exit = virtual_context_exit,
4821
4822 .destroy = virtual_context_destroy,
4823 };
4824
virtual_submission_mask(struct virtual_engine * ve)4825 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4826 {
4827 struct i915_request *rq;
4828 intel_engine_mask_t mask;
4829
4830 rq = READ_ONCE(ve->request);
4831 if (!rq)
4832 return 0;
4833
4834 /* The rq is ready for submission; rq->execution_mask is now stable. */
4835 mask = rq->execution_mask;
4836 if (unlikely(!mask)) {
4837 /* Invalid selection, submit to a random engine in error */
4838 i915_request_skip(rq, -ENODEV);
4839 mask = ve->siblings[0]->mask;
4840 }
4841
4842 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4843 rq->fence.context, rq->fence.seqno,
4844 mask, ve->base.execlists.queue_priority_hint);
4845
4846 return mask;
4847 }
4848
virtual_submission_tasklet(unsigned long data)4849 static void virtual_submission_tasklet(unsigned long data)
4850 {
4851 struct virtual_engine * const ve = (struct virtual_engine *)data;
4852 const int prio = ve->base.execlists.queue_priority_hint;
4853 intel_engine_mask_t mask;
4854 unsigned int n;
4855
4856 rcu_read_lock();
4857 mask = virtual_submission_mask(ve);
4858 rcu_read_unlock();
4859 if (unlikely(!mask))
4860 return;
4861
4862 #ifdef __NetBSD__
4863 int s = splsoftserial(); /* block tasklets=softints */
4864 #else
4865 local_irq_disable();
4866 #endif
4867 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4868 struct intel_engine_cs *sibling = ve->siblings[n];
4869 struct ve_node * const node = &ve->nodes[sibling->id];
4870 struct rb_node **parent, *rb;
4871 bool first;
4872
4873 if (unlikely(!(mask & sibling->mask))) {
4874 if (node->inserted) {
4875 spin_lock(&sibling->active.lock);
4876 rb_erase_cached(&node->rb,
4877 &sibling->execlists.virtual);
4878 node->inserted = false;
4879 spin_unlock(&sibling->active.lock);
4880 }
4881 continue;
4882 }
4883
4884 spin_lock(&sibling->active.lock);
4885
4886 if (node->inserted) {
4887 /*
4888 * Cheat and avoid rebalancing the tree if we can
4889 * reuse this node in situ.
4890 */
4891 first = rb_first_cached(&sibling->execlists.virtual) ==
4892 &node->rb;
4893 if (prio == node->prio || (prio > node->prio && first))
4894 goto submit_engine;
4895
4896 rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4897 node->inserted = false;
4898 }
4899
4900 #ifdef __NetBSD__
4901 __USE(parent);
4902 __USE(rb);
4903 struct ve_node *collision __diagused;
4904 /* XXX kludge to get insertion order */
4905 node->order = ve->order++;
4906 collision = rb_tree_insert_node(
4907 &sibling->execlists.virtual.rb_root.rbr_tree,
4908 node);
4909 KASSERT(collision == node);
4910 node->inserted = true;
4911 first = rb_tree_find_node_geq(
4912 &sibling->execlists.virtual.rb_root.rbr_tree,
4913 &node->prio) == node;
4914 #else
4915 rb = NULL;
4916 first = true;
4917 parent = &sibling->execlists.virtual.rb_root.rb_node;
4918 while (*parent) {
4919 struct ve_node *other;
4920
4921 rb = *parent;
4922 other = rb_entry(rb, typeof(*other), rb);
4923 if (prio > other->prio) {
4924 parent = &rb->rb_left;
4925 } else {
4926 parent = &rb->rb_right;
4927 first = false;
4928 }
4929 }
4930
4931 rb_link_node(&node->rb, rb, parent);
4932 rb_insert_color_cached(&node->rb,
4933 &sibling->execlists.virtual,
4934 first);
4935 #endif
4936
4937 submit_engine:
4938 GEM_BUG_ON(!node->inserted);
4939 node->prio = prio;
4940 if (first && prio > sibling->execlists.queue_priority_hint) {
4941 sibling->execlists.queue_priority_hint = prio;
4942 tasklet_hi_schedule(&sibling->execlists.tasklet);
4943 }
4944
4945 spin_unlock(&sibling->active.lock);
4946 }
4947 #ifdef __NetBSD__
4948 splx(s);
4949 #else
4950 local_irq_enable();
4951 #endif
4952 }
4953
virtual_submit_request(struct i915_request * rq)4954 static void virtual_submit_request(struct i915_request *rq)
4955 {
4956 struct virtual_engine *ve = to_virtual_engine(rq->engine);
4957 struct i915_request *old;
4958 unsigned long flags;
4959
4960 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4961 rq->fence.context,
4962 rq->fence.seqno);
4963
4964 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4965
4966 spin_lock_irqsave(&ve->base.active.lock, flags);
4967
4968 old = ve->request;
4969 if (old) { /* background completion event from preempt-to-busy */
4970 GEM_BUG_ON(!i915_request_completed(old));
4971 __i915_request_submit(old);
4972 i915_request_put(old);
4973 }
4974
4975 if (i915_request_completed(rq)) {
4976 __i915_request_submit(rq);
4977
4978 ve->base.execlists.queue_priority_hint = INT_MIN;
4979 ve->request = NULL;
4980 } else {
4981 ve->base.execlists.queue_priority_hint = rq_prio(rq);
4982 ve->request = i915_request_get(rq);
4983
4984 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4985 list_move_tail(&rq->sched.link, virtual_queue(ve));
4986
4987 tasklet_schedule(&ve->base.execlists.tasklet);
4988 }
4989
4990 spin_unlock_irqrestore(&ve->base.active.lock, flags);
4991 }
4992
4993 static struct ve_bond *
virtual_find_bond(struct virtual_engine * ve,const struct intel_engine_cs * master)4994 virtual_find_bond(struct virtual_engine *ve,
4995 const struct intel_engine_cs *master)
4996 {
4997 int i;
4998
4999 for (i = 0; i < ve->num_bonds; i++) {
5000 if (ve->bonds[i].master == master)
5001 return &ve->bonds[i];
5002 }
5003
5004 return NULL;
5005 }
5006
5007 static void
virtual_bond_execute(struct i915_request * rq,struct dma_fence * signal)5008 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5009 {
5010 struct virtual_engine *ve = to_virtual_engine(rq->engine);
5011 intel_engine_mask_t allowed, exec;
5012 struct ve_bond *bond;
5013
5014 allowed = ~to_request(signal)->engine->mask;
5015
5016 bond = virtual_find_bond(ve, to_request(signal)->engine);
5017 if (bond)
5018 allowed &= bond->sibling_mask;
5019
5020 /* Restrict the bonded request to run on only the available engines */
5021 exec = READ_ONCE(rq->execution_mask);
5022 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5023 ;
5024
5025 /* Prevent the master from being re-run on the bonded engines */
5026 to_request(signal)->execution_mask &= ~allowed;
5027 }
5028
5029 struct intel_context *
intel_execlists_create_virtual(struct intel_engine_cs ** siblings,unsigned int count)5030 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5031 unsigned int count)
5032 {
5033 struct virtual_engine *ve;
5034 unsigned int n;
5035 int err;
5036
5037 if (count == 0)
5038 return ERR_PTR(-EINVAL);
5039
5040 if (count == 1)
5041 return intel_context_create(siblings[0]);
5042
5043 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5044 if (!ve)
5045 return ERR_PTR(-ENOMEM);
5046
5047 ve->base.i915 = siblings[0]->i915;
5048 ve->base.gt = siblings[0]->gt;
5049 ve->base.uncore = siblings[0]->uncore;
5050 ve->base.id = -1;
5051
5052 ve->base.class = OTHER_CLASS;
5053 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5054 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5055 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5056
5057 /*
5058 * The decision on whether to submit a request using semaphores
5059 * depends on the saturated state of the engine. We only compute
5060 * this during HW submission of the request, and we need for this
5061 * state to be globally applied to all requests being submitted
5062 * to this engine. Virtual engines encompass more than one physical
5063 * engine and so we cannot accurately tell in advance if one of those
5064 * engines is already saturated and so cannot afford to use a semaphore
5065 * and be pessimized in priority for doing so -- if we are the only
5066 * context using semaphores after all other clients have stopped, we
5067 * will be starved on the saturated system. Such a global switch for
5068 * semaphores is less than ideal, but alas is the current compromise.
5069 */
5070 ve->base.saturated = ALL_ENGINES;
5071
5072 snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5073
5074 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5075 intel_engine_init_breadcrumbs(&ve->base);
5076 intel_engine_init_execlists(&ve->base);
5077
5078 ve->base.cops = &virtual_context_ops;
5079 ve->base.request_alloc = execlists_request_alloc;
5080
5081 ve->base.schedule = i915_schedule;
5082 ve->base.submit_request = virtual_submit_request;
5083 ve->base.bond_execute = virtual_bond_execute;
5084
5085 INIT_LIST_HEAD(virtual_queue(ve));
5086 ve->base.execlists.queue_priority_hint = INT_MIN;
5087 tasklet_init(&ve->base.execlists.tasklet,
5088 virtual_submission_tasklet,
5089 (unsigned long)ve);
5090
5091 intel_context_init(&ve->context, &ve->base);
5092
5093 for (n = 0; n < count; n++) {
5094 struct intel_engine_cs *sibling = siblings[n];
5095
5096 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5097 if (sibling->mask & ve->base.mask) {
5098 DRM_DEBUG("duplicate %s entry in load balancer\n",
5099 sibling->name);
5100 err = -EINVAL;
5101 goto err_put;
5102 }
5103
5104 /*
5105 * The virtual engine implementation is tightly coupled to
5106 * the execlists backend -- we push out request directly
5107 * into a tree inside each physical engine. We could support
5108 * layering if we handle cloning of the requests and
5109 * submitting a copy into each backend.
5110 */
5111 if (sibling->execlists.tasklet.func !=
5112 execlists_submission_tasklet) {
5113 err = -ENODEV;
5114 goto err_put;
5115 }
5116
5117 GEM_BUG_ON(!ve->nodes[sibling->id].inserted);
5118 ve->nodes[sibling->id].inserted = false;
5119
5120 ve->siblings[ve->num_siblings++] = sibling;
5121 ve->base.mask |= sibling->mask;
5122
5123 /*
5124 * All physical engines must be compatible for their emission
5125 * functions (as we build the instructions during request
5126 * construction and do not alter them before submission
5127 * on the physical engine). We use the engine class as a guide
5128 * here, although that could be refined.
5129 */
5130 if (ve->base.class != OTHER_CLASS) {
5131 if (ve->base.class != sibling->class) {
5132 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5133 sibling->class, ve->base.class);
5134 err = -EINVAL;
5135 goto err_put;
5136 }
5137 continue;
5138 }
5139
5140 ve->base.class = sibling->class;
5141 ve->base.uabi_class = sibling->uabi_class;
5142 snprintf(ve->base.name, sizeof(ve->base.name),
5143 "v%dx%d", ve->base.class, count);
5144 ve->base.context_size = sibling->context_size;
5145
5146 ve->base.emit_bb_start = sibling->emit_bb_start;
5147 ve->base.emit_flush = sibling->emit_flush;
5148 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5149 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5150 ve->base.emit_fini_breadcrumb_dw =
5151 sibling->emit_fini_breadcrumb_dw;
5152
5153 ve->base.flags = sibling->flags;
5154 }
5155
5156 ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5157
5158 return &ve->context;
5159
5160 err_put:
5161 intel_context_put(&ve->context);
5162 return ERR_PTR(err);
5163 }
5164
5165 struct intel_context *
intel_execlists_clone_virtual(struct intel_engine_cs * src)5166 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5167 {
5168 struct virtual_engine *se = to_virtual_engine(src);
5169 struct intel_context *dst;
5170
5171 dst = intel_execlists_create_virtual(se->siblings,
5172 se->num_siblings);
5173 if (IS_ERR(dst))
5174 return dst;
5175
5176 if (se->num_bonds) {
5177 struct virtual_engine *de = to_virtual_engine(dst->engine);
5178
5179 de->bonds = kmemdup(se->bonds,
5180 sizeof(*se->bonds) * se->num_bonds,
5181 GFP_KERNEL);
5182 if (!de->bonds) {
5183 intel_context_put(dst);
5184 return ERR_PTR(-ENOMEM);
5185 }
5186
5187 de->num_bonds = se->num_bonds;
5188 }
5189
5190 return dst;
5191 }
5192
intel_virtual_engine_attach_bond(struct intel_engine_cs * engine,const struct intel_engine_cs * master,const struct intel_engine_cs * sibling)5193 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5194 const struct intel_engine_cs *master,
5195 const struct intel_engine_cs *sibling)
5196 {
5197 struct virtual_engine *ve = to_virtual_engine(engine);
5198 struct ve_bond *bond;
5199 int n;
5200
5201 /* Sanity check the sibling is part of the virtual engine */
5202 for (n = 0; n < ve->num_siblings; n++)
5203 if (sibling == ve->siblings[n])
5204 break;
5205 if (n == ve->num_siblings)
5206 return -EINVAL;
5207
5208 bond = virtual_find_bond(ve, master);
5209 if (bond) {
5210 bond->sibling_mask |= sibling->mask;
5211 return 0;
5212 }
5213
5214 bond = krealloc(ve->bonds,
5215 sizeof(*bond) * (ve->num_bonds + 1),
5216 GFP_KERNEL);
5217 if (!bond)
5218 return -ENOMEM;
5219
5220 bond[ve->num_bonds].master = master;
5221 bond[ve->num_bonds].sibling_mask = sibling->mask;
5222
5223 ve->bonds = bond;
5224 ve->num_bonds++;
5225
5226 return 0;
5227 }
5228
5229 struct intel_engine_cs *
intel_virtual_engine_get_sibling(struct intel_engine_cs * engine,unsigned int sibling)5230 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5231 unsigned int sibling)
5232 {
5233 struct virtual_engine *ve = to_virtual_engine(engine);
5234
5235 if (sibling >= ve->num_siblings)
5236 return NULL;
5237
5238 return ve->siblings[sibling];
5239 }
5240
intel_execlists_show_requests(struct intel_engine_cs * engine,struct drm_printer * m,void (* show_request)(struct drm_printer * m,struct i915_request * rq,const char * prefix),unsigned int max)5241 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5242 struct drm_printer *m,
5243 void (*show_request)(struct drm_printer *m,
5244 struct i915_request *rq,
5245 const char *prefix),
5246 unsigned int max)
5247 {
5248 const struct intel_engine_execlists *execlists = &engine->execlists;
5249 struct i915_request *rq, *last;
5250 unsigned long flags;
5251 unsigned int count;
5252 struct rb_node *rb;
5253
5254 spin_lock_irqsave(&engine->active.lock, flags);
5255
5256 last = NULL;
5257 count = 0;
5258 list_for_each_entry(rq, &engine->active.requests, sched.link) {
5259 if (count++ < max - 1)
5260 show_request(m, rq, "\t\tE ");
5261 else
5262 last = rq;
5263 }
5264 if (last) {
5265 if (count > max) {
5266 drm_printf(m,
5267 "\t\t...skipping %d executing requests...\n",
5268 count - max);
5269 }
5270 show_request(m, last, "\t\tE ");
5271 }
5272
5273 last = NULL;
5274 count = 0;
5275 if (execlists->queue_priority_hint != INT_MIN)
5276 drm_printf(m, "\t\tQueue priority hint: %d\n",
5277 execlists->queue_priority_hint);
5278 for (rb = rb_first_cached(&execlists->queue);
5279 rb;
5280 rb = rb_next2(&execlists->queue.rb_root, rb)) {
5281 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5282 int i;
5283
5284 priolist_for_each_request(rq, p, i) {
5285 if (count++ < max - 1)
5286 show_request(m, rq, "\t\tQ ");
5287 else
5288 last = rq;
5289 }
5290 }
5291 if (last) {
5292 if (count > max) {
5293 drm_printf(m,
5294 "\t\t...skipping %d queued requests...\n",
5295 count - max);
5296 }
5297 show_request(m, last, "\t\tQ ");
5298 }
5299
5300 last = NULL;
5301 count = 0;
5302 for (rb = rb_first_cached(&execlists->virtual);
5303 rb;
5304 rb = rb_next2(&execlists->virtual.rb_root, rb)) {
5305 struct virtual_engine *ve =
5306 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5307 struct i915_request *rq = READ_ONCE(ve->request);
5308
5309 if (rq) {
5310 if (count++ < max - 1)
5311 show_request(m, rq, "\t\tV ");
5312 else
5313 last = rq;
5314 }
5315 }
5316 if (last) {
5317 if (count > max) {
5318 drm_printf(m,
5319 "\t\t...skipping %d virtual requests...\n",
5320 count - max);
5321 }
5322 show_request(m, last, "\t\tV ");
5323 }
5324
5325 spin_unlock_irqrestore(&engine->active.lock, flags);
5326 }
5327
intel_lr_context_reset(struct intel_engine_cs * engine,struct intel_context * ce,u32 head,bool scrub)5328 void intel_lr_context_reset(struct intel_engine_cs *engine,
5329 struct intel_context *ce,
5330 u32 head,
5331 bool scrub)
5332 {
5333 GEM_BUG_ON(!intel_context_is_pinned(ce));
5334
5335 /*
5336 * We want a simple context + ring to execute the breadcrumb update.
5337 * We cannot rely on the context being intact across the GPU hang,
5338 * so clear it and rebuild just what we need for the breadcrumb.
5339 * All pending requests for this context will be zapped, and any
5340 * future request will be after userspace has had the opportunity
5341 * to recreate its own state.
5342 */
5343 if (scrub)
5344 restore_default_state(ce, engine);
5345
5346 /* Rerun the request; its payload has been neutered (if guilty). */
5347 __execlists_update_reg_state(ce, engine, head);
5348 }
5349
5350 bool
intel_engine_in_execlists_submission_mode(const struct intel_engine_cs * engine)5351 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5352 {
5353 return engine->set_default_submission ==
5354 intel_execlists_set_default_submission;
5355 }
5356
5357 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5358 #include "selftest_lrc.c"
5359 #endif
5360