1 /* $NetBSD: intel_ring_submission.c,v 1.3 2021/12/19 11:49:11 riastradh Exp $ */
2
3 /*
4 * Copyright © 2008-2010 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 *
25 * Authors:
26 * Eric Anholt <eric@anholt.net>
27 * Zou Nan hai <nanhai.zou@intel.com>
28 * Xiang Hai hao<haihao.xiang@intel.com>
29 *
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: intel_ring_submission.c,v 1.3 2021/12/19 11:49:11 riastradh Exp $");
34
35 #include <linux/log2.h>
36
37 #include <drm/i915_drm.h>
38
39 #include "gem/i915_gem_context.h"
40
41 #include "gen6_ppgtt.h"
42 #include "i915_drv.h"
43 #include "i915_trace.h"
44 #include "intel_context.h"
45 #include "intel_gt.h"
46 #include "intel_gt_irq.h"
47 #include "intel_gt_pm_irq.h"
48 #include "intel_reset.h"
49 #include "intel_ring.h"
50 #include "intel_workarounds.h"
51
52 /* Rough estimate of the typical request size, performing a flush,
53 * set-context and then emitting the batch.
54 */
55 #define LEGACY_REQUEST_SIZE 200
56
57 static int
gen2_render_ring_flush(struct i915_request * rq,u32 mode)58 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
59 {
60 unsigned int num_store_dw;
61 u32 cmd, *cs;
62
63 cmd = MI_FLUSH;
64 num_store_dw = 0;
65 if (mode & EMIT_INVALIDATE)
66 cmd |= MI_READ_FLUSH;
67 if (mode & EMIT_FLUSH)
68 num_store_dw = 4;
69
70 cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
71 if (IS_ERR(cs))
72 return PTR_ERR(cs);
73
74 *cs++ = cmd;
75 while (num_store_dw--) {
76 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
77 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
78 INTEL_GT_SCRATCH_FIELD_DEFAULT);
79 *cs++ = 0;
80 }
81 *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
82
83 intel_ring_advance(rq, cs);
84
85 return 0;
86 }
87
88 static int
gen4_render_ring_flush(struct i915_request * rq,u32 mode)89 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
90 {
91 u32 cmd, *cs;
92 int i;
93
94 /*
95 * read/write caches:
96 *
97 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
98 * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is
99 * also flushed at 2d versus 3d pipeline switches.
100 *
101 * read-only caches:
102 *
103 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
104 * MI_READ_FLUSH is set, and is always flushed on 965.
105 *
106 * I915_GEM_DOMAIN_COMMAND may not exist?
107 *
108 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
109 * invalidated when MI_EXE_FLUSH is set.
110 *
111 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
112 * invalidated with every MI_FLUSH.
113 *
114 * TLBs:
115 *
116 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
117 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
118 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
119 * are flushed at any MI_FLUSH.
120 */
121
122 cmd = MI_FLUSH;
123 if (mode & EMIT_INVALIDATE) {
124 cmd |= MI_EXE_FLUSH;
125 if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
126 cmd |= MI_INVALIDATE_ISP;
127 }
128
129 i = 2;
130 if (mode & EMIT_INVALIDATE)
131 i += 20;
132
133 cs = intel_ring_begin(rq, i);
134 if (IS_ERR(cs))
135 return PTR_ERR(cs);
136
137 *cs++ = cmd;
138
139 /*
140 * A random delay to let the CS invalidate take effect? Without this
141 * delay, the GPU relocation path fails as the CS does not see
142 * the updated contents. Just as important, if we apply the flushes
143 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
144 * write and before the invalidate on the next batch), the relocations
145 * still fail. This implies that is a delay following invalidation
146 * that is required to reset the caches as opposed to a delay to
147 * ensure the memory is written.
148 */
149 if (mode & EMIT_INVALIDATE) {
150 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
151 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
152 INTEL_GT_SCRATCH_FIELD_DEFAULT) |
153 PIPE_CONTROL_GLOBAL_GTT;
154 *cs++ = 0;
155 *cs++ = 0;
156
157 for (i = 0; i < 12; i++)
158 *cs++ = MI_FLUSH;
159
160 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
161 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
162 INTEL_GT_SCRATCH_FIELD_DEFAULT) |
163 PIPE_CONTROL_GLOBAL_GTT;
164 *cs++ = 0;
165 *cs++ = 0;
166 }
167
168 *cs++ = cmd;
169
170 intel_ring_advance(rq, cs);
171
172 return 0;
173 }
174
175 /*
176 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
177 * implementing two workarounds on gen6. From section 1.4.7.1
178 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
179 *
180 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
181 * produced by non-pipelined state commands), software needs to first
182 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
183 * 0.
184 *
185 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
186 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
187 *
188 * And the workaround for these two requires this workaround first:
189 *
190 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
191 * BEFORE the pipe-control with a post-sync op and no write-cache
192 * flushes.
193 *
194 * And this last workaround is tricky because of the requirements on
195 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
196 * volume 2 part 1:
197 *
198 * "1 of the following must also be set:
199 * - Render Target Cache Flush Enable ([12] of DW1)
200 * - Depth Cache Flush Enable ([0] of DW1)
201 * - Stall at Pixel Scoreboard ([1] of DW1)
202 * - Depth Stall ([13] of DW1)
203 * - Post-Sync Operation ([13] of DW1)
204 * - Notify Enable ([8] of DW1)"
205 *
206 * The cache flushes require the workaround flush that triggered this
207 * one, so we can't use it. Depth stall would trigger the same.
208 * Post-sync nonzero is what triggered this second workaround, so we
209 * can't use that one either. Notify enable is IRQs, which aren't
210 * really our business. That leaves only stall at scoreboard.
211 */
212 static int
gen6_emit_post_sync_nonzero_flush(struct i915_request * rq)213 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
214 {
215 u32 scratch_addr =
216 intel_gt_scratch_offset(rq->engine->gt,
217 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
218 u32 *cs;
219
220 cs = intel_ring_begin(rq, 6);
221 if (IS_ERR(cs))
222 return PTR_ERR(cs);
223
224 *cs++ = GFX_OP_PIPE_CONTROL(5);
225 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
226 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
227 *cs++ = 0; /* low dword */
228 *cs++ = 0; /* high dword */
229 *cs++ = MI_NOOP;
230 intel_ring_advance(rq, cs);
231
232 cs = intel_ring_begin(rq, 6);
233 if (IS_ERR(cs))
234 return PTR_ERR(cs);
235
236 *cs++ = GFX_OP_PIPE_CONTROL(5);
237 *cs++ = PIPE_CONTROL_QW_WRITE;
238 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
239 *cs++ = 0;
240 *cs++ = 0;
241 *cs++ = MI_NOOP;
242 intel_ring_advance(rq, cs);
243
244 return 0;
245 }
246
247 static int
gen6_render_ring_flush(struct i915_request * rq,u32 mode)248 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
249 {
250 u32 scratch_addr =
251 intel_gt_scratch_offset(rq->engine->gt,
252 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
253 u32 *cs, flags = 0;
254 int ret;
255
256 /* Force SNB workarounds for PIPE_CONTROL flushes */
257 ret = gen6_emit_post_sync_nonzero_flush(rq);
258 if (ret)
259 return ret;
260
261 /* Just flush everything. Experiments have shown that reducing the
262 * number of bits based on the write domains has little performance
263 * impact.
264 */
265 if (mode & EMIT_FLUSH) {
266 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
267 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
268 /*
269 * Ensure that any following seqno writes only happen
270 * when the render cache is indeed flushed.
271 */
272 flags |= PIPE_CONTROL_CS_STALL;
273 }
274 if (mode & EMIT_INVALIDATE) {
275 flags |= PIPE_CONTROL_TLB_INVALIDATE;
276 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
277 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
278 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
279 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
280 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
281 /*
282 * TLB invalidate requires a post-sync write.
283 */
284 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
285 }
286
287 cs = intel_ring_begin(rq, 4);
288 if (IS_ERR(cs))
289 return PTR_ERR(cs);
290
291 *cs++ = GFX_OP_PIPE_CONTROL(4);
292 *cs++ = flags;
293 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
294 *cs++ = 0;
295 intel_ring_advance(rq, cs);
296
297 return 0;
298 }
299
gen6_rcs_emit_breadcrumb(struct i915_request * rq,u32 * cs)300 static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
301 {
302 /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
303 *cs++ = GFX_OP_PIPE_CONTROL(4);
304 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
305 *cs++ = 0;
306 *cs++ = 0;
307
308 *cs++ = GFX_OP_PIPE_CONTROL(4);
309 *cs++ = PIPE_CONTROL_QW_WRITE;
310 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
311 INTEL_GT_SCRATCH_FIELD_DEFAULT) |
312 PIPE_CONTROL_GLOBAL_GTT;
313 *cs++ = 0;
314
315 /* Finally we can flush and with it emit the breadcrumb */
316 *cs++ = GFX_OP_PIPE_CONTROL(4);
317 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
318 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
319 PIPE_CONTROL_DC_FLUSH_ENABLE |
320 PIPE_CONTROL_QW_WRITE |
321 PIPE_CONTROL_CS_STALL);
322 *cs++ = i915_request_active_timeline(rq)->hwsp_offset |
323 PIPE_CONTROL_GLOBAL_GTT;
324 *cs++ = rq->fence.seqno;
325
326 *cs++ = MI_USER_INTERRUPT;
327 *cs++ = MI_NOOP;
328
329 rq->tail = intel_ring_offset(rq, cs);
330 assert_ring_tail_valid(rq->ring, rq->tail);
331
332 return cs;
333 }
334
335 static int
gen7_render_ring_cs_stall_wa(struct i915_request * rq)336 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
337 {
338 u32 *cs;
339
340 cs = intel_ring_begin(rq, 4);
341 if (IS_ERR(cs))
342 return PTR_ERR(cs);
343
344 *cs++ = GFX_OP_PIPE_CONTROL(4);
345 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
346 *cs++ = 0;
347 *cs++ = 0;
348 intel_ring_advance(rq, cs);
349
350 return 0;
351 }
352
353 static int
gen7_render_ring_flush(struct i915_request * rq,u32 mode)354 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
355 {
356 u32 scratch_addr =
357 intel_gt_scratch_offset(rq->engine->gt,
358 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
359 u32 *cs, flags = 0;
360
361 /*
362 * Ensure that any following seqno writes only happen when the render
363 * cache is indeed flushed.
364 *
365 * Workaround: 4th PIPE_CONTROL command (except the ones with only
366 * read-cache invalidate bits set) must have the CS_STALL bit set. We
367 * don't try to be clever and just set it unconditionally.
368 */
369 flags |= PIPE_CONTROL_CS_STALL;
370
371 /*
372 * CS_STALL suggests at least a post-sync write.
373 */
374 flags |= PIPE_CONTROL_QW_WRITE;
375 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
376
377 /* Just flush everything. Experiments have shown that reducing the
378 * number of bits based on the write domains has little performance
379 * impact.
380 */
381 if (mode & EMIT_FLUSH) {
382 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
383 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
384 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
385 flags |= PIPE_CONTROL_FLUSH_ENABLE;
386 }
387 if (mode & EMIT_INVALIDATE) {
388 flags |= PIPE_CONTROL_TLB_INVALIDATE;
389 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
390 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
391 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
392 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
393 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
394 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
395
396 /* Workaround: we must issue a pipe_control with CS-stall bit
397 * set before a pipe_control command that has the state cache
398 * invalidate bit set. */
399 gen7_render_ring_cs_stall_wa(rq);
400 }
401
402 cs = intel_ring_begin(rq, 4);
403 if (IS_ERR(cs))
404 return PTR_ERR(cs);
405
406 *cs++ = GFX_OP_PIPE_CONTROL(4);
407 *cs++ = flags;
408 *cs++ = scratch_addr;
409 *cs++ = 0;
410 intel_ring_advance(rq, cs);
411
412 return 0;
413 }
414
gen7_rcs_emit_breadcrumb(struct i915_request * rq,u32 * cs)415 static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
416 {
417 *cs++ = GFX_OP_PIPE_CONTROL(4);
418 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
419 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
420 PIPE_CONTROL_DC_FLUSH_ENABLE |
421 PIPE_CONTROL_FLUSH_ENABLE |
422 PIPE_CONTROL_QW_WRITE |
423 PIPE_CONTROL_GLOBAL_GTT_IVB |
424 PIPE_CONTROL_CS_STALL);
425 *cs++ = i915_request_active_timeline(rq)->hwsp_offset;
426 *cs++ = rq->fence.seqno;
427
428 *cs++ = MI_USER_INTERRUPT;
429 *cs++ = MI_NOOP;
430
431 rq->tail = intel_ring_offset(rq, cs);
432 assert_ring_tail_valid(rq->ring, rq->tail);
433
434 return cs;
435 }
436
gen6_xcs_emit_breadcrumb(struct i915_request * rq,u32 * cs)437 static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
438 {
439 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
440 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
441
442 *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
443 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
444 *cs++ = rq->fence.seqno;
445
446 *cs++ = MI_USER_INTERRUPT;
447
448 rq->tail = intel_ring_offset(rq, cs);
449 assert_ring_tail_valid(rq->ring, rq->tail);
450
451 return cs;
452 }
453
454 #define GEN7_XCS_WA 32
gen7_xcs_emit_breadcrumb(struct i915_request * rq,u32 * cs)455 static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
456 {
457 int i;
458
459 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
460 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
461
462 *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
463 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
464 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
465 *cs++ = rq->fence.seqno;
466
467 for (i = 0; i < GEN7_XCS_WA; i++) {
468 *cs++ = MI_STORE_DWORD_INDEX;
469 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
470 *cs++ = rq->fence.seqno;
471 }
472
473 *cs++ = MI_FLUSH_DW;
474 *cs++ = 0;
475 *cs++ = 0;
476
477 *cs++ = MI_USER_INTERRUPT;
478 *cs++ = MI_NOOP;
479
480 rq->tail = intel_ring_offset(rq, cs);
481 assert_ring_tail_valid(rq->ring, rq->tail);
482
483 return cs;
484 }
485 #undef GEN7_XCS_WA
486
set_hwstam(struct intel_engine_cs * engine,u32 mask)487 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
488 {
489 /*
490 * Keep the render interrupt unmasked as this papers over
491 * lost interrupts following a reset.
492 */
493 if (engine->class == RENDER_CLASS) {
494 if (INTEL_GEN(engine->i915) >= 6)
495 mask &= ~BIT(0);
496 else
497 mask &= ~I915_USER_INTERRUPT;
498 }
499
500 intel_engine_set_hwsp_writemask(engine, mask);
501 }
502
set_hws_pga(struct intel_engine_cs * engine,phys_addr_t phys)503 static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
504 {
505 u32 addr;
506
507 addr = lower_32_bits(phys);
508 if (INTEL_GEN(engine->i915) >= 4)
509 addr |= (phys >> 28) & 0xf0;
510
511 intel_uncore_write(engine->uncore, HWS_PGA, addr);
512 }
513
514 #ifdef __NetBSD__
515 static void
ring_setup_phys_status_page(struct intel_engine_cs * engine)516 ring_setup_phys_status_page(struct intel_engine_cs *engine)
517 {
518 struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
519 bus_addr_t addr;
520
521 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
522 addr = obj->mm.pages->sgl[0].sg_dmamap->dm_segs[0].ds_addr;
523 set_hws_pga(engine, addr);
524 set_hwstam(engine, ~0u);
525 }
526 #else
status_page(struct intel_engine_cs * engine)527 static struct page *status_page(struct intel_engine_cs *engine)
528 {
529 struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
530
531 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
532 return sg_page(obj->mm.pages->sgl);
533 }
534
ring_setup_phys_status_page(struct intel_engine_cs * engine)535 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
536 {
537 set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
538 set_hwstam(engine, ~0u);
539 }
540 #endif
541
set_hwsp(struct intel_engine_cs * engine,u32 offset)542 static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
543 {
544 i915_reg_t hwsp;
545
546 /*
547 * The ring status page addresses are no longer next to the rest of
548 * the ring registers as of gen7.
549 */
550 if (IS_GEN(engine->i915, 7)) {
551 switch (engine->id) {
552 /*
553 * No more rings exist on Gen7. Default case is only to shut up
554 * gcc switch check warning.
555 */
556 default:
557 GEM_BUG_ON(engine->id);
558 /* fallthrough */
559 case RCS0:
560 hwsp = RENDER_HWS_PGA_GEN7;
561 break;
562 case BCS0:
563 hwsp = BLT_HWS_PGA_GEN7;
564 break;
565 case VCS0:
566 hwsp = BSD_HWS_PGA_GEN7;
567 break;
568 case VECS0:
569 hwsp = VEBOX_HWS_PGA_GEN7;
570 break;
571 }
572 } else if (IS_GEN(engine->i915, 6)) {
573 hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
574 } else {
575 hwsp = RING_HWS_PGA(engine->mmio_base);
576 }
577
578 intel_uncore_write(engine->uncore, hwsp, offset);
579 intel_uncore_posting_read(engine->uncore, hwsp);
580 }
581
flush_cs_tlb(struct intel_engine_cs * engine)582 static void flush_cs_tlb(struct intel_engine_cs *engine)
583 {
584 struct drm_i915_private *dev_priv = engine->i915;
585
586 if (!IS_GEN_RANGE(dev_priv, 6, 7))
587 return;
588
589 /* ring should be idle before issuing a sync flush*/
590 WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
591
592 ENGINE_WRITE(engine, RING_INSTPM,
593 _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
594 INSTPM_SYNC_FLUSH));
595 if (intel_wait_for_register(engine->uncore,
596 RING_INSTPM(engine->mmio_base),
597 INSTPM_SYNC_FLUSH, 0,
598 1000))
599 DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
600 engine->name);
601 }
602
ring_setup_status_page(struct intel_engine_cs * engine)603 static void ring_setup_status_page(struct intel_engine_cs *engine)
604 {
605 set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
606 set_hwstam(engine, ~0u);
607
608 flush_cs_tlb(engine);
609 }
610
stop_ring(struct intel_engine_cs * engine)611 static bool stop_ring(struct intel_engine_cs *engine)
612 {
613 struct drm_i915_private *dev_priv = engine->i915;
614
615 if (INTEL_GEN(dev_priv) > 2) {
616 ENGINE_WRITE(engine,
617 RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
618 if (intel_wait_for_register(engine->uncore,
619 RING_MI_MODE(engine->mmio_base),
620 MODE_IDLE,
621 MODE_IDLE,
622 1000)) {
623 DRM_ERROR("%s : timed out trying to stop ring\n",
624 engine->name);
625
626 /*
627 * Sometimes we observe that the idle flag is not
628 * set even though the ring is empty. So double
629 * check before giving up.
630 */
631 if (ENGINE_READ(engine, RING_HEAD) !=
632 ENGINE_READ(engine, RING_TAIL))
633 return false;
634 }
635 }
636
637 ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
638
639 ENGINE_WRITE(engine, RING_HEAD, 0);
640 ENGINE_WRITE(engine, RING_TAIL, 0);
641
642 /* The ring must be empty before it is disabled */
643 ENGINE_WRITE(engine, RING_CTL, 0);
644
645 return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
646 }
647
xcs_resume(struct intel_engine_cs * engine)648 static int xcs_resume(struct intel_engine_cs *engine)
649 {
650 struct drm_i915_private *dev_priv = engine->i915;
651 struct intel_ring *ring = engine->legacy.ring;
652 int ret = 0;
653
654 ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
655 ring->head, ring->tail);
656
657 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
658
659 /* WaClearRingBufHeadRegAtInit:ctg,elk */
660 if (!stop_ring(engine)) {
661 /* G45 ring initialization often fails to reset head to zero */
662 DRM_DEBUG_DRIVER("%s head not reset to zero "
663 "ctl %08x head %08x tail %08x start %08x\n",
664 engine->name,
665 ENGINE_READ(engine, RING_CTL),
666 ENGINE_READ(engine, RING_HEAD),
667 ENGINE_READ(engine, RING_TAIL),
668 ENGINE_READ(engine, RING_START));
669
670 if (!stop_ring(engine)) {
671 DRM_ERROR("failed to set %s head to zero "
672 "ctl %08x head %08x tail %08x start %08x\n",
673 engine->name,
674 ENGINE_READ(engine, RING_CTL),
675 ENGINE_READ(engine, RING_HEAD),
676 ENGINE_READ(engine, RING_TAIL),
677 ENGINE_READ(engine, RING_START));
678 ret = -EIO;
679 goto out;
680 }
681 }
682
683 if (HWS_NEEDS_PHYSICAL(dev_priv))
684 ring_setup_phys_status_page(engine);
685 else
686 ring_setup_status_page(engine);
687
688 intel_engine_reset_breadcrumbs(engine);
689
690 /* Enforce ordering by reading HEAD register back */
691 ENGINE_POSTING_READ(engine, RING_HEAD);
692
693 /*
694 * Initialize the ring. This must happen _after_ we've cleared the ring
695 * registers with the above sequence (the readback of the HEAD registers
696 * also enforces ordering), otherwise the hw might lose the new ring
697 * register values.
698 */
699 ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
700
701 /* Check that the ring offsets point within the ring! */
702 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
703 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
704 intel_ring_update_space(ring);
705
706 /* First wake the ring up to an empty/idle ring */
707 ENGINE_WRITE(engine, RING_HEAD, ring->head);
708 ENGINE_WRITE(engine, RING_TAIL, ring->head);
709 ENGINE_POSTING_READ(engine, RING_TAIL);
710
711 ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
712
713 /* If the head is still not zero, the ring is dead */
714 if (intel_wait_for_register(engine->uncore,
715 RING_CTL(engine->mmio_base),
716 RING_VALID, RING_VALID,
717 50)) {
718 DRM_ERROR("%s initialization failed "
719 "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
720 engine->name,
721 ENGINE_READ(engine, RING_CTL),
722 ENGINE_READ(engine, RING_CTL) & RING_VALID,
723 ENGINE_READ(engine, RING_HEAD), ring->head,
724 ENGINE_READ(engine, RING_TAIL), ring->tail,
725 ENGINE_READ(engine, RING_START),
726 i915_ggtt_offset(ring->vma));
727 ret = -EIO;
728 goto out;
729 }
730
731 if (INTEL_GEN(dev_priv) > 2)
732 ENGINE_WRITE(engine,
733 RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
734
735 /* Now awake, let it get started */
736 if (ring->tail != ring->head) {
737 ENGINE_WRITE(engine, RING_TAIL, ring->tail);
738 ENGINE_POSTING_READ(engine, RING_TAIL);
739 }
740
741 /* Papering over lost _interrupts_ immediately following the restart */
742 intel_engine_signal_breadcrumbs(engine);
743 out:
744 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
745
746 return ret;
747 }
748
reset_prepare(struct intel_engine_cs * engine)749 static void reset_prepare(struct intel_engine_cs *engine)
750 {
751 struct intel_uncore *uncore = engine->uncore;
752 const u32 base = engine->mmio_base;
753
754 /*
755 * We stop engines, otherwise we might get failed reset and a
756 * dead gpu (on elk). Also as modern gpu as kbl can suffer
757 * from system hang if batchbuffer is progressing when
758 * the reset is issued, regardless of READY_TO_RESET ack.
759 * Thus assume it is best to stop engines on all gens
760 * where we have a gpu reset.
761 *
762 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
763 *
764 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
765 *
766 * FIXME: Wa for more modern gens needs to be validated
767 */
768 ENGINE_TRACE(engine, "\n");
769
770 if (intel_engine_stop_cs(engine))
771 ENGINE_TRACE(engine, "timed out on STOP_RING\n");
772
773 intel_uncore_write_fw(uncore,
774 RING_HEAD(base),
775 intel_uncore_read_fw(uncore, RING_TAIL(base)));
776 intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
777
778 intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
779 intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
780 intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
781
782 /* The ring must be empty before it is disabled */
783 intel_uncore_write_fw(uncore, RING_CTL(base), 0);
784
785 /* Check acts as a post */
786 if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
787 ENGINE_TRACE(engine, "ring head [%x] not parked\n",
788 intel_uncore_read_fw(uncore, RING_HEAD(base)));
789 }
790
reset_rewind(struct intel_engine_cs * engine,bool stalled)791 static void reset_rewind(struct intel_engine_cs *engine, bool stalled)
792 {
793 struct i915_request *pos, *rq;
794 unsigned long flags;
795 u32 head;
796
797 rq = NULL;
798 spin_lock_irqsave(&engine->active.lock, flags);
799 list_for_each_entry(pos, &engine->active.requests, sched.link) {
800 if (!i915_request_completed(pos)) {
801 rq = pos;
802 break;
803 }
804 }
805
806 /*
807 * The guilty request will get skipped on a hung engine.
808 *
809 * Users of client default contexts do not rely on logical
810 * state preserved between batches so it is safe to execute
811 * queued requests following the hang. Non default contexts
812 * rely on preserved state, so skipping a batch loses the
813 * evolution of the state and it needs to be considered corrupted.
814 * Executing more queued batches on top of corrupted state is
815 * risky. But we take the risk by trying to advance through
816 * the queued requests in order to make the client behaviour
817 * more predictable around resets, by not throwing away random
818 * amount of batches it has prepared for execution. Sophisticated
819 * clients can use gem_reset_stats_ioctl and dma fence status
820 * (exported via sync_file info ioctl on explicit fences) to observe
821 * when it loses the context state and should rebuild accordingly.
822 *
823 * The context ban, and ultimately the client ban, mechanism are safety
824 * valves if client submission ends up resulting in nothing more than
825 * subsequent hangs.
826 */
827
828 if (rq) {
829 /*
830 * Try to restore the logical GPU state to match the
831 * continuation of the request queue. If we skip the
832 * context/PD restore, then the next request may try to execute
833 * assuming that its context is valid and loaded on the GPU and
834 * so may try to access invalid memory, prompting repeated GPU
835 * hangs.
836 *
837 * If the request was guilty, we still restore the logical
838 * state in case the next request requires it (e.g. the
839 * aliasing ppgtt), but skip over the hung batch.
840 *
841 * If the request was innocent, we try to replay the request
842 * with the restored context.
843 */
844 __i915_request_reset(rq, stalled);
845
846 GEM_BUG_ON(rq->ring != engine->legacy.ring);
847 head = rq->head;
848 } else {
849 head = engine->legacy.ring->tail;
850 }
851 engine->legacy.ring->head = intel_ring_wrap(engine->legacy.ring, head);
852
853 spin_unlock_irqrestore(&engine->active.lock, flags);
854 }
855
reset_finish(struct intel_engine_cs * engine)856 static void reset_finish(struct intel_engine_cs *engine)
857 {
858 }
859
rcs_resume(struct intel_engine_cs * engine)860 static int rcs_resume(struct intel_engine_cs *engine)
861 {
862 struct drm_i915_private *i915 = engine->i915;
863 struct intel_uncore *uncore = engine->uncore;
864
865 /*
866 * Disable CONSTANT_BUFFER before it is loaded from the context
867 * image. For as it is loaded, it is executed and the stored
868 * address may no longer be valid, leading to a GPU hang.
869 *
870 * This imposes the requirement that userspace reload their
871 * CONSTANT_BUFFER on every batch, fortunately a requirement
872 * they are already accustomed to from before contexts were
873 * enabled.
874 */
875 if (IS_GEN(i915, 4))
876 intel_uncore_write(uncore, ECOSKPD,
877 _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
878
879 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
880 if (IS_GEN_RANGE(i915, 4, 6))
881 intel_uncore_write(uncore, MI_MODE,
882 _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
883
884 /* We need to disable the AsyncFlip performance optimisations in order
885 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
886 * programmed to '1' on all products.
887 *
888 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
889 */
890 if (IS_GEN_RANGE(i915, 6, 7))
891 intel_uncore_write(uncore, MI_MODE,
892 _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
893
894 /* Required for the hardware to program scanline values for waiting */
895 /* WaEnableFlushTlbInvalidationMode:snb */
896 if (IS_GEN(i915, 6))
897 intel_uncore_write(uncore, GFX_MODE,
898 _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
899
900 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
901 if (IS_GEN(i915, 7))
902 intel_uncore_write(uncore, GFX_MODE_GEN7,
903 _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
904 _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
905
906 if (IS_GEN(i915, 6)) {
907 /* From the Sandybridge PRM, volume 1 part 3, page 24:
908 * "If this bit is set, STCunit will have LRA as replacement
909 * policy. [...] This bit must be reset. LRA replacement
910 * policy is not supported."
911 */
912 intel_uncore_write(uncore, CACHE_MODE_0,
913 _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
914 }
915
916 if (IS_GEN_RANGE(i915, 6, 7))
917 intel_uncore_write(uncore, INSTPM,
918 _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
919
920 return xcs_resume(engine);
921 }
922
reset_cancel(struct intel_engine_cs * engine)923 static void reset_cancel(struct intel_engine_cs *engine)
924 {
925 struct i915_request *request;
926 unsigned long flags;
927
928 spin_lock_irqsave(&engine->active.lock, flags);
929
930 /* Mark all submitted requests as skipped. */
931 list_for_each_entry(request, &engine->active.requests, sched.link) {
932 if (!i915_request_signaled(request))
933 dma_fence_set_error(&request->fence, -EIO);
934
935 i915_request_mark_complete(request);
936 }
937
938 /* Remaining _unready_ requests will be nop'ed when submitted */
939
940 spin_unlock_irqrestore(&engine->active.lock, flags);
941 }
942
i9xx_submit_request(struct i915_request * request)943 static void i9xx_submit_request(struct i915_request *request)
944 {
945 i915_request_submit(request);
946 wmb(); /* paranoid flush writes out of the WCB before mmio */
947
948 ENGINE_WRITE(request->engine, RING_TAIL,
949 intel_ring_set_tail(request->ring, request->tail));
950 }
951
i9xx_emit_breadcrumb(struct i915_request * rq,u32 * cs)952 static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
953 {
954 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
955 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
956
957 *cs++ = MI_FLUSH;
958
959 *cs++ = MI_STORE_DWORD_INDEX;
960 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
961 *cs++ = rq->fence.seqno;
962
963 *cs++ = MI_USER_INTERRUPT;
964 *cs++ = MI_NOOP;
965
966 rq->tail = intel_ring_offset(rq, cs);
967 assert_ring_tail_valid(rq->ring, rq->tail);
968
969 return cs;
970 }
971
972 #define GEN5_WA_STORES 8 /* must be at least 1! */
gen5_emit_breadcrumb(struct i915_request * rq,u32 * cs)973 static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
974 {
975 int i;
976
977 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
978 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
979
980 *cs++ = MI_FLUSH;
981
982 BUILD_BUG_ON(GEN5_WA_STORES < 1);
983 for (i = 0; i < GEN5_WA_STORES; i++) {
984 *cs++ = MI_STORE_DWORD_INDEX;
985 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
986 *cs++ = rq->fence.seqno;
987 }
988
989 *cs++ = MI_USER_INTERRUPT;
990
991 rq->tail = intel_ring_offset(rq, cs);
992 assert_ring_tail_valid(rq->ring, rq->tail);
993
994 return cs;
995 }
996 #undef GEN5_WA_STORES
997
998 static void
gen5_irq_enable(struct intel_engine_cs * engine)999 gen5_irq_enable(struct intel_engine_cs *engine)
1000 {
1001 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
1002 }
1003
1004 static void
gen5_irq_disable(struct intel_engine_cs * engine)1005 gen5_irq_disable(struct intel_engine_cs *engine)
1006 {
1007 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
1008 }
1009
1010 static void
i9xx_irq_enable(struct intel_engine_cs * engine)1011 i9xx_irq_enable(struct intel_engine_cs *engine)
1012 {
1013 engine->i915->irq_mask &= ~engine->irq_enable_mask;
1014 intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
1015 intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
1016 }
1017
1018 static void
i9xx_irq_disable(struct intel_engine_cs * engine)1019 i9xx_irq_disable(struct intel_engine_cs *engine)
1020 {
1021 engine->i915->irq_mask |= engine->irq_enable_mask;
1022 intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
1023 }
1024
1025 static void
i8xx_irq_enable(struct intel_engine_cs * engine)1026 i8xx_irq_enable(struct intel_engine_cs *engine)
1027 {
1028 struct drm_i915_private *i915 = engine->i915;
1029
1030 i915->irq_mask &= ~engine->irq_enable_mask;
1031 intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
1032 ENGINE_POSTING_READ16(engine, RING_IMR);
1033 }
1034
1035 static void
i8xx_irq_disable(struct intel_engine_cs * engine)1036 i8xx_irq_disable(struct intel_engine_cs *engine)
1037 {
1038 struct drm_i915_private *i915 = engine->i915;
1039
1040 i915->irq_mask |= engine->irq_enable_mask;
1041 intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
1042 }
1043
1044 static int
bsd_ring_flush(struct i915_request * rq,u32 mode)1045 bsd_ring_flush(struct i915_request *rq, u32 mode)
1046 {
1047 u32 *cs;
1048
1049 cs = intel_ring_begin(rq, 2);
1050 if (IS_ERR(cs))
1051 return PTR_ERR(cs);
1052
1053 *cs++ = MI_FLUSH;
1054 *cs++ = MI_NOOP;
1055 intel_ring_advance(rq, cs);
1056 return 0;
1057 }
1058
1059 static void
gen6_irq_enable(struct intel_engine_cs * engine)1060 gen6_irq_enable(struct intel_engine_cs *engine)
1061 {
1062 ENGINE_WRITE(engine, RING_IMR,
1063 ~(engine->irq_enable_mask | engine->irq_keep_mask));
1064
1065 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1066 ENGINE_POSTING_READ(engine, RING_IMR);
1067
1068 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
1069 }
1070
1071 static void
gen6_irq_disable(struct intel_engine_cs * engine)1072 gen6_irq_disable(struct intel_engine_cs *engine)
1073 {
1074 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1075 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
1076 }
1077
1078 static void
hsw_vebox_irq_enable(struct intel_engine_cs * engine)1079 hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1080 {
1081 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1082
1083 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1084 ENGINE_POSTING_READ(engine, RING_IMR);
1085
1086 gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
1087 }
1088
1089 static void
hsw_vebox_irq_disable(struct intel_engine_cs * engine)1090 hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1091 {
1092 ENGINE_WRITE(engine, RING_IMR, ~0);
1093 gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
1094 }
1095
1096 static int
i965_emit_bb_start(struct i915_request * rq,u64 offset,u32 length,unsigned int dispatch_flags)1097 i965_emit_bb_start(struct i915_request *rq,
1098 u64 offset, u32 length,
1099 unsigned int dispatch_flags)
1100 {
1101 u32 *cs;
1102
1103 cs = intel_ring_begin(rq, 2);
1104 if (IS_ERR(cs))
1105 return PTR_ERR(cs);
1106
1107 *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1108 I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1109 *cs++ = offset;
1110 intel_ring_advance(rq, cs);
1111
1112 return 0;
1113 }
1114
1115 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1116 #define I830_BATCH_LIMIT SZ_256K
1117 #define I830_TLB_ENTRIES (2)
1118 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1119 static int
i830_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)1120 i830_emit_bb_start(struct i915_request *rq,
1121 u64 offset, u32 len,
1122 unsigned int dispatch_flags)
1123 {
1124 u32 *cs, cs_offset =
1125 intel_gt_scratch_offset(rq->engine->gt,
1126 INTEL_GT_SCRATCH_FIELD_DEFAULT);
1127
1128 GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
1129
1130 cs = intel_ring_begin(rq, 6);
1131 if (IS_ERR(cs))
1132 return PTR_ERR(cs);
1133
1134 /* Evict the invalid PTE TLBs */
1135 *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1136 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1137 *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1138 *cs++ = cs_offset;
1139 *cs++ = 0xdeadbeef;
1140 *cs++ = MI_NOOP;
1141 intel_ring_advance(rq, cs);
1142
1143 if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1144 if (len > I830_BATCH_LIMIT)
1145 return -ENOSPC;
1146
1147 cs = intel_ring_begin(rq, 6 + 2);
1148 if (IS_ERR(cs))
1149 return PTR_ERR(cs);
1150
1151 /* Blit the batch (which has now all relocs applied) to the
1152 * stable batch scratch bo area (so that the CS never
1153 * stumbles over its tlb invalidation bug) ...
1154 */
1155 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
1156 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1157 *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1158 *cs++ = cs_offset;
1159 *cs++ = 4096;
1160 *cs++ = offset;
1161
1162 *cs++ = MI_FLUSH;
1163 *cs++ = MI_NOOP;
1164 intel_ring_advance(rq, cs);
1165
1166 /* ... and execute it. */
1167 offset = cs_offset;
1168 }
1169
1170 cs = intel_ring_begin(rq, 2);
1171 if (IS_ERR(cs))
1172 return PTR_ERR(cs);
1173
1174 *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1175 *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1176 MI_BATCH_NON_SECURE);
1177 intel_ring_advance(rq, cs);
1178
1179 return 0;
1180 }
1181
1182 static int
i915_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)1183 i915_emit_bb_start(struct i915_request *rq,
1184 u64 offset, u32 len,
1185 unsigned int dispatch_flags)
1186 {
1187 u32 *cs;
1188
1189 cs = intel_ring_begin(rq, 2);
1190 if (IS_ERR(cs))
1191 return PTR_ERR(cs);
1192
1193 *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1194 *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1195 MI_BATCH_NON_SECURE);
1196 intel_ring_advance(rq, cs);
1197
1198 return 0;
1199 }
1200
__ring_context_fini(struct intel_context * ce)1201 static void __ring_context_fini(struct intel_context *ce)
1202 {
1203 i915_vma_put(ce->state);
1204 }
1205
ring_context_destroy(struct kref * ref)1206 static void ring_context_destroy(struct kref *ref)
1207 {
1208 struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1209
1210 GEM_BUG_ON(intel_context_is_pinned(ce));
1211
1212 if (ce->state)
1213 __ring_context_fini(ce);
1214
1215 intel_context_fini(ce);
1216 intel_context_free(ce);
1217 }
1218
vm_alias(struct intel_context * ce)1219 static struct i915_address_space *vm_alias(struct intel_context *ce)
1220 {
1221 struct i915_address_space *vm;
1222
1223 vm = ce->vm;
1224 if (i915_is_ggtt(vm))
1225 vm = &i915_vm_to_ggtt(vm)->alias->vm;
1226
1227 return vm;
1228 }
1229
__context_pin_ppgtt(struct intel_context * ce)1230 static int __context_pin_ppgtt(struct intel_context *ce)
1231 {
1232 struct i915_address_space *vm;
1233 int err = 0;
1234
1235 vm = vm_alias(ce);
1236 if (vm)
1237 err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)));
1238
1239 return err;
1240 }
1241
__context_unpin_ppgtt(struct intel_context * ce)1242 static void __context_unpin_ppgtt(struct intel_context *ce)
1243 {
1244 struct i915_address_space *vm;
1245
1246 vm = vm_alias(ce);
1247 if (vm)
1248 gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm));
1249 }
1250
ring_context_unpin(struct intel_context * ce)1251 static void ring_context_unpin(struct intel_context *ce)
1252 {
1253 __context_unpin_ppgtt(ce);
1254 }
1255
1256 static struct i915_vma *
alloc_context_vma(struct intel_engine_cs * engine)1257 alloc_context_vma(struct intel_engine_cs *engine)
1258 {
1259 struct drm_i915_private *i915 = engine->i915;
1260 struct drm_i915_gem_object *obj;
1261 struct i915_vma *vma;
1262 int err;
1263
1264 obj = i915_gem_object_create_shmem(i915, engine->context_size);
1265 if (IS_ERR(obj))
1266 return ERR_CAST(obj);
1267
1268 /*
1269 * Try to make the context utilize L3 as well as LLC.
1270 *
1271 * On VLV we don't have L3 controls in the PTEs so we
1272 * shouldn't touch the cache level, especially as that
1273 * would make the object snooped which might have a
1274 * negative performance impact.
1275 *
1276 * Snooping is required on non-llc platforms in execlist
1277 * mode, but since all GGTT accesses use PAT entry 0 we
1278 * get snooping anyway regardless of cache_level.
1279 *
1280 * This is only applicable for Ivy Bridge devices since
1281 * later platforms don't have L3 control bits in the PTE.
1282 */
1283 if (IS_IVYBRIDGE(i915))
1284 i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1285
1286 if (engine->default_state) {
1287 void *defaults, *vaddr;
1288
1289 vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1290 if (IS_ERR(vaddr)) {
1291 err = PTR_ERR(vaddr);
1292 goto err_obj;
1293 }
1294
1295 defaults = i915_gem_object_pin_map(engine->default_state,
1296 I915_MAP_WB);
1297 if (IS_ERR(defaults)) {
1298 err = PTR_ERR(defaults);
1299 goto err_map;
1300 }
1301
1302 memcpy(vaddr, defaults, engine->context_size);
1303 i915_gem_object_unpin_map(engine->default_state);
1304
1305 i915_gem_object_flush_map(obj);
1306 i915_gem_object_unpin_map(obj);
1307 }
1308
1309 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1310 if (IS_ERR(vma)) {
1311 err = PTR_ERR(vma);
1312 goto err_obj;
1313 }
1314
1315 return vma;
1316
1317 err_map:
1318 i915_gem_object_unpin_map(obj);
1319 err_obj:
1320 i915_gem_object_put(obj);
1321 return ERR_PTR(err);
1322 }
1323
ring_context_alloc(struct intel_context * ce)1324 static int ring_context_alloc(struct intel_context *ce)
1325 {
1326 struct intel_engine_cs *engine = ce->engine;
1327
1328 /* One ringbuffer to rule them all */
1329 GEM_BUG_ON(!engine->legacy.ring);
1330 ce->ring = engine->legacy.ring;
1331 ce->timeline = intel_timeline_get(engine->legacy.timeline);
1332
1333 GEM_BUG_ON(ce->state);
1334 if (engine->context_size) {
1335 struct i915_vma *vma;
1336
1337 vma = alloc_context_vma(engine);
1338 if (IS_ERR(vma))
1339 return PTR_ERR(vma);
1340
1341 ce->state = vma;
1342 if (engine->default_state)
1343 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1344 }
1345
1346 return 0;
1347 }
1348
ring_context_pin(struct intel_context * ce)1349 static int ring_context_pin(struct intel_context *ce)
1350 {
1351 return __context_pin_ppgtt(ce);
1352 }
1353
ring_context_reset(struct intel_context * ce)1354 static void ring_context_reset(struct intel_context *ce)
1355 {
1356 intel_ring_reset(ce->ring, ce->ring->emit);
1357 }
1358
1359 static const struct intel_context_ops ring_context_ops = {
1360 .alloc = ring_context_alloc,
1361
1362 .pin = ring_context_pin,
1363 .unpin = ring_context_unpin,
1364
1365 .enter = intel_context_enter_engine,
1366 .exit = intel_context_exit_engine,
1367
1368 .reset = ring_context_reset,
1369 .destroy = ring_context_destroy,
1370 };
1371
load_pd_dir(struct i915_request * rq,const struct i915_ppgtt * ppgtt,u32 valid)1372 static int load_pd_dir(struct i915_request *rq,
1373 const struct i915_ppgtt *ppgtt,
1374 u32 valid)
1375 {
1376 const struct intel_engine_cs * const engine = rq->engine;
1377 u32 *cs;
1378
1379 cs = intel_ring_begin(rq, 12);
1380 if (IS_ERR(cs))
1381 return PTR_ERR(cs);
1382
1383 *cs++ = MI_LOAD_REGISTER_IMM(1);
1384 *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1385 *cs++ = valid;
1386
1387 *cs++ = MI_LOAD_REGISTER_IMM(1);
1388 *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1389 *cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
1390
1391 /* Stall until the page table load is complete? */
1392 *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1393 *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1394 *cs++ = intel_gt_scratch_offset(engine->gt,
1395 INTEL_GT_SCRATCH_FIELD_DEFAULT);
1396
1397 *cs++ = MI_LOAD_REGISTER_IMM(1);
1398 *cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base));
1399 *cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE);
1400
1401 intel_ring_advance(rq, cs);
1402
1403 return rq->engine->emit_flush(rq, EMIT_FLUSH);
1404 }
1405
mi_set_context(struct i915_request * rq,u32 flags)1406 static inline int mi_set_context(struct i915_request *rq, u32 flags)
1407 {
1408 struct drm_i915_private *i915 = rq->i915;
1409 struct intel_engine_cs *engine = rq->engine;
1410 enum intel_engine_id id;
1411 const int num_engines =
1412 IS_HASWELL(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1413 bool force_restore = false;
1414 int len;
1415 u32 *cs;
1416
1417 len = 4;
1418 if (IS_GEN(i915, 7))
1419 len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1420 else if (IS_GEN(i915, 5))
1421 len += 2;
1422 if (flags & MI_FORCE_RESTORE) {
1423 GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1424 flags &= ~MI_FORCE_RESTORE;
1425 force_restore = true;
1426 len += 2;
1427 }
1428
1429 cs = intel_ring_begin(rq, len);
1430 if (IS_ERR(cs))
1431 return PTR_ERR(cs);
1432
1433 /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1434 if (IS_GEN(i915, 7)) {
1435 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1436 if (num_engines) {
1437 struct intel_engine_cs *signaller;
1438
1439 *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1440 for_each_engine(signaller, engine->gt, id) {
1441 if (signaller == engine)
1442 continue;
1443
1444 *cs++ = i915_mmio_reg_offset(
1445 RING_PSMI_CTL(signaller->mmio_base));
1446 *cs++ = _MASKED_BIT_ENABLE(
1447 GEN6_PSMI_SLEEP_MSG_DISABLE);
1448 }
1449 }
1450 } else if (IS_GEN(i915, 5)) {
1451 /*
1452 * This w/a is only listed for pre-production ilk a/b steppings,
1453 * but is also mentioned for programming the powerctx. To be
1454 * safe, just apply the workaround; we do not use SyncFlush so
1455 * this should never take effect and so be a no-op!
1456 */
1457 *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
1458 }
1459
1460 if (force_restore) {
1461 /*
1462 * The HW doesn't handle being told to restore the current
1463 * context very well. Quite often it likes goes to go off and
1464 * sulk, especially when it is meant to be reloading PP_DIR.
1465 * A very simple fix to force the reload is to simply switch
1466 * away from the current context and back again.
1467 *
1468 * Note that the kernel_context will contain random state
1469 * following the INHIBIT_RESTORE. We accept this since we
1470 * never use the kernel_context state; it is merely a
1471 * placeholder we use to flush other contexts.
1472 */
1473 *cs++ = MI_SET_CONTEXT;
1474 *cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1475 MI_MM_SPACE_GTT |
1476 MI_RESTORE_INHIBIT;
1477 }
1478
1479 *cs++ = MI_NOOP;
1480 *cs++ = MI_SET_CONTEXT;
1481 *cs++ = i915_ggtt_offset(rq->context->state) | flags;
1482 /*
1483 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1484 * WaMiSetContext_Hang:snb,ivb,vlv
1485 */
1486 *cs++ = MI_NOOP;
1487
1488 if (IS_GEN(i915, 7)) {
1489 if (num_engines) {
1490 struct intel_engine_cs *signaller;
1491 i915_reg_t last_reg = {}; /* keep gcc quiet */
1492
1493 *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1494 for_each_engine(signaller, engine->gt, id) {
1495 if (signaller == engine)
1496 continue;
1497
1498 last_reg = RING_PSMI_CTL(signaller->mmio_base);
1499 *cs++ = i915_mmio_reg_offset(last_reg);
1500 *cs++ = _MASKED_BIT_DISABLE(
1501 GEN6_PSMI_SLEEP_MSG_DISABLE);
1502 }
1503
1504 /* Insert a delay before the next switch! */
1505 *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1506 *cs++ = i915_mmio_reg_offset(last_reg);
1507 *cs++ = intel_gt_scratch_offset(engine->gt,
1508 INTEL_GT_SCRATCH_FIELD_DEFAULT);
1509 *cs++ = MI_NOOP;
1510 }
1511 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1512 } else if (IS_GEN(i915, 5)) {
1513 *cs++ = MI_SUSPEND_FLUSH;
1514 }
1515
1516 intel_ring_advance(rq, cs);
1517
1518 return 0;
1519 }
1520
remap_l3_slice(struct i915_request * rq,int slice)1521 static int remap_l3_slice(struct i915_request *rq, int slice)
1522 {
1523 u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1524 int i;
1525
1526 if (!remap_info)
1527 return 0;
1528
1529 cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1530 if (IS_ERR(cs))
1531 return PTR_ERR(cs);
1532
1533 /*
1534 * Note: We do not worry about the concurrent register cacheline hang
1535 * here because no other code should access these registers other than
1536 * at initialization time.
1537 */
1538 *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1539 for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1540 *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1541 *cs++ = remap_info[i];
1542 }
1543 *cs++ = MI_NOOP;
1544 intel_ring_advance(rq, cs);
1545
1546 return 0;
1547 }
1548
remap_l3(struct i915_request * rq)1549 static int remap_l3(struct i915_request *rq)
1550 {
1551 struct i915_gem_context *ctx = i915_request_gem_context(rq);
1552 int i, err;
1553
1554 if (!ctx || !ctx->remap_slice)
1555 return 0;
1556
1557 for (i = 0; i < MAX_L3_SLICES; i++) {
1558 if (!(ctx->remap_slice & BIT(i)))
1559 continue;
1560
1561 err = remap_l3_slice(rq, i);
1562 if (err)
1563 return err;
1564 }
1565
1566 ctx->remap_slice = 0;
1567 return 0;
1568 }
1569
switch_mm(struct i915_request * rq,struct i915_address_space * vm)1570 static int switch_mm(struct i915_request *rq, struct i915_address_space *vm)
1571 {
1572 int ret;
1573
1574 if (!vm)
1575 return 0;
1576
1577 ret = rq->engine->emit_flush(rq, EMIT_FLUSH);
1578 if (ret)
1579 return ret;
1580
1581 /*
1582 * Not only do we need a full barrier (post-sync write) after
1583 * invalidating the TLBs, but we need to wait a little bit
1584 * longer. Whether this is merely delaying us, or the
1585 * subsequent flush is a key part of serialising with the
1586 * post-sync op, this extra pass appears vital before a
1587 * mm switch!
1588 */
1589 ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G);
1590 if (ret)
1591 return ret;
1592
1593 return rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1594 }
1595
switch_context(struct i915_request * rq)1596 static int switch_context(struct i915_request *rq)
1597 {
1598 struct intel_context *ce = rq->context;
1599 int ret;
1600
1601 GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1602
1603 ret = switch_mm(rq, vm_alias(ce));
1604 if (ret)
1605 return ret;
1606
1607 if (ce->state) {
1608 u32 flags;
1609
1610 GEM_BUG_ON(rq->engine->id != RCS0);
1611
1612 /* For resource streamer on HSW+ and power context elsewhere */
1613 BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN);
1614 BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN);
1615
1616 flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT;
1617 if (test_bit(CONTEXT_VALID_BIT, &ce->flags))
1618 flags |= MI_RESTORE_EXT_STATE_EN;
1619 else
1620 flags |= MI_RESTORE_INHIBIT;
1621
1622 ret = mi_set_context(rq, flags);
1623 if (ret)
1624 return ret;
1625 }
1626
1627 ret = remap_l3(rq);
1628 if (ret)
1629 return ret;
1630
1631 return 0;
1632 }
1633
ring_request_alloc(struct i915_request * request)1634 static int ring_request_alloc(struct i915_request *request)
1635 {
1636 int ret;
1637
1638 GEM_BUG_ON(!intel_context_is_pinned(request->context));
1639 GEM_BUG_ON(i915_request_timeline(request)->has_initial_breadcrumb);
1640
1641 /*
1642 * Flush enough space to reduce the likelihood of waiting after
1643 * we start building the request - in which case we will just
1644 * have to repeat work.
1645 */
1646 request->reserved_space += LEGACY_REQUEST_SIZE;
1647
1648 /* Unconditionally invalidate GPU caches and TLBs. */
1649 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1650 if (ret)
1651 return ret;
1652
1653 ret = switch_context(request);
1654 if (ret)
1655 return ret;
1656
1657 request->reserved_space -= LEGACY_REQUEST_SIZE;
1658 return 0;
1659 }
1660
gen6_bsd_submit_request(struct i915_request * request)1661 static void gen6_bsd_submit_request(struct i915_request *request)
1662 {
1663 struct intel_uncore *uncore = request->engine->uncore;
1664
1665 intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
1666
1667 /* Every tail move must follow the sequence below */
1668
1669 /* Disable notification that the ring is IDLE. The GT
1670 * will then assume that it is busy and bring it out of rc6.
1671 */
1672 intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1673 _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1674
1675 /* Clear the context id. Here be magic! */
1676 intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
1677
1678 /* Wait for the ring not to be idle, i.e. for it to wake up. */
1679 if (__intel_wait_for_register_fw(uncore,
1680 GEN6_BSD_SLEEP_PSMI_CONTROL,
1681 GEN6_BSD_SLEEP_INDICATOR,
1682 0,
1683 1000, 0, NULL))
1684 DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
1685
1686 /* Now that the ring is fully powered up, update the tail */
1687 i9xx_submit_request(request);
1688
1689 /* Let the ring send IDLE messages to the GT again,
1690 * and so let it sleep to conserve power when idle.
1691 */
1692 intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1693 _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1694
1695 intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
1696 }
1697
mi_flush_dw(struct i915_request * rq,u32 flags)1698 static int mi_flush_dw(struct i915_request *rq, u32 flags)
1699 {
1700 u32 cmd, *cs;
1701
1702 cs = intel_ring_begin(rq, 4);
1703 if (IS_ERR(cs))
1704 return PTR_ERR(cs);
1705
1706 cmd = MI_FLUSH_DW;
1707
1708 /*
1709 * We always require a command barrier so that subsequent
1710 * commands, such as breadcrumb interrupts, are strictly ordered
1711 * wrt the contents of the write cache being flushed to memory
1712 * (and thus being coherent from the CPU).
1713 */
1714 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1715
1716 /*
1717 * Bspec vol 1c.3 - blitter engine command streamer:
1718 * "If ENABLED, all TLBs will be invalidated once the flush
1719 * operation is complete. This bit is only valid when the
1720 * Post-Sync Operation field is a value of 1h or 3h."
1721 */
1722 cmd |= flags;
1723
1724 *cs++ = cmd;
1725 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
1726 *cs++ = 0;
1727 *cs++ = MI_NOOP;
1728
1729 intel_ring_advance(rq, cs);
1730
1731 return 0;
1732 }
1733
gen6_flush_dw(struct i915_request * rq,u32 mode,u32 invflags)1734 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
1735 {
1736 return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
1737 }
1738
gen6_bsd_ring_flush(struct i915_request * rq,u32 mode)1739 static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
1740 {
1741 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
1742 }
1743
1744 static int
hsw_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)1745 hsw_emit_bb_start(struct i915_request *rq,
1746 u64 offset, u32 len,
1747 unsigned int dispatch_flags)
1748 {
1749 u32 *cs;
1750
1751 cs = intel_ring_begin(rq, 2);
1752 if (IS_ERR(cs))
1753 return PTR_ERR(cs);
1754
1755 *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1756 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
1757 /* bit0-7 is the length on GEN6+ */
1758 *cs++ = offset;
1759 intel_ring_advance(rq, cs);
1760
1761 return 0;
1762 }
1763
1764 static int
gen6_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)1765 gen6_emit_bb_start(struct i915_request *rq,
1766 u64 offset, u32 len,
1767 unsigned int dispatch_flags)
1768 {
1769 u32 *cs;
1770
1771 cs = intel_ring_begin(rq, 2);
1772 if (IS_ERR(cs))
1773 return PTR_ERR(cs);
1774
1775 *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1776 0 : MI_BATCH_NON_SECURE_I965);
1777 /* bit0-7 is the length on GEN6+ */
1778 *cs++ = offset;
1779 intel_ring_advance(rq, cs);
1780
1781 return 0;
1782 }
1783
1784 /* Blitter support (SandyBridge+) */
1785
gen6_ring_flush(struct i915_request * rq,u32 mode)1786 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
1787 {
1788 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
1789 }
1790
i9xx_set_default_submission(struct intel_engine_cs * engine)1791 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
1792 {
1793 engine->submit_request = i9xx_submit_request;
1794
1795 engine->park = NULL;
1796 engine->unpark = NULL;
1797 }
1798
gen6_bsd_set_default_submission(struct intel_engine_cs * engine)1799 static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
1800 {
1801 i9xx_set_default_submission(engine);
1802 engine->submit_request = gen6_bsd_submit_request;
1803 }
1804
ring_release(struct intel_engine_cs * engine)1805 static void ring_release(struct intel_engine_cs *engine)
1806 {
1807 struct drm_i915_private *dev_priv = engine->i915;
1808
1809 WARN_ON(INTEL_GEN(dev_priv) > 2 &&
1810 (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
1811
1812 intel_engine_cleanup_common(engine);
1813
1814 intel_ring_unpin(engine->legacy.ring);
1815 intel_ring_put(engine->legacy.ring);
1816
1817 intel_timeline_unpin(engine->legacy.timeline);
1818 intel_timeline_put(engine->legacy.timeline);
1819 }
1820
setup_irq(struct intel_engine_cs * engine)1821 static void setup_irq(struct intel_engine_cs *engine)
1822 {
1823 struct drm_i915_private *i915 = engine->i915;
1824
1825 if (INTEL_GEN(i915) >= 6) {
1826 engine->irq_enable = gen6_irq_enable;
1827 engine->irq_disable = gen6_irq_disable;
1828 } else if (INTEL_GEN(i915) >= 5) {
1829 engine->irq_enable = gen5_irq_enable;
1830 engine->irq_disable = gen5_irq_disable;
1831 } else if (INTEL_GEN(i915) >= 3) {
1832 engine->irq_enable = i9xx_irq_enable;
1833 engine->irq_disable = i9xx_irq_disable;
1834 } else {
1835 engine->irq_enable = i8xx_irq_enable;
1836 engine->irq_disable = i8xx_irq_disable;
1837 }
1838 }
1839
setup_common(struct intel_engine_cs * engine)1840 static void setup_common(struct intel_engine_cs *engine)
1841 {
1842 struct drm_i915_private *i915 = engine->i915;
1843
1844 /* gen8+ are only supported with execlists */
1845 GEM_BUG_ON(INTEL_GEN(i915) >= 8);
1846
1847 setup_irq(engine);
1848
1849 engine->resume = xcs_resume;
1850 engine->reset.prepare = reset_prepare;
1851 engine->reset.rewind = reset_rewind;
1852 engine->reset.cancel = reset_cancel;
1853 engine->reset.finish = reset_finish;
1854
1855 engine->cops = &ring_context_ops;
1856 engine->request_alloc = ring_request_alloc;
1857
1858 /*
1859 * Using a global execution timeline; the previous final breadcrumb is
1860 * equivalent to our next initial bread so we can elide
1861 * engine->emit_init_breadcrumb().
1862 */
1863 engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
1864 if (IS_GEN(i915, 5))
1865 engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
1866
1867 engine->set_default_submission = i9xx_set_default_submission;
1868
1869 if (INTEL_GEN(i915) >= 6)
1870 engine->emit_bb_start = gen6_emit_bb_start;
1871 else if (INTEL_GEN(i915) >= 4)
1872 engine->emit_bb_start = i965_emit_bb_start;
1873 else if (IS_I830(i915) || IS_I845G(i915))
1874 engine->emit_bb_start = i830_emit_bb_start;
1875 else
1876 engine->emit_bb_start = i915_emit_bb_start;
1877 }
1878
setup_rcs(struct intel_engine_cs * engine)1879 static void setup_rcs(struct intel_engine_cs *engine)
1880 {
1881 struct drm_i915_private *i915 = engine->i915;
1882
1883 if (HAS_L3_DPF(i915))
1884 engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
1885
1886 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
1887
1888 if (INTEL_GEN(i915) >= 7) {
1889 engine->emit_flush = gen7_render_ring_flush;
1890 engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
1891 } else if (IS_GEN(i915, 6)) {
1892 engine->emit_flush = gen6_render_ring_flush;
1893 engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
1894 } else if (IS_GEN(i915, 5)) {
1895 engine->emit_flush = gen4_render_ring_flush;
1896 } else {
1897 if (INTEL_GEN(i915) < 4)
1898 engine->emit_flush = gen2_render_ring_flush;
1899 else
1900 engine->emit_flush = gen4_render_ring_flush;
1901 engine->irq_enable_mask = I915_USER_INTERRUPT;
1902 }
1903
1904 if (IS_HASWELL(i915))
1905 engine->emit_bb_start = hsw_emit_bb_start;
1906
1907 engine->resume = rcs_resume;
1908 }
1909
setup_vcs(struct intel_engine_cs * engine)1910 static void setup_vcs(struct intel_engine_cs *engine)
1911 {
1912 struct drm_i915_private *i915 = engine->i915;
1913
1914 if (INTEL_GEN(i915) >= 6) {
1915 /* gen6 bsd needs a special wa for tail updates */
1916 if (IS_GEN(i915, 6))
1917 engine->set_default_submission = gen6_bsd_set_default_submission;
1918 engine->emit_flush = gen6_bsd_ring_flush;
1919 engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
1920
1921 if (IS_GEN(i915, 6))
1922 engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
1923 else
1924 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1925 } else {
1926 engine->emit_flush = bsd_ring_flush;
1927 if (IS_GEN(i915, 5))
1928 engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
1929 else
1930 engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
1931 }
1932 }
1933
setup_bcs(struct intel_engine_cs * engine)1934 static void setup_bcs(struct intel_engine_cs *engine)
1935 {
1936 struct drm_i915_private *i915 = engine->i915;
1937
1938 engine->emit_flush = gen6_ring_flush;
1939 engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
1940
1941 if (IS_GEN(i915, 6))
1942 engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
1943 else
1944 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1945 }
1946
setup_vecs(struct intel_engine_cs * engine)1947 static void setup_vecs(struct intel_engine_cs *engine)
1948 {
1949 struct drm_i915_private *i915 = engine->i915;
1950
1951 GEM_BUG_ON(INTEL_GEN(i915) < 7);
1952
1953 engine->emit_flush = gen6_ring_flush;
1954 engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
1955 engine->irq_enable = hsw_vebox_irq_enable;
1956 engine->irq_disable = hsw_vebox_irq_disable;
1957
1958 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1959 }
1960
intel_ring_submission_setup(struct intel_engine_cs * engine)1961 int intel_ring_submission_setup(struct intel_engine_cs *engine)
1962 {
1963 struct intel_timeline *timeline;
1964 struct intel_ring *ring;
1965 int err;
1966
1967 setup_common(engine);
1968
1969 switch (engine->class) {
1970 case RENDER_CLASS:
1971 setup_rcs(engine);
1972 break;
1973 case VIDEO_DECODE_CLASS:
1974 setup_vcs(engine);
1975 break;
1976 case COPY_ENGINE_CLASS:
1977 setup_bcs(engine);
1978 break;
1979 case VIDEO_ENHANCEMENT_CLASS:
1980 setup_vecs(engine);
1981 break;
1982 default:
1983 MISSING_CASE(engine->class);
1984 return -ENODEV;
1985 }
1986
1987 timeline = intel_timeline_create(engine->gt, engine->status_page.vma);
1988 if (IS_ERR(timeline)) {
1989 err = PTR_ERR(timeline);
1990 goto err;
1991 }
1992 GEM_BUG_ON(timeline->has_initial_breadcrumb);
1993
1994 err = intel_timeline_pin(timeline);
1995 if (err)
1996 goto err_timeline;
1997
1998 ring = intel_engine_create_ring(engine, SZ_16K);
1999 if (IS_ERR(ring)) {
2000 err = PTR_ERR(ring);
2001 goto err_timeline_unpin;
2002 }
2003
2004 err = intel_ring_pin(ring);
2005 if (err)
2006 goto err_ring;
2007
2008 GEM_BUG_ON(engine->legacy.ring);
2009 engine->legacy.ring = ring;
2010 engine->legacy.timeline = timeline;
2011
2012 GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
2013
2014 /* Finally, take ownership and responsibility for cleanup! */
2015 engine->release = ring_release;
2016
2017 return 0;
2018
2019 err_ring:
2020 intel_ring_put(ring);
2021 err_timeline_unpin:
2022 intel_timeline_unpin(timeline);
2023 err_timeline:
2024 intel_timeline_put(timeline);
2025 err:
2026 intel_engine_cleanup_common(engine);
2027 return err;
2028 }
2029