xref: /openbsd-src/sys/dev/pci/drm/i915/gt/gen6_engine_cs.c (revision 1bb76ff151c0aba8e3312a604e4cd2e5195cf4b7)
1ad8b1aafSjsg // SPDX-License-Identifier: MIT
2ad8b1aafSjsg /*
3ad8b1aafSjsg  * Copyright © 2020 Intel Corporation
4ad8b1aafSjsg  */
5ad8b1aafSjsg 
6ad8b1aafSjsg #include "gen6_engine_cs.h"
7ad8b1aafSjsg #include "intel_engine.h"
8*1bb76ff1Sjsg #include "intel_engine_regs.h"
9ad8b1aafSjsg #include "intel_gpu_commands.h"
10ad8b1aafSjsg #include "intel_gt.h"
11ad8b1aafSjsg #include "intel_gt_irq.h"
12ad8b1aafSjsg #include "intel_gt_pm_irq.h"
13ad8b1aafSjsg #include "intel_ring.h"
14ad8b1aafSjsg 
15ad8b1aafSjsg #define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
16ad8b1aafSjsg 
17ad8b1aafSjsg /*
18ad8b1aafSjsg  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19ad8b1aafSjsg  * implementing two workarounds on gen6.  From section 1.4.7.1
20ad8b1aafSjsg  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21ad8b1aafSjsg  *
22ad8b1aafSjsg  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
23ad8b1aafSjsg  * produced by non-pipelined state commands), software needs to first
24ad8b1aafSjsg  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25ad8b1aafSjsg  * 0.
26ad8b1aafSjsg  *
27ad8b1aafSjsg  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28ad8b1aafSjsg  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29ad8b1aafSjsg  *
30ad8b1aafSjsg  * And the workaround for these two requires this workaround first:
31ad8b1aafSjsg  *
32ad8b1aafSjsg  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33ad8b1aafSjsg  * BEFORE the pipe-control with a post-sync op and no write-cache
34ad8b1aafSjsg  * flushes.
35ad8b1aafSjsg  *
36ad8b1aafSjsg  * And this last workaround is tricky because of the requirements on
37ad8b1aafSjsg  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38ad8b1aafSjsg  * volume 2 part 1:
39ad8b1aafSjsg  *
40ad8b1aafSjsg  *     "1 of the following must also be set:
41ad8b1aafSjsg  *      - Render Target Cache Flush Enable ([12] of DW1)
42ad8b1aafSjsg  *      - Depth Cache Flush Enable ([0] of DW1)
43ad8b1aafSjsg  *      - Stall at Pixel Scoreboard ([1] of DW1)
44ad8b1aafSjsg  *      - Depth Stall ([13] of DW1)
45ad8b1aafSjsg  *      - Post-Sync Operation ([13] of DW1)
46ad8b1aafSjsg  *      - Notify Enable ([8] of DW1)"
47ad8b1aafSjsg  *
48ad8b1aafSjsg  * The cache flushes require the workaround flush that triggered this
49ad8b1aafSjsg  * one, so we can't use it.  Depth stall would trigger the same.
50ad8b1aafSjsg  * Post-sync nonzero is what triggered this second workaround, so we
51ad8b1aafSjsg  * can't use that one either.  Notify enable is IRQs, which aren't
52ad8b1aafSjsg  * really our business.  That leaves only stall at scoreboard.
53ad8b1aafSjsg  */
54ad8b1aafSjsg static int
gen6_emit_post_sync_nonzero_flush(struct i915_request * rq)55ad8b1aafSjsg gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56ad8b1aafSjsg {
57ad8b1aafSjsg 	u32 scratch_addr =
58ad8b1aafSjsg 		intel_gt_scratch_offset(rq->engine->gt,
59ad8b1aafSjsg 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60ad8b1aafSjsg 	u32 *cs;
61ad8b1aafSjsg 
62ad8b1aafSjsg 	cs = intel_ring_begin(rq, 6);
63ad8b1aafSjsg 	if (IS_ERR(cs))
64ad8b1aafSjsg 		return PTR_ERR(cs);
65ad8b1aafSjsg 
66ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(5);
67ad8b1aafSjsg 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
68ad8b1aafSjsg 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
69ad8b1aafSjsg 	*cs++ = 0; /* low dword */
70ad8b1aafSjsg 	*cs++ = 0; /* high dword */
71ad8b1aafSjsg 	*cs++ = MI_NOOP;
72ad8b1aafSjsg 	intel_ring_advance(rq, cs);
73ad8b1aafSjsg 
74ad8b1aafSjsg 	cs = intel_ring_begin(rq, 6);
75ad8b1aafSjsg 	if (IS_ERR(cs))
76ad8b1aafSjsg 		return PTR_ERR(cs);
77ad8b1aafSjsg 
78ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(5);
79ad8b1aafSjsg 	*cs++ = PIPE_CONTROL_QW_WRITE;
80ad8b1aafSjsg 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
81ad8b1aafSjsg 	*cs++ = 0;
82ad8b1aafSjsg 	*cs++ = 0;
83ad8b1aafSjsg 	*cs++ = MI_NOOP;
84ad8b1aafSjsg 	intel_ring_advance(rq, cs);
85ad8b1aafSjsg 
86ad8b1aafSjsg 	return 0;
87ad8b1aafSjsg }
88ad8b1aafSjsg 
gen6_emit_flush_rcs(struct i915_request * rq,u32 mode)89ad8b1aafSjsg int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90ad8b1aafSjsg {
91ad8b1aafSjsg 	u32 scratch_addr =
92ad8b1aafSjsg 		intel_gt_scratch_offset(rq->engine->gt,
93ad8b1aafSjsg 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94ad8b1aafSjsg 	u32 *cs, flags = 0;
95ad8b1aafSjsg 	int ret;
96ad8b1aafSjsg 
97ad8b1aafSjsg 	/* Force SNB workarounds for PIPE_CONTROL flushes */
98ad8b1aafSjsg 	ret = gen6_emit_post_sync_nonzero_flush(rq);
99ad8b1aafSjsg 	if (ret)
100ad8b1aafSjsg 		return ret;
101ad8b1aafSjsg 
102ad8b1aafSjsg 	/*
103ad8b1aafSjsg 	 * Just flush everything.  Experiments have shown that reducing the
104ad8b1aafSjsg 	 * number of bits based on the write domains has little performance
105ad8b1aafSjsg 	 * impact. And when rearranging requests, the order of flushes is
106ad8b1aafSjsg 	 * unknown.
107ad8b1aafSjsg 	 */
108ad8b1aafSjsg 	if (mode & EMIT_FLUSH) {
109ad8b1aafSjsg 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110ad8b1aafSjsg 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111ad8b1aafSjsg 		/*
112ad8b1aafSjsg 		 * Ensure that any following seqno writes only happen
113ad8b1aafSjsg 		 * when the render cache is indeed flushed.
114ad8b1aafSjsg 		 */
115ad8b1aafSjsg 		flags |= PIPE_CONTROL_CS_STALL;
116ad8b1aafSjsg 	}
117ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE) {
118ad8b1aafSjsg 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
119ad8b1aafSjsg 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120ad8b1aafSjsg 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121ad8b1aafSjsg 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122ad8b1aafSjsg 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123ad8b1aafSjsg 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124ad8b1aafSjsg 		/*
125ad8b1aafSjsg 		 * TLB invalidate requires a post-sync write.
126ad8b1aafSjsg 		 */
127ad8b1aafSjsg 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
128ad8b1aafSjsg 	}
129ad8b1aafSjsg 
130ad8b1aafSjsg 	cs = intel_ring_begin(rq, 4);
131ad8b1aafSjsg 	if (IS_ERR(cs))
132ad8b1aafSjsg 		return PTR_ERR(cs);
133ad8b1aafSjsg 
134ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
135ad8b1aafSjsg 	*cs++ = flags;
136ad8b1aafSjsg 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137ad8b1aafSjsg 	*cs++ = 0;
138ad8b1aafSjsg 	intel_ring_advance(rq, cs);
139ad8b1aafSjsg 
140ad8b1aafSjsg 	return 0;
141ad8b1aafSjsg }
142ad8b1aafSjsg 
gen6_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)143ad8b1aafSjsg u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144ad8b1aafSjsg {
145ad8b1aafSjsg 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
146ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
147ad8b1aafSjsg 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
148ad8b1aafSjsg 	*cs++ = 0;
149ad8b1aafSjsg 	*cs++ = 0;
150ad8b1aafSjsg 
151ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
152ad8b1aafSjsg 	*cs++ = PIPE_CONTROL_QW_WRITE;
153ad8b1aafSjsg 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
154ad8b1aafSjsg 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
155ad8b1aafSjsg 		PIPE_CONTROL_GLOBAL_GTT;
156ad8b1aafSjsg 	*cs++ = 0;
157ad8b1aafSjsg 
158ad8b1aafSjsg 	/* Finally we can flush and with it emit the breadcrumb */
159ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
160ad8b1aafSjsg 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
161ad8b1aafSjsg 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
162ad8b1aafSjsg 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
163ad8b1aafSjsg 		 PIPE_CONTROL_QW_WRITE |
164ad8b1aafSjsg 		 PIPE_CONTROL_CS_STALL);
1655ca02815Sjsg 	*cs++ = i915_request_active_seqno(rq) |
166ad8b1aafSjsg 		PIPE_CONTROL_GLOBAL_GTT;
167ad8b1aafSjsg 	*cs++ = rq->fence.seqno;
168ad8b1aafSjsg 
169ad8b1aafSjsg 	*cs++ = MI_USER_INTERRUPT;
170ad8b1aafSjsg 	*cs++ = MI_NOOP;
171ad8b1aafSjsg 
172ad8b1aafSjsg 	rq->tail = intel_ring_offset(rq, cs);
173ad8b1aafSjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
174ad8b1aafSjsg 
175ad8b1aafSjsg 	return cs;
176ad8b1aafSjsg }
177ad8b1aafSjsg 
mi_flush_dw(struct i915_request * rq,u32 flags)178ad8b1aafSjsg static int mi_flush_dw(struct i915_request *rq, u32 flags)
179ad8b1aafSjsg {
180ad8b1aafSjsg 	u32 cmd, *cs;
181ad8b1aafSjsg 
182ad8b1aafSjsg 	cs = intel_ring_begin(rq, 4);
183ad8b1aafSjsg 	if (IS_ERR(cs))
184ad8b1aafSjsg 		return PTR_ERR(cs);
185ad8b1aafSjsg 
186ad8b1aafSjsg 	cmd = MI_FLUSH_DW;
187ad8b1aafSjsg 
188ad8b1aafSjsg 	/*
189ad8b1aafSjsg 	 * We always require a command barrier so that subsequent
190ad8b1aafSjsg 	 * commands, such as breadcrumb interrupts, are strictly ordered
191ad8b1aafSjsg 	 * wrt the contents of the write cache being flushed to memory
192ad8b1aafSjsg 	 * (and thus being coherent from the CPU).
193ad8b1aafSjsg 	 */
194ad8b1aafSjsg 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
195ad8b1aafSjsg 
196ad8b1aafSjsg 	/*
197ad8b1aafSjsg 	 * Bspec vol 1c.3 - blitter engine command streamer:
198ad8b1aafSjsg 	 * "If ENABLED, all TLBs will be invalidated once the flush
199ad8b1aafSjsg 	 * operation is complete. This bit is only valid when the
200ad8b1aafSjsg 	 * Post-Sync Operation field is a value of 1h or 3h."
201ad8b1aafSjsg 	 */
202ad8b1aafSjsg 	cmd |= flags;
203ad8b1aafSjsg 
204ad8b1aafSjsg 	*cs++ = cmd;
205ad8b1aafSjsg 	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
206ad8b1aafSjsg 	*cs++ = 0;
207ad8b1aafSjsg 	*cs++ = MI_NOOP;
208ad8b1aafSjsg 
209ad8b1aafSjsg 	intel_ring_advance(rq, cs);
210ad8b1aafSjsg 
211ad8b1aafSjsg 	return 0;
212ad8b1aafSjsg }
213ad8b1aafSjsg 
gen6_flush_dw(struct i915_request * rq,u32 mode,u32 invflags)214ad8b1aafSjsg static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215ad8b1aafSjsg {
216ad8b1aafSjsg 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
217ad8b1aafSjsg }
218ad8b1aafSjsg 
gen6_emit_flush_xcs(struct i915_request * rq,u32 mode)219ad8b1aafSjsg int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220ad8b1aafSjsg {
221ad8b1aafSjsg 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222ad8b1aafSjsg }
223ad8b1aafSjsg 
gen6_emit_flush_vcs(struct i915_request * rq,u32 mode)224ad8b1aafSjsg int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225ad8b1aafSjsg {
226ad8b1aafSjsg 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
227ad8b1aafSjsg }
228ad8b1aafSjsg 
gen6_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)229ad8b1aafSjsg int gen6_emit_bb_start(struct i915_request *rq,
230ad8b1aafSjsg 		       u64 offset, u32 len,
231ad8b1aafSjsg 		       unsigned int dispatch_flags)
232ad8b1aafSjsg {
233ad8b1aafSjsg 	u32 security;
234ad8b1aafSjsg 	u32 *cs;
235ad8b1aafSjsg 
236ad8b1aafSjsg 	security = MI_BATCH_NON_SECURE_I965;
237ad8b1aafSjsg 	if (dispatch_flags & I915_DISPATCH_SECURE)
238ad8b1aafSjsg 		security = 0;
239ad8b1aafSjsg 
240ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
241ad8b1aafSjsg 	if (IS_ERR(cs))
242ad8b1aafSjsg 		return PTR_ERR(cs);
243ad8b1aafSjsg 
244ad8b1aafSjsg 	cs = __gen6_emit_bb_start(cs, offset, security);
245ad8b1aafSjsg 	intel_ring_advance(rq, cs);
246ad8b1aafSjsg 
247ad8b1aafSjsg 	return 0;
248ad8b1aafSjsg }
249ad8b1aafSjsg 
250ad8b1aafSjsg int
hsw_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)251ad8b1aafSjsg hsw_emit_bb_start(struct i915_request *rq,
252ad8b1aafSjsg 		  u64 offset, u32 len,
253ad8b1aafSjsg 		  unsigned int dispatch_flags)
254ad8b1aafSjsg {
255ad8b1aafSjsg 	u32 security;
256ad8b1aafSjsg 	u32 *cs;
257ad8b1aafSjsg 
258ad8b1aafSjsg 	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
259ad8b1aafSjsg 	if (dispatch_flags & I915_DISPATCH_SECURE)
260ad8b1aafSjsg 		security = 0;
261ad8b1aafSjsg 
262ad8b1aafSjsg 	cs = intel_ring_begin(rq, 2);
263ad8b1aafSjsg 	if (IS_ERR(cs))
264ad8b1aafSjsg 		return PTR_ERR(cs);
265ad8b1aafSjsg 
266ad8b1aafSjsg 	cs = __gen6_emit_bb_start(cs, offset, security);
267ad8b1aafSjsg 	intel_ring_advance(rq, cs);
268ad8b1aafSjsg 
269ad8b1aafSjsg 	return 0;
270ad8b1aafSjsg }
271ad8b1aafSjsg 
gen7_stall_cs(struct i915_request * rq)272ad8b1aafSjsg static int gen7_stall_cs(struct i915_request *rq)
273ad8b1aafSjsg {
274ad8b1aafSjsg 	u32 *cs;
275ad8b1aafSjsg 
276ad8b1aafSjsg 	cs = intel_ring_begin(rq, 4);
277ad8b1aafSjsg 	if (IS_ERR(cs))
278ad8b1aafSjsg 		return PTR_ERR(cs);
279ad8b1aafSjsg 
280ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
281ad8b1aafSjsg 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
282ad8b1aafSjsg 	*cs++ = 0;
283ad8b1aafSjsg 	*cs++ = 0;
284ad8b1aafSjsg 	intel_ring_advance(rq, cs);
285ad8b1aafSjsg 
286ad8b1aafSjsg 	return 0;
287ad8b1aafSjsg }
288ad8b1aafSjsg 
gen7_emit_flush_rcs(struct i915_request * rq,u32 mode)289ad8b1aafSjsg int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290ad8b1aafSjsg {
291ad8b1aafSjsg 	u32 scratch_addr =
292ad8b1aafSjsg 		intel_gt_scratch_offset(rq->engine->gt,
293ad8b1aafSjsg 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294ad8b1aafSjsg 	u32 *cs, flags = 0;
295ad8b1aafSjsg 
296ad8b1aafSjsg 	/*
297ad8b1aafSjsg 	 * Ensure that any following seqno writes only happen when the render
298ad8b1aafSjsg 	 * cache is indeed flushed.
299ad8b1aafSjsg 	 *
300ad8b1aafSjsg 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
301ad8b1aafSjsg 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
302ad8b1aafSjsg 	 * don't try to be clever and just set it unconditionally.
303ad8b1aafSjsg 	 */
304ad8b1aafSjsg 	flags |= PIPE_CONTROL_CS_STALL;
305ad8b1aafSjsg 
306ad8b1aafSjsg 	/*
307ad8b1aafSjsg 	 * CS_STALL suggests at least a post-sync write.
308ad8b1aafSjsg 	 */
309ad8b1aafSjsg 	flags |= PIPE_CONTROL_QW_WRITE;
310ad8b1aafSjsg 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
311ad8b1aafSjsg 
312ad8b1aafSjsg 	/*
313ad8b1aafSjsg 	 * Just flush everything.  Experiments have shown that reducing the
314ad8b1aafSjsg 	 * number of bits based on the write domains has little performance
315ad8b1aafSjsg 	 * impact.
316ad8b1aafSjsg 	 */
317ad8b1aafSjsg 	if (mode & EMIT_FLUSH) {
318ad8b1aafSjsg 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319ad8b1aafSjsg 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320ad8b1aafSjsg 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
321ad8b1aafSjsg 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
322ad8b1aafSjsg 	}
323ad8b1aafSjsg 	if (mode & EMIT_INVALIDATE) {
324ad8b1aafSjsg 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
325ad8b1aafSjsg 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326ad8b1aafSjsg 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327ad8b1aafSjsg 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328ad8b1aafSjsg 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329ad8b1aafSjsg 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330ad8b1aafSjsg 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331ad8b1aafSjsg 
332ad8b1aafSjsg 		/*
333ad8b1aafSjsg 		 * Workaround: we must issue a pipe_control with CS-stall bit
334ad8b1aafSjsg 		 * set before a pipe_control command that has the state cache
335ad8b1aafSjsg 		 * invalidate bit set.
336ad8b1aafSjsg 		 */
337ad8b1aafSjsg 		gen7_stall_cs(rq);
338ad8b1aafSjsg 	}
339ad8b1aafSjsg 
340ad8b1aafSjsg 	cs = intel_ring_begin(rq, 4);
341ad8b1aafSjsg 	if (IS_ERR(cs))
342ad8b1aafSjsg 		return PTR_ERR(cs);
343ad8b1aafSjsg 
344ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
345ad8b1aafSjsg 	*cs++ = flags;
346ad8b1aafSjsg 	*cs++ = scratch_addr;
347ad8b1aafSjsg 	*cs++ = 0;
348ad8b1aafSjsg 	intel_ring_advance(rq, cs);
349ad8b1aafSjsg 
350ad8b1aafSjsg 	return 0;
351ad8b1aafSjsg }
352ad8b1aafSjsg 
gen7_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)353ad8b1aafSjsg u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354ad8b1aafSjsg {
355ad8b1aafSjsg 	*cs++ = GFX_OP_PIPE_CONTROL(4);
356ad8b1aafSjsg 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
357ad8b1aafSjsg 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
358ad8b1aafSjsg 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
359ad8b1aafSjsg 		 PIPE_CONTROL_FLUSH_ENABLE |
360ad8b1aafSjsg 		 PIPE_CONTROL_QW_WRITE |
361ad8b1aafSjsg 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
362ad8b1aafSjsg 		 PIPE_CONTROL_CS_STALL);
3635ca02815Sjsg 	*cs++ = i915_request_active_seqno(rq);
364ad8b1aafSjsg 	*cs++ = rq->fence.seqno;
365ad8b1aafSjsg 
366ad8b1aafSjsg 	*cs++ = MI_USER_INTERRUPT;
367ad8b1aafSjsg 	*cs++ = MI_NOOP;
368ad8b1aafSjsg 
369ad8b1aafSjsg 	rq->tail = intel_ring_offset(rq, cs);
370ad8b1aafSjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
371ad8b1aafSjsg 
372ad8b1aafSjsg 	return cs;
373ad8b1aafSjsg }
374ad8b1aafSjsg 
gen6_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)375ad8b1aafSjsg u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376ad8b1aafSjsg {
377ad8b1aafSjsg 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
3785ca02815Sjsg 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379ad8b1aafSjsg 
380ad8b1aafSjsg 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
381ad8b1aafSjsg 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
382ad8b1aafSjsg 	*cs++ = rq->fence.seqno;
383ad8b1aafSjsg 
384ad8b1aafSjsg 	*cs++ = MI_USER_INTERRUPT;
385ad8b1aafSjsg 
386ad8b1aafSjsg 	rq->tail = intel_ring_offset(rq, cs);
387ad8b1aafSjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
388ad8b1aafSjsg 
389ad8b1aafSjsg 	return cs;
390ad8b1aafSjsg }
391ad8b1aafSjsg 
392ad8b1aafSjsg #define GEN7_XCS_WA 32
gen7_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)393ad8b1aafSjsg u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
394ad8b1aafSjsg {
395ad8b1aafSjsg 	int i;
396ad8b1aafSjsg 
397ad8b1aafSjsg 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
3985ca02815Sjsg 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399ad8b1aafSjsg 
400ad8b1aafSjsg 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
401ad8b1aafSjsg 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
402ad8b1aafSjsg 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
403ad8b1aafSjsg 	*cs++ = rq->fence.seqno;
404ad8b1aafSjsg 
405ad8b1aafSjsg 	for (i = 0; i < GEN7_XCS_WA; i++) {
406ad8b1aafSjsg 		*cs++ = MI_STORE_DWORD_INDEX;
407ad8b1aafSjsg 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
408ad8b1aafSjsg 		*cs++ = rq->fence.seqno;
409ad8b1aafSjsg 	}
410ad8b1aafSjsg 
411ad8b1aafSjsg 	*cs++ = MI_FLUSH_DW;
412ad8b1aafSjsg 	*cs++ = 0;
413ad8b1aafSjsg 	*cs++ = 0;
414ad8b1aafSjsg 
415ad8b1aafSjsg 	*cs++ = MI_USER_INTERRUPT;
416ad8b1aafSjsg 	*cs++ = MI_NOOP;
417ad8b1aafSjsg 
418ad8b1aafSjsg 	rq->tail = intel_ring_offset(rq, cs);
419ad8b1aafSjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
420ad8b1aafSjsg 
421ad8b1aafSjsg 	return cs;
422ad8b1aafSjsg }
423ad8b1aafSjsg #undef GEN7_XCS_WA
424ad8b1aafSjsg 
gen6_irq_enable(struct intel_engine_cs * engine)425ad8b1aafSjsg void gen6_irq_enable(struct intel_engine_cs *engine)
426ad8b1aafSjsg {
427ad8b1aafSjsg 	ENGINE_WRITE(engine, RING_IMR,
428ad8b1aafSjsg 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
429ad8b1aafSjsg 
430ad8b1aafSjsg 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
431ad8b1aafSjsg 	ENGINE_POSTING_READ(engine, RING_IMR);
432ad8b1aafSjsg 
433ad8b1aafSjsg 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
434ad8b1aafSjsg }
435ad8b1aafSjsg 
gen6_irq_disable(struct intel_engine_cs * engine)436ad8b1aafSjsg void gen6_irq_disable(struct intel_engine_cs *engine)
437ad8b1aafSjsg {
438ad8b1aafSjsg 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439ad8b1aafSjsg 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
440ad8b1aafSjsg }
441ad8b1aafSjsg 
hsw_irq_enable_vecs(struct intel_engine_cs * engine)442ad8b1aafSjsg void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443ad8b1aafSjsg {
444ad8b1aafSjsg 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445ad8b1aafSjsg 
446ad8b1aafSjsg 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
447ad8b1aafSjsg 	ENGINE_POSTING_READ(engine, RING_IMR);
448ad8b1aafSjsg 
449ad8b1aafSjsg 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
450ad8b1aafSjsg }
451ad8b1aafSjsg 
hsw_irq_disable_vecs(struct intel_engine_cs * engine)452ad8b1aafSjsg void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453ad8b1aafSjsg {
454ad8b1aafSjsg 	ENGINE_WRITE(engine, RING_IMR, ~0);
455ad8b1aafSjsg 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
456ad8b1aafSjsg }
457