xref: /openbsd-src/sys/dev/pci/drm/i915/gt/gen8_engine_cs.c (revision 69bddb605650d792f8c155011bb76c24601163cb)
15ca02815Sjsg // SPDX-License-Identifier: MIT
25ca02815Sjsg /*
35ca02815Sjsg  * Copyright © 2014 Intel Corporation
45ca02815Sjsg  */
55ca02815Sjsg 
65ca02815Sjsg #include "gen8_engine_cs.h"
71bb76ff1Sjsg #include "intel_engine_regs.h"
85ca02815Sjsg #include "intel_gpu_commands.h"
9596b6869Sjsg #include "intel_gt.h"
101bb76ff1Sjsg #include "intel_lrc.h"
115ca02815Sjsg #include "intel_ring.h"
125ca02815Sjsg 
135ca02815Sjsg int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
145ca02815Sjsg {
155ca02815Sjsg 	bool vf_flush_wa = false, dc_flush_wa = false;
165ca02815Sjsg 	u32 *cs, flags = 0;
175ca02815Sjsg 	int len;
185ca02815Sjsg 
195ca02815Sjsg 	flags |= PIPE_CONTROL_CS_STALL;
205ca02815Sjsg 
215ca02815Sjsg 	if (mode & EMIT_FLUSH) {
225ca02815Sjsg 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
235ca02815Sjsg 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
245ca02815Sjsg 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
255ca02815Sjsg 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
265ca02815Sjsg 	}
275ca02815Sjsg 
285ca02815Sjsg 	if (mode & EMIT_INVALIDATE) {
295ca02815Sjsg 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
305ca02815Sjsg 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
315ca02815Sjsg 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
325ca02815Sjsg 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
335ca02815Sjsg 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
345ca02815Sjsg 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
355ca02815Sjsg 		flags |= PIPE_CONTROL_QW_WRITE;
365ca02815Sjsg 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
375ca02815Sjsg 
385ca02815Sjsg 		/*
395ca02815Sjsg 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
405ca02815Sjsg 		 * pipe control.
415ca02815Sjsg 		 */
42f005ef32Sjsg 		if (GRAPHICS_VER(rq->i915) == 9)
435ca02815Sjsg 			vf_flush_wa = true;
445ca02815Sjsg 
455ca02815Sjsg 		/* WaForGAMHang:kbl */
46f005ef32Sjsg 		if (IS_KABYLAKE(rq->i915) && IS_GRAPHICS_STEP(rq->i915, 0, STEP_C0))
475ca02815Sjsg 			dc_flush_wa = true;
485ca02815Sjsg 	}
495ca02815Sjsg 
505ca02815Sjsg 	len = 6;
515ca02815Sjsg 
525ca02815Sjsg 	if (vf_flush_wa)
535ca02815Sjsg 		len += 6;
545ca02815Sjsg 
555ca02815Sjsg 	if (dc_flush_wa)
565ca02815Sjsg 		len += 12;
575ca02815Sjsg 
585ca02815Sjsg 	cs = intel_ring_begin(rq, len);
595ca02815Sjsg 	if (IS_ERR(cs))
605ca02815Sjsg 		return PTR_ERR(cs);
615ca02815Sjsg 
625ca02815Sjsg 	if (vf_flush_wa)
635ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, 0, 0);
645ca02815Sjsg 
655ca02815Sjsg 	if (dc_flush_wa)
665ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
675ca02815Sjsg 					    0);
685ca02815Sjsg 
695ca02815Sjsg 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
705ca02815Sjsg 
715ca02815Sjsg 	if (dc_flush_wa)
725ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
735ca02815Sjsg 
745ca02815Sjsg 	intel_ring_advance(rq, cs);
755ca02815Sjsg 
765ca02815Sjsg 	return 0;
775ca02815Sjsg }
785ca02815Sjsg 
795ca02815Sjsg int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
805ca02815Sjsg {
815ca02815Sjsg 	u32 cmd, *cs;
825ca02815Sjsg 
835ca02815Sjsg 	cs = intel_ring_begin(rq, 4);
845ca02815Sjsg 	if (IS_ERR(cs))
855ca02815Sjsg 		return PTR_ERR(cs);
865ca02815Sjsg 
875ca02815Sjsg 	cmd = MI_FLUSH_DW + 1;
885ca02815Sjsg 
895ca02815Sjsg 	/*
905ca02815Sjsg 	 * We always require a command barrier so that subsequent
915ca02815Sjsg 	 * commands, such as breadcrumb interrupts, are strictly ordered
925ca02815Sjsg 	 * wrt the contents of the write cache being flushed to memory
935ca02815Sjsg 	 * (and thus being coherent from the CPU).
945ca02815Sjsg 	 */
955ca02815Sjsg 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
965ca02815Sjsg 
975ca02815Sjsg 	if (mode & EMIT_INVALIDATE) {
985ca02815Sjsg 		cmd |= MI_INVALIDATE_TLB;
995ca02815Sjsg 		if (rq->engine->class == VIDEO_DECODE_CLASS)
1005ca02815Sjsg 			cmd |= MI_INVALIDATE_BSD;
1015ca02815Sjsg 	}
1025ca02815Sjsg 
1035ca02815Sjsg 	*cs++ = cmd;
1045ca02815Sjsg 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
1055ca02815Sjsg 	*cs++ = 0; /* upper addr */
1065ca02815Sjsg 	*cs++ = 0; /* value */
1075ca02815Sjsg 	intel_ring_advance(rq, cs);
1085ca02815Sjsg 
1095ca02815Sjsg 	return 0;
1105ca02815Sjsg }
1115ca02815Sjsg 
1125ca02815Sjsg int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
1135ca02815Sjsg {
1145ca02815Sjsg 	if (mode & EMIT_FLUSH) {
1155ca02815Sjsg 		u32 *cs;
1165ca02815Sjsg 		u32 flags = 0;
1175ca02815Sjsg 
1185ca02815Sjsg 		flags |= PIPE_CONTROL_CS_STALL;
1195ca02815Sjsg 
1205ca02815Sjsg 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
1215ca02815Sjsg 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
1225ca02815Sjsg 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
1235ca02815Sjsg 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
1245ca02815Sjsg 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
1255ca02815Sjsg 		flags |= PIPE_CONTROL_QW_WRITE;
1265ca02815Sjsg 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
1275ca02815Sjsg 
1285ca02815Sjsg 		cs = intel_ring_begin(rq, 6);
1295ca02815Sjsg 		if (IS_ERR(cs))
1305ca02815Sjsg 			return PTR_ERR(cs);
1315ca02815Sjsg 
1325ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
1335ca02815Sjsg 		intel_ring_advance(rq, cs);
1345ca02815Sjsg 	}
1355ca02815Sjsg 
1365ca02815Sjsg 	if (mode & EMIT_INVALIDATE) {
1375ca02815Sjsg 		u32 *cs;
1385ca02815Sjsg 		u32 flags = 0;
1395ca02815Sjsg 
1405ca02815Sjsg 		flags |= PIPE_CONTROL_CS_STALL;
1415ca02815Sjsg 
1425ca02815Sjsg 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
1435ca02815Sjsg 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
1445ca02815Sjsg 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
1455ca02815Sjsg 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
1465ca02815Sjsg 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
1475ca02815Sjsg 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
1485ca02815Sjsg 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
1495ca02815Sjsg 		flags |= PIPE_CONTROL_QW_WRITE;
1505ca02815Sjsg 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
1515ca02815Sjsg 
1525ca02815Sjsg 		cs = intel_ring_begin(rq, 6);
1535ca02815Sjsg 		if (IS_ERR(cs))
1545ca02815Sjsg 			return PTR_ERR(cs);
1555ca02815Sjsg 
1565ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
1575ca02815Sjsg 		intel_ring_advance(rq, cs);
1585ca02815Sjsg 	}
1595ca02815Sjsg 
1605ca02815Sjsg 	return 0;
1615ca02815Sjsg }
1625ca02815Sjsg 
1635ca02815Sjsg static u32 preparser_disable(bool state)
1645ca02815Sjsg {
1655ca02815Sjsg 	return MI_ARB_CHECK | 1 << 8 | state;
1665ca02815Sjsg }
1675ca02815Sjsg 
168df2f834eSjsg static i915_reg_t gen12_get_aux_inv_reg(struct intel_engine_cs *engine)
169df2f834eSjsg {
170df2f834eSjsg 	switch (engine->id) {
171df2f834eSjsg 	case RCS0:
172df2f834eSjsg 		return GEN12_CCS_AUX_INV;
173df2f834eSjsg 	case BCS0:
174df2f834eSjsg 		return GEN12_BCS0_AUX_INV;
175df2f834eSjsg 	case VCS0:
176df2f834eSjsg 		return GEN12_VD0_AUX_INV;
177df2f834eSjsg 	case VCS2:
178df2f834eSjsg 		return GEN12_VD2_AUX_INV;
179df2f834eSjsg 	case VECS0:
180df2f834eSjsg 		return GEN12_VE0_AUX_INV;
181df2f834eSjsg 	case CCS0:
182df2f834eSjsg 		return GEN12_CCS0_AUX_INV;
183df2f834eSjsg 	default:
184df2f834eSjsg 		return INVALID_MMIO_REG;
185df2f834eSjsg 	}
186df2f834eSjsg }
187df2f834eSjsg 
1888028d8eaSjsg static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine)
1898028d8eaSjsg {
190df2f834eSjsg 	i915_reg_t reg = gen12_get_aux_inv_reg(engine);
191df2f834eSjsg 
1928028d8eaSjsg 	if (IS_PONTEVECCHIO(engine->i915))
1938028d8eaSjsg 		return false;
1948028d8eaSjsg 
1958028d8eaSjsg 	/*
196df2f834eSjsg 	 * So far platforms supported by i915 having flat ccs do not require
197df2f834eSjsg 	 * AUX invalidation. Check also whether the engine requires it.
1988028d8eaSjsg 	 */
199df2f834eSjsg 	return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915);
2008028d8eaSjsg }
2018028d8eaSjsg 
202df2f834eSjsg u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs)
2035ca02815Sjsg {
204df2f834eSjsg 	i915_reg_t inv_reg = gen12_get_aux_inv_reg(engine);
205df2f834eSjsg 	u32 gsi_offset = engine->gt->uncore->gsi_offset;
206df2f834eSjsg 
207df2f834eSjsg 	if (!gen12_needs_ccs_aux_inv(engine))
208df2f834eSjsg 		return cs;
2095ca02815Sjsg 
2101bb76ff1Sjsg 	*cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
2111bb76ff1Sjsg 	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
2125ca02815Sjsg 	*cs++ = AUX_INV;
213f6aeda7dSjsg 
214f6aeda7dSjsg 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
215f6aeda7dSjsg 		MI_SEMAPHORE_REGISTER_POLL |
216f6aeda7dSjsg 		MI_SEMAPHORE_POLL |
217f6aeda7dSjsg 		MI_SEMAPHORE_SAD_EQ_SDD;
218f6aeda7dSjsg 	*cs++ = 0;
219f6aeda7dSjsg 	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
220f6aeda7dSjsg 	*cs++ = 0;
221f6aeda7dSjsg 	*cs++ = 0;
2225ca02815Sjsg 
2235ca02815Sjsg 	return cs;
2245ca02815Sjsg }
2255ca02815Sjsg 
226f005ef32Sjsg static int mtl_dummy_pipe_control(struct i915_request *rq)
227f005ef32Sjsg {
228f005ef32Sjsg 	/* Wa_14016712196 */
2296e5fdd49Sjsg 	if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
23093533976Sjsg 	    IS_DG2(rq->i915)) {
231f005ef32Sjsg 		u32 *cs;
232f005ef32Sjsg 
233f005ef32Sjsg 		/* dummy PIPE_CONTROL + depth flush */
234f005ef32Sjsg 		cs = intel_ring_begin(rq, 6);
235f005ef32Sjsg 		if (IS_ERR(cs))
236f005ef32Sjsg 			return PTR_ERR(cs);
237f005ef32Sjsg 		cs = gen12_emit_pipe_control(cs,
238f005ef32Sjsg 					     0,
239f005ef32Sjsg 					     PIPE_CONTROL_DEPTH_CACHE_FLUSH,
240f005ef32Sjsg 					     LRC_PPHWSP_SCRATCH_ADDR);
241f005ef32Sjsg 		intel_ring_advance(rq, cs);
242f005ef32Sjsg 	}
243f005ef32Sjsg 
244f005ef32Sjsg 	return 0;
245f005ef32Sjsg }
246f005ef32Sjsg 
2475ca02815Sjsg int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
2485ca02815Sjsg {
2491bb76ff1Sjsg 	struct intel_engine_cs *engine = rq->engine;
2501bb76ff1Sjsg 
2519ce1c53cSjsg 	/*
2529ce1c53cSjsg 	 * On Aux CCS platforms the invalidation of the Aux
2539ce1c53cSjsg 	 * table requires quiescing memory traffic beforehand
2549ce1c53cSjsg 	 */
2559ce1c53cSjsg 	if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) {
256f005ef32Sjsg 		u32 bit_group_0 = 0;
257f005ef32Sjsg 		u32 bit_group_1 = 0;
258f005ef32Sjsg 		int err;
2595ca02815Sjsg 		u32 *cs;
2605ca02815Sjsg 
261f005ef32Sjsg 		err = mtl_dummy_pipe_control(rq);
262f005ef32Sjsg 		if (err)
263f005ef32Sjsg 			return err;
264f005ef32Sjsg 
265f005ef32Sjsg 		bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
266f005ef32Sjsg 
267f005ef32Sjsg 		/*
268f005ef32Sjsg 		 * When required, in MTL and beyond platforms we
269f005ef32Sjsg 		 * need to set the CCS_FLUSH bit in the pipe control
270f005ef32Sjsg 		 */
271f005ef32Sjsg 		if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
272f005ef32Sjsg 			bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
273f005ef32Sjsg 
27442b1e46fSjsg 		/*
27542b1e46fSjsg 		 * L3 fabric flush is needed for AUX CCS invalidation
27642b1e46fSjsg 		 * which happens as part of pipe-control so we can
27742b1e46fSjsg 		 * ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3
27842b1e46fSjsg 		 * deals with Protected Memory which is not needed for
27942b1e46fSjsg 		 * AUX CCS invalidation and lead to unwanted side effects.
28042b1e46fSjsg 		 */
281*69bddb60Sjsg 		if ((mode & EMIT_FLUSH) &&
282*69bddb60Sjsg 		    GRAPHICS_VER_FULL(rq->i915) < IP_VER(12, 70))
283f005ef32Sjsg 			bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
28442b1e46fSjsg 
285f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH;
286f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
287f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2885ca02815Sjsg 		/* Wa_1409600907:tgl,adl-p */
289f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_DEPTH_STALL;
290f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_DC_FLUSH_ENABLE;
291f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_FLUSH_ENABLE;
2925ca02815Sjsg 
293f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_STORE_DATA_INDEX;
294f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_QW_WRITE;
2955ca02815Sjsg 
296f005ef32Sjsg 		bit_group_1 |= PIPE_CONTROL_CS_STALL;
2975ca02815Sjsg 
2981bb76ff1Sjsg 		if (!HAS_3D_PIPELINE(engine->i915))
299f005ef32Sjsg 			bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
3001bb76ff1Sjsg 		else if (engine->class == COMPUTE_CLASS)
301f005ef32Sjsg 			bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
3021bb76ff1Sjsg 
3035ca02815Sjsg 		cs = intel_ring_begin(rq, 6);
3045ca02815Sjsg 		if (IS_ERR(cs))
3055ca02815Sjsg 			return PTR_ERR(cs);
3065ca02815Sjsg 
307f005ef32Sjsg 		cs = gen12_emit_pipe_control(cs, bit_group_0, bit_group_1,
308f005ef32Sjsg 					     LRC_PPHWSP_SCRATCH_ADDR);
3095ca02815Sjsg 		intel_ring_advance(rq, cs);
3105ca02815Sjsg 	}
3115ca02815Sjsg 
3125ca02815Sjsg 	if (mode & EMIT_INVALIDATE) {
3135ca02815Sjsg 		u32 flags = 0;
3141bb76ff1Sjsg 		u32 *cs, count;
315f005ef32Sjsg 		int err;
316f005ef32Sjsg 
317f005ef32Sjsg 		err = mtl_dummy_pipe_control(rq);
318f005ef32Sjsg 		if (err)
319f005ef32Sjsg 			return err;
3205ca02815Sjsg 
3215ca02815Sjsg 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3225ca02815Sjsg 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3235ca02815Sjsg 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3245ca02815Sjsg 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3255ca02815Sjsg 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3265ca02815Sjsg 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3275ca02815Sjsg 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3285ca02815Sjsg 
3295ca02815Sjsg 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3305ca02815Sjsg 		flags |= PIPE_CONTROL_QW_WRITE;
3315ca02815Sjsg 
3325ca02815Sjsg 		flags |= PIPE_CONTROL_CS_STALL;
3335ca02815Sjsg 
3341bb76ff1Sjsg 		if (!HAS_3D_PIPELINE(engine->i915))
3351bb76ff1Sjsg 			flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
3361bb76ff1Sjsg 		else if (engine->class == COMPUTE_CLASS)
3371bb76ff1Sjsg 			flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
3381bb76ff1Sjsg 
3391bb76ff1Sjsg 		count = 8;
340f6aeda7dSjsg 		if (gen12_needs_ccs_aux_inv(rq->engine))
341f6aeda7dSjsg 			count += 8;
3421bb76ff1Sjsg 
3431bb76ff1Sjsg 		cs = intel_ring_begin(rq, count);
3445ca02815Sjsg 		if (IS_ERR(cs))
3455ca02815Sjsg 			return PTR_ERR(cs);
3465ca02815Sjsg 
3475ca02815Sjsg 		/*
3485ca02815Sjsg 		 * Prevent the pre-parser from skipping past the TLB
3495ca02815Sjsg 		 * invalidate and loading a stale page for the batch
3505ca02815Sjsg 		 * buffer / request payload.
3515ca02815Sjsg 		 */
3525ca02815Sjsg 		*cs++ = preparser_disable(true);
3535ca02815Sjsg 
3545ca02815Sjsg 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3555ca02815Sjsg 
356df2f834eSjsg 		cs = gen12_emit_aux_table_inv(engine, cs);
3575ca02815Sjsg 
3585ca02815Sjsg 		*cs++ = preparser_disable(false);
3595ca02815Sjsg 		intel_ring_advance(rq, cs);
3605ca02815Sjsg 	}
3615ca02815Sjsg 
3625ca02815Sjsg 	return 0;
3635ca02815Sjsg }
3645ca02815Sjsg 
3655ca02815Sjsg int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
3665ca02815Sjsg {
367df2f834eSjsg 	u32 cmd = 4;
368df2f834eSjsg 	u32 *cs;
3695ca02815Sjsg 
3701bb76ff1Sjsg 	if (mode & EMIT_INVALIDATE) {
3715ca02815Sjsg 		cmd += 2;
3721bb76ff1Sjsg 
373df2f834eSjsg 		if (gen12_needs_ccs_aux_inv(rq->engine))
374f6aeda7dSjsg 			cmd += 8;
3751bb76ff1Sjsg 	}
3765ca02815Sjsg 
3775ca02815Sjsg 	cs = intel_ring_begin(rq, cmd);
3785ca02815Sjsg 	if (IS_ERR(cs))
3795ca02815Sjsg 		return PTR_ERR(cs);
3805ca02815Sjsg 
3815ca02815Sjsg 	if (mode & EMIT_INVALIDATE)
3825ca02815Sjsg 		*cs++ = preparser_disable(true);
3835ca02815Sjsg 
3845ca02815Sjsg 	cmd = MI_FLUSH_DW + 1;
3855ca02815Sjsg 
3865ca02815Sjsg 	/*
3875ca02815Sjsg 	 * We always require a command barrier so that subsequent
3885ca02815Sjsg 	 * commands, such as breadcrumb interrupts, are strictly ordered
3895ca02815Sjsg 	 * wrt the contents of the write cache being flushed to memory
3905ca02815Sjsg 	 * (and thus being coherent from the CPU).
3915ca02815Sjsg 	 */
3925ca02815Sjsg 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3935ca02815Sjsg 
3945ca02815Sjsg 	if (mode & EMIT_INVALIDATE) {
3955ca02815Sjsg 		cmd |= MI_INVALIDATE_TLB;
3965ca02815Sjsg 		if (rq->engine->class == VIDEO_DECODE_CLASS)
3975ca02815Sjsg 			cmd |= MI_INVALIDATE_BSD;
398f005ef32Sjsg 
399f005ef32Sjsg 		if (gen12_needs_ccs_aux_inv(rq->engine) &&
400f005ef32Sjsg 		    rq->engine->class == COPY_ENGINE_CLASS)
401f005ef32Sjsg 			cmd |= MI_FLUSH_DW_CCS;
4025ca02815Sjsg 	}
4035ca02815Sjsg 
4045ca02815Sjsg 	*cs++ = cmd;
4055ca02815Sjsg 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4065ca02815Sjsg 	*cs++ = 0; /* upper addr */
4075ca02815Sjsg 	*cs++ = 0; /* value */
4085ca02815Sjsg 
409df2f834eSjsg 	cs = gen12_emit_aux_table_inv(rq->engine, cs);
4105ca02815Sjsg 
4115ca02815Sjsg 	if (mode & EMIT_INVALIDATE)
4125ca02815Sjsg 		*cs++ = preparser_disable(false);
4135ca02815Sjsg 
4145ca02815Sjsg 	intel_ring_advance(rq, cs);
4155ca02815Sjsg 
4165ca02815Sjsg 	return 0;
4175ca02815Sjsg }
4185ca02815Sjsg 
4195ca02815Sjsg static u32 preempt_address(struct intel_engine_cs *engine)
4205ca02815Sjsg {
4215ca02815Sjsg 	return (i915_ggtt_offset(engine->status_page.vma) +
4225ca02815Sjsg 		I915_GEM_HWS_PREEMPT_ADDR);
4235ca02815Sjsg }
4245ca02815Sjsg 
4255ca02815Sjsg static u32 hwsp_offset(const struct i915_request *rq)
4265ca02815Sjsg {
4275ca02815Sjsg 	const struct intel_timeline *tl;
4285ca02815Sjsg 
4295ca02815Sjsg 	/* Before the request is executed, the timeline is fixed */
4305ca02815Sjsg 	tl = rcu_dereference_protected(rq->timeline,
4315ca02815Sjsg 				       !i915_request_signaled(rq));
4325ca02815Sjsg 
4335ca02815Sjsg 	/* See the comment in i915_request_active_seqno(). */
4345ca02815Sjsg 	return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
4355ca02815Sjsg }
4365ca02815Sjsg 
4375ca02815Sjsg int gen8_emit_init_breadcrumb(struct i915_request *rq)
4385ca02815Sjsg {
4395ca02815Sjsg 	u32 *cs;
4405ca02815Sjsg 
4415ca02815Sjsg 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
4425ca02815Sjsg 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
4435ca02815Sjsg 		return 0;
4445ca02815Sjsg 
4455ca02815Sjsg 	cs = intel_ring_begin(rq, 6);
4465ca02815Sjsg 	if (IS_ERR(cs))
4475ca02815Sjsg 		return PTR_ERR(cs);
4485ca02815Sjsg 
4495ca02815Sjsg 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
4505ca02815Sjsg 	*cs++ = hwsp_offset(rq);
4515ca02815Sjsg 	*cs++ = 0;
4525ca02815Sjsg 	*cs++ = rq->fence.seqno - 1;
4535ca02815Sjsg 
4545ca02815Sjsg 	/*
4555ca02815Sjsg 	 * Check if we have been preempted before we even get started.
4565ca02815Sjsg 	 *
4575ca02815Sjsg 	 * After this point i915_request_started() reports true, even if
4585ca02815Sjsg 	 * we get preempted and so are no longer running.
4595ca02815Sjsg 	 *
4605ca02815Sjsg 	 * i915_request_started() is used during preemption processing
4615ca02815Sjsg 	 * to decide if the request is currently inside the user payload
4625ca02815Sjsg 	 * or spinning on a kernel semaphore (or earlier). For no-preemption
4635ca02815Sjsg 	 * requests, we do allow preemption on the semaphore before the user
4645ca02815Sjsg 	 * payload, but do not allow preemption once the request is started.
4655ca02815Sjsg 	 *
4665ca02815Sjsg 	 * i915_request_started() is similarly used during GPU hangs to
4675ca02815Sjsg 	 * determine if the user's payload was guilty, and if so, the
4685ca02815Sjsg 	 * request is banned. Before the request is started, it is assumed
4695ca02815Sjsg 	 * to be unharmed and an innocent victim of another's hang.
4705ca02815Sjsg 	 */
4715ca02815Sjsg 	*cs++ = MI_NOOP;
4725ca02815Sjsg 	*cs++ = MI_ARB_CHECK;
4735ca02815Sjsg 
4745ca02815Sjsg 	intel_ring_advance(rq, cs);
4755ca02815Sjsg 
4765ca02815Sjsg 	/* Record the updated position of the request's payload */
4775ca02815Sjsg 	rq->infix = intel_ring_offset(rq, cs);
4785ca02815Sjsg 
4795ca02815Sjsg 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
4805ca02815Sjsg 
4815ca02815Sjsg 	return 0;
4825ca02815Sjsg }
4835ca02815Sjsg 
484f005ef32Sjsg static int __xehp_emit_bb_start(struct i915_request *rq,
4851bb76ff1Sjsg 				u64 offset, u32 len,
4861bb76ff1Sjsg 				const unsigned int flags,
4871bb76ff1Sjsg 				u32 arb)
4881bb76ff1Sjsg {
4891bb76ff1Sjsg 	struct intel_context *ce = rq->context;
4901bb76ff1Sjsg 	u32 wa_offset = lrc_indirect_bb(ce);
4911bb76ff1Sjsg 	u32 *cs;
4921bb76ff1Sjsg 
493f005ef32Sjsg 	GEM_BUG_ON(!ce->wa_bb_page);
494f005ef32Sjsg 
4951bb76ff1Sjsg 	cs = intel_ring_begin(rq, 12);
4961bb76ff1Sjsg 	if (IS_ERR(cs))
4971bb76ff1Sjsg 		return PTR_ERR(cs);
4981bb76ff1Sjsg 
4991bb76ff1Sjsg 	*cs++ = MI_ARB_ON_OFF | arb;
5001bb76ff1Sjsg 
5011bb76ff1Sjsg 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
5021bb76ff1Sjsg 		MI_SRM_LRM_GLOBAL_GTT |
5031bb76ff1Sjsg 		MI_LRI_LRM_CS_MMIO;
5041bb76ff1Sjsg 	*cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0));
5051bb76ff1Sjsg 	*cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
5061bb76ff1Sjsg 	*cs++ = 0;
5071bb76ff1Sjsg 
5081bb76ff1Sjsg 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
5091bb76ff1Sjsg 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
5101bb76ff1Sjsg 	*cs++ = lower_32_bits(offset);
5111bb76ff1Sjsg 	*cs++ = upper_32_bits(offset);
5121bb76ff1Sjsg 
5131bb76ff1Sjsg 	/* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
5141bb76ff1Sjsg 	*cs++ = MI_BATCH_BUFFER_START_GEN8;
5151bb76ff1Sjsg 	*cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
5161bb76ff1Sjsg 	*cs++ = 0;
5171bb76ff1Sjsg 
5181bb76ff1Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
5191bb76ff1Sjsg 
5201bb76ff1Sjsg 	intel_ring_advance(rq, cs);
5211bb76ff1Sjsg 
5221bb76ff1Sjsg 	return 0;
5231bb76ff1Sjsg }
5241bb76ff1Sjsg 
525f005ef32Sjsg int xehp_emit_bb_start_noarb(struct i915_request *rq,
5261bb76ff1Sjsg 			     u64 offset, u32 len,
5271bb76ff1Sjsg 			     const unsigned int flags)
5281bb76ff1Sjsg {
529f005ef32Sjsg 	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
5301bb76ff1Sjsg }
5311bb76ff1Sjsg 
532f005ef32Sjsg int xehp_emit_bb_start(struct i915_request *rq,
5331bb76ff1Sjsg 		       u64 offset, u32 len,
5341bb76ff1Sjsg 		       const unsigned int flags)
5351bb76ff1Sjsg {
536f005ef32Sjsg 	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
5371bb76ff1Sjsg }
5381bb76ff1Sjsg 
5395ca02815Sjsg int gen8_emit_bb_start_noarb(struct i915_request *rq,
5405ca02815Sjsg 			     u64 offset, u32 len,
5415ca02815Sjsg 			     const unsigned int flags)
5425ca02815Sjsg {
5435ca02815Sjsg 	u32 *cs;
5445ca02815Sjsg 
5455ca02815Sjsg 	cs = intel_ring_begin(rq, 4);
5465ca02815Sjsg 	if (IS_ERR(cs))
5475ca02815Sjsg 		return PTR_ERR(cs);
5485ca02815Sjsg 
5495ca02815Sjsg 	/*
5505ca02815Sjsg 	 * WaDisableCtxRestoreArbitration:bdw,chv
5515ca02815Sjsg 	 *
5525ca02815Sjsg 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
5535ca02815Sjsg 	 * particular all the gen that do not need the w/a at all!), if we
5545ca02815Sjsg 	 * took care to make sure that on every switch into this context
5555ca02815Sjsg 	 * (both ordinary and for preemption) that arbitrartion was enabled
5565ca02815Sjsg 	 * we would be fine.  However, for gen8 there is another w/a that
5575ca02815Sjsg 	 * requires us to not preempt inside GPGPU execution, so we keep
5585ca02815Sjsg 	 * arbitration disabled for gen8 batches. Arbitration will be
5595ca02815Sjsg 	 * re-enabled before we close the request
5605ca02815Sjsg 	 * (engine->emit_fini_breadcrumb).
5615ca02815Sjsg 	 */
5625ca02815Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
5635ca02815Sjsg 
5645ca02815Sjsg 	/* FIXME(BDW+): Address space and security selectors. */
5655ca02815Sjsg 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
5665ca02815Sjsg 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
5675ca02815Sjsg 	*cs++ = lower_32_bits(offset);
5685ca02815Sjsg 	*cs++ = upper_32_bits(offset);
5695ca02815Sjsg 
5705ca02815Sjsg 	intel_ring_advance(rq, cs);
5715ca02815Sjsg 
5725ca02815Sjsg 	return 0;
5735ca02815Sjsg }
5745ca02815Sjsg 
5755ca02815Sjsg int gen8_emit_bb_start(struct i915_request *rq,
5765ca02815Sjsg 		       u64 offset, u32 len,
5775ca02815Sjsg 		       const unsigned int flags)
5785ca02815Sjsg {
5795ca02815Sjsg 	u32 *cs;
5805ca02815Sjsg 
5815ca02815Sjsg 	if (unlikely(i915_request_has_nopreempt(rq)))
5825ca02815Sjsg 		return gen8_emit_bb_start_noarb(rq, offset, len, flags);
5835ca02815Sjsg 
5845ca02815Sjsg 	cs = intel_ring_begin(rq, 6);
5855ca02815Sjsg 	if (IS_ERR(cs))
5865ca02815Sjsg 		return PTR_ERR(cs);
5875ca02815Sjsg 
5885ca02815Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5895ca02815Sjsg 
5905ca02815Sjsg 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
5915ca02815Sjsg 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
5925ca02815Sjsg 	*cs++ = lower_32_bits(offset);
5935ca02815Sjsg 	*cs++ = upper_32_bits(offset);
5945ca02815Sjsg 
5955ca02815Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
5965ca02815Sjsg 	*cs++ = MI_NOOP;
5975ca02815Sjsg 
5985ca02815Sjsg 	intel_ring_advance(rq, cs);
5995ca02815Sjsg 
6005ca02815Sjsg 	return 0;
6015ca02815Sjsg }
6025ca02815Sjsg 
6035ca02815Sjsg static void assert_request_valid(struct i915_request *rq)
6045ca02815Sjsg {
6055ca02815Sjsg 	struct intel_ring *ring __maybe_unused = rq->ring;
6065ca02815Sjsg 
6075ca02815Sjsg 	/* Can we unwind this request without appearing to go forwards? */
6085ca02815Sjsg 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
6095ca02815Sjsg }
6105ca02815Sjsg 
6115ca02815Sjsg /*
6125ca02815Sjsg  * Reserve space for 2 NOOPs at the end of each request to be
6135ca02815Sjsg  * used as a workaround for not being allowed to do lite
6145ca02815Sjsg  * restore with HEAD==TAIL (WaIdleLiteRestore).
6155ca02815Sjsg  */
6165ca02815Sjsg static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
6175ca02815Sjsg {
6185ca02815Sjsg 	/* Ensure there's always at least one preemption point per-request. */
6195ca02815Sjsg 	*cs++ = MI_ARB_CHECK;
6205ca02815Sjsg 	*cs++ = MI_NOOP;
6215ca02815Sjsg 	rq->wa_tail = intel_ring_offset(rq, cs);
6225ca02815Sjsg 
6235ca02815Sjsg 	/* Check that entire request is less than half the ring */
6245ca02815Sjsg 	assert_request_valid(rq);
6255ca02815Sjsg 
6265ca02815Sjsg 	return cs;
6275ca02815Sjsg }
6285ca02815Sjsg 
6295ca02815Sjsg static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
6305ca02815Sjsg {
6315ca02815Sjsg 	*cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
6325ca02815Sjsg 	*cs++ = MI_SEMAPHORE_WAIT |
6335ca02815Sjsg 		MI_SEMAPHORE_GLOBAL_GTT |
6345ca02815Sjsg 		MI_SEMAPHORE_POLL |
6355ca02815Sjsg 		MI_SEMAPHORE_SAD_EQ_SDD;
6365ca02815Sjsg 	*cs++ = 0;
6375ca02815Sjsg 	*cs++ = preempt_address(rq->engine);
6385ca02815Sjsg 	*cs++ = 0;
6395ca02815Sjsg 	*cs++ = MI_NOOP;
6405ca02815Sjsg 
6415ca02815Sjsg 	return cs;
6425ca02815Sjsg }
6435ca02815Sjsg 
6445ca02815Sjsg static __always_inline u32*
6455ca02815Sjsg gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
6465ca02815Sjsg {
6475ca02815Sjsg 	*cs++ = MI_USER_INTERRUPT;
6485ca02815Sjsg 
6495ca02815Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
6505ca02815Sjsg 	if (intel_engine_has_semaphores(rq->engine) &&
6515ca02815Sjsg 	    !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
6525ca02815Sjsg 		cs = emit_preempt_busywait(rq, cs);
6535ca02815Sjsg 
6545ca02815Sjsg 	rq->tail = intel_ring_offset(rq, cs);
6555ca02815Sjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
6565ca02815Sjsg 
6575ca02815Sjsg 	return gen8_emit_wa_tail(rq, cs);
6585ca02815Sjsg }
6595ca02815Sjsg 
6605ca02815Sjsg static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
6615ca02815Sjsg {
6625ca02815Sjsg 	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
6635ca02815Sjsg }
6645ca02815Sjsg 
6655ca02815Sjsg u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
6665ca02815Sjsg {
6675ca02815Sjsg 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
6685ca02815Sjsg }
6695ca02815Sjsg 
6705ca02815Sjsg u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
6715ca02815Sjsg {
6725ca02815Sjsg 	cs = gen8_emit_pipe_control(cs,
673f005ef32Sjsg 				    PIPE_CONTROL_CS_STALL |
674f005ef32Sjsg 				    PIPE_CONTROL_TLB_INVALIDATE |
6755ca02815Sjsg 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
6765ca02815Sjsg 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
6775ca02815Sjsg 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
6785ca02815Sjsg 				    0);
6795ca02815Sjsg 
6805ca02815Sjsg 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
6815ca02815Sjsg 	cs = gen8_emit_ggtt_write_rcs(cs,
6825ca02815Sjsg 				      rq->fence.seqno,
6835ca02815Sjsg 				      hwsp_offset(rq),
6845ca02815Sjsg 				      PIPE_CONTROL_FLUSH_ENABLE |
6855ca02815Sjsg 				      PIPE_CONTROL_CS_STALL);
6865ca02815Sjsg 
6875ca02815Sjsg 	return gen8_emit_fini_breadcrumb_tail(rq, cs);
6885ca02815Sjsg }
6895ca02815Sjsg 
6905ca02815Sjsg u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
6915ca02815Sjsg {
692f005ef32Sjsg 	cs = gen8_emit_pipe_control(cs,
6935ca02815Sjsg 				    PIPE_CONTROL_CS_STALL |
694f005ef32Sjsg 				    PIPE_CONTROL_TLB_INVALIDATE |
6955ca02815Sjsg 				    PIPE_CONTROL_TILE_CACHE_FLUSH |
6965ca02815Sjsg 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
6975ca02815Sjsg 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
698f005ef32Sjsg 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
699f005ef32Sjsg 				    0);
700f005ef32Sjsg 
701f005ef32Sjsg 	/*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
702f005ef32Sjsg 	cs = gen8_emit_ggtt_write_rcs(cs,
703f005ef32Sjsg 				      rq->fence.seqno,
704f005ef32Sjsg 				      hwsp_offset(rq),
705f005ef32Sjsg 				      PIPE_CONTROL_FLUSH_ENABLE |
706f005ef32Sjsg 				      PIPE_CONTROL_CS_STALL);
7075ca02815Sjsg 
7085ca02815Sjsg 	return gen8_emit_fini_breadcrumb_tail(rq, cs);
7095ca02815Sjsg }
7105ca02815Sjsg 
7115ca02815Sjsg /*
7125ca02815Sjsg  * Note that the CS instruction pre-parser will not stall on the breadcrumb
7135ca02815Sjsg  * flush and will continue pre-fetching the instructions after it before the
7145ca02815Sjsg  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
7155ca02815Sjsg  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
7165ca02815Sjsg  * of the next request before the memory has been flushed, we're guaranteed that
7175ca02815Sjsg  * we won't access the batch itself too early.
7185ca02815Sjsg  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
7195ca02815Sjsg  * so, if the current request is modifying an instruction in the next request on
7205ca02815Sjsg  * the same intel_context, we might pre-fetch and then execute the pre-update
7215ca02815Sjsg  * instruction. To avoid this, the users of self-modifying code should either
7225ca02815Sjsg  * disable the parser around the code emitting the memory writes, via a new flag
7235ca02815Sjsg  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
7245ca02815Sjsg  * the in-kernel use-cases we've opted to use a separate context, see
7255ca02815Sjsg  * reloc_gpu() as an example.
7265ca02815Sjsg  * All the above applies only to the instructions themselves. Non-inline data
7275ca02815Sjsg  * used by the instructions is not pre-fetched.
7285ca02815Sjsg  */
7295ca02815Sjsg 
7305ca02815Sjsg static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
7315ca02815Sjsg {
7325ca02815Sjsg 	*cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
7335ca02815Sjsg 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
7345ca02815Sjsg 		MI_SEMAPHORE_GLOBAL_GTT |
7355ca02815Sjsg 		MI_SEMAPHORE_POLL |
7365ca02815Sjsg 		MI_SEMAPHORE_SAD_EQ_SDD;
7375ca02815Sjsg 	*cs++ = 0;
7385ca02815Sjsg 	*cs++ = preempt_address(rq->engine);
7395ca02815Sjsg 	*cs++ = 0;
7405ca02815Sjsg 	*cs++ = 0;
7415ca02815Sjsg 
7425ca02815Sjsg 	return cs;
7435ca02815Sjsg }
7445ca02815Sjsg 
7451bb76ff1Sjsg /* Wa_14014475959:dg2 */
7461bb76ff1Sjsg #define CCS_SEMAPHORE_PPHWSP_OFFSET	0x540
7471bb76ff1Sjsg static u32 ccs_semaphore_offset(struct i915_request *rq)
7481bb76ff1Sjsg {
7491bb76ff1Sjsg 	return i915_ggtt_offset(rq->context->state) +
7501bb76ff1Sjsg 		(LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
7511bb76ff1Sjsg }
7521bb76ff1Sjsg 
7531bb76ff1Sjsg /* Wa_14014475959:dg2 */
7541bb76ff1Sjsg static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs)
7551bb76ff1Sjsg {
7561bb76ff1Sjsg 	int i;
7571bb76ff1Sjsg 
7581bb76ff1Sjsg 	*cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | MI_ATOMIC_CS_STALL |
7591bb76ff1Sjsg 		MI_ATOMIC_MOVE;
7601bb76ff1Sjsg 	*cs++ = ccs_semaphore_offset(rq);
7611bb76ff1Sjsg 	*cs++ = 0;
7621bb76ff1Sjsg 	*cs++ = 1;
7631bb76ff1Sjsg 
7641bb76ff1Sjsg 	/*
7651bb76ff1Sjsg 	 * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
7661bb76ff1Sjsg 	 * to align. 4 DWs above + 8 filler DWs here.
7671bb76ff1Sjsg 	 */
7681bb76ff1Sjsg 	for (i = 0; i < 8; ++i)
7691bb76ff1Sjsg 		*cs++ = 0;
7701bb76ff1Sjsg 
7711bb76ff1Sjsg 	*cs++ = MI_SEMAPHORE_WAIT |
7721bb76ff1Sjsg 		MI_SEMAPHORE_GLOBAL_GTT |
7731bb76ff1Sjsg 		MI_SEMAPHORE_POLL |
7741bb76ff1Sjsg 		MI_SEMAPHORE_SAD_EQ_SDD;
7751bb76ff1Sjsg 	*cs++ = 0;
7761bb76ff1Sjsg 	*cs++ = ccs_semaphore_offset(rq);
7771bb76ff1Sjsg 	*cs++ = 0;
7781bb76ff1Sjsg 
7791bb76ff1Sjsg 	return cs;
7801bb76ff1Sjsg }
7811bb76ff1Sjsg 
7825ca02815Sjsg static __always_inline u32*
7835ca02815Sjsg gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
7845ca02815Sjsg {
7855ca02815Sjsg 	*cs++ = MI_USER_INTERRUPT;
7865ca02815Sjsg 
7875ca02815Sjsg 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
7885ca02815Sjsg 	if (intel_engine_has_semaphores(rq->engine) &&
7895ca02815Sjsg 	    !intel_uc_uses_guc_submission(&rq->engine->gt->uc))
7905ca02815Sjsg 		cs = gen12_emit_preempt_busywait(rq, cs);
7915ca02815Sjsg 
7921bb76ff1Sjsg 	/* Wa_14014475959:dg2 */
7931bb76ff1Sjsg 	if (intel_engine_uses_wa_hold_ccs_switchout(rq->engine))
7941bb76ff1Sjsg 		cs = ccs_emit_wa_busywait(rq, cs);
7951bb76ff1Sjsg 
7965ca02815Sjsg 	rq->tail = intel_ring_offset(rq, cs);
7975ca02815Sjsg 	assert_ring_tail_valid(rq->ring, rq->tail);
7985ca02815Sjsg 
7995ca02815Sjsg 	return gen8_emit_wa_tail(rq, cs);
8005ca02815Sjsg }
8015ca02815Sjsg 
8025ca02815Sjsg u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
8035ca02815Sjsg {
8045ca02815Sjsg 	/* XXX Stalling flush before seqno write; post-sync not */
8055ca02815Sjsg 	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
8065ca02815Sjsg 	return gen12_emit_fini_breadcrumb_tail(rq, cs);
8075ca02815Sjsg }
8085ca02815Sjsg 
8095ca02815Sjsg u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
8105ca02815Sjsg {
811f005ef32Sjsg 	struct drm_i915_private *i915 = rq->i915;
812596b6869Sjsg 	struct intel_gt *gt = rq->engine->gt;
8131bb76ff1Sjsg 	u32 flags = (PIPE_CONTROL_CS_STALL |
814f005ef32Sjsg 		     PIPE_CONTROL_TLB_INVALIDATE |
8155ca02815Sjsg 		     PIPE_CONTROL_TILE_CACHE_FLUSH |
8165ca02815Sjsg 		     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
8175ca02815Sjsg 		     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8185ca02815Sjsg 		     PIPE_CONTROL_DC_FLUSH_ENABLE |
8195ca02815Sjsg 		     PIPE_CONTROL_FLUSH_ENABLE);
8205ca02815Sjsg 
821*69bddb60Sjsg 	if (GRAPHICS_VER_FULL(rq->i915) < IP_VER(12, 70))
822*69bddb60Sjsg 		flags |= PIPE_CONTROL_FLUSH_L3;
823*69bddb60Sjsg 
824f005ef32Sjsg 	/* Wa_14016712196 */
8256e5fdd49Sjsg 	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
826f005ef32Sjsg 		/* dummy PIPE_CONTROL + depth flush */
827f005ef32Sjsg 		cs = gen12_emit_pipe_control(cs, 0,
828f005ef32Sjsg 					     PIPE_CONTROL_DEPTH_CACHE_FLUSH, 0);
829f005ef32Sjsg 
8301bb76ff1Sjsg 	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
8311bb76ff1Sjsg 		/* Wa_1409600907 */
8321bb76ff1Sjsg 		flags |= PIPE_CONTROL_DEPTH_STALL;
8331bb76ff1Sjsg 
834f005ef32Sjsg 	if (!HAS_3D_PIPELINE(rq->i915))
8351bb76ff1Sjsg 		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
8361bb76ff1Sjsg 	else if (rq->engine->class == COMPUTE_CLASS)
8371bb76ff1Sjsg 		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
8381bb76ff1Sjsg 
839f005ef32Sjsg 	cs = gen12_emit_pipe_control(cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0);
840f005ef32Sjsg 
841f005ef32Sjsg 	/*XXX: Look at gen8_emit_fini_breadcrumb_rcs */
8421bb76ff1Sjsg 	cs = gen12_emit_ggtt_write_rcs(cs,
8431bb76ff1Sjsg 				       rq->fence.seqno,
8441bb76ff1Sjsg 				       hwsp_offset(rq),
845f005ef32Sjsg 				       0,
846f005ef32Sjsg 				       PIPE_CONTROL_FLUSH_ENABLE |
847f005ef32Sjsg 				       PIPE_CONTROL_CS_STALL);
8481bb76ff1Sjsg 
8495ca02815Sjsg 	return gen12_emit_fini_breadcrumb_tail(rq, cs);
8505ca02815Sjsg }
851