i915/gt/gen6_engine_cs.c

ad8b1aafSjsg// SPDX-License-Identifier: MIT
ad8b1aafSjsg/*
ad8b1aafSjsg * Copyright © 2020 Intel Corporation
ad8b1aafSjsg */
ad8b1aafSjsg
ad8b1aafSjsg#include "gen6_engine_cs.h"
ad8b1aafSjsg#include "intel_engine.h"
*1bb76ff1Sjsg#include "intel_engine_regs.h"
ad8b1aafSjsg#include "intel_gpu_commands.h"
ad8b1aafSjsg#include "intel_gt.h"
ad8b1aafSjsg#include "intel_gt_irq.h"
ad8b1aafSjsg#include "intel_gt_pm_irq.h"
ad8b1aafSjsg#include "intel_ring.h"
ad8b1aafSjsg
ad8b1aafSjsg#define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
ad8b1aafSjsg
ad8b1aafSjsg/*
ad8b1aafSjsg * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
ad8b1aafSjsg * implementing two workarounds on gen6.  From section 1.4.7.1
ad8b1aafSjsg * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
ad8b1aafSjsg *
ad8b1aafSjsg * [DevSNB-C+{W/A}] Before any depth stall flush (including those
ad8b1aafSjsg * produced by non-pipelined state commands), software needs to first
ad8b1aafSjsg * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
ad8b1aafSjsg * 0.
ad8b1aafSjsg *
ad8b1aafSjsg * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
ad8b1aafSjsg * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
ad8b1aafSjsg *
ad8b1aafSjsg * And the workaround for these two requires this workaround first:
ad8b1aafSjsg *
ad8b1aafSjsg * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
ad8b1aafSjsg * BEFORE the pipe-control with a post-sync op and no write-cache
ad8b1aafSjsg * flushes.
ad8b1aafSjsg *
ad8b1aafSjsg * And this last workaround is tricky because of the requirements on
ad8b1aafSjsg * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
ad8b1aafSjsg * volume 2 part 1:
ad8b1aafSjsg *
ad8b1aafSjsg *     "1 of the following must also be set:
ad8b1aafSjsg *      - Render Target Cache Flush Enable ([12] of DW1)
ad8b1aafSjsg *      - Depth Cache Flush Enable ([0] of DW1)
ad8b1aafSjsg *      - Stall at Pixel Scoreboard ([1] of DW1)
ad8b1aafSjsg *      - Depth Stall ([13] of DW1)
ad8b1aafSjsg *      - Post-Sync Operation ([13] of DW1)
ad8b1aafSjsg *      - Notify Enable ([8] of DW1)"
ad8b1aafSjsg *
ad8b1aafSjsg * The cache flushes require the workaround flush that triggered this
ad8b1aafSjsg * one, so we can't use it.  Depth stall would trigger the same.
ad8b1aafSjsg * Post-sync nonzero is what triggered this second workaround, so we
ad8b1aafSjsg * can't use that one either.  Notify enable is IRQs, which aren't
ad8b1aafSjsg * really our business.  That leaves only stall at scoreboard.
ad8b1aafSjsg */
ad8b1aafSjsgstatic int
ad8b1aafSjsggen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 scratch_addr =
ad8b1aafSjsg		intel_gt_scratch_offset(rq->engine->gt,
ad8b1aafSjsg					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
ad8b1aafSjsg	u32 *cs;
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 6);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(5);
ad8b1aafSjsg	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
ad8b1aafSjsg	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
ad8b1aafSjsg	*cs++ = 0; /* low dword */
ad8b1aafSjsg	*cs++ = 0; /* high dword */
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 6);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(5);
ad8b1aafSjsg	*cs++ = PIPE_CONTROL_QW_WRITE;
ad8b1aafSjsg	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 scratch_addr =
ad8b1aafSjsg		intel_gt_scratch_offset(rq->engine->gt,
ad8b1aafSjsg					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
ad8b1aafSjsg	u32 *cs, flags = 0;
ad8b1aafSjsg	int ret;
ad8b1aafSjsg
ad8b1aafSjsg	/* Force SNB workarounds for PIPE_CONTROL flushes */
ad8b1aafSjsg	ret = gen6_emit_post_sync_nonzero_flush(rq);
ad8b1aafSjsg	if (ret)
ad8b1aafSjsg		return ret;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * Just flush everything.  Experiments have shown that reducing the
ad8b1aafSjsg	 * number of bits based on the write domains has little performance
ad8b1aafSjsg	 * impact. And when rearranging requests, the order of flushes is
ad8b1aafSjsg	 * unknown.
ad8b1aafSjsg	 */
ad8b1aafSjsg	if (mode & EMIT_FLUSH) {
ad8b1aafSjsg		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
ad8b1aafSjsg		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
ad8b1aafSjsg		/*
ad8b1aafSjsg		 * Ensure that any following seqno writes only happen
ad8b1aafSjsg		 * when the render cache is indeed flushed.
ad8b1aafSjsg		 */
ad8b1aafSjsg		flags |= PIPE_CONTROL_CS_STALL;
ad8b1aafSjsg	}
ad8b1aafSjsg	if (mode & EMIT_INVALIDATE) {
ad8b1aafSjsg		flags |= PIPE_CONTROL_TLB_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
ad8b1aafSjsg		/*
ad8b1aafSjsg		 * TLB invalidate requires a post-sync write.
ad8b1aafSjsg		 */
ad8b1aafSjsg		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
ad8b1aafSjsg	}
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 4);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = flags;
ad8b1aafSjsg	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgu32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
ad8b1aafSjsg{
ad8b1aafSjsg	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = PIPE_CONTROL_QW_WRITE;
ad8b1aafSjsg	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
ad8b1aafSjsg					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
ad8b1aafSjsg		PIPE_CONTROL_GLOBAL_GTT;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg
ad8b1aafSjsg	/* Finally we can flush and with it emit the breadcrumb */
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
ad8b1aafSjsg		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
ad8b1aafSjsg		 PIPE_CONTROL_DC_FLUSH_ENABLE |
ad8b1aafSjsg		 PIPE_CONTROL_QW_WRITE |
ad8b1aafSjsg		 PIPE_CONTROL_CS_STALL);
5ca02815Sjsg	*cs++ = i915_request_active_seqno(rq) |
ad8b1aafSjsg		PIPE_CONTROL_GLOBAL_GTT;
ad8b1aafSjsg	*cs++ = rq->fence.seqno;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_USER_INTERRUPT;
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg
ad8b1aafSjsg	rq->tail = intel_ring_offset(rq, cs);
ad8b1aafSjsg	assert_ring_tail_valid(rq->ring, rq->tail);
ad8b1aafSjsg
ad8b1aafSjsg	return cs;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgstatic int mi_flush_dw(struct i915_request *rq, u32 flags)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 cmd, *cs;
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 4);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	cmd = MI_FLUSH_DW;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * We always require a command barrier so that subsequent
ad8b1aafSjsg	 * commands, such as breadcrumb interrupts, are strictly ordered
ad8b1aafSjsg	 * wrt the contents of the write cache being flushed to memory
ad8b1aafSjsg	 * (and thus being coherent from the CPU).
ad8b1aafSjsg	 */
ad8b1aafSjsg	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * Bspec vol 1c.3 - blitter engine command streamer:
ad8b1aafSjsg	 * "If ENABLED, all TLBs will be invalidated once the flush
ad8b1aafSjsg	 * operation is complete. This bit is only valid when the
ad8b1aafSjsg	 * Post-Sync Operation field is a value of 1h or 3h."
ad8b1aafSjsg	 */
ad8b1aafSjsg	cmd |= flags;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = cmd;
ad8b1aafSjsg	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgstatic int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
ad8b1aafSjsg{
ad8b1aafSjsg	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
ad8b1aafSjsg{
ad8b1aafSjsg	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
ad8b1aafSjsg{
ad8b1aafSjsg	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint gen6_emit_bb_start(struct i915_request *rq,
ad8b1aafSjsg		       u64 offset, u32 len,
ad8b1aafSjsg		       unsigned int dispatch_flags)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 security;
ad8b1aafSjsg	u32 *cs;
ad8b1aafSjsg
ad8b1aafSjsg	security = MI_BATCH_NON_SECURE_I965;
ad8b1aafSjsg	if (dispatch_flags & I915_DISPATCH_SECURE)
ad8b1aafSjsg		security = 0;
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 2);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	cs = __gen6_emit_bb_start(cs, offset, security);
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint
ad8b1aafSjsghsw_emit_bb_start(struct i915_request *rq,
ad8b1aafSjsg		  u64 offset, u32 len,
ad8b1aafSjsg		  unsigned int dispatch_flags)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 security;
ad8b1aafSjsg	u32 *cs;
ad8b1aafSjsg
ad8b1aafSjsg	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
ad8b1aafSjsg	if (dispatch_flags & I915_DISPATCH_SECURE)
ad8b1aafSjsg		security = 0;
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 2);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	cs = __gen6_emit_bb_start(cs, offset, security);
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgstatic int gen7_stall_cs(struct i915_request *rq)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 *cs;
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 4);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgint gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
ad8b1aafSjsg{
ad8b1aafSjsg	u32 scratch_addr =
ad8b1aafSjsg		intel_gt_scratch_offset(rq->engine->gt,
ad8b1aafSjsg					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
ad8b1aafSjsg	u32 *cs, flags = 0;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * Ensure that any following seqno writes only happen when the render
ad8b1aafSjsg	 * cache is indeed flushed.
ad8b1aafSjsg	 *
ad8b1aafSjsg	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
ad8b1aafSjsg	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
ad8b1aafSjsg	 * don't try to be clever and just set it unconditionally.
ad8b1aafSjsg	 */
ad8b1aafSjsg	flags |= PIPE_CONTROL_CS_STALL;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * CS_STALL suggests at least a post-sync write.
ad8b1aafSjsg	 */
ad8b1aafSjsg	flags |= PIPE_CONTROL_QW_WRITE;
ad8b1aafSjsg	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
ad8b1aafSjsg
ad8b1aafSjsg	/*
ad8b1aafSjsg	 * Just flush everything.  Experiments have shown that reducing the
ad8b1aafSjsg	 * number of bits based on the write domains has little performance
ad8b1aafSjsg	 * impact.
ad8b1aafSjsg	 */
ad8b1aafSjsg	if (mode & EMIT_FLUSH) {
ad8b1aafSjsg		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
ad8b1aafSjsg		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
ad8b1aafSjsg		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_FLUSH_ENABLE;
ad8b1aafSjsg	}
ad8b1aafSjsg	if (mode & EMIT_INVALIDATE) {
ad8b1aafSjsg		flags |= PIPE_CONTROL_TLB_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
ad8b1aafSjsg		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
ad8b1aafSjsg
ad8b1aafSjsg		/*
ad8b1aafSjsg		 * Workaround: we must issue a pipe_control with CS-stall bit
ad8b1aafSjsg		 * set before a pipe_control command that has the state cache
ad8b1aafSjsg		 * invalidate bit set.
ad8b1aafSjsg		 */
ad8b1aafSjsg		gen7_stall_cs(rq);
ad8b1aafSjsg	}
ad8b1aafSjsg
ad8b1aafSjsg	cs = intel_ring_begin(rq, 4);
ad8b1aafSjsg	if (IS_ERR(cs))
ad8b1aafSjsg		return PTR_ERR(cs);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = flags;
ad8b1aafSjsg	*cs++ = scratch_addr;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	intel_ring_advance(rq, cs);
ad8b1aafSjsg
ad8b1aafSjsg	return 0;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgu32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
ad8b1aafSjsg{
ad8b1aafSjsg	*cs++ = GFX_OP_PIPE_CONTROL(4);
ad8b1aafSjsg	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
ad8b1aafSjsg		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
ad8b1aafSjsg		 PIPE_CONTROL_DC_FLUSH_ENABLE |
ad8b1aafSjsg		 PIPE_CONTROL_FLUSH_ENABLE |
ad8b1aafSjsg		 PIPE_CONTROL_QW_WRITE |
ad8b1aafSjsg		 PIPE_CONTROL_GLOBAL_GTT_IVB |
ad8b1aafSjsg		 PIPE_CONTROL_CS_STALL);
5ca02815Sjsg	*cs++ = i915_request_active_seqno(rq);
ad8b1aafSjsg	*cs++ = rq->fence.seqno;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_USER_INTERRUPT;
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg
ad8b1aafSjsg	rq->tail = intel_ring_offset(rq, cs);
ad8b1aafSjsg	assert_ring_tail_valid(rq->ring, rq->tail);
ad8b1aafSjsg
ad8b1aafSjsg	return cs;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgu32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
ad8b1aafSjsg{
ad8b1aafSjsg	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
5ca02815Sjsg	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
ad8b1aafSjsg	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
ad8b1aafSjsg	*cs++ = rq->fence.seqno;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_USER_INTERRUPT;
ad8b1aafSjsg
ad8b1aafSjsg	rq->tail = intel_ring_offset(rq, cs);
ad8b1aafSjsg	assert_ring_tail_valid(rq->ring, rq->tail);
ad8b1aafSjsg
ad8b1aafSjsg	return cs;
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsg#define GEN7_XCS_WA 32
ad8b1aafSjsgu32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
ad8b1aafSjsg{
ad8b1aafSjsg	int i;
ad8b1aafSjsg
ad8b1aafSjsg	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
5ca02815Sjsg	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
ad8b1aafSjsg		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
ad8b1aafSjsg	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
ad8b1aafSjsg	*cs++ = rq->fence.seqno;
ad8b1aafSjsg
ad8b1aafSjsg	for (i = 0; i < GEN7_XCS_WA; i++) {
ad8b1aafSjsg		*cs++ = MI_STORE_DWORD_INDEX;
ad8b1aafSjsg		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
ad8b1aafSjsg		*cs++ = rq->fence.seqno;
ad8b1aafSjsg	}
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_FLUSH_DW;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg	*cs++ = 0;
ad8b1aafSjsg
ad8b1aafSjsg	*cs++ = MI_USER_INTERRUPT;
ad8b1aafSjsg	*cs++ = MI_NOOP;
ad8b1aafSjsg
ad8b1aafSjsg	rq->tail = intel_ring_offset(rq, cs);
ad8b1aafSjsg	assert_ring_tail_valid(rq->ring, rq->tail);
ad8b1aafSjsg
ad8b1aafSjsg	return cs;
ad8b1aafSjsg}
ad8b1aafSjsg#undef GEN7_XCS_WA
ad8b1aafSjsg
ad8b1aafSjsgvoid gen6_irq_enable(struct intel_engine_cs *engine)
ad8b1aafSjsg{
ad8b1aafSjsg	ENGINE_WRITE(engine, RING_IMR,
ad8b1aafSjsg		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
ad8b1aafSjsg
ad8b1aafSjsg	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
ad8b1aafSjsg	ENGINE_POSTING_READ(engine, RING_IMR);
ad8b1aafSjsg
ad8b1aafSjsg	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgvoid gen6_irq_disable(struct intel_engine_cs *engine)
ad8b1aafSjsg{
ad8b1aafSjsg	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
ad8b1aafSjsg	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgvoid hsw_irq_enable_vecs(struct intel_engine_cs *engine)
ad8b1aafSjsg{
ad8b1aafSjsg	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
ad8b1aafSjsg
ad8b1aafSjsg	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
ad8b1aafSjsg	ENGINE_POSTING_READ(engine, RING_IMR);
ad8b1aafSjsg
ad8b1aafSjsg	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
ad8b1aafSjsg}
ad8b1aafSjsg
ad8b1aafSjsgvoid hsw_irq_disable_vecs(struct intel_engine_cs *engine)
ad8b1aafSjsg{
ad8b1aafSjsg	ENGINE_WRITE(engine, RING_IMR, ~0);
ad8b1aafSjsg	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
ad8b1aafSjsg}