xref: /openbsd-src/sys/dev/pci/drm/i915/gt/intel_reset.c (revision ddf58b8f7628d6c9b1edf87465234638976d017b)
15ca02815Sjsg // SPDX-License-Identifier: MIT
2c349dbc7Sjsg /*
3c349dbc7Sjsg  * Copyright © 2008-2018 Intel Corporation
4c349dbc7Sjsg  */
5c349dbc7Sjsg 
6c349dbc7Sjsg #include <linux/sched/mm.h>
7c349dbc7Sjsg #include <linux/stop_machine.h>
81bb76ff1Sjsg #include <linux/string_helpers.h>
9c349dbc7Sjsg 
10f005ef32Sjsg #include "display/intel_display_reset.h"
11c349dbc7Sjsg #include "display/intel_overlay.h"
12c349dbc7Sjsg 
13c349dbc7Sjsg #include "gem/i915_gem_context.h"
14c349dbc7Sjsg 
151bb76ff1Sjsg #include "gt/intel_gt_regs.h"
161bb76ff1Sjsg 
17f005ef32Sjsg #include "gt/uc/intel_gsc_fw.h"
18f005ef32Sjsg 
19c349dbc7Sjsg #include "i915_drv.h"
201bb76ff1Sjsg #include "i915_file_private.h"
21c349dbc7Sjsg #include "i915_gpu_error.h"
22c349dbc7Sjsg #include "i915_irq.h"
23f005ef32Sjsg #include "i915_reg.h"
24ad8b1aafSjsg #include "intel_breadcrumbs.h"
25c349dbc7Sjsg #include "intel_engine_pm.h"
261bb76ff1Sjsg #include "intel_engine_regs.h"
27c349dbc7Sjsg #include "intel_gt.h"
28c349dbc7Sjsg #include "intel_gt_pm.h"
295ca02815Sjsg #include "intel_gt_requests.h"
301bb76ff1Sjsg #include "intel_mchbar_regs.h"
311bb76ff1Sjsg #include "intel_pci_config.h"
32c349dbc7Sjsg #include "intel_reset.h"
33c349dbc7Sjsg 
34c349dbc7Sjsg #include "uc/intel_guc.h"
35c349dbc7Sjsg 
36c349dbc7Sjsg #define RESET_MAX_RETRIES 3
37c349dbc7Sjsg 
client_mark_guilty(struct i915_gem_context * ctx,bool banned)38c349dbc7Sjsg static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
39c349dbc7Sjsg {
40c349dbc7Sjsg 	struct drm_i915_file_private *file_priv = ctx->file_priv;
41c349dbc7Sjsg 	unsigned long prev_hang;
42c349dbc7Sjsg 	unsigned int score;
43c349dbc7Sjsg 
44c349dbc7Sjsg 	if (IS_ERR_OR_NULL(file_priv))
45c349dbc7Sjsg 		return;
46c349dbc7Sjsg 
47c349dbc7Sjsg 	score = 0;
48c349dbc7Sjsg 	if (banned)
49c349dbc7Sjsg 		score = I915_CLIENT_SCORE_CONTEXT_BAN;
50c349dbc7Sjsg 
51c349dbc7Sjsg 	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
52c349dbc7Sjsg 	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
53c349dbc7Sjsg 		score += I915_CLIENT_SCORE_HANG_FAST;
54c349dbc7Sjsg 
55c349dbc7Sjsg 	if (score) {
56c349dbc7Sjsg 		atomic_add(score, &file_priv->ban_score);
57c349dbc7Sjsg 
58c349dbc7Sjsg 		drm_dbg(&ctx->i915->drm,
59c349dbc7Sjsg 			"client %s: gained %u ban score, now %u\n",
60c349dbc7Sjsg 			ctx->name, score,
61c349dbc7Sjsg 			atomic_read(&file_priv->ban_score));
62c349dbc7Sjsg 	}
63c349dbc7Sjsg }
64c349dbc7Sjsg 
mark_guilty(struct i915_request * rq)65c349dbc7Sjsg static bool mark_guilty(struct i915_request *rq)
66c349dbc7Sjsg {
67c349dbc7Sjsg 	struct i915_gem_context *ctx;
68c349dbc7Sjsg 	unsigned long prev_hang;
69c349dbc7Sjsg 	bool banned;
70c349dbc7Sjsg 	int i;
71c349dbc7Sjsg 
725ca02815Sjsg 	if (intel_context_is_closed(rq->context))
73c349dbc7Sjsg 		return true;
74c349dbc7Sjsg 
75c349dbc7Sjsg 	rcu_read_lock();
76c349dbc7Sjsg 	ctx = rcu_dereference(rq->context->gem_context);
77c349dbc7Sjsg 	if (ctx && !kref_get_unless_zero(&ctx->ref))
78c349dbc7Sjsg 		ctx = NULL;
79c349dbc7Sjsg 	rcu_read_unlock();
80c349dbc7Sjsg 	if (!ctx)
81c349dbc7Sjsg 		return intel_context_is_banned(rq->context);
82c349dbc7Sjsg 
83c349dbc7Sjsg 	atomic_inc(&ctx->guilty_count);
84c349dbc7Sjsg 
85c349dbc7Sjsg 	/* Cool contexts are too cool to be banned! (Used for reset testing.) */
86c349dbc7Sjsg 	if (!i915_gem_context_is_bannable(ctx)) {
87c349dbc7Sjsg 		banned = false;
88c349dbc7Sjsg 		goto out;
89c349dbc7Sjsg 	}
90c349dbc7Sjsg 
91ad8b1aafSjsg 	drm_notice(&ctx->i915->drm,
92c349dbc7Sjsg 		   "%s context reset due to GPU hang\n",
93c349dbc7Sjsg 		   ctx->name);
94c349dbc7Sjsg 
95c349dbc7Sjsg 	/* Record the timestamp for the last N hangs */
96c349dbc7Sjsg 	prev_hang = ctx->hang_timestamp[0];
97c349dbc7Sjsg 	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
98c349dbc7Sjsg 		ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
99c349dbc7Sjsg 	ctx->hang_timestamp[i] = jiffies;
100c349dbc7Sjsg 
101c349dbc7Sjsg 	/* If we have hung N+1 times in rapid succession, we ban the context! */
102c349dbc7Sjsg 	banned = !i915_gem_context_is_recoverable(ctx);
103c349dbc7Sjsg 	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
104c349dbc7Sjsg 		banned = true;
1055ca02815Sjsg 	if (banned)
106c349dbc7Sjsg 		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
107c349dbc7Sjsg 			ctx->name, atomic_read(&ctx->guilty_count));
108c349dbc7Sjsg 
109c349dbc7Sjsg 	client_mark_guilty(ctx, banned);
110c349dbc7Sjsg 
111c349dbc7Sjsg out:
112c349dbc7Sjsg 	i915_gem_context_put(ctx);
113c349dbc7Sjsg 	return banned;
114c349dbc7Sjsg }
115c349dbc7Sjsg 
mark_innocent(struct i915_request * rq)116c349dbc7Sjsg static void mark_innocent(struct i915_request *rq)
117c349dbc7Sjsg {
118c349dbc7Sjsg 	struct i915_gem_context *ctx;
119c349dbc7Sjsg 
120c349dbc7Sjsg 	rcu_read_lock();
121c349dbc7Sjsg 	ctx = rcu_dereference(rq->context->gem_context);
122c349dbc7Sjsg 	if (ctx)
123c349dbc7Sjsg 		atomic_inc(&ctx->active_count);
124c349dbc7Sjsg 	rcu_read_unlock();
125c349dbc7Sjsg }
126c349dbc7Sjsg 
__i915_request_reset(struct i915_request * rq,bool guilty)127c349dbc7Sjsg void __i915_request_reset(struct i915_request *rq, bool guilty)
128c349dbc7Sjsg {
1295ca02815Sjsg 	bool banned = false;
130c349dbc7Sjsg 
1311bb76ff1Sjsg 	RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty));
1325ca02815Sjsg 	GEM_BUG_ON(__i915_request_is_complete(rq));
133c349dbc7Sjsg 
134c349dbc7Sjsg 	rcu_read_lock(); /* protect the GEM context */
135c349dbc7Sjsg 	if (guilty) {
136c349dbc7Sjsg 		i915_request_set_error_once(rq, -EIO);
137c349dbc7Sjsg 		__i915_request_skip(rq);
1385ca02815Sjsg 		banned = mark_guilty(rq);
139c349dbc7Sjsg 	} else {
140c349dbc7Sjsg 		i915_request_set_error_once(rq, -EAGAIN);
141c349dbc7Sjsg 		mark_innocent(rq);
142c349dbc7Sjsg 	}
143c349dbc7Sjsg 	rcu_read_unlock();
1445ca02815Sjsg 
1455ca02815Sjsg 	if (banned)
1465ca02815Sjsg 		intel_context_ban(rq->context, rq);
147c349dbc7Sjsg }
148c349dbc7Sjsg 
i915_in_reset(struct pci_dev * pdev)149c349dbc7Sjsg static bool i915_in_reset(struct pci_dev *pdev)
150c349dbc7Sjsg {
151c349dbc7Sjsg 	u8 gdrst;
152c349dbc7Sjsg 
153c349dbc7Sjsg 	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
154c349dbc7Sjsg 	return gdrst & GRDOM_RESET_STATUS;
155c349dbc7Sjsg }
156c349dbc7Sjsg 
i915_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)157c349dbc7Sjsg static int i915_do_reset(struct intel_gt *gt,
158c349dbc7Sjsg 			 intel_engine_mask_t engine_mask,
159c349dbc7Sjsg 			 unsigned int retry)
160c349dbc7Sjsg {
161c349dbc7Sjsg 	struct pci_dev *pdev = gt->i915->drm.pdev;
162c349dbc7Sjsg 	int err;
163c349dbc7Sjsg 
164c349dbc7Sjsg 	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
165c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
166c349dbc7Sjsg 	udelay(50);
167c349dbc7Sjsg 	err = wait_for_atomic(i915_in_reset(pdev), 50);
168c349dbc7Sjsg 
169c349dbc7Sjsg 	/* Clear the reset request. */
170c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST, 0);
171c349dbc7Sjsg 	udelay(50);
172c349dbc7Sjsg 	if (!err)
173c349dbc7Sjsg 		err = wait_for_atomic(!i915_in_reset(pdev), 50);
174c349dbc7Sjsg 
175c349dbc7Sjsg 	return err;
176c349dbc7Sjsg }
177c349dbc7Sjsg 
g4x_reset_complete(struct pci_dev * pdev)178c349dbc7Sjsg static bool g4x_reset_complete(struct pci_dev *pdev)
179c349dbc7Sjsg {
180c349dbc7Sjsg 	u8 gdrst;
181c349dbc7Sjsg 
182c349dbc7Sjsg 	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
183c349dbc7Sjsg 	return (gdrst & GRDOM_RESET_ENABLE) == 0;
184c349dbc7Sjsg }
185c349dbc7Sjsg 
g33_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)186c349dbc7Sjsg static int g33_do_reset(struct intel_gt *gt,
187c349dbc7Sjsg 			intel_engine_mask_t engine_mask,
188c349dbc7Sjsg 			unsigned int retry)
189c349dbc7Sjsg {
190c349dbc7Sjsg 	struct pci_dev *pdev = gt->i915->drm.pdev;
191c349dbc7Sjsg 
192c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
193c349dbc7Sjsg 	return wait_for_atomic(g4x_reset_complete(pdev), 50);
194c349dbc7Sjsg }
195c349dbc7Sjsg 
g4x_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)196c349dbc7Sjsg static int g4x_do_reset(struct intel_gt *gt,
197c349dbc7Sjsg 			intel_engine_mask_t engine_mask,
198c349dbc7Sjsg 			unsigned int retry)
199c349dbc7Sjsg {
200c349dbc7Sjsg 	struct pci_dev *pdev = gt->i915->drm.pdev;
201c349dbc7Sjsg 	struct intel_uncore *uncore = gt->uncore;
202c349dbc7Sjsg 	int ret;
203c349dbc7Sjsg 
204c349dbc7Sjsg 	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
205f005ef32Sjsg 	intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, 0, VCP_UNIT_CLOCK_GATE_DISABLE);
206c349dbc7Sjsg 	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
207c349dbc7Sjsg 
208c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST,
209c349dbc7Sjsg 			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
210c349dbc7Sjsg 	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
211c349dbc7Sjsg 	if (ret) {
2125ca02815Sjsg 		GT_TRACE(gt, "Wait for media reset failed\n");
213c349dbc7Sjsg 		goto out;
214c349dbc7Sjsg 	}
215c349dbc7Sjsg 
216c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST,
217c349dbc7Sjsg 			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
218c349dbc7Sjsg 	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
219c349dbc7Sjsg 	if (ret) {
2205ca02815Sjsg 		GT_TRACE(gt, "Wait for render reset failed\n");
221c349dbc7Sjsg 		goto out;
222c349dbc7Sjsg 	}
223c349dbc7Sjsg 
224c349dbc7Sjsg out:
225c349dbc7Sjsg 	pci_write_config_byte(pdev, I915_GDRST, 0);
226c349dbc7Sjsg 
227f005ef32Sjsg 	intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE, 0);
228c349dbc7Sjsg 	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
229c349dbc7Sjsg 
230c349dbc7Sjsg 	return ret;
231c349dbc7Sjsg }
232c349dbc7Sjsg 
ilk_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)233c349dbc7Sjsg static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
234c349dbc7Sjsg 			unsigned int retry)
235c349dbc7Sjsg {
236c349dbc7Sjsg 	struct intel_uncore *uncore = gt->uncore;
237c349dbc7Sjsg 	int ret;
238c349dbc7Sjsg 
239c349dbc7Sjsg 	intel_uncore_write_fw(uncore, ILK_GDSR,
240c349dbc7Sjsg 			      ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
241c349dbc7Sjsg 	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
242c349dbc7Sjsg 					   ILK_GRDOM_RESET_ENABLE, 0,
243c349dbc7Sjsg 					   5000, 0,
244c349dbc7Sjsg 					   NULL);
245c349dbc7Sjsg 	if (ret) {
2465ca02815Sjsg 		GT_TRACE(gt, "Wait for render reset failed\n");
247c349dbc7Sjsg 		goto out;
248c349dbc7Sjsg 	}
249c349dbc7Sjsg 
250c349dbc7Sjsg 	intel_uncore_write_fw(uncore, ILK_GDSR,
251c349dbc7Sjsg 			      ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
252c349dbc7Sjsg 	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
253c349dbc7Sjsg 					   ILK_GRDOM_RESET_ENABLE, 0,
254c349dbc7Sjsg 					   5000, 0,
255c349dbc7Sjsg 					   NULL);
256c349dbc7Sjsg 	if (ret) {
2575ca02815Sjsg 		GT_TRACE(gt, "Wait for media reset failed\n");
258c349dbc7Sjsg 		goto out;
259c349dbc7Sjsg 	}
260c349dbc7Sjsg 
261c349dbc7Sjsg out:
262c349dbc7Sjsg 	intel_uncore_write_fw(uncore, ILK_GDSR, 0);
263c349dbc7Sjsg 	intel_uncore_posting_read_fw(uncore, ILK_GDSR);
264c349dbc7Sjsg 	return ret;
265c349dbc7Sjsg }
266c349dbc7Sjsg 
267c349dbc7Sjsg /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
gen6_hw_domain_reset(struct intel_gt * gt,u32 hw_domain_mask)268c349dbc7Sjsg static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
269c349dbc7Sjsg {
270c349dbc7Sjsg 	struct intel_uncore *uncore = gt->uncore;
271f005ef32Sjsg 	int loops;
272c349dbc7Sjsg 	int err;
273c349dbc7Sjsg 
274c349dbc7Sjsg 	/*
275f005ef32Sjsg 	 * On some platforms, e.g. Jasperlake, we see that the engine register
276f005ef32Sjsg 	 * state is not cleared until shortly after GDRST reports completion,
277f005ef32Sjsg 	 * causing a failure as we try to immediately resume while the internal
278f005ef32Sjsg 	 * state is still in flux. If we immediately repeat the reset, the
279f005ef32Sjsg 	 * second reset appears to serialise with the first, and since it is a
280f005ef32Sjsg 	 * no-op, the registers should retain their reset value. However, there
281f005ef32Sjsg 	 * is still a concern that upon leaving the second reset, the internal
282f005ef32Sjsg 	 * engine state is still in flux and not ready for resuming.
283f005ef32Sjsg 	 *
284f005ef32Sjsg 	 * Starting on MTL, there are some prep steps that we need to do when
285f005ef32Sjsg 	 * resetting some engines that need to be applied every time we write to
286f005ef32Sjsg 	 * GEN6_GDRST. As those are time consuming (tens of ms), we don't want
287f005ef32Sjsg 	 * to perform that twice, so, since the Jasperlake issue hasn't been
288f005ef32Sjsg 	 * observed on MTL, we avoid repeating the reset on newer platforms.
289f005ef32Sjsg 	 */
290f005ef32Sjsg 	loops = GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70) ? 2 : 1;
291f005ef32Sjsg 
292f005ef32Sjsg 	/*
293c349dbc7Sjsg 	 * GEN6_GDRST is not in the gt power well, no need to check
294c349dbc7Sjsg 	 * for fifo space for the write or forcewake the chip for
295c349dbc7Sjsg 	 * the read
296c349dbc7Sjsg 	 */
297bd055151Sjsg 	do {
298c349dbc7Sjsg 		intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
299c349dbc7Sjsg 
300f005ef32Sjsg 		/* Wait for the device to ack the reset requests. */
301bd055151Sjsg 		err = __intel_wait_for_register_fw(uncore, GEN6_GDRST,
302bd055151Sjsg 						   hw_domain_mask, 0,
303bd055151Sjsg 						   2000, 0,
304c349dbc7Sjsg 						   NULL);
305bd055151Sjsg 	} while (err == 0 && --loops);
306c349dbc7Sjsg 	if (err)
3075ca02815Sjsg 		GT_TRACE(gt,
308c349dbc7Sjsg 			 "Wait for 0x%08x engines reset failed\n",
309c349dbc7Sjsg 			 hw_domain_mask);
310c349dbc7Sjsg 
311bd055151Sjsg 	/*
312bd055151Sjsg 	 * As we have observed that the engine state is still volatile
313bd055151Sjsg 	 * after GDRST is acked, impose a small delay to let everything settle.
314bd055151Sjsg 	 */
315bd055151Sjsg 	udelay(50);
316bd055151Sjsg 
317c349dbc7Sjsg 	return err;
318c349dbc7Sjsg }
319c349dbc7Sjsg 
__gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)3202e7c6ff7Sjsg static int __gen6_reset_engines(struct intel_gt *gt,
321c349dbc7Sjsg 				intel_engine_mask_t engine_mask,
322c349dbc7Sjsg 				unsigned int retry)
323c349dbc7Sjsg {
324c349dbc7Sjsg 	struct intel_engine_cs *engine;
325c349dbc7Sjsg 	u32 hw_mask;
326c349dbc7Sjsg 
327c349dbc7Sjsg 	if (engine_mask == ALL_ENGINES) {
328c349dbc7Sjsg 		hw_mask = GEN6_GRDOM_FULL;
329c349dbc7Sjsg 	} else {
330c349dbc7Sjsg 		intel_engine_mask_t tmp;
331c349dbc7Sjsg 
332c349dbc7Sjsg 		hw_mask = 0;
333c349dbc7Sjsg 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
3341bb76ff1Sjsg 			hw_mask |= engine->reset_domain;
335c349dbc7Sjsg 		}
336c349dbc7Sjsg 	}
337c349dbc7Sjsg 
338c349dbc7Sjsg 	return gen6_hw_domain_reset(gt, hw_mask);
339c349dbc7Sjsg }
340c349dbc7Sjsg 
gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)3412e7c6ff7Sjsg static int gen6_reset_engines(struct intel_gt *gt,
3422e7c6ff7Sjsg 			      intel_engine_mask_t engine_mask,
3432e7c6ff7Sjsg 			      unsigned int retry)
3442e7c6ff7Sjsg {
3452e7c6ff7Sjsg 	unsigned long flags;
3462e7c6ff7Sjsg 	int ret;
3472e7c6ff7Sjsg 
3482e7c6ff7Sjsg 	spin_lock_irqsave(&gt->uncore->lock, flags);
3492e7c6ff7Sjsg 	ret = __gen6_reset_engines(gt, engine_mask, retry);
3502e7c6ff7Sjsg 	spin_unlock_irqrestore(&gt->uncore->lock, flags);
3512e7c6ff7Sjsg 
3522e7c6ff7Sjsg 	return ret;
3532e7c6ff7Sjsg }
3542e7c6ff7Sjsg 
find_sfc_paired_vecs_engine(struct intel_engine_cs * engine)3555ca02815Sjsg static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
3565ca02815Sjsg {
3575ca02815Sjsg 	int vecs_id;
3585ca02815Sjsg 
3595ca02815Sjsg 	GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
3605ca02815Sjsg 
3615ca02815Sjsg 	vecs_id = _VECS((engine->instance) / 2);
3625ca02815Sjsg 
3635ca02815Sjsg 	return engine->gt->engine[vecs_id];
3645ca02815Sjsg }
3655ca02815Sjsg 
3665ca02815Sjsg struct sfc_lock_data {
3675ca02815Sjsg 	i915_reg_t lock_reg;
3685ca02815Sjsg 	i915_reg_t ack_reg;
3695ca02815Sjsg 	i915_reg_t usage_reg;
3705ca02815Sjsg 	u32 lock_bit;
3715ca02815Sjsg 	u32 ack_bit;
3725ca02815Sjsg 	u32 usage_bit;
3735ca02815Sjsg 	u32 reset_bit;
3745ca02815Sjsg };
3755ca02815Sjsg 
get_sfc_forced_lock_data(struct intel_engine_cs * engine,struct sfc_lock_data * sfc_lock)3765ca02815Sjsg static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
3775ca02815Sjsg 				     struct sfc_lock_data *sfc_lock)
3785ca02815Sjsg {
3795ca02815Sjsg 	switch (engine->class) {
3805ca02815Sjsg 	default:
3815ca02815Sjsg 		MISSING_CASE(engine->class);
3825ca02815Sjsg 		fallthrough;
3835ca02815Sjsg 	case VIDEO_DECODE_CLASS:
3841bb76ff1Sjsg 		sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base);
3855ca02815Sjsg 		sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
3865ca02815Sjsg 
3871bb76ff1Sjsg 		sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
3885ca02815Sjsg 		sfc_lock->ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
3895ca02815Sjsg 
3901bb76ff1Sjsg 		sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
3915ca02815Sjsg 		sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
3925ca02815Sjsg 		sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
3935ca02815Sjsg 
3945ca02815Sjsg 		break;
3955ca02815Sjsg 	case VIDEO_ENHANCEMENT_CLASS:
3961bb76ff1Sjsg 		sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base);
3975ca02815Sjsg 		sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
3985ca02815Sjsg 
3991bb76ff1Sjsg 		sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base);
4005ca02815Sjsg 		sfc_lock->ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
4015ca02815Sjsg 
4021bb76ff1Sjsg 		sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base);
4035ca02815Sjsg 		sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
4045ca02815Sjsg 		sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
4055ca02815Sjsg 
4065ca02815Sjsg 		break;
4075ca02815Sjsg 	}
4085ca02815Sjsg }
4095ca02815Sjsg 
gen11_lock_sfc(struct intel_engine_cs * engine,u32 * reset_mask,u32 * unlock_mask)4105ca02815Sjsg static int gen11_lock_sfc(struct intel_engine_cs *engine,
4115ca02815Sjsg 			  u32 *reset_mask,
4125ca02815Sjsg 			  u32 *unlock_mask)
413c349dbc7Sjsg {
414c349dbc7Sjsg 	struct intel_uncore *uncore = engine->uncore;
415ad8b1aafSjsg 	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
4165ca02815Sjsg 	struct sfc_lock_data sfc_lock;
4175ca02815Sjsg 	bool lock_obtained, lock_to_other = false;
418c349dbc7Sjsg 	int ret;
419c349dbc7Sjsg 
420c349dbc7Sjsg 	switch (engine->class) {
421c349dbc7Sjsg 	case VIDEO_DECODE_CLASS:
422c349dbc7Sjsg 		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
423c349dbc7Sjsg 			return 0;
424c349dbc7Sjsg 
4255ca02815Sjsg 		fallthrough;
426c349dbc7Sjsg 	case VIDEO_ENHANCEMENT_CLASS:
4275ca02815Sjsg 		get_sfc_forced_lock_data(engine, &sfc_lock);
428c349dbc7Sjsg 
429c349dbc7Sjsg 		break;
430c349dbc7Sjsg 	default:
431c349dbc7Sjsg 		return 0;
432c349dbc7Sjsg 	}
433c349dbc7Sjsg 
4345ca02815Sjsg 	if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
4355ca02815Sjsg 		struct intel_engine_cs *paired_vecs;
4365ca02815Sjsg 
4375ca02815Sjsg 		if (engine->class != VIDEO_DECODE_CLASS ||
4385ca02815Sjsg 		    GRAPHICS_VER(engine->i915) != 12)
4395ca02815Sjsg 			return 0;
4405ca02815Sjsg 
441c349dbc7Sjsg 		/*
4425ca02815Sjsg 		 * Wa_14010733141
4435ca02815Sjsg 		 *
4445ca02815Sjsg 		 * If the VCS-MFX isn't using the SFC, we also need to check
4455ca02815Sjsg 		 * whether VCS-HCP is using it.  If so, we need to issue a *VE*
4465ca02815Sjsg 		 * forced lock on the VE engine that shares the same SFC.
4475ca02815Sjsg 		 */
4485ca02815Sjsg 		if (!(intel_uncore_read_fw(uncore,
4491bb76ff1Sjsg 					   GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) &
4505ca02815Sjsg 		      GEN12_HCP_SFC_USAGE_BIT))
4515ca02815Sjsg 			return 0;
4525ca02815Sjsg 
4535ca02815Sjsg 		paired_vecs = find_sfc_paired_vecs_engine(engine);
4545ca02815Sjsg 		get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
4555ca02815Sjsg 		lock_to_other = true;
4565ca02815Sjsg 		*unlock_mask |= paired_vecs->mask;
4575ca02815Sjsg 	} else {
4585ca02815Sjsg 		*unlock_mask |= engine->mask;
4595ca02815Sjsg 	}
4605ca02815Sjsg 
4615ca02815Sjsg 	/*
4625ca02815Sjsg 	 * If the engine is using an SFC, tell the engine that a software reset
463c349dbc7Sjsg 	 * is going to happen. The engine will then try to force lock the SFC.
464c349dbc7Sjsg 	 * If SFC ends up being locked to the engine we want to reset, we have
465c349dbc7Sjsg 	 * to reset it as well (we will unlock it once the reset sequence is
466c349dbc7Sjsg 	 * completed).
467c349dbc7Sjsg 	 */
468f005ef32Sjsg 	intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, 0, sfc_lock.lock_bit);
469c349dbc7Sjsg 
470c349dbc7Sjsg 	ret = __intel_wait_for_register_fw(uncore,
4715ca02815Sjsg 					   sfc_lock.ack_reg,
4725ca02815Sjsg 					   sfc_lock.ack_bit,
4735ca02815Sjsg 					   sfc_lock.ack_bit,
474c349dbc7Sjsg 					   1000, 0, NULL);
475c349dbc7Sjsg 
4765ca02815Sjsg 	/*
4775ca02815Sjsg 	 * Was the SFC released while we were trying to lock it?
4785ca02815Sjsg 	 *
4795ca02815Sjsg 	 * We should reset both the engine and the SFC if:
4805ca02815Sjsg 	 *  - We were locking the SFC to this engine and the lock succeeded
4815ca02815Sjsg 	 *       OR
4825ca02815Sjsg 	 *  - We were locking the SFC to a different engine (Wa_14010733141)
4835ca02815Sjsg 	 *    but the SFC was released before the lock was obtained.
4845ca02815Sjsg 	 *
4855ca02815Sjsg 	 * Otherwise we need only reset the engine by itself and we can
4865ca02815Sjsg 	 * leave the SFC alone.
4875ca02815Sjsg 	 */
4885ca02815Sjsg 	lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
4895ca02815Sjsg 			sfc_lock.usage_bit) != 0;
4905ca02815Sjsg 	if (lock_obtained == lock_to_other)
491c349dbc7Sjsg 		return 0;
492c349dbc7Sjsg 
493c349dbc7Sjsg 	if (ret) {
4945ca02815Sjsg 		ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
495c349dbc7Sjsg 		return ret;
496c349dbc7Sjsg 	}
497c349dbc7Sjsg 
4985ca02815Sjsg 	*reset_mask |= sfc_lock.reset_bit;
499c349dbc7Sjsg 	return 0;
500c349dbc7Sjsg }
501c349dbc7Sjsg 
gen11_unlock_sfc(struct intel_engine_cs * engine)502c349dbc7Sjsg static void gen11_unlock_sfc(struct intel_engine_cs *engine)
503c349dbc7Sjsg {
504c349dbc7Sjsg 	struct intel_uncore *uncore = engine->uncore;
505ad8b1aafSjsg 	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
5065ca02815Sjsg 	struct sfc_lock_data sfc_lock = {};
507c349dbc7Sjsg 
5085ca02815Sjsg 	if (engine->class != VIDEO_DECODE_CLASS &&
5095ca02815Sjsg 	    engine->class != VIDEO_ENHANCEMENT_CLASS)
510c349dbc7Sjsg 		return;
511c349dbc7Sjsg 
5125ca02815Sjsg 	if (engine->class == VIDEO_DECODE_CLASS &&
5135ca02815Sjsg 	    (BIT(engine->instance) & vdbox_sfc_access) == 0)
514c349dbc7Sjsg 		return;
515c349dbc7Sjsg 
5165ca02815Sjsg 	get_sfc_forced_lock_data(engine, &sfc_lock);
5175ca02815Sjsg 
518f005ef32Sjsg 	intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit, 0);
519c349dbc7Sjsg }
520c349dbc7Sjsg 
__gen11_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)5212e7c6ff7Sjsg static int __gen11_reset_engines(struct intel_gt *gt,
522c349dbc7Sjsg 				 intel_engine_mask_t engine_mask,
523c349dbc7Sjsg 				 unsigned int retry)
524c349dbc7Sjsg {
525c349dbc7Sjsg 	struct intel_engine_cs *engine;
526c349dbc7Sjsg 	intel_engine_mask_t tmp;
5275ca02815Sjsg 	u32 reset_mask, unlock_mask = 0;
528c349dbc7Sjsg 	int ret;
529c349dbc7Sjsg 
530c349dbc7Sjsg 	if (engine_mask == ALL_ENGINES) {
5315ca02815Sjsg 		reset_mask = GEN11_GRDOM_FULL;
532c349dbc7Sjsg 	} else {
5335ca02815Sjsg 		reset_mask = 0;
534c349dbc7Sjsg 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
5351bb76ff1Sjsg 			reset_mask |= engine->reset_domain;
5365ca02815Sjsg 			ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
537c349dbc7Sjsg 			if (ret)
538c349dbc7Sjsg 				goto sfc_unlock;
539c349dbc7Sjsg 		}
540c349dbc7Sjsg 	}
541c349dbc7Sjsg 
5425ca02815Sjsg 	ret = gen6_hw_domain_reset(gt, reset_mask);
543c349dbc7Sjsg 
544c349dbc7Sjsg sfc_unlock:
545c349dbc7Sjsg 	/*
546c349dbc7Sjsg 	 * We unlock the SFC based on the lock status and not the result of
547c349dbc7Sjsg 	 * gen11_lock_sfc to make sure that we clean properly if something
548c349dbc7Sjsg 	 * wrong happened during the lock (e.g. lock acquired after timeout
549c349dbc7Sjsg 	 * expiration).
5505ca02815Sjsg 	 *
5515ca02815Sjsg 	 * Due to Wa_14010733141, we may have locked an SFC to an engine that
5525ca02815Sjsg 	 * wasn't being reset.  So instead of calling gen11_unlock_sfc()
5535ca02815Sjsg 	 * on engine_mask, we instead call it on the mask of engines that our
5545ca02815Sjsg 	 * gen11_lock_sfc() calls told us actually had locks attempted.
555c349dbc7Sjsg 	 */
5565ca02815Sjsg 	for_each_engine_masked(engine, gt, unlock_mask, tmp)
557c349dbc7Sjsg 		gen11_unlock_sfc(engine);
558c349dbc7Sjsg 
559c349dbc7Sjsg 	return ret;
560c349dbc7Sjsg }
561c349dbc7Sjsg 
gen8_engine_reset_prepare(struct intel_engine_cs * engine)562c349dbc7Sjsg static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
563c349dbc7Sjsg {
564c349dbc7Sjsg 	struct intel_uncore *uncore = engine->uncore;
565c349dbc7Sjsg 	const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
566c349dbc7Sjsg 	u32 request, mask, ack;
567c349dbc7Sjsg 	int ret;
568c349dbc7Sjsg 
5695ca02815Sjsg 	if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
5705ca02815Sjsg 		return -ETIMEDOUT;
5715ca02815Sjsg 
572c349dbc7Sjsg 	ack = intel_uncore_read_fw(uncore, reg);
573c349dbc7Sjsg 	if (ack & RESET_CTL_CAT_ERROR) {
574c349dbc7Sjsg 		/*
575c349dbc7Sjsg 		 * For catastrophic errors, ready-for-reset sequence
576c349dbc7Sjsg 		 * needs to be bypassed: HAS#396813
577c349dbc7Sjsg 		 */
578c349dbc7Sjsg 		request = RESET_CTL_CAT_ERROR;
579c349dbc7Sjsg 		mask = RESET_CTL_CAT_ERROR;
580c349dbc7Sjsg 
581c349dbc7Sjsg 		/* Catastrophic errors need to be cleared by HW */
582c349dbc7Sjsg 		ack = 0;
583c349dbc7Sjsg 	} else if (!(ack & RESET_CTL_READY_TO_RESET)) {
584c349dbc7Sjsg 		request = RESET_CTL_REQUEST_RESET;
585c349dbc7Sjsg 		mask = RESET_CTL_READY_TO_RESET;
586c349dbc7Sjsg 		ack = RESET_CTL_READY_TO_RESET;
587c349dbc7Sjsg 	} else {
588c349dbc7Sjsg 		return 0;
589c349dbc7Sjsg 	}
590c349dbc7Sjsg 
591c349dbc7Sjsg 	intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
592c349dbc7Sjsg 	ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
593c349dbc7Sjsg 					   700, 0, NULL);
594c349dbc7Sjsg 	if (ret)
595c349dbc7Sjsg 		drm_err(&engine->i915->drm,
596c349dbc7Sjsg 			"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
597c349dbc7Sjsg 			engine->name, request,
598c349dbc7Sjsg 			intel_uncore_read_fw(uncore, reg));
599c349dbc7Sjsg 
600c349dbc7Sjsg 	return ret;
601c349dbc7Sjsg }
602c349dbc7Sjsg 
gen8_engine_reset_cancel(struct intel_engine_cs * engine)603c349dbc7Sjsg static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
604c349dbc7Sjsg {
605c349dbc7Sjsg 	intel_uncore_write_fw(engine->uncore,
606c349dbc7Sjsg 			      RING_RESET_CTL(engine->mmio_base),
607c349dbc7Sjsg 			      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
608c349dbc7Sjsg }
609c349dbc7Sjsg 
gen8_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)610c349dbc7Sjsg static int gen8_reset_engines(struct intel_gt *gt,
611c349dbc7Sjsg 			      intel_engine_mask_t engine_mask,
612c349dbc7Sjsg 			      unsigned int retry)
613c349dbc7Sjsg {
614c349dbc7Sjsg 	struct intel_engine_cs *engine;
615c349dbc7Sjsg 	const bool reset_non_ready = retry >= 1;
616c349dbc7Sjsg 	intel_engine_mask_t tmp;
6172e7c6ff7Sjsg 	unsigned long flags;
618c349dbc7Sjsg 	int ret;
619c349dbc7Sjsg 
6202e7c6ff7Sjsg 	spin_lock_irqsave(&gt->uncore->lock, flags);
6212e7c6ff7Sjsg 
622c349dbc7Sjsg 	for_each_engine_masked(engine, gt, engine_mask, tmp) {
623c349dbc7Sjsg 		ret = gen8_engine_reset_prepare(engine);
624c349dbc7Sjsg 		if (ret && !reset_non_ready)
625c349dbc7Sjsg 			goto skip_reset;
626c349dbc7Sjsg 
627c349dbc7Sjsg 		/*
628c349dbc7Sjsg 		 * If this is not the first failed attempt to prepare,
629c349dbc7Sjsg 		 * we decide to proceed anyway.
630c349dbc7Sjsg 		 *
631c349dbc7Sjsg 		 * By doing so we risk context corruption and with
632c349dbc7Sjsg 		 * some gens (kbl), possible system hang if reset
633c349dbc7Sjsg 		 * happens during active bb execution.
634c349dbc7Sjsg 		 *
635c349dbc7Sjsg 		 * We rather take context corruption instead of
636c349dbc7Sjsg 		 * failed reset with a wedged driver/gpu. And
637c349dbc7Sjsg 		 * active bb execution case should be covered by
638c349dbc7Sjsg 		 * stop_engines() we have before the reset.
639c349dbc7Sjsg 		 */
640c349dbc7Sjsg 	}
641c349dbc7Sjsg 
6420398c68bSjsg 	/*
6430398c68bSjsg 	 * Wa_22011100796:dg2, whenever Full soft reset is required,
6440398c68bSjsg 	 * reset all individual engines firstly, and then do a full soft reset.
6450398c68bSjsg 	 *
6460398c68bSjsg 	 * This is best effort, so ignore any error from the initial reset.
6470398c68bSjsg 	 */
6480398c68bSjsg 	if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
6492e7c6ff7Sjsg 		__gen11_reset_engines(gt, gt->info.engine_mask, 0);
6500398c68bSjsg 
6515ca02815Sjsg 	if (GRAPHICS_VER(gt->i915) >= 11)
6522e7c6ff7Sjsg 		ret = __gen11_reset_engines(gt, engine_mask, retry);
653c349dbc7Sjsg 	else
6542e7c6ff7Sjsg 		ret = __gen6_reset_engines(gt, engine_mask, retry);
655c349dbc7Sjsg 
656c349dbc7Sjsg skip_reset:
657c349dbc7Sjsg 	for_each_engine_masked(engine, gt, engine_mask, tmp)
658c349dbc7Sjsg 		gen8_engine_reset_cancel(engine);
659c349dbc7Sjsg 
6602e7c6ff7Sjsg 	spin_unlock_irqrestore(&gt->uncore->lock, flags);
6612e7c6ff7Sjsg 
662c349dbc7Sjsg 	return ret;
663c349dbc7Sjsg }
664c349dbc7Sjsg 
mock_reset(struct intel_gt * gt,intel_engine_mask_t mask,unsigned int retry)665c349dbc7Sjsg static int mock_reset(struct intel_gt *gt,
666c349dbc7Sjsg 		      intel_engine_mask_t mask,
667c349dbc7Sjsg 		      unsigned int retry)
668c349dbc7Sjsg {
669c349dbc7Sjsg 	return 0;
670c349dbc7Sjsg }
671c349dbc7Sjsg 
672c349dbc7Sjsg typedef int (*reset_func)(struct intel_gt *,
673c349dbc7Sjsg 			  intel_engine_mask_t engine_mask,
674c349dbc7Sjsg 			  unsigned int retry);
675c349dbc7Sjsg 
intel_get_gpu_reset(const struct intel_gt * gt)676c349dbc7Sjsg static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
677c349dbc7Sjsg {
678c349dbc7Sjsg 	struct drm_i915_private *i915 = gt->i915;
679c349dbc7Sjsg 
680c349dbc7Sjsg 	if (is_mock_gt(gt))
681c349dbc7Sjsg 		return mock_reset;
6825ca02815Sjsg 	else if (GRAPHICS_VER(i915) >= 8)
683c349dbc7Sjsg 		return gen8_reset_engines;
6845ca02815Sjsg 	else if (GRAPHICS_VER(i915) >= 6)
685c349dbc7Sjsg 		return gen6_reset_engines;
6865ca02815Sjsg 	else if (GRAPHICS_VER(i915) >= 5)
687c349dbc7Sjsg 		return ilk_do_reset;
688c349dbc7Sjsg 	else if (IS_G4X(i915))
689c349dbc7Sjsg 		return g4x_do_reset;
690c349dbc7Sjsg 	else if (IS_G33(i915) || IS_PINEVIEW(i915))
691c349dbc7Sjsg 		return g33_do_reset;
6925ca02815Sjsg 	else if (GRAPHICS_VER(i915) >= 3)
693c349dbc7Sjsg 		return i915_do_reset;
694c349dbc7Sjsg 	else
695c349dbc7Sjsg 		return NULL;
696c349dbc7Sjsg }
697c349dbc7Sjsg 
__reset_guc(struct intel_gt * gt)698f005ef32Sjsg static int __reset_guc(struct intel_gt *gt)
699f005ef32Sjsg {
700f005ef32Sjsg 	u32 guc_domain =
701f005ef32Sjsg 		GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
702f005ef32Sjsg 
703f005ef32Sjsg 	return gen6_hw_domain_reset(gt, guc_domain);
704f005ef32Sjsg }
705f005ef32Sjsg 
needs_wa_14015076503(struct intel_gt * gt,intel_engine_mask_t engine_mask)706f005ef32Sjsg static bool needs_wa_14015076503(struct intel_gt *gt, intel_engine_mask_t engine_mask)
707f005ef32Sjsg {
708*ddf58b8fSjsg 	if (MEDIA_VER_FULL(gt->i915) != IP_VER(13, 0) || !HAS_ENGINE(gt, GSC0))
709f005ef32Sjsg 		return false;
710f005ef32Sjsg 
711f005ef32Sjsg 	if (!__HAS_ENGINE(engine_mask, GSC0))
712f005ef32Sjsg 		return false;
713f005ef32Sjsg 
714f005ef32Sjsg 	return intel_gsc_uc_fw_init_done(&gt->uc.gsc);
715f005ef32Sjsg }
716f005ef32Sjsg 
717f005ef32Sjsg static intel_engine_mask_t
wa_14015076503_start(struct intel_gt * gt,intel_engine_mask_t engine_mask,bool first)718f005ef32Sjsg wa_14015076503_start(struct intel_gt *gt, intel_engine_mask_t engine_mask, bool first)
719f005ef32Sjsg {
720f005ef32Sjsg 	if (!needs_wa_14015076503(gt, engine_mask))
721f005ef32Sjsg 		return engine_mask;
722f005ef32Sjsg 
723f005ef32Sjsg 	/*
724f005ef32Sjsg 	 * wa_14015076503: if the GSC FW is loaded, we need to alert it that
725f005ef32Sjsg 	 * we're going to do a GSC engine reset and then wait for 200ms for the
726f005ef32Sjsg 	 * FW to get ready for it. However, if this is the first ALL_ENGINES
727f005ef32Sjsg 	 * reset attempt and the GSC is not busy, we can try to instead reset
728f005ef32Sjsg 	 * the GuC and all the other engines individually to avoid the 200ms
729f005ef32Sjsg 	 * wait.
730f005ef32Sjsg 	 * Skipping the GSC engine is safe because, differently from other
731f005ef32Sjsg 	 * engines, the GSCCS only role is to forward the commands to the GSC
732f005ef32Sjsg 	 * FW, so it doesn't have any HW outside of the CS itself and therefore
733f005ef32Sjsg 	 * it has no state that we don't explicitly re-init on resume or on
734f005ef32Sjsg 	 * context switch LRC or power context). The HW for the GSC uC is
735f005ef32Sjsg 	 * managed by the GSC FW so we don't need to care about that.
736f005ef32Sjsg 	 */
737f005ef32Sjsg 	if (engine_mask == ALL_ENGINES && first && intel_engine_is_idle(gt->engine[GSC0])) {
738f005ef32Sjsg 		__reset_guc(gt);
739f005ef32Sjsg 		engine_mask = gt->info.engine_mask & ~BIT(GSC0);
740f005ef32Sjsg 	} else {
741f005ef32Sjsg 		intel_uncore_rmw(gt->uncore,
742f005ef32Sjsg 				 HECI_H_GS1(MTL_GSC_HECI2_BASE),
743f005ef32Sjsg 				 0, HECI_H_GS1_ER_PREP);
744f005ef32Sjsg 
745f005ef32Sjsg 		/* make sure the reset bit is clear when writing the CSR reg */
746f005ef32Sjsg 		intel_uncore_rmw(gt->uncore,
747f005ef32Sjsg 				 HECI_H_CSR(MTL_GSC_HECI2_BASE),
748f005ef32Sjsg 				 HECI_H_CSR_RST, HECI_H_CSR_IG);
749f005ef32Sjsg 		drm_msleep(200);
750f005ef32Sjsg 	}
751f005ef32Sjsg 
752f005ef32Sjsg 	return engine_mask;
753f005ef32Sjsg }
754f005ef32Sjsg 
755f005ef32Sjsg static void
wa_14015076503_end(struct intel_gt * gt,intel_engine_mask_t engine_mask)756f005ef32Sjsg wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask)
757f005ef32Sjsg {
758f005ef32Sjsg 	if (!needs_wa_14015076503(gt, engine_mask))
759f005ef32Sjsg 		return;
760f005ef32Sjsg 
761f005ef32Sjsg 	intel_uncore_rmw(gt->uncore,
762f005ef32Sjsg 			 HECI_H_GS1(MTL_GSC_HECI2_BASE),
763f005ef32Sjsg 			 HECI_H_GS1_ER_PREP, 0);
764f005ef32Sjsg }
765f005ef32Sjsg 
__intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask)766c349dbc7Sjsg int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
767c349dbc7Sjsg {
768c349dbc7Sjsg 	const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
769c349dbc7Sjsg 	reset_func reset;
770c349dbc7Sjsg 	int ret = -ETIMEDOUT;
771c349dbc7Sjsg 	int retry;
772c349dbc7Sjsg 
773c349dbc7Sjsg 	reset = intel_get_gpu_reset(gt);
774c349dbc7Sjsg 	if (!reset)
775c349dbc7Sjsg 		return -ENODEV;
776c349dbc7Sjsg 
777c349dbc7Sjsg 	/*
778c349dbc7Sjsg 	 * If the power well sleeps during the reset, the reset
779c349dbc7Sjsg 	 * request may be dropped and never completes (causing -EIO).
780c349dbc7Sjsg 	 */
781c349dbc7Sjsg 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
782c349dbc7Sjsg 	for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
783f005ef32Sjsg 		intel_engine_mask_t reset_mask;
784f005ef32Sjsg 
785f005ef32Sjsg 		reset_mask = wa_14015076503_start(gt, engine_mask, !retry);
786f005ef32Sjsg 
787f005ef32Sjsg 		GT_TRACE(gt, "engine_mask=%x\n", reset_mask);
788c349dbc7Sjsg 		preempt_disable();
789f005ef32Sjsg 		ret = reset(gt, reset_mask, retry);
790c349dbc7Sjsg 		preempt_enable();
791f005ef32Sjsg 
792f005ef32Sjsg 		wa_14015076503_end(gt, reset_mask);
793c349dbc7Sjsg 	}
794c349dbc7Sjsg 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
795c349dbc7Sjsg 
796c349dbc7Sjsg 	return ret;
797c349dbc7Sjsg }
798c349dbc7Sjsg 
intel_has_gpu_reset(const struct intel_gt * gt)799c349dbc7Sjsg bool intel_has_gpu_reset(const struct intel_gt *gt)
800c349dbc7Sjsg {
801ad8b1aafSjsg 	if (!gt->i915->params.reset)
802c349dbc7Sjsg 		return NULL;
803c349dbc7Sjsg 
804c349dbc7Sjsg 	return intel_get_gpu_reset(gt);
805c349dbc7Sjsg }
806c349dbc7Sjsg 
intel_has_reset_engine(const struct intel_gt * gt)807c349dbc7Sjsg bool intel_has_reset_engine(const struct intel_gt *gt)
808c349dbc7Sjsg {
809ad8b1aafSjsg 	if (gt->i915->params.reset < 2)
810c349dbc7Sjsg 		return false;
811c349dbc7Sjsg 
812c349dbc7Sjsg 	return INTEL_INFO(gt->i915)->has_reset_engine;
813c349dbc7Sjsg }
814c349dbc7Sjsg 
intel_reset_guc(struct intel_gt * gt)815c349dbc7Sjsg int intel_reset_guc(struct intel_gt *gt)
816c349dbc7Sjsg {
817c349dbc7Sjsg 	int ret;
818c349dbc7Sjsg 
819c349dbc7Sjsg 	GEM_BUG_ON(!HAS_GT_UC(gt->i915));
820c349dbc7Sjsg 
821c349dbc7Sjsg 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
822f005ef32Sjsg 	ret = __reset_guc(gt);
823c349dbc7Sjsg 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
824c349dbc7Sjsg 
825c349dbc7Sjsg 	return ret;
826c349dbc7Sjsg }
827c349dbc7Sjsg 
828c349dbc7Sjsg /*
829c349dbc7Sjsg  * Ensure irq handler finishes, and not run again.
830c349dbc7Sjsg  * Also return the active request so that we only search for it once.
831c349dbc7Sjsg  */
reset_prepare_engine(struct intel_engine_cs * engine)832c349dbc7Sjsg static void reset_prepare_engine(struct intel_engine_cs *engine)
833c349dbc7Sjsg {
834c349dbc7Sjsg 	/*
835c349dbc7Sjsg 	 * During the reset sequence, we must prevent the engine from
836c349dbc7Sjsg 	 * entering RC6. As the context state is undefined until we restart
837c349dbc7Sjsg 	 * the engine, if it does enter RC6 during the reset, the state
838c349dbc7Sjsg 	 * written to the powercontext is undefined and so we may lose
839c349dbc7Sjsg 	 * GPU state upon resume, i.e. fail to restart after a reset.
840c349dbc7Sjsg 	 */
841c349dbc7Sjsg 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
842c349dbc7Sjsg 	if (engine->reset.prepare)
843c349dbc7Sjsg 		engine->reset.prepare(engine);
844c349dbc7Sjsg }
845c349dbc7Sjsg 
revoke_mmaps(struct intel_gt * gt)846c349dbc7Sjsg static void revoke_mmaps(struct intel_gt *gt)
847c349dbc7Sjsg {
848c349dbc7Sjsg 	int i;
849c349dbc7Sjsg 
850c349dbc7Sjsg 	for (i = 0; i < gt->ggtt->num_fences; i++) {
851c349dbc7Sjsg 		struct drm_vma_offset_node *node;
852c349dbc7Sjsg 		struct i915_vma *vma;
853c349dbc7Sjsg 		u64 vma_offset;
854c349dbc7Sjsg 
855c349dbc7Sjsg 		vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
856c349dbc7Sjsg 		if (!vma)
857c349dbc7Sjsg 			continue;
858c349dbc7Sjsg 
859c349dbc7Sjsg 		if (!i915_vma_has_userfault(vma))
860c349dbc7Sjsg 			continue;
861c349dbc7Sjsg 
862c349dbc7Sjsg 		GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
863c349dbc7Sjsg 
864c349dbc7Sjsg 		if (!vma->mmo)
865c349dbc7Sjsg 			continue;
866c349dbc7Sjsg 
867c349dbc7Sjsg 		node = &vma->mmo->vma_node;
8681bb76ff1Sjsg 		vma_offset = vma->gtt_view.partial.offset << PAGE_SHIFT;
869c349dbc7Sjsg 
870c349dbc7Sjsg #ifdef __linux__
871c349dbc7Sjsg 		unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
872c349dbc7Sjsg 				    drm_vma_node_offset_addr(node) + vma_offset,
873c349dbc7Sjsg 				    vma->size,
874c349dbc7Sjsg 				    1);
875c349dbc7Sjsg #else
876c349dbc7Sjsg {
877c349dbc7Sjsg 		struct drm_i915_private *dev_priv = vma->obj->base.dev->dev_private;
878c349dbc7Sjsg 		struct vm_page *pg;
879c349dbc7Sjsg 
880c349dbc7Sjsg 		for (pg = &dev_priv->pgs[atop(vma->node.start)];
881c349dbc7Sjsg 		     pg != &dev_priv->pgs[atop(vma->node.start + vma->size)];
882c349dbc7Sjsg 		     pg++)
883c349dbc7Sjsg 			pmap_page_protect(pg, PROT_NONE);
884c349dbc7Sjsg }
885c349dbc7Sjsg #endif
886c349dbc7Sjsg 	}
887c349dbc7Sjsg }
888c349dbc7Sjsg 
reset_prepare(struct intel_gt * gt)889c349dbc7Sjsg static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
890c349dbc7Sjsg {
891c349dbc7Sjsg 	struct intel_engine_cs *engine;
892c349dbc7Sjsg 	intel_engine_mask_t awake = 0;
893c349dbc7Sjsg 	enum intel_engine_id id;
894c349dbc7Sjsg 
8951bb76ff1Sjsg 	/* For GuC mode, ensure submission is disabled before stopping ring */
8961bb76ff1Sjsg 	intel_uc_reset_prepare(&gt->uc);
8971bb76ff1Sjsg 
898c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
899c349dbc7Sjsg 		if (intel_engine_pm_get_if_awake(engine))
900c349dbc7Sjsg 			awake |= engine->mask;
901c349dbc7Sjsg 		reset_prepare_engine(engine);
902c349dbc7Sjsg 	}
903c349dbc7Sjsg 
904c349dbc7Sjsg 	return awake;
905c349dbc7Sjsg }
906c349dbc7Sjsg 
gt_revoke(struct intel_gt * gt)907c349dbc7Sjsg static void gt_revoke(struct intel_gt *gt)
908c349dbc7Sjsg {
909c349dbc7Sjsg 	revoke_mmaps(gt);
910c349dbc7Sjsg }
911c349dbc7Sjsg 
gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)912c349dbc7Sjsg static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
913c349dbc7Sjsg {
914c349dbc7Sjsg 	struct intel_engine_cs *engine;
915c349dbc7Sjsg 	enum intel_engine_id id;
916c349dbc7Sjsg 	int err;
917c349dbc7Sjsg 
918c349dbc7Sjsg 	/*
919c349dbc7Sjsg 	 * Everything depends on having the GTT running, so we need to start
920c349dbc7Sjsg 	 * there.
921c349dbc7Sjsg 	 */
922c349dbc7Sjsg 	err = i915_ggtt_enable_hw(gt->i915);
923c349dbc7Sjsg 	if (err)
924c349dbc7Sjsg 		return err;
925c349dbc7Sjsg 
9265ca02815Sjsg 	local_bh_disable();
927c349dbc7Sjsg 	for_each_engine(engine, gt, id)
928c349dbc7Sjsg 		__intel_engine_reset(engine, stalled_mask & engine->mask);
9295ca02815Sjsg 	local_bh_enable();
9305ca02815Sjsg 
9311bb76ff1Sjsg 	intel_uc_reset(&gt->uc, ALL_ENGINES);
932c349dbc7Sjsg 
933ad8b1aafSjsg 	intel_ggtt_restore_fences(gt->ggtt);
934c349dbc7Sjsg 
935c349dbc7Sjsg 	return err;
936c349dbc7Sjsg }
937c349dbc7Sjsg 
reset_finish_engine(struct intel_engine_cs * engine)938c349dbc7Sjsg static void reset_finish_engine(struct intel_engine_cs *engine)
939c349dbc7Sjsg {
940c349dbc7Sjsg 	if (engine->reset.finish)
941c349dbc7Sjsg 		engine->reset.finish(engine);
942c349dbc7Sjsg 	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
943c349dbc7Sjsg 
944c349dbc7Sjsg 	intel_engine_signal_breadcrumbs(engine);
945c349dbc7Sjsg }
946c349dbc7Sjsg 
reset_finish(struct intel_gt * gt,intel_engine_mask_t awake)947c349dbc7Sjsg static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
948c349dbc7Sjsg {
949c349dbc7Sjsg 	struct intel_engine_cs *engine;
950c349dbc7Sjsg 	enum intel_engine_id id;
951c349dbc7Sjsg 
952c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
953c349dbc7Sjsg 		reset_finish_engine(engine);
954c349dbc7Sjsg 		if (awake & engine->mask)
955c349dbc7Sjsg 			intel_engine_pm_put(engine);
956c349dbc7Sjsg 	}
9575ca02815Sjsg 
9585ca02815Sjsg 	intel_uc_reset_finish(&gt->uc);
959c349dbc7Sjsg }
960c349dbc7Sjsg 
nop_submit_request(struct i915_request * request)961c349dbc7Sjsg static void nop_submit_request(struct i915_request *request)
962c349dbc7Sjsg {
963c349dbc7Sjsg 	RQ_TRACE(request, "-EIO\n");
964c349dbc7Sjsg 
9655ca02815Sjsg 	request = i915_request_mark_eio(request);
9665ca02815Sjsg 	if (request) {
9675ca02815Sjsg 		i915_request_submit(request);
9685ca02815Sjsg 		intel_engine_signal_breadcrumbs(request->engine);
969c349dbc7Sjsg 
9705ca02815Sjsg 		i915_request_put(request);
9715ca02815Sjsg 	}
972c349dbc7Sjsg }
973c349dbc7Sjsg 
__intel_gt_set_wedged(struct intel_gt * gt)974c349dbc7Sjsg static void __intel_gt_set_wedged(struct intel_gt *gt)
975c349dbc7Sjsg {
976c349dbc7Sjsg 	struct intel_engine_cs *engine;
977c349dbc7Sjsg 	intel_engine_mask_t awake;
978c349dbc7Sjsg 	enum intel_engine_id id;
979c349dbc7Sjsg 
980c349dbc7Sjsg 	if (test_bit(I915_WEDGED, &gt->reset.flags))
981c349dbc7Sjsg 		return;
982c349dbc7Sjsg 
983c349dbc7Sjsg 	GT_TRACE(gt, "start\n");
984c349dbc7Sjsg 
985c349dbc7Sjsg 	/*
986c349dbc7Sjsg 	 * First, stop submission to hw, but do not yet complete requests by
987c349dbc7Sjsg 	 * rolling the global seqno forward (since this would complete requests
988c349dbc7Sjsg 	 * for which we haven't set the fence error to EIO yet).
989c349dbc7Sjsg 	 */
990c349dbc7Sjsg 	awake = reset_prepare(gt);
991c349dbc7Sjsg 
992c349dbc7Sjsg 	/* Even if the GPU reset fails, it should still stop the engines */
993c349dbc7Sjsg 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
994c349dbc7Sjsg 		__intel_gt_reset(gt, ALL_ENGINES);
995c349dbc7Sjsg 
996c349dbc7Sjsg 	for_each_engine(engine, gt, id)
997c349dbc7Sjsg 		engine->submit_request = nop_submit_request;
998c349dbc7Sjsg 
999c349dbc7Sjsg 	/*
1000c349dbc7Sjsg 	 * Make sure no request can slip through without getting completed by
1001c349dbc7Sjsg 	 * either this call here to intel_engine_write_global_seqno, or the one
1002c349dbc7Sjsg 	 * in nop_submit_request.
1003c349dbc7Sjsg 	 */
1004c349dbc7Sjsg 	synchronize_rcu_expedited();
1005c349dbc7Sjsg 	set_bit(I915_WEDGED, &gt->reset.flags);
1006c349dbc7Sjsg 
1007c349dbc7Sjsg 	/* Mark all executing requests as skipped */
10085ca02815Sjsg 	local_bh_disable();
1009c349dbc7Sjsg 	for_each_engine(engine, gt, id)
1010c349dbc7Sjsg 		if (engine->reset.cancel)
1011c349dbc7Sjsg 			engine->reset.cancel(engine);
10125ca02815Sjsg 	intel_uc_cancel_requests(&gt->uc);
10135ca02815Sjsg 	local_bh_enable();
1014c349dbc7Sjsg 
1015c349dbc7Sjsg 	reset_finish(gt, awake);
1016c349dbc7Sjsg 
1017c349dbc7Sjsg 	GT_TRACE(gt, "end\n");
1018c349dbc7Sjsg }
1019c349dbc7Sjsg 
intel_gt_set_wedged(struct intel_gt * gt)1020c349dbc7Sjsg void intel_gt_set_wedged(struct intel_gt *gt)
1021c349dbc7Sjsg {
1022c349dbc7Sjsg 	intel_wakeref_t wakeref;
1023c349dbc7Sjsg 
1024c349dbc7Sjsg 	if (test_bit(I915_WEDGED, &gt->reset.flags))
1025c349dbc7Sjsg 		return;
1026c349dbc7Sjsg 
1027c349dbc7Sjsg 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1028c349dbc7Sjsg 	mutex_lock(&gt->reset.mutex);
1029c349dbc7Sjsg 
1030c349dbc7Sjsg 	if (GEM_SHOW_DEBUG()) {
1031c349dbc7Sjsg 		struct drm_printer p = drm_debug_printer(__func__);
1032c349dbc7Sjsg 		struct intel_engine_cs *engine;
1033c349dbc7Sjsg 		enum intel_engine_id id;
1034c349dbc7Sjsg 
1035c349dbc7Sjsg 		drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
1036c349dbc7Sjsg 		for_each_engine(engine, gt, id) {
1037c349dbc7Sjsg 			if (intel_engine_is_idle(engine))
1038c349dbc7Sjsg 				continue;
1039c349dbc7Sjsg 
1040c349dbc7Sjsg 			intel_engine_dump(engine, &p, "%s\n", engine->name);
1041c349dbc7Sjsg 		}
1042c349dbc7Sjsg 	}
1043c349dbc7Sjsg 
1044c349dbc7Sjsg 	__intel_gt_set_wedged(gt);
1045c349dbc7Sjsg 
1046c349dbc7Sjsg 	mutex_unlock(&gt->reset.mutex);
1047c349dbc7Sjsg 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1048c349dbc7Sjsg }
1049c349dbc7Sjsg 
__intel_gt_unset_wedged(struct intel_gt * gt)1050c349dbc7Sjsg static bool __intel_gt_unset_wedged(struct intel_gt *gt)
1051c349dbc7Sjsg {
1052c349dbc7Sjsg 	struct intel_gt_timelines *timelines = &gt->timelines;
1053c349dbc7Sjsg 	struct intel_timeline *tl;
1054c349dbc7Sjsg 	bool ok;
1055c349dbc7Sjsg 
1056c349dbc7Sjsg 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
1057c349dbc7Sjsg 		return true;
1058c349dbc7Sjsg 
1059c349dbc7Sjsg 	/* Never fully initialised, recovery impossible */
1060ad8b1aafSjsg 	if (intel_gt_has_unrecoverable_error(gt))
1061c349dbc7Sjsg 		return false;
1062c349dbc7Sjsg 
1063c349dbc7Sjsg 	GT_TRACE(gt, "start\n");
1064c349dbc7Sjsg 
1065c349dbc7Sjsg 	/*
1066c349dbc7Sjsg 	 * Before unwedging, make sure that all pending operations
1067c349dbc7Sjsg 	 * are flushed and errored out - we may have requests waiting upon
1068c349dbc7Sjsg 	 * third party fences. We marked all inflight requests as EIO, and
1069c349dbc7Sjsg 	 * every execbuf since returned EIO, for consistency we want all
1070c349dbc7Sjsg 	 * the currently pending requests to also be marked as EIO, which
1071c349dbc7Sjsg 	 * is done inside our nop_submit_request - and so we must wait.
1072c349dbc7Sjsg 	 *
1073c349dbc7Sjsg 	 * No more can be submitted until we reset the wedged bit.
1074c349dbc7Sjsg 	 */
1075c349dbc7Sjsg 	spin_lock(&timelines->lock);
1076c349dbc7Sjsg 	list_for_each_entry(tl, &timelines->active_list, link) {
1077c349dbc7Sjsg 		struct dma_fence *fence;
1078c349dbc7Sjsg 
1079c349dbc7Sjsg 		fence = i915_active_fence_get(&tl->last_request);
1080c349dbc7Sjsg 		if (!fence)
1081c349dbc7Sjsg 			continue;
1082c349dbc7Sjsg 
1083c349dbc7Sjsg 		spin_unlock(&timelines->lock);
1084c349dbc7Sjsg 
1085c349dbc7Sjsg 		/*
1086c349dbc7Sjsg 		 * All internal dependencies (i915_requests) will have
1087c349dbc7Sjsg 		 * been flushed by the set-wedge, but we may be stuck waiting
1088c349dbc7Sjsg 		 * for external fences. These should all be capped to 10s
1089c349dbc7Sjsg 		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
1090c349dbc7Sjsg 		 * in the worst case.
1091c349dbc7Sjsg 		 */
1092c349dbc7Sjsg 		dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
1093c349dbc7Sjsg 		dma_fence_put(fence);
1094c349dbc7Sjsg 
1095c349dbc7Sjsg 		/* Restart iteration after droping lock */
1096c349dbc7Sjsg 		spin_lock(&timelines->lock);
1097c349dbc7Sjsg 		tl = list_entry(&timelines->active_list, typeof(*tl), link);
1098c349dbc7Sjsg 	}
1099c349dbc7Sjsg 	spin_unlock(&timelines->lock);
1100c349dbc7Sjsg 
1101c349dbc7Sjsg 	/* We must reset pending GPU events before restoring our submission */
1102c349dbc7Sjsg 	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
1103c349dbc7Sjsg 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1104c349dbc7Sjsg 		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
1105c349dbc7Sjsg 	if (!ok) {
1106c349dbc7Sjsg 		/*
1107c349dbc7Sjsg 		 * Warn CI about the unrecoverable wedged condition.
1108c349dbc7Sjsg 		 * Time for a reboot.
1109c349dbc7Sjsg 		 */
1110ad8b1aafSjsg 		add_taint_for_CI(gt->i915, TAINT_WARN);
1111c349dbc7Sjsg 		return false;
1112c349dbc7Sjsg 	}
1113c349dbc7Sjsg 
1114c349dbc7Sjsg 	/*
1115c349dbc7Sjsg 	 * Undo nop_submit_request. We prevent all new i915 requests from
1116c349dbc7Sjsg 	 * being queued (by disallowing execbuf whilst wedged) so having
1117c349dbc7Sjsg 	 * waited for all active requests above, we know the system is idle
1118c349dbc7Sjsg 	 * and do not have to worry about a thread being inside
1119c349dbc7Sjsg 	 * engine->submit_request() as we swap over. So unlike installing
1120c349dbc7Sjsg 	 * the nop_submit_request on reset, we can do this from normal
1121c349dbc7Sjsg 	 * context and do not require stop_machine().
1122c349dbc7Sjsg 	 */
1123c349dbc7Sjsg 	intel_engines_reset_default_submission(gt);
1124c349dbc7Sjsg 
1125c349dbc7Sjsg 	GT_TRACE(gt, "end\n");
1126c349dbc7Sjsg 
1127c349dbc7Sjsg 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
1128c349dbc7Sjsg 	clear_bit(I915_WEDGED, &gt->reset.flags);
1129c349dbc7Sjsg 
1130c349dbc7Sjsg 	return true;
1131c349dbc7Sjsg }
1132c349dbc7Sjsg 
intel_gt_unset_wedged(struct intel_gt * gt)1133c349dbc7Sjsg bool intel_gt_unset_wedged(struct intel_gt *gt)
1134c349dbc7Sjsg {
1135c349dbc7Sjsg 	bool result;
1136c349dbc7Sjsg 
1137c349dbc7Sjsg 	mutex_lock(&gt->reset.mutex);
1138c349dbc7Sjsg 	result = __intel_gt_unset_wedged(gt);
1139c349dbc7Sjsg 	mutex_unlock(&gt->reset.mutex);
1140c349dbc7Sjsg 
1141c349dbc7Sjsg 	return result;
1142c349dbc7Sjsg }
1143c349dbc7Sjsg 
do_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)1144c349dbc7Sjsg static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1145c349dbc7Sjsg {
1146c349dbc7Sjsg 	int err, i;
1147c349dbc7Sjsg 
1148c349dbc7Sjsg 	err = __intel_gt_reset(gt, ALL_ENGINES);
1149c349dbc7Sjsg 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1150c349dbc7Sjsg 		drm_msleep(10 * (i + 1));
1151c349dbc7Sjsg 		err = __intel_gt_reset(gt, ALL_ENGINES);
1152c349dbc7Sjsg 	}
1153c349dbc7Sjsg 	if (err)
1154c349dbc7Sjsg 		return err;
1155c349dbc7Sjsg 
1156c349dbc7Sjsg 	return gt_reset(gt, stalled_mask);
1157c349dbc7Sjsg }
1158c349dbc7Sjsg 
resume(struct intel_gt * gt)1159c349dbc7Sjsg static int resume(struct intel_gt *gt)
1160c349dbc7Sjsg {
1161c349dbc7Sjsg 	struct intel_engine_cs *engine;
1162c349dbc7Sjsg 	enum intel_engine_id id;
1163c349dbc7Sjsg 	int ret;
1164c349dbc7Sjsg 
1165c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
1166c349dbc7Sjsg 		ret = intel_engine_resume(engine);
1167c349dbc7Sjsg 		if (ret)
1168c349dbc7Sjsg 			return ret;
1169c349dbc7Sjsg 	}
1170c349dbc7Sjsg 
1171c349dbc7Sjsg 	return 0;
1172c349dbc7Sjsg }
1173c349dbc7Sjsg 
1174c349dbc7Sjsg /**
1175c349dbc7Sjsg  * intel_gt_reset - reset chip after a hang
1176c349dbc7Sjsg  * @gt: #intel_gt to reset
1177c349dbc7Sjsg  * @stalled_mask: mask of the stalled engines with the guilty requests
1178c349dbc7Sjsg  * @reason: user error message for why we are resetting
1179c349dbc7Sjsg  *
1180c349dbc7Sjsg  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
1181c349dbc7Sjsg  * on failure.
1182c349dbc7Sjsg  *
1183c349dbc7Sjsg  * Procedure is fairly simple:
1184c349dbc7Sjsg  *   - reset the chip using the reset reg
1185c349dbc7Sjsg  *   - re-init context state
1186c349dbc7Sjsg  *   - re-init hardware status page
1187c349dbc7Sjsg  *   - re-init ring buffer
1188c349dbc7Sjsg  *   - re-init interrupt state
1189c349dbc7Sjsg  *   - re-init display
1190c349dbc7Sjsg  */
intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask,const char * reason)1191c349dbc7Sjsg void intel_gt_reset(struct intel_gt *gt,
1192c349dbc7Sjsg 		    intel_engine_mask_t stalled_mask,
1193c349dbc7Sjsg 		    const char *reason)
1194c349dbc7Sjsg {
1195c349dbc7Sjsg 	intel_engine_mask_t awake;
1196c349dbc7Sjsg 	int ret;
1197c349dbc7Sjsg 
1198c349dbc7Sjsg 	GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1199c349dbc7Sjsg 
1200c349dbc7Sjsg 	might_sleep();
1201c349dbc7Sjsg 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
12025ca02815Sjsg 
12035ca02815Sjsg 	/*
12045ca02815Sjsg 	 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
12055ca02815Sjsg 	 * critical section like gpu reset.
12065ca02815Sjsg 	 */
12075ca02815Sjsg 	gt_revoke(gt);
12085ca02815Sjsg 
1209c349dbc7Sjsg 	mutex_lock(&gt->reset.mutex);
1210c349dbc7Sjsg 
1211c349dbc7Sjsg 	/* Clear any previous failed attempts at recovery. Time to try again. */
1212c349dbc7Sjsg 	if (!__intel_gt_unset_wedged(gt))
1213c349dbc7Sjsg 		goto unlock;
1214c349dbc7Sjsg 
1215c349dbc7Sjsg 	if (reason)
1216ad8b1aafSjsg 		drm_notice(&gt->i915->drm,
1217c349dbc7Sjsg 			   "Resetting chip for %s\n", reason);
1218c349dbc7Sjsg 	atomic_inc(&gt->i915->gpu_error.reset_count);
1219c349dbc7Sjsg 
1220c349dbc7Sjsg 	awake = reset_prepare(gt);
1221c349dbc7Sjsg 
1222c349dbc7Sjsg 	if (!intel_has_gpu_reset(gt)) {
1223ad8b1aafSjsg 		if (gt->i915->params.reset)
1224ad8b1aafSjsg 			drm_err(&gt->i915->drm, "GPU reset not supported\n");
1225c349dbc7Sjsg 		else
1226c349dbc7Sjsg 			drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
1227c349dbc7Sjsg 		goto error;
1228c349dbc7Sjsg 	}
1229c349dbc7Sjsg 
1230c349dbc7Sjsg 	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1231c349dbc7Sjsg 		intel_runtime_pm_disable_interrupts(gt->i915);
1232c349dbc7Sjsg 
1233c349dbc7Sjsg 	if (do_reset(gt, stalled_mask)) {
1234ad8b1aafSjsg 		drm_err(&gt->i915->drm, "Failed to reset chip\n");
1235c349dbc7Sjsg 		goto taint;
1236c349dbc7Sjsg 	}
1237c349dbc7Sjsg 
1238c349dbc7Sjsg 	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1239c349dbc7Sjsg 		intel_runtime_pm_enable_interrupts(gt->i915);
1240c349dbc7Sjsg 
1241c349dbc7Sjsg 	intel_overlay_reset(gt->i915);
1242c349dbc7Sjsg 
1243c349dbc7Sjsg 	/*
1244c349dbc7Sjsg 	 * Next we need to restore the context, but we don't use those
1245c349dbc7Sjsg 	 * yet either...
1246c349dbc7Sjsg 	 *
1247c349dbc7Sjsg 	 * Ring buffer needs to be re-initialized in the KMS case, or if X
1248c349dbc7Sjsg 	 * was running at the time of the reset (i.e. we weren't VT
1249c349dbc7Sjsg 	 * switched away).
1250c349dbc7Sjsg 	 */
1251c349dbc7Sjsg 	ret = intel_gt_init_hw(gt);
1252c349dbc7Sjsg 	if (ret) {
1253c349dbc7Sjsg 		drm_err(&gt->i915->drm,
1254c349dbc7Sjsg 			"Failed to initialise HW following reset (%d)\n",
1255c349dbc7Sjsg 			ret);
1256c349dbc7Sjsg 		goto taint;
1257c349dbc7Sjsg 	}
1258c349dbc7Sjsg 
1259c349dbc7Sjsg 	ret = resume(gt);
1260c349dbc7Sjsg 	if (ret)
1261c349dbc7Sjsg 		goto taint;
1262c349dbc7Sjsg 
1263c349dbc7Sjsg finish:
1264c349dbc7Sjsg 	reset_finish(gt, awake);
1265c349dbc7Sjsg unlock:
1266c349dbc7Sjsg 	mutex_unlock(&gt->reset.mutex);
1267c349dbc7Sjsg 	return;
1268c349dbc7Sjsg 
1269c349dbc7Sjsg taint:
1270c349dbc7Sjsg 	/*
1271c349dbc7Sjsg 	 * History tells us that if we cannot reset the GPU now, we
1272c349dbc7Sjsg 	 * never will. This then impacts everything that is run
1273c349dbc7Sjsg 	 * subsequently. On failing the reset, we mark the driver
1274c349dbc7Sjsg 	 * as wedged, preventing further execution on the GPU.
1275c349dbc7Sjsg 	 * We also want to go one step further and add a taint to the
1276c349dbc7Sjsg 	 * kernel so that any subsequent faults can be traced back to
1277c349dbc7Sjsg 	 * this failure. This is important for CI, where if the
1278c349dbc7Sjsg 	 * GPU/driver fails we would like to reboot and restart testing
1279c349dbc7Sjsg 	 * rather than continue on into oblivion. For everyone else,
1280c349dbc7Sjsg 	 * the system should still plod along, but they have been warned!
1281c349dbc7Sjsg 	 */
1282ad8b1aafSjsg 	add_taint_for_CI(gt->i915, TAINT_WARN);
1283c349dbc7Sjsg error:
1284c349dbc7Sjsg 	__intel_gt_set_wedged(gt);
1285c349dbc7Sjsg 	goto finish;
1286c349dbc7Sjsg }
1287c349dbc7Sjsg 
intel_gt_reset_engine(struct intel_engine_cs * engine)12885ca02815Sjsg static int intel_gt_reset_engine(struct intel_engine_cs *engine)
1289c349dbc7Sjsg {
1290c349dbc7Sjsg 	return __intel_gt_reset(engine->gt, engine->mask);
1291c349dbc7Sjsg }
1292c349dbc7Sjsg 
__intel_engine_reset_bh(struct intel_engine_cs * engine,const char * msg)12935ca02815Sjsg int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1294c349dbc7Sjsg {
1295c349dbc7Sjsg 	struct intel_gt *gt = engine->gt;
1296c349dbc7Sjsg 	int ret;
1297c349dbc7Sjsg 
1298c349dbc7Sjsg 	ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1299c349dbc7Sjsg 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1300c349dbc7Sjsg 
13015ca02815Sjsg 	if (intel_engine_uses_guc(engine))
13025ca02815Sjsg 		return -ENODEV;
13035ca02815Sjsg 
1304c349dbc7Sjsg 	if (!intel_engine_pm_get_if_awake(engine))
1305c349dbc7Sjsg 		return 0;
1306c349dbc7Sjsg 
1307c349dbc7Sjsg 	reset_prepare_engine(engine);
1308c349dbc7Sjsg 
1309c349dbc7Sjsg 	if (msg)
1310ad8b1aafSjsg 		drm_notice(&engine->i915->drm,
1311c349dbc7Sjsg 			   "Resetting %s for %s\n", engine->name, msg);
1312f005ef32Sjsg 	i915_increase_reset_engine_count(&engine->i915->gpu_error, engine);
1313c349dbc7Sjsg 
1314c349dbc7Sjsg 	ret = intel_gt_reset_engine(engine);
1315c349dbc7Sjsg 	if (ret) {
1316c349dbc7Sjsg 		/* If we fail here, we expect to fallback to a global reset */
13175ca02815Sjsg 		ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1318c349dbc7Sjsg 		goto out;
1319c349dbc7Sjsg 	}
1320c349dbc7Sjsg 
1321c349dbc7Sjsg 	/*
1322c349dbc7Sjsg 	 * The request that caused the hang is stuck on elsp, we know the
1323c349dbc7Sjsg 	 * active request and can drop it, adjust head to skip the offending
1324c349dbc7Sjsg 	 * request to resume executing remaining requests in the queue.
1325c349dbc7Sjsg 	 */
1326c349dbc7Sjsg 	__intel_engine_reset(engine, true);
1327c349dbc7Sjsg 
1328c349dbc7Sjsg 	/*
1329c349dbc7Sjsg 	 * The engine and its registers (and workarounds in case of render)
1330c349dbc7Sjsg 	 * have been reset to their default values. Follow the init_ring
1331c349dbc7Sjsg 	 * process to program RING_MODE, HWSP and re-enable submission.
1332c349dbc7Sjsg 	 */
1333c349dbc7Sjsg 	ret = intel_engine_resume(engine);
1334c349dbc7Sjsg 
1335c349dbc7Sjsg out:
1336c349dbc7Sjsg 	intel_engine_cancel_stop_cs(engine);
1337c349dbc7Sjsg 	reset_finish_engine(engine);
1338c349dbc7Sjsg 	intel_engine_pm_put_async(engine);
1339c349dbc7Sjsg 	return ret;
1340c349dbc7Sjsg }
1341c349dbc7Sjsg 
13425ca02815Sjsg /**
13435ca02815Sjsg  * intel_engine_reset - reset GPU engine to recover from a hang
13445ca02815Sjsg  * @engine: engine to reset
13455ca02815Sjsg  * @msg: reason for GPU reset; or NULL for no drm_notice()
13465ca02815Sjsg  *
13475ca02815Sjsg  * Reset a specific GPU engine. Useful if a hang is detected.
13485ca02815Sjsg  * Returns zero on successful reset or otherwise an error code.
13495ca02815Sjsg  *
13505ca02815Sjsg  * Procedure is:
13515ca02815Sjsg  *  - identifies the request that caused the hang and it is dropped
13525ca02815Sjsg  *  - reset engine (which will force the engine to idle)
13535ca02815Sjsg  *  - re-init/configure engine
13545ca02815Sjsg  */
intel_engine_reset(struct intel_engine_cs * engine,const char * msg)13555ca02815Sjsg int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
13565ca02815Sjsg {
13575ca02815Sjsg 	int err;
13585ca02815Sjsg 
13595ca02815Sjsg 	local_bh_disable();
13605ca02815Sjsg 	err = __intel_engine_reset_bh(engine, msg);
13615ca02815Sjsg 	local_bh_enable();
13625ca02815Sjsg 
13635ca02815Sjsg 	return err;
13645ca02815Sjsg }
13655ca02815Sjsg 
intel_gt_reset_global(struct intel_gt * gt,u32 engine_mask,const char * reason)1366c349dbc7Sjsg static void intel_gt_reset_global(struct intel_gt *gt,
1367c349dbc7Sjsg 				  u32 engine_mask,
1368c349dbc7Sjsg 				  const char *reason)
1369c349dbc7Sjsg {
1370c349dbc7Sjsg #ifdef notyet
1371c349dbc7Sjsg 	struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1372c349dbc7Sjsg 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1373c349dbc7Sjsg 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1374c349dbc7Sjsg 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1375c349dbc7Sjsg #endif
1376c349dbc7Sjsg 	struct intel_wedge_me w;
1377c349dbc7Sjsg 
1378c349dbc7Sjsg 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1379c349dbc7Sjsg 
13805ca02815Sjsg 	GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1381c349dbc7Sjsg 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1382c349dbc7Sjsg 
1383c349dbc7Sjsg 	/* Use a watchdog to ensure that our reset completes */
1384f005ef32Sjsg 	intel_wedge_on_timeout(&w, gt, 60 * HZ) {
1385f005ef32Sjsg 		intel_display_reset_prepare(gt->i915);
1386c349dbc7Sjsg 
1387c349dbc7Sjsg 		intel_gt_reset(gt, engine_mask, reason);
1388c349dbc7Sjsg 
1389f005ef32Sjsg 		intel_display_reset_finish(gt->i915);
1390c349dbc7Sjsg 	}
1391c349dbc7Sjsg 
1392c349dbc7Sjsg 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
1393c349dbc7Sjsg 		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1394c349dbc7Sjsg }
1395c349dbc7Sjsg 
1396c349dbc7Sjsg /**
1397c349dbc7Sjsg  * intel_gt_handle_error - handle a gpu error
1398c349dbc7Sjsg  * @gt: the intel_gt
1399c349dbc7Sjsg  * @engine_mask: mask representing engines that are hung
1400c349dbc7Sjsg  * @flags: control flags
1401c349dbc7Sjsg  * @fmt: Error message format string
1402c349dbc7Sjsg  *
1403c349dbc7Sjsg  * Do some basic checking of register state at error time and
1404c349dbc7Sjsg  * dump it to the syslog.  Also call i915_capture_error_state() to make
1405c349dbc7Sjsg  * sure we get a record and make it available in debugfs.  Fire a uevent
1406c349dbc7Sjsg  * so userspace knows something bad happened (should trigger collection
1407c349dbc7Sjsg  * of a ring dump etc.).
1408c349dbc7Sjsg  */
intel_gt_handle_error(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned long flags,const char * fmt,...)1409c349dbc7Sjsg void intel_gt_handle_error(struct intel_gt *gt,
1410c349dbc7Sjsg 			   intel_engine_mask_t engine_mask,
1411c349dbc7Sjsg 			   unsigned long flags,
1412c349dbc7Sjsg 			   const char *fmt, ...)
1413c349dbc7Sjsg {
1414c349dbc7Sjsg 	struct intel_engine_cs *engine;
1415c349dbc7Sjsg 	intel_wakeref_t wakeref;
1416c349dbc7Sjsg 	intel_engine_mask_t tmp;
1417c349dbc7Sjsg 	char error_msg[80];
1418c349dbc7Sjsg 	char *msg = NULL;
1419c349dbc7Sjsg 
1420c349dbc7Sjsg 	if (fmt) {
1421c349dbc7Sjsg 		va_list args;
1422c349dbc7Sjsg 
1423c349dbc7Sjsg 		va_start(args, fmt);
1424c349dbc7Sjsg 		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1425c349dbc7Sjsg 		va_end(args);
1426c349dbc7Sjsg 
1427c349dbc7Sjsg 		msg = error_msg;
1428c349dbc7Sjsg 	}
1429c349dbc7Sjsg 
1430c349dbc7Sjsg 	/*
1431c349dbc7Sjsg 	 * In most cases it's guaranteed that we get here with an RPM
1432c349dbc7Sjsg 	 * reference held, for example because there is a pending GPU
1433c349dbc7Sjsg 	 * request that won't finish until the reset is done. This
1434c349dbc7Sjsg 	 * isn't the case at least when we get here by doing a
1435c349dbc7Sjsg 	 * simulated reset via debugfs, so get an RPM reference.
1436c349dbc7Sjsg 	 */
1437c349dbc7Sjsg 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1438c349dbc7Sjsg 
1439ad8b1aafSjsg 	engine_mask &= gt->info.engine_mask;
1440c349dbc7Sjsg 
1441c349dbc7Sjsg 	if (flags & I915_ERROR_CAPTURE) {
14421bb76ff1Sjsg 		i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
1443c349dbc7Sjsg 		intel_gt_clear_error_registers(gt, engine_mask);
1444c349dbc7Sjsg 	}
1445c349dbc7Sjsg 
1446c349dbc7Sjsg 	/*
1447c349dbc7Sjsg 	 * Try engine reset when available. We fall back to full reset if
1448c349dbc7Sjsg 	 * single reset fails.
1449c349dbc7Sjsg 	 */
14505ca02815Sjsg 	if (!intel_uc_uses_guc_submission(&gt->uc) &&
14515ca02815Sjsg 	    intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
14525ca02815Sjsg 		local_bh_disable();
1453c349dbc7Sjsg 		for_each_engine_masked(engine, gt, engine_mask, tmp) {
1454c349dbc7Sjsg 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1455c349dbc7Sjsg 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1456c349dbc7Sjsg 					     &gt->reset.flags))
1457c349dbc7Sjsg 				continue;
1458c349dbc7Sjsg 
14595ca02815Sjsg 			if (__intel_engine_reset_bh(engine, msg) == 0)
1460c349dbc7Sjsg 				engine_mask &= ~engine->mask;
1461c349dbc7Sjsg 
1462c349dbc7Sjsg 			clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1463c349dbc7Sjsg 					      &gt->reset.flags);
1464c349dbc7Sjsg 		}
14655ca02815Sjsg 		local_bh_enable();
1466c349dbc7Sjsg 	}
1467c349dbc7Sjsg 
1468c349dbc7Sjsg 	if (!engine_mask)
1469c349dbc7Sjsg 		goto out;
1470c349dbc7Sjsg 
1471c349dbc7Sjsg 	/* Full reset needs the mutex, stop any other user trying to do so. */
1472c349dbc7Sjsg 	if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1473c349dbc7Sjsg 		wait_event(gt->reset.queue,
1474c349dbc7Sjsg 			   !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1475c349dbc7Sjsg 		goto out; /* piggy-back on the other reset */
1476c349dbc7Sjsg 	}
1477c349dbc7Sjsg 
1478c349dbc7Sjsg 	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1479c349dbc7Sjsg 	synchronize_rcu_expedited();
1480c349dbc7Sjsg 
14811bb76ff1Sjsg 	/*
14821bb76ff1Sjsg 	 * Prevent any other reset-engine attempt. We don't do this for GuC
14831bb76ff1Sjsg 	 * submission the GuC owns the per-engine reset, not the i915.
14841bb76ff1Sjsg 	 */
14851bb76ff1Sjsg 	if (!intel_uc_uses_guc_submission(&gt->uc)) {
1486c349dbc7Sjsg 		for_each_engine(engine, gt, tmp) {
1487c349dbc7Sjsg 			while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1488c349dbc7Sjsg 						&gt->reset.flags))
1489c349dbc7Sjsg 				wait_on_bit(&gt->reset.flags,
1490c349dbc7Sjsg 					    I915_RESET_ENGINE + engine->id,
1491c349dbc7Sjsg 					    TASK_UNINTERRUPTIBLE);
1492c349dbc7Sjsg 		}
14931bb76ff1Sjsg 	}
14941bb76ff1Sjsg 
14951bb76ff1Sjsg 	/* Flush everyone using a resource about to be clobbered */
14961bb76ff1Sjsg 	synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1497c349dbc7Sjsg 
1498c349dbc7Sjsg 	intel_gt_reset_global(gt, engine_mask, msg);
1499c349dbc7Sjsg 
15001bb76ff1Sjsg 	if (!intel_uc_uses_guc_submission(&gt->uc)) {
1501c349dbc7Sjsg 		for_each_engine(engine, gt, tmp)
1502c349dbc7Sjsg 			clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1503c349dbc7Sjsg 					 &gt->reset.flags);
15041bb76ff1Sjsg 	}
1505c349dbc7Sjsg 	clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1506c349dbc7Sjsg 	smp_mb__after_atomic();
1507c349dbc7Sjsg 	wake_up_all(&gt->reset.queue);
1508c349dbc7Sjsg 
1509c349dbc7Sjsg out:
1510c349dbc7Sjsg 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1511c349dbc7Sjsg }
1512c349dbc7Sjsg 
_intel_gt_reset_lock(struct intel_gt * gt,int * srcu,bool retry)1513f005ef32Sjsg static int _intel_gt_reset_lock(struct intel_gt *gt, int *srcu, bool retry)
1514c349dbc7Sjsg {
1515c349dbc7Sjsg 	might_lock(&gt->reset.backoff_srcu);
1516f005ef32Sjsg 	if (retry)
1517c349dbc7Sjsg 		might_sleep();
1518c349dbc7Sjsg 
1519c349dbc7Sjsg 	rcu_read_lock();
1520c349dbc7Sjsg 	while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1521c349dbc7Sjsg 		rcu_read_unlock();
1522c349dbc7Sjsg 
1523f005ef32Sjsg 		if (!retry)
1524f005ef32Sjsg 			return -EBUSY;
1525f005ef32Sjsg 
1526c349dbc7Sjsg 		if (wait_event_interruptible(gt->reset.queue,
1527c349dbc7Sjsg 					     !test_bit(I915_RESET_BACKOFF,
1528c349dbc7Sjsg 						       &gt->reset.flags)))
1529c349dbc7Sjsg 			return -EINTR;
1530c349dbc7Sjsg 
1531c349dbc7Sjsg 		rcu_read_lock();
1532c349dbc7Sjsg 	}
1533c349dbc7Sjsg 	*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1534c349dbc7Sjsg 	rcu_read_unlock();
1535c349dbc7Sjsg 
1536c349dbc7Sjsg 	return 0;
1537c349dbc7Sjsg }
1538c349dbc7Sjsg 
intel_gt_reset_trylock(struct intel_gt * gt,int * srcu)1539f005ef32Sjsg int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1540f005ef32Sjsg {
1541f005ef32Sjsg 	return _intel_gt_reset_lock(gt, srcu, false);
1542f005ef32Sjsg }
1543f005ef32Sjsg 
intel_gt_reset_lock_interruptible(struct intel_gt * gt,int * srcu)1544f005ef32Sjsg int intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu)
1545f005ef32Sjsg {
1546f005ef32Sjsg 	return _intel_gt_reset_lock(gt, srcu, true);
1547f005ef32Sjsg }
1548f005ef32Sjsg 
intel_gt_reset_unlock(struct intel_gt * gt,int tag)1549c349dbc7Sjsg void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1550c349dbc7Sjsg __releases(&gt->reset.backoff_srcu)
1551c349dbc7Sjsg {
1552c349dbc7Sjsg 	srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1553c349dbc7Sjsg }
1554c349dbc7Sjsg 
intel_gt_terminally_wedged(struct intel_gt * gt)1555c349dbc7Sjsg int intel_gt_terminally_wedged(struct intel_gt *gt)
1556c349dbc7Sjsg {
1557c349dbc7Sjsg 	might_sleep();
1558c349dbc7Sjsg 
1559c349dbc7Sjsg 	if (!intel_gt_is_wedged(gt))
1560c349dbc7Sjsg 		return 0;
1561c349dbc7Sjsg 
1562ad8b1aafSjsg 	if (intel_gt_has_unrecoverable_error(gt))
1563c349dbc7Sjsg 		return -EIO;
1564c349dbc7Sjsg 
1565c349dbc7Sjsg 	/* Reset still in progress? Maybe we will recover? */
1566c349dbc7Sjsg 	if (wait_event_interruptible(gt->reset.queue,
1567c349dbc7Sjsg 				     !test_bit(I915_RESET_BACKOFF,
1568c349dbc7Sjsg 					       &gt->reset.flags)))
1569c349dbc7Sjsg 		return -EINTR;
1570c349dbc7Sjsg 
1571c349dbc7Sjsg 	return intel_gt_is_wedged(gt) ? -EIO : 0;
1572c349dbc7Sjsg }
1573c349dbc7Sjsg 
intel_gt_set_wedged_on_init(struct intel_gt * gt)1574c349dbc7Sjsg void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1575c349dbc7Sjsg {
1576c349dbc7Sjsg 	BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1577c349dbc7Sjsg 		     I915_WEDGED_ON_INIT);
1578c349dbc7Sjsg 	intel_gt_set_wedged(gt);
15791bb76ff1Sjsg 	i915_disable_error_state(gt->i915, -ENODEV);
1580c349dbc7Sjsg 	set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
1581ad8b1aafSjsg 
1582ad8b1aafSjsg 	/* Wedged on init is non-recoverable */
1583ad8b1aafSjsg 	add_taint_for_CI(gt->i915, TAINT_WARN);
1584ad8b1aafSjsg }
1585ad8b1aafSjsg 
intel_gt_set_wedged_on_fini(struct intel_gt * gt)1586ad8b1aafSjsg void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1587ad8b1aafSjsg {
1588ad8b1aafSjsg 	intel_gt_set_wedged(gt);
15891bb76ff1Sjsg 	i915_disable_error_state(gt->i915, -ENODEV);
1590ad8b1aafSjsg 	set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
15915ca02815Sjsg 	intel_gt_retire_requests(gt); /* cleanup any wedged requests */
1592c349dbc7Sjsg }
1593c349dbc7Sjsg 
intel_gt_init_reset(struct intel_gt * gt)1594c349dbc7Sjsg void intel_gt_init_reset(struct intel_gt *gt)
1595c349dbc7Sjsg {
1596c349dbc7Sjsg 	init_waitqueue_head(&gt->reset.queue);
1597c349dbc7Sjsg 	rw_init(&gt->reset.mutex, "gtres");
1598c349dbc7Sjsg 	init_srcu_struct(&gt->reset.backoff_srcu);
1599c349dbc7Sjsg 
16005ca02815Sjsg 	/*
16015ca02815Sjsg 	 * While undesirable to wait inside the shrinker, complain anyway.
16025ca02815Sjsg 	 *
16035ca02815Sjsg 	 * If we have to wait during shrinking, we guarantee forward progress
16045ca02815Sjsg 	 * by forcing the reset. Therefore during the reset we must not
16055ca02815Sjsg 	 * re-enter the shrinker. By declaring that we take the reset mutex
16065ca02815Sjsg 	 * within the shrinker, we forbid ourselves from performing any
16075ca02815Sjsg 	 * fs-reclaim or taking related locks during reset.
16085ca02815Sjsg 	 */
16095ca02815Sjsg 	i915_gem_shrinker_taints_mutex(gt->i915, &gt->reset.mutex);
16105ca02815Sjsg 
1611c349dbc7Sjsg 	/* no GPU until we are ready! */
1612c349dbc7Sjsg 	__set_bit(I915_WEDGED, &gt->reset.flags);
1613c349dbc7Sjsg }
1614c349dbc7Sjsg 
intel_gt_fini_reset(struct intel_gt * gt)1615c349dbc7Sjsg void intel_gt_fini_reset(struct intel_gt *gt)
1616c349dbc7Sjsg {
1617c349dbc7Sjsg 	cleanup_srcu_struct(&gt->reset.backoff_srcu);
1618c349dbc7Sjsg }
1619c349dbc7Sjsg 
intel_wedge_me(struct work_struct * work)1620c349dbc7Sjsg static void intel_wedge_me(struct work_struct *work)
1621c349dbc7Sjsg {
1622c349dbc7Sjsg 	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1623c349dbc7Sjsg 
1624ad8b1aafSjsg 	drm_err(&w->gt->i915->drm,
1625c349dbc7Sjsg 		"%s timed out, cancelling all in-flight rendering.\n",
1626c349dbc7Sjsg 		w->name);
1627c349dbc7Sjsg 	intel_gt_set_wedged(w->gt);
1628c349dbc7Sjsg }
1629c349dbc7Sjsg 
__intel_init_wedge(struct intel_wedge_me * w,struct intel_gt * gt,long timeout,const char * name)1630c349dbc7Sjsg void __intel_init_wedge(struct intel_wedge_me *w,
1631c349dbc7Sjsg 			struct intel_gt *gt,
1632c349dbc7Sjsg 			long timeout,
1633c349dbc7Sjsg 			const char *name)
1634c349dbc7Sjsg {
1635c349dbc7Sjsg 	w->gt = gt;
1636c349dbc7Sjsg 	w->name = name;
1637c349dbc7Sjsg 
1638c349dbc7Sjsg 	INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1639f005ef32Sjsg 	queue_delayed_work(gt->i915->unordered_wq, &w->work, timeout);
1640c349dbc7Sjsg }
1641c349dbc7Sjsg 
__intel_fini_wedge(struct intel_wedge_me * w)1642c349dbc7Sjsg void __intel_fini_wedge(struct intel_wedge_me *w)
1643c349dbc7Sjsg {
1644c349dbc7Sjsg 	cancel_delayed_work_sync(&w->work);
1645c349dbc7Sjsg 	destroy_delayed_work_on_stack(&w->work);
1646c349dbc7Sjsg 	w->gt = NULL;
1647c349dbc7Sjsg }
1648c349dbc7Sjsg 
1649176435d3Sjsg /*
1650176435d3Sjsg  * Wa_22011802037 requires that we (or the GuC) ensure that no command
1651176435d3Sjsg  * streamers are executing MI_FORCE_WAKE while an engine reset is initiated.
1652176435d3Sjsg  */
intel_engine_reset_needs_wa_22011802037(struct intel_gt * gt)1653176435d3Sjsg bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt)
1654176435d3Sjsg {
1655176435d3Sjsg 	if (GRAPHICS_VER(gt->i915) < 11)
1656176435d3Sjsg 		return false;
1657176435d3Sjsg 
1658596b6869Sjsg 	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0))
1659176435d3Sjsg 		return true;
1660176435d3Sjsg 
1661176435d3Sjsg 	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
1662176435d3Sjsg 		return false;
1663176435d3Sjsg 
1664176435d3Sjsg 	return true;
1665176435d3Sjsg }
1666176435d3Sjsg 
1667c349dbc7Sjsg #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1668c349dbc7Sjsg #include "selftest_reset.c"
1669c349dbc7Sjsg #include "selftest_hangcheck.c"
1670c349dbc7Sjsg #endif
1671