xref: /openbsd-src/sys/dev/pci/drm/i915/gt/selftest_hangcheck.c (revision f005ef32267c16bdb134f0e9fa4477dbe07c263a)
15ca02815Sjsg // SPDX-License-Identifier: MIT
2c349dbc7Sjsg /*
3c349dbc7Sjsg  * Copyright © 2016 Intel Corporation
4c349dbc7Sjsg  */
5c349dbc7Sjsg 
6c349dbc7Sjsg #include <linux/kthread.h>
7c349dbc7Sjsg 
8c349dbc7Sjsg #include "gem/i915_gem_context.h"
91bb76ff1Sjsg #include "gem/i915_gem_internal.h"
10c349dbc7Sjsg 
111bb76ff1Sjsg #include "i915_gem_evict.h"
12c349dbc7Sjsg #include "intel_gt.h"
13c349dbc7Sjsg #include "intel_engine_heartbeat.h"
14c349dbc7Sjsg #include "intel_engine_pm.h"
15ad8b1aafSjsg #include "selftest_engine_heartbeat.h"
16c349dbc7Sjsg 
17c349dbc7Sjsg #include "i915_selftest.h"
18c349dbc7Sjsg #include "selftests/i915_random.h"
19c349dbc7Sjsg #include "selftests/igt_flush_test.h"
20c349dbc7Sjsg #include "selftests/igt_reset.h"
21c349dbc7Sjsg #include "selftests/igt_atomic.h"
225ca02815Sjsg #include "selftests/igt_spinner.h"
235ca02815Sjsg #include "selftests/intel_scheduler_helpers.h"
24c349dbc7Sjsg 
25c349dbc7Sjsg #include "selftests/mock_drm.h"
26c349dbc7Sjsg 
27c349dbc7Sjsg #include "gem/selftests/mock_context.h"
28c349dbc7Sjsg #include "gem/selftests/igt_gem_utils.h"
29c349dbc7Sjsg 
30c349dbc7Sjsg #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
31c349dbc7Sjsg 
32c349dbc7Sjsg struct hang {
33c349dbc7Sjsg 	struct intel_gt *gt;
34c349dbc7Sjsg 	struct drm_i915_gem_object *hws;
35c349dbc7Sjsg 	struct drm_i915_gem_object *obj;
36c349dbc7Sjsg 	struct i915_gem_context *ctx;
37c349dbc7Sjsg 	u32 *seqno;
38c349dbc7Sjsg 	u32 *batch;
39c349dbc7Sjsg };
40c349dbc7Sjsg 
hang_init(struct hang * h,struct intel_gt * gt)41c349dbc7Sjsg static int hang_init(struct hang *h, struct intel_gt *gt)
42c349dbc7Sjsg {
43c349dbc7Sjsg 	void *vaddr;
44c349dbc7Sjsg 	int err;
45c349dbc7Sjsg 
46c349dbc7Sjsg 	memset(h, 0, sizeof(*h));
47c349dbc7Sjsg 	h->gt = gt;
48c349dbc7Sjsg 
495ca02815Sjsg 	h->ctx = kernel_context(gt->i915, NULL);
50c349dbc7Sjsg 	if (IS_ERR(h->ctx))
51c349dbc7Sjsg 		return PTR_ERR(h->ctx);
52c349dbc7Sjsg 
53c349dbc7Sjsg 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54c349dbc7Sjsg 
55c349dbc7Sjsg 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56c349dbc7Sjsg 	if (IS_ERR(h->hws)) {
57c349dbc7Sjsg 		err = PTR_ERR(h->hws);
58c349dbc7Sjsg 		goto err_ctx;
59c349dbc7Sjsg 	}
60c349dbc7Sjsg 
61c349dbc7Sjsg 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62c349dbc7Sjsg 	if (IS_ERR(h->obj)) {
63c349dbc7Sjsg 		err = PTR_ERR(h->obj);
64c349dbc7Sjsg 		goto err_hws;
65c349dbc7Sjsg 	}
66c349dbc7Sjsg 
67c349dbc7Sjsg 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
685ca02815Sjsg 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69c349dbc7Sjsg 	if (IS_ERR(vaddr)) {
70c349dbc7Sjsg 		err = PTR_ERR(vaddr);
71c349dbc7Sjsg 		goto err_obj;
72c349dbc7Sjsg 	}
73c349dbc7Sjsg 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74c349dbc7Sjsg 
755ca02815Sjsg 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76*f005ef32Sjsg 						 intel_gt_coherent_map_type(gt, h->obj, false));
77c349dbc7Sjsg 	if (IS_ERR(vaddr)) {
78c349dbc7Sjsg 		err = PTR_ERR(vaddr);
79c349dbc7Sjsg 		goto err_unpin_hws;
80c349dbc7Sjsg 	}
81c349dbc7Sjsg 	h->batch = vaddr;
82c349dbc7Sjsg 
83c349dbc7Sjsg 	return 0;
84c349dbc7Sjsg 
85c349dbc7Sjsg err_unpin_hws:
86c349dbc7Sjsg 	i915_gem_object_unpin_map(h->hws);
87c349dbc7Sjsg err_obj:
88c349dbc7Sjsg 	i915_gem_object_put(h->obj);
89c349dbc7Sjsg err_hws:
90c349dbc7Sjsg 	i915_gem_object_put(h->hws);
91c349dbc7Sjsg err_ctx:
92c349dbc7Sjsg 	kernel_context_close(h->ctx);
93c349dbc7Sjsg 	return err;
94c349dbc7Sjsg }
95c349dbc7Sjsg 
hws_address(const struct i915_vma * hws,const struct i915_request * rq)96c349dbc7Sjsg static u64 hws_address(const struct i915_vma *hws,
97c349dbc7Sjsg 		       const struct i915_request *rq)
98c349dbc7Sjsg {
99*f005ef32Sjsg 	return i915_vma_offset(hws) +
100*f005ef32Sjsg 	       offset_in_page(sizeof(u32) * rq->fence.context);
101c349dbc7Sjsg }
102c349dbc7Sjsg 
103c349dbc7Sjsg static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)104c349dbc7Sjsg hang_create_request(struct hang *h, struct intel_engine_cs *engine)
105c349dbc7Sjsg {
106c349dbc7Sjsg 	struct intel_gt *gt = h->gt;
1071bb76ff1Sjsg 	struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
108c349dbc7Sjsg 	struct drm_i915_gem_object *obj;
109c349dbc7Sjsg 	struct i915_request *rq = NULL;
110c349dbc7Sjsg 	struct i915_vma *hws, *vma;
111c349dbc7Sjsg 	unsigned int flags;
112c349dbc7Sjsg 	void *vaddr;
113c349dbc7Sjsg 	u32 *batch;
114c349dbc7Sjsg 	int err;
115c349dbc7Sjsg 
116c349dbc7Sjsg 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
117c349dbc7Sjsg 	if (IS_ERR(obj)) {
118c349dbc7Sjsg 		i915_vm_put(vm);
119c349dbc7Sjsg 		return ERR_CAST(obj);
120c349dbc7Sjsg 	}
121c349dbc7Sjsg 
122*f005ef32Sjsg 	vaddr = i915_gem_object_pin_map_unlocked(obj, intel_gt_coherent_map_type(gt, obj, false));
123c349dbc7Sjsg 	if (IS_ERR(vaddr)) {
124c349dbc7Sjsg 		i915_gem_object_put(obj);
125c349dbc7Sjsg 		i915_vm_put(vm);
126c349dbc7Sjsg 		return ERR_CAST(vaddr);
127c349dbc7Sjsg 	}
128c349dbc7Sjsg 
129c349dbc7Sjsg 	i915_gem_object_unpin_map(h->obj);
130c349dbc7Sjsg 	i915_gem_object_put(h->obj);
131c349dbc7Sjsg 
132c349dbc7Sjsg 	h->obj = obj;
133c349dbc7Sjsg 	h->batch = vaddr;
134c349dbc7Sjsg 
135c349dbc7Sjsg 	vma = i915_vma_instance(h->obj, vm, NULL);
136c349dbc7Sjsg 	if (IS_ERR(vma)) {
137c349dbc7Sjsg 		i915_vm_put(vm);
138c349dbc7Sjsg 		return ERR_CAST(vma);
139c349dbc7Sjsg 	}
140c349dbc7Sjsg 
141c349dbc7Sjsg 	hws = i915_vma_instance(h->hws, vm, NULL);
142c349dbc7Sjsg 	if (IS_ERR(hws)) {
143c349dbc7Sjsg 		i915_vm_put(vm);
144c349dbc7Sjsg 		return ERR_CAST(hws);
145c349dbc7Sjsg 	}
146c349dbc7Sjsg 
147c349dbc7Sjsg 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
148c349dbc7Sjsg 	if (err) {
149c349dbc7Sjsg 		i915_vm_put(vm);
150c349dbc7Sjsg 		return ERR_PTR(err);
151c349dbc7Sjsg 	}
152c349dbc7Sjsg 
153c349dbc7Sjsg 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
154c349dbc7Sjsg 	if (err)
155c349dbc7Sjsg 		goto unpin_vma;
156c349dbc7Sjsg 
157c349dbc7Sjsg 	rq = igt_request_alloc(h->ctx, engine);
158c349dbc7Sjsg 	if (IS_ERR(rq)) {
159c349dbc7Sjsg 		err = PTR_ERR(rq);
160c349dbc7Sjsg 		goto unpin_hws;
161c349dbc7Sjsg 	}
162c349dbc7Sjsg 
163*f005ef32Sjsg 	err = igt_vma_move_to_active_unlocked(vma, rq, 0);
164c349dbc7Sjsg 	if (err)
165c349dbc7Sjsg 		goto cancel_rq;
166c349dbc7Sjsg 
167*f005ef32Sjsg 	err = igt_vma_move_to_active_unlocked(hws, rq, 0);
168c349dbc7Sjsg 	if (err)
169c349dbc7Sjsg 		goto cancel_rq;
170c349dbc7Sjsg 
171c349dbc7Sjsg 	batch = h->batch;
1725ca02815Sjsg 	if (GRAPHICS_VER(gt->i915) >= 8) {
173c349dbc7Sjsg 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
174c349dbc7Sjsg 		*batch++ = lower_32_bits(hws_address(hws, rq));
175c349dbc7Sjsg 		*batch++ = upper_32_bits(hws_address(hws, rq));
176c349dbc7Sjsg 		*batch++ = rq->fence.seqno;
177ad8b1aafSjsg 		*batch++ = MI_NOOP;
178c349dbc7Sjsg 
179c349dbc7Sjsg 		memset(batch, 0, 1024);
180c349dbc7Sjsg 		batch += 1024 / sizeof(*batch);
181c349dbc7Sjsg 
182ad8b1aafSjsg 		*batch++ = MI_NOOP;
183c349dbc7Sjsg 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
184*f005ef32Sjsg 		*batch++ = lower_32_bits(i915_vma_offset(vma));
185*f005ef32Sjsg 		*batch++ = upper_32_bits(i915_vma_offset(vma));
1865ca02815Sjsg 	} else if (GRAPHICS_VER(gt->i915) >= 6) {
187c349dbc7Sjsg 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
188c349dbc7Sjsg 		*batch++ = 0;
189c349dbc7Sjsg 		*batch++ = lower_32_bits(hws_address(hws, rq));
190c349dbc7Sjsg 		*batch++ = rq->fence.seqno;
191ad8b1aafSjsg 		*batch++ = MI_NOOP;
192c349dbc7Sjsg 
193c349dbc7Sjsg 		memset(batch, 0, 1024);
194c349dbc7Sjsg 		batch += 1024 / sizeof(*batch);
195c349dbc7Sjsg 
196ad8b1aafSjsg 		*batch++ = MI_NOOP;
197c349dbc7Sjsg 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
198*f005ef32Sjsg 		*batch++ = lower_32_bits(i915_vma_offset(vma));
1995ca02815Sjsg 	} else if (GRAPHICS_VER(gt->i915) >= 4) {
200c349dbc7Sjsg 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
201c349dbc7Sjsg 		*batch++ = 0;
202c349dbc7Sjsg 		*batch++ = lower_32_bits(hws_address(hws, rq));
203c349dbc7Sjsg 		*batch++ = rq->fence.seqno;
204ad8b1aafSjsg 		*batch++ = MI_NOOP;
205c349dbc7Sjsg 
206c349dbc7Sjsg 		memset(batch, 0, 1024);
207c349dbc7Sjsg 		batch += 1024 / sizeof(*batch);
208c349dbc7Sjsg 
209ad8b1aafSjsg 		*batch++ = MI_NOOP;
210c349dbc7Sjsg 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
211*f005ef32Sjsg 		*batch++ = lower_32_bits(i915_vma_offset(vma));
212c349dbc7Sjsg 	} else {
213c349dbc7Sjsg 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
214c349dbc7Sjsg 		*batch++ = lower_32_bits(hws_address(hws, rq));
215c349dbc7Sjsg 		*batch++ = rq->fence.seqno;
216ad8b1aafSjsg 		*batch++ = MI_NOOP;
217c349dbc7Sjsg 
218c349dbc7Sjsg 		memset(batch, 0, 1024);
219c349dbc7Sjsg 		batch += 1024 / sizeof(*batch);
220c349dbc7Sjsg 
221ad8b1aafSjsg 		*batch++ = MI_NOOP;
222c349dbc7Sjsg 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
223*f005ef32Sjsg 		*batch++ = lower_32_bits(i915_vma_offset(vma));
224c349dbc7Sjsg 	}
225c349dbc7Sjsg 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
226c349dbc7Sjsg 	intel_gt_chipset_flush(engine->gt);
227c349dbc7Sjsg 
228c349dbc7Sjsg 	if (rq->engine->emit_init_breadcrumb) {
229c349dbc7Sjsg 		err = rq->engine->emit_init_breadcrumb(rq);
230c349dbc7Sjsg 		if (err)
231c349dbc7Sjsg 			goto cancel_rq;
232c349dbc7Sjsg 	}
233c349dbc7Sjsg 
234c349dbc7Sjsg 	flags = 0;
2355ca02815Sjsg 	if (GRAPHICS_VER(gt->i915) <= 5)
236c349dbc7Sjsg 		flags |= I915_DISPATCH_SECURE;
237c349dbc7Sjsg 
238*f005ef32Sjsg 	err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), PAGE_SIZE, flags);
239c349dbc7Sjsg 
240c349dbc7Sjsg cancel_rq:
241c349dbc7Sjsg 	if (err) {
242c349dbc7Sjsg 		i915_request_set_error_once(rq, err);
243c349dbc7Sjsg 		i915_request_add(rq);
244c349dbc7Sjsg 	}
245c349dbc7Sjsg unpin_hws:
246c349dbc7Sjsg 	i915_vma_unpin(hws);
247c349dbc7Sjsg unpin_vma:
248c349dbc7Sjsg 	i915_vma_unpin(vma);
249c349dbc7Sjsg 	i915_vm_put(vm);
250c349dbc7Sjsg 	return err ? ERR_PTR(err) : rq;
251c349dbc7Sjsg }
252c349dbc7Sjsg 
hws_seqno(const struct hang * h,const struct i915_request * rq)253c349dbc7Sjsg static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
254c349dbc7Sjsg {
255c349dbc7Sjsg 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
256c349dbc7Sjsg }
257c349dbc7Sjsg 
hang_fini(struct hang * h)258c349dbc7Sjsg static void hang_fini(struct hang *h)
259c349dbc7Sjsg {
260c349dbc7Sjsg 	*h->batch = MI_BATCH_BUFFER_END;
261c349dbc7Sjsg 	intel_gt_chipset_flush(h->gt);
262c349dbc7Sjsg 
263c349dbc7Sjsg 	i915_gem_object_unpin_map(h->obj);
264c349dbc7Sjsg 	i915_gem_object_put(h->obj);
265c349dbc7Sjsg 
266c349dbc7Sjsg 	i915_gem_object_unpin_map(h->hws);
267c349dbc7Sjsg 	i915_gem_object_put(h->hws);
268c349dbc7Sjsg 
269c349dbc7Sjsg 	kernel_context_close(h->ctx);
270c349dbc7Sjsg 
271c349dbc7Sjsg 	igt_flush_test(h->gt->i915);
272c349dbc7Sjsg }
273c349dbc7Sjsg 
wait_until_running(struct hang * h,struct i915_request * rq)274c349dbc7Sjsg static bool wait_until_running(struct hang *h, struct i915_request *rq)
275c349dbc7Sjsg {
276c349dbc7Sjsg 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
277c349dbc7Sjsg 					       rq->fence.seqno),
278c349dbc7Sjsg 			     10) &&
279c349dbc7Sjsg 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
280c349dbc7Sjsg 					    rq->fence.seqno),
281c349dbc7Sjsg 			  1000));
282c349dbc7Sjsg }
283c349dbc7Sjsg 
igt_hang_sanitycheck(void * arg)284c349dbc7Sjsg static int igt_hang_sanitycheck(void *arg)
285c349dbc7Sjsg {
286c349dbc7Sjsg 	struct intel_gt *gt = arg;
287c349dbc7Sjsg 	struct i915_request *rq;
288c349dbc7Sjsg 	struct intel_engine_cs *engine;
289c349dbc7Sjsg 	enum intel_engine_id id;
290c349dbc7Sjsg 	struct hang h;
291c349dbc7Sjsg 	int err;
292c349dbc7Sjsg 
293c349dbc7Sjsg 	/* Basic check that we can execute our hanging batch */
294c349dbc7Sjsg 
295c349dbc7Sjsg 	err = hang_init(&h, gt);
296c349dbc7Sjsg 	if (err)
297c349dbc7Sjsg 		return err;
298c349dbc7Sjsg 
299c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
300c349dbc7Sjsg 		struct intel_wedge_me w;
301c349dbc7Sjsg 		long timeout;
302c349dbc7Sjsg 
303c349dbc7Sjsg 		if (!intel_engine_can_store_dword(engine))
304c349dbc7Sjsg 			continue;
305c349dbc7Sjsg 
306c349dbc7Sjsg 		rq = hang_create_request(&h, engine);
307c349dbc7Sjsg 		if (IS_ERR(rq)) {
308c349dbc7Sjsg 			err = PTR_ERR(rq);
309c349dbc7Sjsg 			pr_err("Failed to create request for %s, err=%d\n",
310c349dbc7Sjsg 			       engine->name, err);
311c349dbc7Sjsg 			goto fini;
312c349dbc7Sjsg 		}
313c349dbc7Sjsg 
314c349dbc7Sjsg 		i915_request_get(rq);
315c349dbc7Sjsg 
316c349dbc7Sjsg 		*h.batch = MI_BATCH_BUFFER_END;
317c349dbc7Sjsg 		intel_gt_chipset_flush(engine->gt);
318c349dbc7Sjsg 
319c349dbc7Sjsg 		i915_request_add(rq);
320c349dbc7Sjsg 
321c349dbc7Sjsg 		timeout = 0;
322c349dbc7Sjsg 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
323c349dbc7Sjsg 			timeout = i915_request_wait(rq, 0,
324c349dbc7Sjsg 						    MAX_SCHEDULE_TIMEOUT);
325c349dbc7Sjsg 		if (intel_gt_is_wedged(gt))
326c349dbc7Sjsg 			timeout = -EIO;
327c349dbc7Sjsg 
328c349dbc7Sjsg 		i915_request_put(rq);
329c349dbc7Sjsg 
330c349dbc7Sjsg 		if (timeout < 0) {
331c349dbc7Sjsg 			err = timeout;
332c349dbc7Sjsg 			pr_err("Wait for request failed on %s, err=%d\n",
333c349dbc7Sjsg 			       engine->name, err);
334c349dbc7Sjsg 			goto fini;
335c349dbc7Sjsg 		}
336c349dbc7Sjsg 	}
337c349dbc7Sjsg 
338c349dbc7Sjsg fini:
339c349dbc7Sjsg 	hang_fini(&h);
340c349dbc7Sjsg 	return err;
341c349dbc7Sjsg }
342c349dbc7Sjsg 
wait_for_idle(struct intel_engine_cs * engine)343c349dbc7Sjsg static bool wait_for_idle(struct intel_engine_cs *engine)
344c349dbc7Sjsg {
345c349dbc7Sjsg 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
346c349dbc7Sjsg }
347c349dbc7Sjsg 
igt_reset_nop(void * arg)348c349dbc7Sjsg static int igt_reset_nop(void *arg)
349c349dbc7Sjsg {
350c349dbc7Sjsg 	struct intel_gt *gt = arg;
351c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
352c349dbc7Sjsg 	struct intel_engine_cs *engine;
353c349dbc7Sjsg 	unsigned int reset_count, count;
354c349dbc7Sjsg 	enum intel_engine_id id;
355c349dbc7Sjsg 	IGT_TIMEOUT(end_time);
356c349dbc7Sjsg 	int err = 0;
357c349dbc7Sjsg 
358c349dbc7Sjsg 	/* Check that we can reset during non-user portions of requests */
359c349dbc7Sjsg 
360c349dbc7Sjsg 	reset_count = i915_reset_count(global);
361c349dbc7Sjsg 	count = 0;
362c349dbc7Sjsg 	do {
363c349dbc7Sjsg 		for_each_engine(engine, gt, id) {
364c349dbc7Sjsg 			struct intel_context *ce;
365c349dbc7Sjsg 			int i;
366c349dbc7Sjsg 
367c349dbc7Sjsg 			ce = intel_context_create(engine);
368c349dbc7Sjsg 			if (IS_ERR(ce)) {
369c349dbc7Sjsg 				err = PTR_ERR(ce);
3705ca02815Sjsg 				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
371c349dbc7Sjsg 				break;
372c349dbc7Sjsg 			}
373c349dbc7Sjsg 
374c349dbc7Sjsg 			for (i = 0; i < 16; i++) {
375c349dbc7Sjsg 				struct i915_request *rq;
376c349dbc7Sjsg 
377c349dbc7Sjsg 				rq = intel_context_create_request(ce);
378c349dbc7Sjsg 				if (IS_ERR(rq)) {
379c349dbc7Sjsg 					err = PTR_ERR(rq);
3805ca02815Sjsg 					pr_err("[%s] Create request failed: %d!\n",
3815ca02815Sjsg 					       engine->name, err);
382c349dbc7Sjsg 					break;
383c349dbc7Sjsg 				}
384c349dbc7Sjsg 
385c349dbc7Sjsg 				i915_request_add(rq);
386c349dbc7Sjsg 			}
387c349dbc7Sjsg 
388c349dbc7Sjsg 			intel_context_put(ce);
389c349dbc7Sjsg 		}
390c349dbc7Sjsg 
391c349dbc7Sjsg 		igt_global_reset_lock(gt);
392c349dbc7Sjsg 		intel_gt_reset(gt, ALL_ENGINES, NULL);
393c349dbc7Sjsg 		igt_global_reset_unlock(gt);
394c349dbc7Sjsg 
395c349dbc7Sjsg 		if (intel_gt_is_wedged(gt)) {
3965ca02815Sjsg 			pr_err("[%s] GT is wedged!\n", engine->name);
397c349dbc7Sjsg 			err = -EIO;
398c349dbc7Sjsg 			break;
399c349dbc7Sjsg 		}
400c349dbc7Sjsg 
401c349dbc7Sjsg 		if (i915_reset_count(global) != reset_count + ++count) {
4025ca02815Sjsg 			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
4035ca02815Sjsg 			       engine->name, i915_reset_count(global), reset_count, count);
404c349dbc7Sjsg 			err = -EINVAL;
405c349dbc7Sjsg 			break;
406c349dbc7Sjsg 		}
407c349dbc7Sjsg 
408c349dbc7Sjsg 		err = igt_flush_test(gt->i915);
4095ca02815Sjsg 		if (err) {
4105ca02815Sjsg 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
411c349dbc7Sjsg 			break;
4125ca02815Sjsg 		}
413c349dbc7Sjsg 	} while (time_before(jiffies, end_time));
414c349dbc7Sjsg 	pr_info("%s: %d resets\n", __func__, count);
415c349dbc7Sjsg 
4165ca02815Sjsg 	if (igt_flush_test(gt->i915)) {
4175ca02815Sjsg 		pr_err("Post flush failed: %d!\n", err);
418c349dbc7Sjsg 		err = -EIO;
4195ca02815Sjsg 	}
4205ca02815Sjsg 
421c349dbc7Sjsg 	return err;
422c349dbc7Sjsg }
423c349dbc7Sjsg 
igt_reset_nop_engine(void * arg)424c349dbc7Sjsg static int igt_reset_nop_engine(void *arg)
425c349dbc7Sjsg {
426c349dbc7Sjsg 	struct intel_gt *gt = arg;
427c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
428c349dbc7Sjsg 	struct intel_engine_cs *engine;
429c349dbc7Sjsg 	enum intel_engine_id id;
430c349dbc7Sjsg 
431c349dbc7Sjsg 	/* Check that we can engine-reset during non-user portions */
432c349dbc7Sjsg 
433c349dbc7Sjsg 	if (!intel_has_reset_engine(gt))
434c349dbc7Sjsg 		return 0;
435c349dbc7Sjsg 
436c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
437c349dbc7Sjsg 		unsigned int reset_count, reset_engine_count, count;
438c349dbc7Sjsg 		struct intel_context *ce;
439c349dbc7Sjsg 		IGT_TIMEOUT(end_time);
440c349dbc7Sjsg 		int err;
441c349dbc7Sjsg 
4425ca02815Sjsg 		if (intel_engine_uses_guc(engine)) {
4435ca02815Sjsg 			/* Engine level resets are triggered by GuC when a hang
4445ca02815Sjsg 			 * is detected. They can't be triggered by the KMD any
4455ca02815Sjsg 			 * more. Thus a nop batch cannot be used as a reset test
4465ca02815Sjsg 			 */
4475ca02815Sjsg 			continue;
4485ca02815Sjsg 		}
4495ca02815Sjsg 
450c349dbc7Sjsg 		ce = intel_context_create(engine);
4515ca02815Sjsg 		if (IS_ERR(ce)) {
4525ca02815Sjsg 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
453c349dbc7Sjsg 			return PTR_ERR(ce);
4545ca02815Sjsg 		}
455c349dbc7Sjsg 
456c349dbc7Sjsg 		reset_count = i915_reset_count(global);
457c349dbc7Sjsg 		reset_engine_count = i915_reset_engine_count(global, engine);
458c349dbc7Sjsg 		count = 0;
459c349dbc7Sjsg 
460ad8b1aafSjsg 		st_engine_heartbeat_disable(engine);
4611bb76ff1Sjsg 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
4621bb76ff1Sjsg 					    &gt->reset.flags));
463c349dbc7Sjsg 		do {
464c349dbc7Sjsg 			int i;
465c349dbc7Sjsg 
466c349dbc7Sjsg 			if (!wait_for_idle(engine)) {
467c349dbc7Sjsg 				pr_err("%s failed to idle before reset\n",
468c349dbc7Sjsg 				       engine->name);
469c349dbc7Sjsg 				err = -EIO;
470c349dbc7Sjsg 				break;
471c349dbc7Sjsg 			}
472c349dbc7Sjsg 
473c349dbc7Sjsg 			for (i = 0; i < 16; i++) {
474c349dbc7Sjsg 				struct i915_request *rq;
475c349dbc7Sjsg 
476c349dbc7Sjsg 				rq = intel_context_create_request(ce);
477c349dbc7Sjsg 				if (IS_ERR(rq)) {
478ad8b1aafSjsg 					struct drm_printer p =
479ad8b1aafSjsg 						drm_info_printer(gt->i915->drm.dev);
480ad8b1aafSjsg 					intel_engine_dump(engine, &p,
481ad8b1aafSjsg 							  "%s(%s): failed to submit request\n",
482ad8b1aafSjsg 							  __func__,
483ad8b1aafSjsg 							  engine->name);
484ad8b1aafSjsg 
485ad8b1aafSjsg 					GEM_TRACE("%s(%s): failed to submit request\n",
486ad8b1aafSjsg 						  __func__,
487ad8b1aafSjsg 						  engine->name);
488ad8b1aafSjsg 					GEM_TRACE_DUMP();
489ad8b1aafSjsg 
490ad8b1aafSjsg 					intel_gt_set_wedged(gt);
491ad8b1aafSjsg 
492c349dbc7Sjsg 					err = PTR_ERR(rq);
493c349dbc7Sjsg 					break;
494c349dbc7Sjsg 				}
495c349dbc7Sjsg 
496c349dbc7Sjsg 				i915_request_add(rq);
497c349dbc7Sjsg 			}
498c349dbc7Sjsg 			err = intel_engine_reset(engine, NULL);
499c349dbc7Sjsg 			if (err) {
5005ca02815Sjsg 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
5015ca02815Sjsg 				       engine->name, err);
502c349dbc7Sjsg 				break;
503c349dbc7Sjsg 			}
504c349dbc7Sjsg 
505c349dbc7Sjsg 			if (i915_reset_count(global) != reset_count) {
506c349dbc7Sjsg 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
507c349dbc7Sjsg 				err = -EINVAL;
508c349dbc7Sjsg 				break;
509c349dbc7Sjsg 			}
510c349dbc7Sjsg 
511c349dbc7Sjsg 			if (i915_reset_engine_count(global, engine) !=
512c349dbc7Sjsg 			    reset_engine_count + ++count) {
513c349dbc7Sjsg 				pr_err("%s engine reset not recorded!\n",
514c349dbc7Sjsg 				       engine->name);
515c349dbc7Sjsg 				err = -EINVAL;
516c349dbc7Sjsg 				break;
517c349dbc7Sjsg 			}
518c349dbc7Sjsg 		} while (time_before(jiffies, end_time));
5191bb76ff1Sjsg 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
520ad8b1aafSjsg 		st_engine_heartbeat_enable(engine);
521c349dbc7Sjsg 
522c349dbc7Sjsg 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
523c349dbc7Sjsg 
524c349dbc7Sjsg 		intel_context_put(ce);
525c349dbc7Sjsg 		if (igt_flush_test(gt->i915))
526c349dbc7Sjsg 			err = -EIO;
527c349dbc7Sjsg 		if (err)
528c349dbc7Sjsg 			return err;
529c349dbc7Sjsg 	}
530c349dbc7Sjsg 
531c349dbc7Sjsg 	return 0;
532c349dbc7Sjsg }
533c349dbc7Sjsg 
force_reset_timeout(struct intel_engine_cs * engine)5345ca02815Sjsg static void force_reset_timeout(struct intel_engine_cs *engine)
5355ca02815Sjsg {
5365ca02815Sjsg 	engine->reset_timeout.probability = 999;
5375ca02815Sjsg 	atomic_set(&engine->reset_timeout.times, -1);
5385ca02815Sjsg }
5395ca02815Sjsg 
cancel_reset_timeout(struct intel_engine_cs * engine)5405ca02815Sjsg static void cancel_reset_timeout(struct intel_engine_cs *engine)
5415ca02815Sjsg {
5425ca02815Sjsg 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
5435ca02815Sjsg }
5445ca02815Sjsg 
igt_reset_fail_engine(void * arg)5455ca02815Sjsg static int igt_reset_fail_engine(void *arg)
5465ca02815Sjsg {
5475ca02815Sjsg 	struct intel_gt *gt = arg;
5485ca02815Sjsg 	struct intel_engine_cs *engine;
5495ca02815Sjsg 	enum intel_engine_id id;
5505ca02815Sjsg 
5515ca02815Sjsg 	/* Check that we can recover from engine-reset failues */
5525ca02815Sjsg 
5535ca02815Sjsg 	if (!intel_has_reset_engine(gt))
5545ca02815Sjsg 		return 0;
5555ca02815Sjsg 
5565ca02815Sjsg 	for_each_engine(engine, gt, id) {
5575ca02815Sjsg 		unsigned int count;
5585ca02815Sjsg 		struct intel_context *ce;
5595ca02815Sjsg 		IGT_TIMEOUT(end_time);
5605ca02815Sjsg 		int err;
5615ca02815Sjsg 
5625ca02815Sjsg 		/* Can't manually break the reset if i915 doesn't perform it */
5635ca02815Sjsg 		if (intel_engine_uses_guc(engine))
5645ca02815Sjsg 			continue;
5655ca02815Sjsg 
5665ca02815Sjsg 		ce = intel_context_create(engine);
5675ca02815Sjsg 		if (IS_ERR(ce)) {
5685ca02815Sjsg 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
5695ca02815Sjsg 			return PTR_ERR(ce);
5705ca02815Sjsg 		}
5715ca02815Sjsg 
5725ca02815Sjsg 		st_engine_heartbeat_disable(engine);
5731bb76ff1Sjsg 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
5741bb76ff1Sjsg 					    &gt->reset.flags));
5755ca02815Sjsg 
5765ca02815Sjsg 		force_reset_timeout(engine);
5775ca02815Sjsg 		err = intel_engine_reset(engine, NULL);
5785ca02815Sjsg 		cancel_reset_timeout(engine);
5795ca02815Sjsg 		if (err == 0) /* timeouts only generated on gen8+ */
5805ca02815Sjsg 			goto skip;
5815ca02815Sjsg 
5825ca02815Sjsg 		count = 0;
5835ca02815Sjsg 		do {
5845ca02815Sjsg 			struct i915_request *last = NULL;
5855ca02815Sjsg 			int i;
5865ca02815Sjsg 
5875ca02815Sjsg 			if (!wait_for_idle(engine)) {
5885ca02815Sjsg 				pr_err("%s failed to idle before reset\n",
5895ca02815Sjsg 				       engine->name);
5905ca02815Sjsg 				err = -EIO;
5915ca02815Sjsg 				break;
5925ca02815Sjsg 			}
5935ca02815Sjsg 
5945ca02815Sjsg 			for (i = 0; i < count % 15; i++) {
5955ca02815Sjsg 				struct i915_request *rq;
5965ca02815Sjsg 
5975ca02815Sjsg 				rq = intel_context_create_request(ce);
5985ca02815Sjsg 				if (IS_ERR(rq)) {
5995ca02815Sjsg 					struct drm_printer p =
6005ca02815Sjsg 						drm_info_printer(gt->i915->drm.dev);
6015ca02815Sjsg 					intel_engine_dump(engine, &p,
6025ca02815Sjsg 							  "%s(%s): failed to submit request\n",
6035ca02815Sjsg 							  __func__,
6045ca02815Sjsg 							  engine->name);
6055ca02815Sjsg 
6065ca02815Sjsg 					GEM_TRACE("%s(%s): failed to submit request\n",
6075ca02815Sjsg 						  __func__,
6085ca02815Sjsg 						  engine->name);
6095ca02815Sjsg 					GEM_TRACE_DUMP();
6105ca02815Sjsg 
6115ca02815Sjsg 					intel_gt_set_wedged(gt);
6125ca02815Sjsg 					if (last)
6135ca02815Sjsg 						i915_request_put(last);
6145ca02815Sjsg 
6155ca02815Sjsg 					err = PTR_ERR(rq);
6165ca02815Sjsg 					goto out;
6175ca02815Sjsg 				}
6185ca02815Sjsg 
6195ca02815Sjsg 				if (last)
6205ca02815Sjsg 					i915_request_put(last);
6215ca02815Sjsg 				last = i915_request_get(rq);
6225ca02815Sjsg 				i915_request_add(rq);
6235ca02815Sjsg 			}
6245ca02815Sjsg 
6255ca02815Sjsg 			if (count & 1) {
6265ca02815Sjsg 				err = intel_engine_reset(engine, NULL);
6275ca02815Sjsg 				if (err) {
6285ca02815Sjsg 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
6295ca02815Sjsg 						      engine->name, err);
6305ca02815Sjsg 					GEM_TRACE_DUMP();
6315ca02815Sjsg 					i915_request_put(last);
6325ca02815Sjsg 					break;
6335ca02815Sjsg 				}
6345ca02815Sjsg 			} else {
6355ca02815Sjsg 				force_reset_timeout(engine);
6365ca02815Sjsg 				err = intel_engine_reset(engine, NULL);
6375ca02815Sjsg 				cancel_reset_timeout(engine);
6385ca02815Sjsg 				if (err != -ETIMEDOUT) {
6395ca02815Sjsg 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
6405ca02815Sjsg 					       engine->name, err);
6415ca02815Sjsg 					i915_request_put(last);
6425ca02815Sjsg 					break;
6435ca02815Sjsg 				}
6445ca02815Sjsg 			}
6455ca02815Sjsg 
6465ca02815Sjsg 			err = 0;
6475ca02815Sjsg 			if (last) {
6485ca02815Sjsg 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
6495ca02815Sjsg 					struct drm_printer p =
6505ca02815Sjsg 						drm_info_printer(gt->i915->drm.dev);
6515ca02815Sjsg 
6525ca02815Sjsg 					intel_engine_dump(engine, &p,
6535ca02815Sjsg 							  "%s(%s): failed to complete request\n",
6545ca02815Sjsg 							  __func__,
6555ca02815Sjsg 							  engine->name);
6565ca02815Sjsg 
6575ca02815Sjsg 					GEM_TRACE("%s(%s): failed to complete request\n",
6585ca02815Sjsg 						  __func__,
6595ca02815Sjsg 						  engine->name);
6605ca02815Sjsg 					GEM_TRACE_DUMP();
6615ca02815Sjsg 
6625ca02815Sjsg 					err = -EIO;
6635ca02815Sjsg 				}
6645ca02815Sjsg 				i915_request_put(last);
6655ca02815Sjsg 			}
6665ca02815Sjsg 			count++;
6675ca02815Sjsg 		} while (err == 0 && time_before(jiffies, end_time));
6685ca02815Sjsg out:
6695ca02815Sjsg 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
6705ca02815Sjsg skip:
6711bb76ff1Sjsg 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
6725ca02815Sjsg 		st_engine_heartbeat_enable(engine);
6735ca02815Sjsg 		intel_context_put(ce);
6745ca02815Sjsg 
6755ca02815Sjsg 		if (igt_flush_test(gt->i915))
6765ca02815Sjsg 			err = -EIO;
6775ca02815Sjsg 		if (err)
6785ca02815Sjsg 			return err;
6795ca02815Sjsg 	}
6805ca02815Sjsg 
6815ca02815Sjsg 	return 0;
6825ca02815Sjsg }
6835ca02815Sjsg 
__igt_reset_engine(struct intel_gt * gt,bool active)684c349dbc7Sjsg static int __igt_reset_engine(struct intel_gt *gt, bool active)
685c349dbc7Sjsg {
686c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
687c349dbc7Sjsg 	struct intel_engine_cs *engine;
688c349dbc7Sjsg 	enum intel_engine_id id;
689c349dbc7Sjsg 	struct hang h;
690c349dbc7Sjsg 	int err = 0;
691c349dbc7Sjsg 
692c349dbc7Sjsg 	/* Check that we can issue an engine reset on an idle engine (no-op) */
693c349dbc7Sjsg 
694c349dbc7Sjsg 	if (!intel_has_reset_engine(gt))
695c349dbc7Sjsg 		return 0;
696c349dbc7Sjsg 
697c349dbc7Sjsg 	if (active) {
698c349dbc7Sjsg 		err = hang_init(&h, gt);
699c349dbc7Sjsg 		if (err)
700c349dbc7Sjsg 			return err;
701c349dbc7Sjsg 	}
702c349dbc7Sjsg 
703c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
704c349dbc7Sjsg 		unsigned int reset_count, reset_engine_count;
7055ca02815Sjsg 		unsigned long count;
7065ca02815Sjsg 		bool using_guc = intel_engine_uses_guc(engine);
707c349dbc7Sjsg 		IGT_TIMEOUT(end_time);
708c349dbc7Sjsg 
7095ca02815Sjsg 		if (using_guc && !active)
7105ca02815Sjsg 			continue;
7115ca02815Sjsg 
712c349dbc7Sjsg 		if (active && !intel_engine_can_store_dword(engine))
713c349dbc7Sjsg 			continue;
714c349dbc7Sjsg 
715c349dbc7Sjsg 		if (!wait_for_idle(engine)) {
716c349dbc7Sjsg 			pr_err("%s failed to idle before reset\n",
717c349dbc7Sjsg 			       engine->name);
718c349dbc7Sjsg 			err = -EIO;
719c349dbc7Sjsg 			break;
720c349dbc7Sjsg 		}
721c349dbc7Sjsg 
722c349dbc7Sjsg 		reset_count = i915_reset_count(global);
723c349dbc7Sjsg 		reset_engine_count = i915_reset_engine_count(global, engine);
724c349dbc7Sjsg 
725ad8b1aafSjsg 		st_engine_heartbeat_disable(engine);
7261bb76ff1Sjsg 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
7271bb76ff1Sjsg 					    &gt->reset.flags));
7285ca02815Sjsg 		count = 0;
729c349dbc7Sjsg 		do {
7305ca02815Sjsg 			struct i915_request *rq = NULL;
7315ca02815Sjsg 			struct intel_selftest_saved_policy saved;
7325ca02815Sjsg 			int err2;
733c349dbc7Sjsg 
7345ca02815Sjsg 			err = intel_selftest_modify_policy(engine, &saved,
7355ca02815Sjsg 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
7365ca02815Sjsg 			if (err) {
7375ca02815Sjsg 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
7385ca02815Sjsg 				break;
7395ca02815Sjsg 			}
7405ca02815Sjsg 
7415ca02815Sjsg 			if (active) {
742c349dbc7Sjsg 				rq = hang_create_request(&h, engine);
743c349dbc7Sjsg 				if (IS_ERR(rq)) {
744c349dbc7Sjsg 					err = PTR_ERR(rq);
7455ca02815Sjsg 					pr_err("[%s] Create hang request failed: %d!\n",
7465ca02815Sjsg 					       engine->name, err);
7475ca02815Sjsg 					goto restore;
748c349dbc7Sjsg 				}
749c349dbc7Sjsg 
750c349dbc7Sjsg 				i915_request_get(rq);
751c349dbc7Sjsg 				i915_request_add(rq);
752c349dbc7Sjsg 
753c349dbc7Sjsg 				if (!wait_until_running(&h, rq)) {
754c349dbc7Sjsg 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
755c349dbc7Sjsg 
756c349dbc7Sjsg 					pr_err("%s: Failed to start request %llx, at %x\n",
757c349dbc7Sjsg 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
758c349dbc7Sjsg 					intel_engine_dump(engine, &p,
759c349dbc7Sjsg 							  "%s\n", engine->name);
760c349dbc7Sjsg 
761c349dbc7Sjsg 					i915_request_put(rq);
762c349dbc7Sjsg 					err = -EIO;
7635ca02815Sjsg 					goto restore;
7645ca02815Sjsg 				}
765c349dbc7Sjsg 			}
766c349dbc7Sjsg 
7675ca02815Sjsg 			if (!using_guc) {
768c349dbc7Sjsg 				err = intel_engine_reset(engine, NULL);
769c349dbc7Sjsg 				if (err) {
7705ca02815Sjsg 					pr_err("intel_engine_reset(%s) failed, err:%d\n",
7715ca02815Sjsg 					       engine->name, err);
7725ca02815Sjsg 					goto skip;
773c349dbc7Sjsg 				}
7745ca02815Sjsg 			}
7755ca02815Sjsg 
7765ca02815Sjsg 			if (rq) {
7775ca02815Sjsg 				/* Ensure the reset happens and kills the engine */
7785ca02815Sjsg 				err = intel_selftest_wait_for_rq(rq);
7795ca02815Sjsg 				if (err)
7805ca02815Sjsg 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
7815ca02815Sjsg 					       engine->name, rq->fence.context,
7821bb76ff1Sjsg 					       rq->fence.seqno, rq->context->guc_id.id, err);
7835ca02815Sjsg 			}
7845ca02815Sjsg 
7855ca02815Sjsg skip:
7865ca02815Sjsg 			if (rq)
7875ca02815Sjsg 				i915_request_put(rq);
788c349dbc7Sjsg 
789c349dbc7Sjsg 			if (i915_reset_count(global) != reset_count) {
790c349dbc7Sjsg 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
791c349dbc7Sjsg 				err = -EINVAL;
7925ca02815Sjsg 				goto restore;
793c349dbc7Sjsg 			}
794c349dbc7Sjsg 
7955ca02815Sjsg 			/* GuC based resets are not logged per engine */
7965ca02815Sjsg 			if (!using_guc) {
797c349dbc7Sjsg 				if (i915_reset_engine_count(global, engine) !=
798c349dbc7Sjsg 				    ++reset_engine_count) {
799c349dbc7Sjsg 					pr_err("%s engine reset not recorded!\n",
800c349dbc7Sjsg 					       engine->name);
801c349dbc7Sjsg 					err = -EINVAL;
8025ca02815Sjsg 					goto restore;
803c349dbc7Sjsg 				}
8045ca02815Sjsg 			}
8055ca02815Sjsg 
8065ca02815Sjsg 			count++;
8075ca02815Sjsg 
8085ca02815Sjsg restore:
8095ca02815Sjsg 			err2 = intel_selftest_restore_policy(engine, &saved);
8105ca02815Sjsg 			if (err2)
8115ca02815Sjsg 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
8125ca02815Sjsg 			if (err == 0)
8135ca02815Sjsg 				err = err2;
8145ca02815Sjsg 			if (err)
8155ca02815Sjsg 				break;
816c349dbc7Sjsg 		} while (time_before(jiffies, end_time));
8171bb76ff1Sjsg 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
818ad8b1aafSjsg 		st_engine_heartbeat_enable(engine);
8195ca02815Sjsg 		pr_info("%s: Completed %lu %s resets\n",
8205ca02815Sjsg 			engine->name, count, active ? "active" : "idle");
821c349dbc7Sjsg 
822c349dbc7Sjsg 		if (err)
823c349dbc7Sjsg 			break;
824c349dbc7Sjsg 
825c349dbc7Sjsg 		err = igt_flush_test(gt->i915);
8265ca02815Sjsg 		if (err) {
8275ca02815Sjsg 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
828c349dbc7Sjsg 			break;
829c349dbc7Sjsg 		}
8305ca02815Sjsg 	}
831c349dbc7Sjsg 
8325ca02815Sjsg 	if (intel_gt_is_wedged(gt)) {
8335ca02815Sjsg 		pr_err("GT is wedged!\n");
834c349dbc7Sjsg 		err = -EIO;
8355ca02815Sjsg 	}
836c349dbc7Sjsg 
837c349dbc7Sjsg 	if (active)
838c349dbc7Sjsg 		hang_fini(&h);
839c349dbc7Sjsg 
840c349dbc7Sjsg 	return err;
841c349dbc7Sjsg }
842c349dbc7Sjsg 
igt_reset_idle_engine(void * arg)843c349dbc7Sjsg static int igt_reset_idle_engine(void *arg)
844c349dbc7Sjsg {
845c349dbc7Sjsg 	return __igt_reset_engine(arg, false);
846c349dbc7Sjsg }
847c349dbc7Sjsg 
igt_reset_active_engine(void * arg)848c349dbc7Sjsg static int igt_reset_active_engine(void *arg)
849c349dbc7Sjsg {
850c349dbc7Sjsg 	return __igt_reset_engine(arg, true);
851c349dbc7Sjsg }
852c349dbc7Sjsg 
853c349dbc7Sjsg struct active_engine {
8542e3046b3Sjsg 	struct kthread_worker *worker;
8552e3046b3Sjsg 	struct kthread_work work;
856c349dbc7Sjsg 	struct intel_engine_cs *engine;
857c349dbc7Sjsg 	unsigned long resets;
858c349dbc7Sjsg 	unsigned int flags;
8592e3046b3Sjsg 	bool stop;
8602e3046b3Sjsg 	int result;
861c349dbc7Sjsg };
862c349dbc7Sjsg 
863c349dbc7Sjsg #define TEST_ACTIVE	BIT(0)
864c349dbc7Sjsg #define TEST_OTHERS	BIT(1)
865c349dbc7Sjsg #define TEST_SELF	BIT(2)
866c349dbc7Sjsg #define TEST_PRIORITY	BIT(3)
867c349dbc7Sjsg 
active_request_put(struct i915_request * rq)868c349dbc7Sjsg static int active_request_put(struct i915_request *rq)
869c349dbc7Sjsg {
870c349dbc7Sjsg 	int err = 0;
871c349dbc7Sjsg 
872c349dbc7Sjsg 	if (!rq)
873c349dbc7Sjsg 		return 0;
874c349dbc7Sjsg 
8755ca02815Sjsg 	if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
876c349dbc7Sjsg 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
877c349dbc7Sjsg 			  rq->engine->name,
878c349dbc7Sjsg 			  rq->fence.context,
879c349dbc7Sjsg 			  rq->fence.seqno);
880c349dbc7Sjsg 		GEM_TRACE_DUMP();
881c349dbc7Sjsg 
882c349dbc7Sjsg 		intel_gt_set_wedged(rq->engine->gt);
883c349dbc7Sjsg 		err = -EIO;
884c349dbc7Sjsg 	}
885c349dbc7Sjsg 
886c349dbc7Sjsg 	i915_request_put(rq);
887c349dbc7Sjsg 
888c349dbc7Sjsg 	return err;
889c349dbc7Sjsg }
890c349dbc7Sjsg 
active_engine(struct kthread_work * work)8912e3046b3Sjsg static void active_engine(struct kthread_work *work)
892c349dbc7Sjsg {
893c349dbc7Sjsg 	I915_RND_STATE(prng);
8942e3046b3Sjsg 	struct active_engine *arg = container_of(work, typeof(*arg), work);
895c349dbc7Sjsg 	struct intel_engine_cs *engine = arg->engine;
896c349dbc7Sjsg 	struct i915_request *rq[8] = {};
897c349dbc7Sjsg 	struct intel_context *ce[ARRAY_SIZE(rq)];
898c349dbc7Sjsg 	unsigned long count;
899c349dbc7Sjsg 	int err = 0;
900c349dbc7Sjsg 
901c349dbc7Sjsg 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
902c349dbc7Sjsg 		ce[count] = intel_context_create(engine);
903c349dbc7Sjsg 		if (IS_ERR(ce[count])) {
9042e3046b3Sjsg 			arg->result = PTR_ERR(ce[count]);
9052e3046b3Sjsg 			pr_err("[%s] Create context #%ld failed: %d!\n",
9062e3046b3Sjsg 			       engine->name, count, arg->result);
907c349dbc7Sjsg 			while (--count)
908c349dbc7Sjsg 				intel_context_put(ce[count]);
9092e3046b3Sjsg 			return;
910c349dbc7Sjsg 		}
911c349dbc7Sjsg 	}
912c349dbc7Sjsg 
913c349dbc7Sjsg 	count = 0;
9142e3046b3Sjsg 	while (!READ_ONCE(arg->stop)) {
915c349dbc7Sjsg 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
916c349dbc7Sjsg 		struct i915_request *old = rq[idx];
917c349dbc7Sjsg 		struct i915_request *new;
918c349dbc7Sjsg 
919c349dbc7Sjsg 		new = intel_context_create_request(ce[idx]);
920c349dbc7Sjsg 		if (IS_ERR(new)) {
921c349dbc7Sjsg 			err = PTR_ERR(new);
9225ca02815Sjsg 			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
923c349dbc7Sjsg 			break;
924c349dbc7Sjsg 		}
925c349dbc7Sjsg 
926c349dbc7Sjsg 		rq[idx] = i915_request_get(new);
927c349dbc7Sjsg 		i915_request_add(new);
928c349dbc7Sjsg 
9295ca02815Sjsg 		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
930c349dbc7Sjsg 			struct i915_sched_attr attr = {
931c349dbc7Sjsg 				.priority =
932c349dbc7Sjsg 					i915_prandom_u32_max_state(512, &prng),
933c349dbc7Sjsg 			};
9345ca02815Sjsg 			engine->sched_engine->schedule(rq[idx], &attr);
935c349dbc7Sjsg 		}
936c349dbc7Sjsg 
937c349dbc7Sjsg 		err = active_request_put(old);
9385ca02815Sjsg 		if (err) {
9395ca02815Sjsg 			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
940c349dbc7Sjsg 			break;
9415ca02815Sjsg 		}
942c349dbc7Sjsg 
943c349dbc7Sjsg 		cond_resched();
944c349dbc7Sjsg 	}
945c349dbc7Sjsg 
946c349dbc7Sjsg 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
947c349dbc7Sjsg 		int err__ = active_request_put(rq[count]);
948c349dbc7Sjsg 
9495ca02815Sjsg 		if (err)
9505ca02815Sjsg 			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
9515ca02815Sjsg 
952c349dbc7Sjsg 		/* Keep the first error */
953c349dbc7Sjsg 		if (!err)
954c349dbc7Sjsg 			err = err__;
955c349dbc7Sjsg 
956c349dbc7Sjsg 		intel_context_put(ce[count]);
957c349dbc7Sjsg 	}
958c349dbc7Sjsg 
9592e3046b3Sjsg 	arg->result = err;
960c349dbc7Sjsg }
961c349dbc7Sjsg 
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)962c349dbc7Sjsg static int __igt_reset_engines(struct intel_gt *gt,
963c349dbc7Sjsg 			       const char *test_name,
964c349dbc7Sjsg 			       unsigned int flags)
965c349dbc7Sjsg {
966c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
967c349dbc7Sjsg 	struct intel_engine_cs *engine, *other;
9681bb76ff1Sjsg 	struct active_engine *threads;
969c349dbc7Sjsg 	enum intel_engine_id id, tmp;
970c349dbc7Sjsg 	struct hang h;
971c349dbc7Sjsg 	int err = 0;
972c349dbc7Sjsg 
973c349dbc7Sjsg 	/* Check that issuing a reset on one engine does not interfere
974c349dbc7Sjsg 	 * with any other engine.
975c349dbc7Sjsg 	 */
976c349dbc7Sjsg 
977c349dbc7Sjsg 	if (!intel_has_reset_engine(gt))
978c349dbc7Sjsg 		return 0;
979c349dbc7Sjsg 
980c349dbc7Sjsg 	if (flags & TEST_ACTIVE) {
981c349dbc7Sjsg 		err = hang_init(&h, gt);
982c349dbc7Sjsg 		if (err)
983c349dbc7Sjsg 			return err;
984c349dbc7Sjsg 
985c349dbc7Sjsg 		if (flags & TEST_PRIORITY)
986c349dbc7Sjsg 			h.ctx->sched.priority = 1024;
987c349dbc7Sjsg 	}
988c349dbc7Sjsg 
9891bb76ff1Sjsg 	threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
9901bb76ff1Sjsg 	if (!threads)
9911bb76ff1Sjsg 		return -ENOMEM;
9921bb76ff1Sjsg 
993c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
994c349dbc7Sjsg 		unsigned long device = i915_reset_count(global);
995c349dbc7Sjsg 		unsigned long count = 0, reported;
9965ca02815Sjsg 		bool using_guc = intel_engine_uses_guc(engine);
997c349dbc7Sjsg 		IGT_TIMEOUT(end_time);
998c349dbc7Sjsg 
9995ca02815Sjsg 		if (flags & TEST_ACTIVE) {
10005ca02815Sjsg 			if (!intel_engine_can_store_dword(engine))
10015ca02815Sjsg 				continue;
10025ca02815Sjsg 		} else if (using_guc)
1003c349dbc7Sjsg 			continue;
1004c349dbc7Sjsg 
1005c349dbc7Sjsg 		if (!wait_for_idle(engine)) {
1006c349dbc7Sjsg 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1007c349dbc7Sjsg 			       engine->name, test_name);
1008c349dbc7Sjsg 			err = -EIO;
1009c349dbc7Sjsg 			break;
1010c349dbc7Sjsg 		}
1011c349dbc7Sjsg 
10121bb76ff1Sjsg 		memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1013c349dbc7Sjsg 		for_each_engine(other, gt, tmp) {
10142e3046b3Sjsg 			struct kthread_worker *worker;
1015c349dbc7Sjsg 
1016c349dbc7Sjsg 			threads[tmp].resets =
1017c349dbc7Sjsg 				i915_reset_engine_count(global, other);
1018c349dbc7Sjsg 
1019ad8b1aafSjsg 			if (other == engine && !(flags & TEST_SELF))
1020c349dbc7Sjsg 				continue;
1021c349dbc7Sjsg 
1022ad8b1aafSjsg 			if (other != engine && !(flags & TEST_OTHERS))
1023c349dbc7Sjsg 				continue;
1024c349dbc7Sjsg 
1025c349dbc7Sjsg 			threads[tmp].engine = other;
1026c349dbc7Sjsg 			threads[tmp].flags = flags;
1027c349dbc7Sjsg 
10282e3046b3Sjsg 			worker = kthread_create_worker(0, "igt/%s",
10292e3046b3Sjsg 						       other->name);
10302e3046b3Sjsg 			if (IS_ERR(worker)) {
10312e3046b3Sjsg 				err = PTR_ERR(worker);
10322e3046b3Sjsg 				pr_err("[%s] Worker create failed: %d!\n",
10332e3046b3Sjsg 				       engine->name, err);
1034c349dbc7Sjsg 				goto unwind;
1035c349dbc7Sjsg 			}
1036c349dbc7Sjsg 
10372e3046b3Sjsg 			threads[tmp].worker = worker;
1038c349dbc7Sjsg 
10392e3046b3Sjsg 			kthread_init_work(&threads[tmp].work, active_engine);
10402e3046b3Sjsg 			kthread_queue_work(threads[tmp].worker,
10412e3046b3Sjsg 					   &threads[tmp].work);
10422e3046b3Sjsg 		}
1043c349dbc7Sjsg 
10445ca02815Sjsg 		st_engine_heartbeat_disable_no_pm(engine);
10451bb76ff1Sjsg 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
10461bb76ff1Sjsg 					    &gt->reset.flags));
1047c349dbc7Sjsg 		do {
1048c349dbc7Sjsg 			struct i915_request *rq = NULL;
10495ca02815Sjsg 			struct intel_selftest_saved_policy saved;
10505ca02815Sjsg 			int err2;
10515ca02815Sjsg 
10525ca02815Sjsg 			err = intel_selftest_modify_policy(engine, &saved,
10535ca02815Sjsg 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
10545ca02815Sjsg 			if (err) {
10555ca02815Sjsg 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
10565ca02815Sjsg 				break;
10575ca02815Sjsg 			}
1058c349dbc7Sjsg 
1059c349dbc7Sjsg 			if (flags & TEST_ACTIVE) {
1060c349dbc7Sjsg 				rq = hang_create_request(&h, engine);
1061c349dbc7Sjsg 				if (IS_ERR(rq)) {
1062c349dbc7Sjsg 					err = PTR_ERR(rq);
10635ca02815Sjsg 					pr_err("[%s] Create hang request failed: %d!\n",
10645ca02815Sjsg 					       engine->name, err);
10655ca02815Sjsg 					goto restore;
1066c349dbc7Sjsg 				}
1067c349dbc7Sjsg 
1068c349dbc7Sjsg 				i915_request_get(rq);
1069c349dbc7Sjsg 				i915_request_add(rq);
1070c349dbc7Sjsg 
1071c349dbc7Sjsg 				if (!wait_until_running(&h, rq)) {
1072c349dbc7Sjsg 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1073c349dbc7Sjsg 
1074c349dbc7Sjsg 					pr_err("%s: Failed to start request %llx, at %x\n",
1075c349dbc7Sjsg 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1076c349dbc7Sjsg 					intel_engine_dump(engine, &p,
1077c349dbc7Sjsg 							  "%s\n", engine->name);
1078c349dbc7Sjsg 
1079c349dbc7Sjsg 					i915_request_put(rq);
1080c349dbc7Sjsg 					err = -EIO;
10815ca02815Sjsg 					goto restore;
1082c349dbc7Sjsg 				}
10835ca02815Sjsg 			} else {
10845ca02815Sjsg 				intel_engine_pm_get(engine);
1085c349dbc7Sjsg 			}
1086c349dbc7Sjsg 
10875ca02815Sjsg 			if (!using_guc) {
1088c349dbc7Sjsg 				err = intel_engine_reset(engine, NULL);
1089c349dbc7Sjsg 				if (err) {
1090c349dbc7Sjsg 					pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1091c349dbc7Sjsg 					       engine->name, test_name, err);
10925ca02815Sjsg 					goto restore;
10935ca02815Sjsg 				}
10945ca02815Sjsg 			}
10955ca02815Sjsg 
10965ca02815Sjsg 			if (rq) {
10975ca02815Sjsg 				/* Ensure the reset happens and kills the engine */
10985ca02815Sjsg 				err = intel_selftest_wait_for_rq(rq);
10995ca02815Sjsg 				if (err)
11005ca02815Sjsg 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
11015ca02815Sjsg 					       engine->name, rq->fence.context,
11021bb76ff1Sjsg 					       rq->fence.seqno, rq->context->guc_id.id, err);
1103c349dbc7Sjsg 			}
1104c349dbc7Sjsg 
1105c349dbc7Sjsg 			count++;
1106c349dbc7Sjsg 
1107c349dbc7Sjsg 			if (rq) {
1108ad8b1aafSjsg 				if (rq->fence.error != -EIO) {
11095ca02815Sjsg 					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1110ad8b1aafSjsg 					       engine->name, test_name,
1111ad8b1aafSjsg 					       rq->fence.context,
11121bb76ff1Sjsg 					       rq->fence.seqno, rq->context->guc_id.id);
1113ad8b1aafSjsg 					i915_request_put(rq);
1114ad8b1aafSjsg 
1115ad8b1aafSjsg 					GEM_TRACE_DUMP();
1116ad8b1aafSjsg 					intel_gt_set_wedged(gt);
1117ad8b1aafSjsg 					err = -EIO;
11185ca02815Sjsg 					goto restore;
1119ad8b1aafSjsg 				}
1120ad8b1aafSjsg 
1121c349dbc7Sjsg 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1122c349dbc7Sjsg 					struct drm_printer p =
1123c349dbc7Sjsg 						drm_info_printer(gt->i915->drm.dev);
1124c349dbc7Sjsg 
1125c349dbc7Sjsg 					pr_err("i915_reset_engine(%s:%s):"
1126ad8b1aafSjsg 					       " failed to complete request %llx:%lld after reset\n",
1127ad8b1aafSjsg 					       engine->name, test_name,
1128ad8b1aafSjsg 					       rq->fence.context,
1129ad8b1aafSjsg 					       rq->fence.seqno);
1130c349dbc7Sjsg 					intel_engine_dump(engine, &p,
1131c349dbc7Sjsg 							  "%s\n", engine->name);
1132c349dbc7Sjsg 					i915_request_put(rq);
1133c349dbc7Sjsg 
1134c349dbc7Sjsg 					GEM_TRACE_DUMP();
1135c349dbc7Sjsg 					intel_gt_set_wedged(gt);
1136c349dbc7Sjsg 					err = -EIO;
11375ca02815Sjsg 					goto restore;
1138c349dbc7Sjsg 				}
1139c349dbc7Sjsg 
1140c349dbc7Sjsg 				i915_request_put(rq);
1141c349dbc7Sjsg 			}
1142c349dbc7Sjsg 
11435ca02815Sjsg 			if (!(flags & TEST_ACTIVE))
11445ca02815Sjsg 				intel_engine_pm_put(engine);
11455ca02815Sjsg 
1146c349dbc7Sjsg 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1147c349dbc7Sjsg 				struct drm_printer p =
1148c349dbc7Sjsg 					drm_info_printer(gt->i915->drm.dev);
1149c349dbc7Sjsg 
1150c349dbc7Sjsg 				pr_err("i915_reset_engine(%s:%s):"
1151c349dbc7Sjsg 				       " failed to idle after reset\n",
1152c349dbc7Sjsg 				       engine->name, test_name);
1153c349dbc7Sjsg 				intel_engine_dump(engine, &p,
1154c349dbc7Sjsg 						  "%s\n", engine->name);
1155c349dbc7Sjsg 
1156c349dbc7Sjsg 				err = -EIO;
11575ca02815Sjsg 				goto restore;
1158c349dbc7Sjsg 			}
11595ca02815Sjsg 
11605ca02815Sjsg restore:
11615ca02815Sjsg 			err2 = intel_selftest_restore_policy(engine, &saved);
11625ca02815Sjsg 			if (err2)
11635ca02815Sjsg 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
11645ca02815Sjsg 			if (err == 0)
11655ca02815Sjsg 				err = err2;
11665ca02815Sjsg 			if (err)
11675ca02815Sjsg 				break;
1168c349dbc7Sjsg 		} while (time_before(jiffies, end_time));
11691bb76ff1Sjsg 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
11705ca02815Sjsg 		st_engine_heartbeat_enable_no_pm(engine);
1171c349dbc7Sjsg 
1172c349dbc7Sjsg 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1173c349dbc7Sjsg 			engine->name, test_name, count);
1174c349dbc7Sjsg 
11755ca02815Sjsg 		/* GuC based resets are not logged per engine */
11765ca02815Sjsg 		if (!using_guc) {
1177c349dbc7Sjsg 			reported = i915_reset_engine_count(global, engine);
1178c349dbc7Sjsg 			reported -= threads[engine->id].resets;
1179c349dbc7Sjsg 			if (reported != count) {
1180c349dbc7Sjsg 				pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1181c349dbc7Sjsg 				       engine->name, test_name, count, reported);
1182c349dbc7Sjsg 				if (!err)
1183c349dbc7Sjsg 					err = -EINVAL;
1184c349dbc7Sjsg 			}
11855ca02815Sjsg 		}
1186c349dbc7Sjsg 
1187c349dbc7Sjsg unwind:
1188c349dbc7Sjsg 		for_each_engine(other, gt, tmp) {
1189c349dbc7Sjsg 			int ret;
1190c349dbc7Sjsg 
11912e3046b3Sjsg 			if (!threads[tmp].worker)
1192c349dbc7Sjsg 				continue;
1193c349dbc7Sjsg 
11942e3046b3Sjsg 			WRITE_ONCE(threads[tmp].stop, true);
11952e3046b3Sjsg 			kthread_flush_work(&threads[tmp].work);
11962e3046b3Sjsg 			ret = READ_ONCE(threads[tmp].result);
1197c349dbc7Sjsg 			if (ret) {
1198c349dbc7Sjsg 				pr_err("kthread for other engine %s failed, err=%d\n",
1199c349dbc7Sjsg 				       other->name, ret);
1200c349dbc7Sjsg 				if (!err)
1201c349dbc7Sjsg 					err = ret;
1202c349dbc7Sjsg 			}
12032e3046b3Sjsg 
12042e3046b3Sjsg 			kthread_destroy_worker(threads[tmp].worker);
1205c349dbc7Sjsg 
12065ca02815Sjsg 			/* GuC based resets are not logged per engine */
12075ca02815Sjsg 			if (!using_guc) {
1208c349dbc7Sjsg 				if (other->uabi_class != engine->uabi_class &&
1209c349dbc7Sjsg 				    threads[tmp].resets !=
1210c349dbc7Sjsg 				    i915_reset_engine_count(global, other)) {
1211c349dbc7Sjsg 					pr_err("Innocent engine %s was reset (count=%ld)\n",
1212c349dbc7Sjsg 					       other->name,
1213c349dbc7Sjsg 					       i915_reset_engine_count(global, other) -
1214c349dbc7Sjsg 					       threads[tmp].resets);
1215c349dbc7Sjsg 					if (!err)
1216c349dbc7Sjsg 						err = -EINVAL;
1217c349dbc7Sjsg 				}
1218c349dbc7Sjsg 			}
12195ca02815Sjsg 		}
1220c349dbc7Sjsg 
1221c349dbc7Sjsg 		if (device != i915_reset_count(global)) {
1222c349dbc7Sjsg 			pr_err("Global reset (count=%ld)!\n",
1223c349dbc7Sjsg 			       i915_reset_count(global) - device);
1224c349dbc7Sjsg 			if (!err)
1225c349dbc7Sjsg 				err = -EINVAL;
1226c349dbc7Sjsg 		}
1227c349dbc7Sjsg 
1228c349dbc7Sjsg 		if (err)
1229c349dbc7Sjsg 			break;
1230c349dbc7Sjsg 
1231c349dbc7Sjsg 		err = igt_flush_test(gt->i915);
12325ca02815Sjsg 		if (err) {
12335ca02815Sjsg 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1234c349dbc7Sjsg 			break;
1235c349dbc7Sjsg 		}
12365ca02815Sjsg 	}
12371bb76ff1Sjsg 	kfree(threads);
1238c349dbc7Sjsg 
1239c349dbc7Sjsg 	if (intel_gt_is_wedged(gt))
1240c349dbc7Sjsg 		err = -EIO;
1241c349dbc7Sjsg 
1242c349dbc7Sjsg 	if (flags & TEST_ACTIVE)
1243c349dbc7Sjsg 		hang_fini(&h);
1244c349dbc7Sjsg 
1245c349dbc7Sjsg 	return err;
1246c349dbc7Sjsg }
1247c349dbc7Sjsg 
igt_reset_engines(void * arg)1248c349dbc7Sjsg static int igt_reset_engines(void *arg)
1249c349dbc7Sjsg {
1250c349dbc7Sjsg 	static const struct {
1251c349dbc7Sjsg 		const char *name;
1252c349dbc7Sjsg 		unsigned int flags;
1253c349dbc7Sjsg 	} phases[] = {
1254c349dbc7Sjsg 		{ "idle", 0 },
1255c349dbc7Sjsg 		{ "active", TEST_ACTIVE },
1256c349dbc7Sjsg 		{ "others-idle", TEST_OTHERS },
1257c349dbc7Sjsg 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1258c349dbc7Sjsg 		{
1259c349dbc7Sjsg 			"others-priority",
1260c349dbc7Sjsg 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1261c349dbc7Sjsg 		},
1262c349dbc7Sjsg 		{
1263c349dbc7Sjsg 			"self-priority",
1264ad8b1aafSjsg 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1265c349dbc7Sjsg 		},
1266c349dbc7Sjsg 		{ }
1267c349dbc7Sjsg 	};
1268c349dbc7Sjsg 	struct intel_gt *gt = arg;
1269c349dbc7Sjsg 	typeof(*phases) *p;
1270c349dbc7Sjsg 	int err;
1271c349dbc7Sjsg 
1272c349dbc7Sjsg 	for (p = phases; p->name; p++) {
1273c349dbc7Sjsg 		if (p->flags & TEST_PRIORITY) {
1274c349dbc7Sjsg 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1275c349dbc7Sjsg 				continue;
1276c349dbc7Sjsg 		}
1277c349dbc7Sjsg 
1278c349dbc7Sjsg 		err = __igt_reset_engines(arg, p->name, p->flags);
1279c349dbc7Sjsg 		if (err)
1280c349dbc7Sjsg 			return err;
1281c349dbc7Sjsg 	}
1282c349dbc7Sjsg 
1283c349dbc7Sjsg 	return 0;
1284c349dbc7Sjsg }
1285c349dbc7Sjsg 
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1286c349dbc7Sjsg static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1287c349dbc7Sjsg {
1288c349dbc7Sjsg 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1289c349dbc7Sjsg 
1290c349dbc7Sjsg 	intel_gt_reset(gt, mask, NULL);
1291c349dbc7Sjsg 
1292c349dbc7Sjsg 	return count;
1293c349dbc7Sjsg }
1294c349dbc7Sjsg 
igt_reset_wait(void * arg)1295c349dbc7Sjsg static int igt_reset_wait(void *arg)
1296c349dbc7Sjsg {
1297c349dbc7Sjsg 	struct intel_gt *gt = arg;
1298c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
12991bb76ff1Sjsg 	struct intel_engine_cs *engine;
1300c349dbc7Sjsg 	struct i915_request *rq;
1301c349dbc7Sjsg 	unsigned int reset_count;
1302c349dbc7Sjsg 	struct hang h;
1303c349dbc7Sjsg 	long timeout;
1304c349dbc7Sjsg 	int err;
1305c349dbc7Sjsg 
13061bb76ff1Sjsg 	engine = intel_selftest_find_any_engine(gt);
13071bb76ff1Sjsg 
1308c349dbc7Sjsg 	if (!engine || !intel_engine_can_store_dword(engine))
1309c349dbc7Sjsg 		return 0;
1310c349dbc7Sjsg 
1311c349dbc7Sjsg 	/* Check that we detect a stuck waiter and issue a reset */
1312c349dbc7Sjsg 
1313c349dbc7Sjsg 	igt_global_reset_lock(gt);
1314c349dbc7Sjsg 
1315c349dbc7Sjsg 	err = hang_init(&h, gt);
13165ca02815Sjsg 	if (err) {
13175ca02815Sjsg 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1318c349dbc7Sjsg 		goto unlock;
13195ca02815Sjsg 	}
1320c349dbc7Sjsg 
1321c349dbc7Sjsg 	rq = hang_create_request(&h, engine);
1322c349dbc7Sjsg 	if (IS_ERR(rq)) {
1323c349dbc7Sjsg 		err = PTR_ERR(rq);
13245ca02815Sjsg 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1325c349dbc7Sjsg 		goto fini;
1326c349dbc7Sjsg 	}
1327c349dbc7Sjsg 
1328c349dbc7Sjsg 	i915_request_get(rq);
1329c349dbc7Sjsg 	i915_request_add(rq);
1330c349dbc7Sjsg 
1331c349dbc7Sjsg 	if (!wait_until_running(&h, rq)) {
1332c349dbc7Sjsg 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1333c349dbc7Sjsg 
1334c349dbc7Sjsg 		pr_err("%s: Failed to start request %llx, at %x\n",
1335c349dbc7Sjsg 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1336c349dbc7Sjsg 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1337c349dbc7Sjsg 
1338c349dbc7Sjsg 		intel_gt_set_wedged(gt);
1339c349dbc7Sjsg 
1340c349dbc7Sjsg 		err = -EIO;
1341c349dbc7Sjsg 		goto out_rq;
1342c349dbc7Sjsg 	}
1343c349dbc7Sjsg 
1344c349dbc7Sjsg 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1345c349dbc7Sjsg 
1346c349dbc7Sjsg 	timeout = i915_request_wait(rq, 0, 10);
1347c349dbc7Sjsg 	if (timeout < 0) {
1348c349dbc7Sjsg 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1349c349dbc7Sjsg 		       timeout);
1350c349dbc7Sjsg 		err = timeout;
1351c349dbc7Sjsg 		goto out_rq;
1352c349dbc7Sjsg 	}
1353c349dbc7Sjsg 
1354c349dbc7Sjsg 	if (i915_reset_count(global) == reset_count) {
1355c349dbc7Sjsg 		pr_err("No GPU reset recorded!\n");
1356c349dbc7Sjsg 		err = -EINVAL;
1357c349dbc7Sjsg 		goto out_rq;
1358c349dbc7Sjsg 	}
1359c349dbc7Sjsg 
1360c349dbc7Sjsg out_rq:
1361c349dbc7Sjsg 	i915_request_put(rq);
1362c349dbc7Sjsg fini:
1363c349dbc7Sjsg 	hang_fini(&h);
1364c349dbc7Sjsg unlock:
1365c349dbc7Sjsg 	igt_global_reset_unlock(gt);
1366c349dbc7Sjsg 
1367c349dbc7Sjsg 	if (intel_gt_is_wedged(gt))
1368c349dbc7Sjsg 		return -EIO;
1369c349dbc7Sjsg 
1370c349dbc7Sjsg 	return err;
1371c349dbc7Sjsg }
1372c349dbc7Sjsg 
1373c349dbc7Sjsg struct evict_vma {
1374c349dbc7Sjsg 	struct completion completion;
1375c349dbc7Sjsg 	struct i915_vma *vma;
1376c349dbc7Sjsg };
1377c349dbc7Sjsg 
evict_vma(void * data)1378c349dbc7Sjsg static int evict_vma(void *data)
1379c349dbc7Sjsg {
1380c349dbc7Sjsg 	struct evict_vma *arg = data;
1381c349dbc7Sjsg 	struct i915_address_space *vm = arg->vma->vm;
1382c349dbc7Sjsg 	struct drm_mm_node evict = arg->vma->node;
1383c349dbc7Sjsg 	int err;
1384c349dbc7Sjsg 
1385c349dbc7Sjsg 	complete(&arg->completion);
1386c349dbc7Sjsg 
1387c349dbc7Sjsg 	mutex_lock(&vm->mutex);
13881bb76ff1Sjsg 	err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1389c349dbc7Sjsg 	mutex_unlock(&vm->mutex);
1390c349dbc7Sjsg 
1391c349dbc7Sjsg 	return err;
1392c349dbc7Sjsg }
1393c349dbc7Sjsg 
evict_fence(void * data)1394c349dbc7Sjsg static int evict_fence(void *data)
1395c349dbc7Sjsg {
1396c349dbc7Sjsg 	struct evict_vma *arg = data;
1397c349dbc7Sjsg 	int err;
1398c349dbc7Sjsg 
1399c349dbc7Sjsg 	complete(&arg->completion);
1400c349dbc7Sjsg 
1401c349dbc7Sjsg 	/* Mark the fence register as dirty to force the mmio update. */
1402c349dbc7Sjsg 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1403c349dbc7Sjsg 	if (err) {
1404c349dbc7Sjsg 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1405c349dbc7Sjsg 		return err;
1406c349dbc7Sjsg 	}
1407c349dbc7Sjsg 
1408c349dbc7Sjsg 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1409c349dbc7Sjsg 	if (err) {
1410c349dbc7Sjsg 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1411c349dbc7Sjsg 		return err;
1412c349dbc7Sjsg 	}
1413c349dbc7Sjsg 
1414c349dbc7Sjsg 	err = i915_vma_pin_fence(arg->vma);
1415c349dbc7Sjsg 	i915_vma_unpin(arg->vma);
1416c349dbc7Sjsg 	if (err) {
1417c349dbc7Sjsg 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1418c349dbc7Sjsg 		return err;
1419c349dbc7Sjsg 	}
1420c349dbc7Sjsg 
1421c349dbc7Sjsg 	i915_vma_unpin_fence(arg->vma);
1422c349dbc7Sjsg 
1423c349dbc7Sjsg 	return 0;
1424c349dbc7Sjsg }
1425c349dbc7Sjsg 
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1426c349dbc7Sjsg static int __igt_reset_evict_vma(struct intel_gt *gt,
1427c349dbc7Sjsg 				 struct i915_address_space *vm,
1428c349dbc7Sjsg 				 int (*fn)(void *),
1429c349dbc7Sjsg 				 unsigned int flags)
1430c349dbc7Sjsg {
14311bb76ff1Sjsg 	struct intel_engine_cs *engine;
1432c349dbc7Sjsg 	struct drm_i915_gem_object *obj;
1433c349dbc7Sjsg 	struct task_struct *tsk = NULL;
1434c349dbc7Sjsg 	struct i915_request *rq;
1435c349dbc7Sjsg 	struct evict_vma arg;
1436c349dbc7Sjsg 	struct hang h;
1437c349dbc7Sjsg 	unsigned int pin_flags;
1438c349dbc7Sjsg 	int err;
1439c349dbc7Sjsg 
1440c349dbc7Sjsg 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1441c349dbc7Sjsg 		return 0;
1442c349dbc7Sjsg 
14431bb76ff1Sjsg 	engine = intel_selftest_find_any_engine(gt);
14441bb76ff1Sjsg 
1445c349dbc7Sjsg 	if (!engine || !intel_engine_can_store_dword(engine))
1446c349dbc7Sjsg 		return 0;
1447c349dbc7Sjsg 
1448c349dbc7Sjsg 	/* Check that we can recover an unbind stuck on a hanging request */
1449c349dbc7Sjsg 
1450c349dbc7Sjsg 	err = hang_init(&h, gt);
14515ca02815Sjsg 	if (err) {
14525ca02815Sjsg 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1453c349dbc7Sjsg 		return err;
14545ca02815Sjsg 	}
1455c349dbc7Sjsg 
1456c349dbc7Sjsg 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1457c349dbc7Sjsg 	if (IS_ERR(obj)) {
1458c349dbc7Sjsg 		err = PTR_ERR(obj);
14595ca02815Sjsg 		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1460c349dbc7Sjsg 		goto fini;
1461c349dbc7Sjsg 	}
1462c349dbc7Sjsg 
1463c349dbc7Sjsg 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1464c349dbc7Sjsg 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1465c349dbc7Sjsg 		if (err) {
1466c349dbc7Sjsg 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1467c349dbc7Sjsg 			goto out_obj;
1468c349dbc7Sjsg 		}
1469c349dbc7Sjsg 	}
1470c349dbc7Sjsg 
1471c349dbc7Sjsg 	arg.vma = i915_vma_instance(obj, vm, NULL);
1472c349dbc7Sjsg 	if (IS_ERR(arg.vma)) {
1473c349dbc7Sjsg 		err = PTR_ERR(arg.vma);
14745ca02815Sjsg 		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1475c349dbc7Sjsg 		goto out_obj;
1476c349dbc7Sjsg 	}
1477c349dbc7Sjsg 
1478c349dbc7Sjsg 	rq = hang_create_request(&h, engine);
1479c349dbc7Sjsg 	if (IS_ERR(rq)) {
1480c349dbc7Sjsg 		err = PTR_ERR(rq);
14815ca02815Sjsg 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1482c349dbc7Sjsg 		goto out_obj;
1483c349dbc7Sjsg 	}
1484c349dbc7Sjsg 
1485c349dbc7Sjsg 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1486c349dbc7Sjsg 
1487c349dbc7Sjsg 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1488c349dbc7Sjsg 		pin_flags |= PIN_MAPPABLE;
1489c349dbc7Sjsg 
1490c349dbc7Sjsg 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1491c349dbc7Sjsg 	if (err) {
1492c349dbc7Sjsg 		i915_request_add(rq);
14935ca02815Sjsg 		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1494c349dbc7Sjsg 		goto out_obj;
1495c349dbc7Sjsg 	}
1496c349dbc7Sjsg 
1497c349dbc7Sjsg 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1498c349dbc7Sjsg 		err = i915_vma_pin_fence(arg.vma);
1499c349dbc7Sjsg 		if (err) {
1500c349dbc7Sjsg 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1501c349dbc7Sjsg 			i915_vma_unpin(arg.vma);
1502c349dbc7Sjsg 			i915_request_add(rq);
1503c349dbc7Sjsg 			goto out_obj;
1504c349dbc7Sjsg 		}
1505c349dbc7Sjsg 	}
1506c349dbc7Sjsg 
1507*f005ef32Sjsg 	err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags);
15085ca02815Sjsg 	if (err)
15095ca02815Sjsg 		pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1510c349dbc7Sjsg 
1511c349dbc7Sjsg 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1512c349dbc7Sjsg 		i915_vma_unpin_fence(arg.vma);
1513c349dbc7Sjsg 	i915_vma_unpin(arg.vma);
1514c349dbc7Sjsg 
1515c349dbc7Sjsg 	i915_request_get(rq);
1516c349dbc7Sjsg 	i915_request_add(rq);
1517c349dbc7Sjsg 	if (err)
1518c349dbc7Sjsg 		goto out_rq;
1519c349dbc7Sjsg 
1520c349dbc7Sjsg 	if (!wait_until_running(&h, rq)) {
1521c349dbc7Sjsg 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1522c349dbc7Sjsg 
1523c349dbc7Sjsg 		pr_err("%s: Failed to start request %llx, at %x\n",
1524c349dbc7Sjsg 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1525c349dbc7Sjsg 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1526c349dbc7Sjsg 
1527c349dbc7Sjsg 		intel_gt_set_wedged(gt);
1528c349dbc7Sjsg 		goto out_reset;
1529c349dbc7Sjsg 	}
1530c349dbc7Sjsg 
1531c349dbc7Sjsg 	init_completion(&arg.completion);
1532c349dbc7Sjsg 
1533c349dbc7Sjsg 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1534c349dbc7Sjsg 	if (IS_ERR(tsk)) {
1535c349dbc7Sjsg 		err = PTR_ERR(tsk);
15365ca02815Sjsg 		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1537c349dbc7Sjsg 		tsk = NULL;
1538c349dbc7Sjsg 		goto out_reset;
1539c349dbc7Sjsg 	}
1540c349dbc7Sjsg 	get_task_struct(tsk);
1541c349dbc7Sjsg 
1542c349dbc7Sjsg 	wait_for_completion(&arg.completion);
1543c349dbc7Sjsg 
1544c349dbc7Sjsg 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1545c349dbc7Sjsg 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1546c349dbc7Sjsg 
1547c349dbc7Sjsg 		pr_err("igt/evict_vma kthread did not wait\n");
1548c349dbc7Sjsg 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1549c349dbc7Sjsg 
1550c349dbc7Sjsg 		intel_gt_set_wedged(gt);
1551c349dbc7Sjsg 		goto out_reset;
1552c349dbc7Sjsg 	}
1553c349dbc7Sjsg 
1554c349dbc7Sjsg out_reset:
1555c349dbc7Sjsg 	igt_global_reset_lock(gt);
1556c349dbc7Sjsg 	fake_hangcheck(gt, rq->engine->mask);
1557c349dbc7Sjsg 	igt_global_reset_unlock(gt);
1558c349dbc7Sjsg 
1559c349dbc7Sjsg 	if (tsk) {
1560c349dbc7Sjsg 		struct intel_wedge_me w;
1561c349dbc7Sjsg 
1562c349dbc7Sjsg 		/* The reset, even indirectly, should take less than 10ms. */
1563c349dbc7Sjsg 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1564c349dbc7Sjsg 			err = kthread_stop(tsk);
1565c349dbc7Sjsg 
1566c349dbc7Sjsg 		put_task_struct(tsk);
1567c349dbc7Sjsg 	}
1568c349dbc7Sjsg 
1569c349dbc7Sjsg out_rq:
1570c349dbc7Sjsg 	i915_request_put(rq);
1571c349dbc7Sjsg out_obj:
1572c349dbc7Sjsg 	i915_gem_object_put(obj);
1573c349dbc7Sjsg fini:
1574c349dbc7Sjsg 	hang_fini(&h);
1575c349dbc7Sjsg 	if (intel_gt_is_wedged(gt))
1576c349dbc7Sjsg 		return -EIO;
1577c349dbc7Sjsg 
1578c349dbc7Sjsg 	return err;
1579c349dbc7Sjsg }
1580c349dbc7Sjsg 
igt_reset_evict_ggtt(void * arg)1581c349dbc7Sjsg static int igt_reset_evict_ggtt(void *arg)
1582c349dbc7Sjsg {
1583c349dbc7Sjsg 	struct intel_gt *gt = arg;
1584c349dbc7Sjsg 
1585c349dbc7Sjsg 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1586c349dbc7Sjsg 				     evict_vma, EXEC_OBJECT_WRITE);
1587c349dbc7Sjsg }
1588c349dbc7Sjsg 
igt_reset_evict_ppgtt(void * arg)1589c349dbc7Sjsg static int igt_reset_evict_ppgtt(void *arg)
1590c349dbc7Sjsg {
1591c349dbc7Sjsg 	struct intel_gt *gt = arg;
1592c349dbc7Sjsg 	struct i915_ppgtt *ppgtt;
1593c349dbc7Sjsg 	int err;
1594c349dbc7Sjsg 
1595c349dbc7Sjsg 	/* aliasing == global gtt locking, covered above */
1596c349dbc7Sjsg 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1597c349dbc7Sjsg 		return 0;
1598c349dbc7Sjsg 
15991bb76ff1Sjsg 	ppgtt = i915_ppgtt_create(gt, 0);
1600c349dbc7Sjsg 	if (IS_ERR(ppgtt))
1601c349dbc7Sjsg 		return PTR_ERR(ppgtt);
1602c349dbc7Sjsg 
1603c349dbc7Sjsg 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1604c349dbc7Sjsg 				    evict_vma, EXEC_OBJECT_WRITE);
1605c349dbc7Sjsg 	i915_vm_put(&ppgtt->vm);
1606c349dbc7Sjsg 
1607c349dbc7Sjsg 	return err;
1608c349dbc7Sjsg }
1609c349dbc7Sjsg 
igt_reset_evict_fence(void * arg)1610c349dbc7Sjsg static int igt_reset_evict_fence(void *arg)
1611c349dbc7Sjsg {
1612c349dbc7Sjsg 	struct intel_gt *gt = arg;
1613c349dbc7Sjsg 
1614c349dbc7Sjsg 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1615c349dbc7Sjsg 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1616c349dbc7Sjsg }
1617c349dbc7Sjsg 
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1618c349dbc7Sjsg static int wait_for_others(struct intel_gt *gt,
1619c349dbc7Sjsg 			   struct intel_engine_cs *exclude)
1620c349dbc7Sjsg {
1621c349dbc7Sjsg 	struct intel_engine_cs *engine;
1622c349dbc7Sjsg 	enum intel_engine_id id;
1623c349dbc7Sjsg 
1624c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
1625c349dbc7Sjsg 		if (engine == exclude)
1626c349dbc7Sjsg 			continue;
1627c349dbc7Sjsg 
1628c349dbc7Sjsg 		if (!wait_for_idle(engine))
1629c349dbc7Sjsg 			return -EIO;
1630c349dbc7Sjsg 	}
1631c349dbc7Sjsg 
1632c349dbc7Sjsg 	return 0;
1633c349dbc7Sjsg }
1634c349dbc7Sjsg 
igt_reset_queue(void * arg)1635c349dbc7Sjsg static int igt_reset_queue(void *arg)
1636c349dbc7Sjsg {
1637c349dbc7Sjsg 	struct intel_gt *gt = arg;
1638c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1639c349dbc7Sjsg 	struct intel_engine_cs *engine;
1640c349dbc7Sjsg 	enum intel_engine_id id;
1641c349dbc7Sjsg 	struct hang h;
1642c349dbc7Sjsg 	int err;
1643c349dbc7Sjsg 
1644c349dbc7Sjsg 	/* Check that we replay pending requests following a hang */
1645c349dbc7Sjsg 
1646c349dbc7Sjsg 	igt_global_reset_lock(gt);
1647c349dbc7Sjsg 
1648c349dbc7Sjsg 	err = hang_init(&h, gt);
1649c349dbc7Sjsg 	if (err)
1650c349dbc7Sjsg 		goto unlock;
1651c349dbc7Sjsg 
1652c349dbc7Sjsg 	for_each_engine(engine, gt, id) {
16535ca02815Sjsg 		struct intel_selftest_saved_policy saved;
1654c349dbc7Sjsg 		struct i915_request *prev;
1655c349dbc7Sjsg 		IGT_TIMEOUT(end_time);
1656c349dbc7Sjsg 		unsigned int count;
16575ca02815Sjsg 		bool using_guc = intel_engine_uses_guc(engine);
1658c349dbc7Sjsg 
1659c349dbc7Sjsg 		if (!intel_engine_can_store_dword(engine))
1660c349dbc7Sjsg 			continue;
1661c349dbc7Sjsg 
16625ca02815Sjsg 		if (using_guc) {
16635ca02815Sjsg 			err = intel_selftest_modify_policy(engine, &saved,
16645ca02815Sjsg 							   SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
16655ca02815Sjsg 			if (err) {
16665ca02815Sjsg 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
16675ca02815Sjsg 				goto fini;
16685ca02815Sjsg 			}
16695ca02815Sjsg 		}
16705ca02815Sjsg 
1671c349dbc7Sjsg 		prev = hang_create_request(&h, engine);
1672c349dbc7Sjsg 		if (IS_ERR(prev)) {
1673c349dbc7Sjsg 			err = PTR_ERR(prev);
16745ca02815Sjsg 			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
16755ca02815Sjsg 			goto restore;
1676c349dbc7Sjsg 		}
1677c349dbc7Sjsg 
1678c349dbc7Sjsg 		i915_request_get(prev);
1679c349dbc7Sjsg 		i915_request_add(prev);
1680c349dbc7Sjsg 
1681c349dbc7Sjsg 		count = 0;
1682c349dbc7Sjsg 		do {
1683c349dbc7Sjsg 			struct i915_request *rq;
1684c349dbc7Sjsg 			unsigned int reset_count;
1685c349dbc7Sjsg 
1686c349dbc7Sjsg 			rq = hang_create_request(&h, engine);
1687c349dbc7Sjsg 			if (IS_ERR(rq)) {
1688c349dbc7Sjsg 				err = PTR_ERR(rq);
16895ca02815Sjsg 				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
16905ca02815Sjsg 				goto restore;
1691c349dbc7Sjsg 			}
1692c349dbc7Sjsg 
1693c349dbc7Sjsg 			i915_request_get(rq);
1694c349dbc7Sjsg 			i915_request_add(rq);
1695c349dbc7Sjsg 
1696c349dbc7Sjsg 			/*
1697c349dbc7Sjsg 			 * XXX We don't handle resetting the kernel context
1698c349dbc7Sjsg 			 * very well. If we trigger a device reset twice in
1699c349dbc7Sjsg 			 * quick succession while the kernel context is
1700c349dbc7Sjsg 			 * executing, we may end up skipping the breadcrumb.
1701c349dbc7Sjsg 			 * This is really only a problem for the selftest as
1702c349dbc7Sjsg 			 * normally there is a large interlude between resets
1703c349dbc7Sjsg 			 * (hangcheck), or we focus on resetting just one
1704c349dbc7Sjsg 			 * engine and so avoid repeatedly resetting innocents.
1705c349dbc7Sjsg 			 */
1706c349dbc7Sjsg 			err = wait_for_others(gt, engine);
1707c349dbc7Sjsg 			if (err) {
1708c349dbc7Sjsg 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1709c349dbc7Sjsg 				       __func__, engine->name);
1710c349dbc7Sjsg 				i915_request_put(rq);
1711c349dbc7Sjsg 				i915_request_put(prev);
1712c349dbc7Sjsg 
1713c349dbc7Sjsg 				GEM_TRACE_DUMP();
1714c349dbc7Sjsg 				intel_gt_set_wedged(gt);
17155ca02815Sjsg 				goto restore;
1716c349dbc7Sjsg 			}
1717c349dbc7Sjsg 
1718c349dbc7Sjsg 			if (!wait_until_running(&h, prev)) {
1719c349dbc7Sjsg 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1720c349dbc7Sjsg 
1721c349dbc7Sjsg 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1722c349dbc7Sjsg 				       __func__, engine->name,
1723c349dbc7Sjsg 				       prev->fence.seqno, hws_seqno(&h, prev));
1724c349dbc7Sjsg 				intel_engine_dump(engine, &p,
1725c349dbc7Sjsg 						  "%s\n", engine->name);
1726c349dbc7Sjsg 
1727c349dbc7Sjsg 				i915_request_put(rq);
1728c349dbc7Sjsg 				i915_request_put(prev);
1729c349dbc7Sjsg 
1730c349dbc7Sjsg 				intel_gt_set_wedged(gt);
1731c349dbc7Sjsg 
1732c349dbc7Sjsg 				err = -EIO;
17335ca02815Sjsg 				goto restore;
1734c349dbc7Sjsg 			}
1735c349dbc7Sjsg 
1736c349dbc7Sjsg 			reset_count = fake_hangcheck(gt, BIT(id));
1737c349dbc7Sjsg 
1738c349dbc7Sjsg 			if (prev->fence.error != -EIO) {
1739c349dbc7Sjsg 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1740c349dbc7Sjsg 				       prev->fence.error);
1741c349dbc7Sjsg 				i915_request_put(rq);
1742c349dbc7Sjsg 				i915_request_put(prev);
1743c349dbc7Sjsg 				err = -EINVAL;
17445ca02815Sjsg 				goto restore;
1745c349dbc7Sjsg 			}
1746c349dbc7Sjsg 
1747c349dbc7Sjsg 			if (rq->fence.error) {
1748c349dbc7Sjsg 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1749c349dbc7Sjsg 				       rq->fence.error);
1750c349dbc7Sjsg 				i915_request_put(rq);
1751c349dbc7Sjsg 				i915_request_put(prev);
1752c349dbc7Sjsg 				err = -EINVAL;
17535ca02815Sjsg 				goto restore;
1754c349dbc7Sjsg 			}
1755c349dbc7Sjsg 
1756c349dbc7Sjsg 			if (i915_reset_count(global) == reset_count) {
1757c349dbc7Sjsg 				pr_err("No GPU reset recorded!\n");
1758c349dbc7Sjsg 				i915_request_put(rq);
1759c349dbc7Sjsg 				i915_request_put(prev);
1760c349dbc7Sjsg 				err = -EINVAL;
17615ca02815Sjsg 				goto restore;
1762c349dbc7Sjsg 			}
1763c349dbc7Sjsg 
1764c349dbc7Sjsg 			i915_request_put(prev);
1765c349dbc7Sjsg 			prev = rq;
1766c349dbc7Sjsg 			count++;
1767c349dbc7Sjsg 		} while (time_before(jiffies, end_time));
17685ca02815Sjsg 		pr_info("%s: Completed %d queued resets\n",
17695ca02815Sjsg 			engine->name, count);
1770c349dbc7Sjsg 
1771c349dbc7Sjsg 		*h.batch = MI_BATCH_BUFFER_END;
1772c349dbc7Sjsg 		intel_gt_chipset_flush(engine->gt);
1773c349dbc7Sjsg 
1774c349dbc7Sjsg 		i915_request_put(prev);
1775c349dbc7Sjsg 
17765ca02815Sjsg restore:
17775ca02815Sjsg 		if (using_guc) {
17785ca02815Sjsg 			int err2 = intel_selftest_restore_policy(engine, &saved);
17795ca02815Sjsg 
17805ca02815Sjsg 			if (err2)
17815ca02815Sjsg 				pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
17825ca02815Sjsg 				       __func__, __LINE__, engine->name, err2);
17835ca02815Sjsg 			if (err == 0)
17845ca02815Sjsg 				err = err2;
17855ca02815Sjsg 		}
1786c349dbc7Sjsg 		if (err)
17875ca02815Sjsg 			goto fini;
17885ca02815Sjsg 
17895ca02815Sjsg 		err = igt_flush_test(gt->i915);
17905ca02815Sjsg 		if (err) {
17915ca02815Sjsg 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1792c349dbc7Sjsg 			break;
1793c349dbc7Sjsg 		}
17945ca02815Sjsg 	}
1795c349dbc7Sjsg 
1796c349dbc7Sjsg fini:
1797c349dbc7Sjsg 	hang_fini(&h);
1798c349dbc7Sjsg unlock:
1799c349dbc7Sjsg 	igt_global_reset_unlock(gt);
1800c349dbc7Sjsg 
1801c349dbc7Sjsg 	if (intel_gt_is_wedged(gt))
1802c349dbc7Sjsg 		return -EIO;
1803c349dbc7Sjsg 
1804c349dbc7Sjsg 	return err;
1805c349dbc7Sjsg }
1806c349dbc7Sjsg 
igt_handle_error(void * arg)1807c349dbc7Sjsg static int igt_handle_error(void *arg)
1808c349dbc7Sjsg {
1809c349dbc7Sjsg 	struct intel_gt *gt = arg;
1810c349dbc7Sjsg 	struct i915_gpu_error *global = &gt->i915->gpu_error;
18111bb76ff1Sjsg 	struct intel_engine_cs *engine;
1812c349dbc7Sjsg 	struct hang h;
1813c349dbc7Sjsg 	struct i915_request *rq;
1814c349dbc7Sjsg 	struct i915_gpu_coredump *error;
1815c349dbc7Sjsg 	int err;
1816c349dbc7Sjsg 
18171bb76ff1Sjsg 	engine = intel_selftest_find_any_engine(gt);
18181bb76ff1Sjsg 
1819c349dbc7Sjsg 	/* Check that we can issue a global GPU and engine reset */
1820c349dbc7Sjsg 
1821c349dbc7Sjsg 	if (!intel_has_reset_engine(gt))
1822c349dbc7Sjsg 		return 0;
1823c349dbc7Sjsg 
1824c349dbc7Sjsg 	if (!engine || !intel_engine_can_store_dword(engine))
1825c349dbc7Sjsg 		return 0;
1826c349dbc7Sjsg 
1827c349dbc7Sjsg 	err = hang_init(&h, gt);
18285ca02815Sjsg 	if (err) {
18295ca02815Sjsg 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1830c349dbc7Sjsg 		return err;
18315ca02815Sjsg 	}
1832c349dbc7Sjsg 
1833c349dbc7Sjsg 	rq = hang_create_request(&h, engine);
1834c349dbc7Sjsg 	if (IS_ERR(rq)) {
1835c349dbc7Sjsg 		err = PTR_ERR(rq);
18365ca02815Sjsg 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1837c349dbc7Sjsg 		goto err_fini;
1838c349dbc7Sjsg 	}
1839c349dbc7Sjsg 
1840c349dbc7Sjsg 	i915_request_get(rq);
1841c349dbc7Sjsg 	i915_request_add(rq);
1842c349dbc7Sjsg 
1843c349dbc7Sjsg 	if (!wait_until_running(&h, rq)) {
1844c349dbc7Sjsg 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1845c349dbc7Sjsg 
1846c349dbc7Sjsg 		pr_err("%s: Failed to start request %llx, at %x\n",
1847c349dbc7Sjsg 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1848c349dbc7Sjsg 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1849c349dbc7Sjsg 
1850c349dbc7Sjsg 		intel_gt_set_wedged(gt);
1851c349dbc7Sjsg 
1852c349dbc7Sjsg 		err = -EIO;
1853c349dbc7Sjsg 		goto err_request;
1854c349dbc7Sjsg 	}
1855c349dbc7Sjsg 
1856c349dbc7Sjsg 	/* Temporarily disable error capture */
1857c349dbc7Sjsg 	error = xchg(&global->first_error, (void *)-1);
1858c349dbc7Sjsg 
1859c349dbc7Sjsg 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1860c349dbc7Sjsg 
1861c349dbc7Sjsg 	xchg(&global->first_error, error);
1862c349dbc7Sjsg 
1863c349dbc7Sjsg 	if (rq->fence.error != -EIO) {
1864c349dbc7Sjsg 		pr_err("Guilty request not identified!\n");
1865c349dbc7Sjsg 		err = -EINVAL;
1866c349dbc7Sjsg 		goto err_request;
1867c349dbc7Sjsg 	}
1868c349dbc7Sjsg 
1869c349dbc7Sjsg err_request:
1870c349dbc7Sjsg 	i915_request_put(rq);
1871c349dbc7Sjsg err_fini:
1872c349dbc7Sjsg 	hang_fini(&h);
1873c349dbc7Sjsg 	return err;
1874c349dbc7Sjsg }
1875c349dbc7Sjsg 
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1876c349dbc7Sjsg static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1877c349dbc7Sjsg 				     const struct igt_atomic_section *p,
1878c349dbc7Sjsg 				     const char *mode)
1879c349dbc7Sjsg {
18805ca02815Sjsg 	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1881c349dbc7Sjsg 	int err;
1882c349dbc7Sjsg 
1883c349dbc7Sjsg 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1884c349dbc7Sjsg 		  engine->name, mode, p->name);
1885c349dbc7Sjsg 
18865ca02815Sjsg 	if (t->func)
1887c349dbc7Sjsg 		tasklet_disable(t);
18885ca02815Sjsg 	if (strcmp(p->name, "softirq"))
18895ca02815Sjsg 		local_bh_disable();
1890c349dbc7Sjsg 	p->critical_section_begin();
1891c349dbc7Sjsg 
18925ca02815Sjsg 	err = __intel_engine_reset_bh(engine, NULL);
1893c349dbc7Sjsg 
1894c349dbc7Sjsg 	p->critical_section_end();
18955ca02815Sjsg 	if (strcmp(p->name, "softirq"))
18965ca02815Sjsg 		local_bh_enable();
18975ca02815Sjsg 	if (t->func) {
1898c349dbc7Sjsg 		tasklet_enable(t);
18995ca02815Sjsg 		tasklet_hi_schedule(t);
19005ca02815Sjsg 	}
1901c349dbc7Sjsg 
1902c349dbc7Sjsg 	if (err)
1903c349dbc7Sjsg 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1904c349dbc7Sjsg 		       engine->name, mode, p->name);
1905c349dbc7Sjsg 
1906c349dbc7Sjsg 	return err;
1907c349dbc7Sjsg }
1908c349dbc7Sjsg 
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1909c349dbc7Sjsg static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1910c349dbc7Sjsg 				   const struct igt_atomic_section *p)
1911c349dbc7Sjsg {
1912c349dbc7Sjsg 	struct i915_request *rq;
1913c349dbc7Sjsg 	struct hang h;
1914c349dbc7Sjsg 	int err;
1915c349dbc7Sjsg 
1916c349dbc7Sjsg 	err = __igt_atomic_reset_engine(engine, p, "idle");
1917c349dbc7Sjsg 	if (err)
1918c349dbc7Sjsg 		return err;
1919c349dbc7Sjsg 
1920c349dbc7Sjsg 	err = hang_init(&h, engine->gt);
19215ca02815Sjsg 	if (err) {
19225ca02815Sjsg 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1923c349dbc7Sjsg 		return err;
19245ca02815Sjsg 	}
1925c349dbc7Sjsg 
1926c349dbc7Sjsg 	rq = hang_create_request(&h, engine);
1927c349dbc7Sjsg 	if (IS_ERR(rq)) {
1928c349dbc7Sjsg 		err = PTR_ERR(rq);
19295ca02815Sjsg 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1930c349dbc7Sjsg 		goto out;
1931c349dbc7Sjsg 	}
1932c349dbc7Sjsg 
1933c349dbc7Sjsg 	i915_request_get(rq);
1934c349dbc7Sjsg 	i915_request_add(rq);
1935c349dbc7Sjsg 
1936c349dbc7Sjsg 	if (wait_until_running(&h, rq)) {
1937c349dbc7Sjsg 		err = __igt_atomic_reset_engine(engine, p, "active");
1938c349dbc7Sjsg 	} else {
1939c349dbc7Sjsg 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1940c349dbc7Sjsg 		       __func__, engine->name,
1941c349dbc7Sjsg 		       rq->fence.seqno, hws_seqno(&h, rq));
1942c349dbc7Sjsg 		intel_gt_set_wedged(engine->gt);
1943c349dbc7Sjsg 		err = -EIO;
1944c349dbc7Sjsg 	}
1945c349dbc7Sjsg 
1946c349dbc7Sjsg 	if (err == 0) {
1947c349dbc7Sjsg 		struct intel_wedge_me w;
1948c349dbc7Sjsg 
1949c349dbc7Sjsg 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1950c349dbc7Sjsg 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1951c349dbc7Sjsg 		if (intel_gt_is_wedged(engine->gt))
1952c349dbc7Sjsg 			err = -EIO;
1953c349dbc7Sjsg 	}
1954c349dbc7Sjsg 
1955c349dbc7Sjsg 	i915_request_put(rq);
1956c349dbc7Sjsg out:
1957c349dbc7Sjsg 	hang_fini(&h);
1958c349dbc7Sjsg 	return err;
1959c349dbc7Sjsg }
1960c349dbc7Sjsg 
igt_reset_engines_atomic(void * arg)1961c349dbc7Sjsg static int igt_reset_engines_atomic(void *arg)
1962c349dbc7Sjsg {
1963c349dbc7Sjsg 	struct intel_gt *gt = arg;
1964c349dbc7Sjsg 	const typeof(*igt_atomic_phases) *p;
1965c349dbc7Sjsg 	int err = 0;
1966c349dbc7Sjsg 
1967c349dbc7Sjsg 	/* Check that the engines resets are usable from atomic context */
1968c349dbc7Sjsg 
1969c349dbc7Sjsg 	if (!intel_has_reset_engine(gt))
1970c349dbc7Sjsg 		return 0;
1971c349dbc7Sjsg 
1972c349dbc7Sjsg 	if (intel_uc_uses_guc_submission(&gt->uc))
1973c349dbc7Sjsg 		return 0;
1974c349dbc7Sjsg 
1975c349dbc7Sjsg 	igt_global_reset_lock(gt);
1976c349dbc7Sjsg 
1977c349dbc7Sjsg 	/* Flush any requests before we get started and check basics */
1978c349dbc7Sjsg 	if (!igt_force_reset(gt))
1979c349dbc7Sjsg 		goto unlock;
1980c349dbc7Sjsg 
1981c349dbc7Sjsg 	for (p = igt_atomic_phases; p->name; p++) {
1982c349dbc7Sjsg 		struct intel_engine_cs *engine;
1983c349dbc7Sjsg 		enum intel_engine_id id;
1984c349dbc7Sjsg 
1985c349dbc7Sjsg 		for_each_engine(engine, gt, id) {
1986c349dbc7Sjsg 			err = igt_atomic_reset_engine(engine, p);
1987c349dbc7Sjsg 			if (err)
1988c349dbc7Sjsg 				goto out;
1989c349dbc7Sjsg 		}
1990c349dbc7Sjsg 	}
1991c349dbc7Sjsg 
1992c349dbc7Sjsg out:
1993c349dbc7Sjsg 	/* As we poke around the guts, do a full reset before continuing. */
1994c349dbc7Sjsg 	igt_force_reset(gt);
1995c349dbc7Sjsg unlock:
1996c349dbc7Sjsg 	igt_global_reset_unlock(gt);
1997c349dbc7Sjsg 
1998c349dbc7Sjsg 	return err;
1999c349dbc7Sjsg }
2000c349dbc7Sjsg 
intel_hangcheck_live_selftests(struct drm_i915_private * i915)2001c349dbc7Sjsg int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2002c349dbc7Sjsg {
2003c349dbc7Sjsg 	static const struct i915_subtest tests[] = {
2004c349dbc7Sjsg 		SUBTEST(igt_hang_sanitycheck),
2005c349dbc7Sjsg 		SUBTEST(igt_reset_nop),
2006c349dbc7Sjsg 		SUBTEST(igt_reset_nop_engine),
2007c349dbc7Sjsg 		SUBTEST(igt_reset_idle_engine),
2008c349dbc7Sjsg 		SUBTEST(igt_reset_active_engine),
20095ca02815Sjsg 		SUBTEST(igt_reset_fail_engine),
2010c349dbc7Sjsg 		SUBTEST(igt_reset_engines),
2011c349dbc7Sjsg 		SUBTEST(igt_reset_engines_atomic),
2012c349dbc7Sjsg 		SUBTEST(igt_reset_queue),
2013c349dbc7Sjsg 		SUBTEST(igt_reset_wait),
2014c349dbc7Sjsg 		SUBTEST(igt_reset_evict_ggtt),
2015c349dbc7Sjsg 		SUBTEST(igt_reset_evict_ppgtt),
2016c349dbc7Sjsg 		SUBTEST(igt_reset_evict_fence),
2017c349dbc7Sjsg 		SUBTEST(igt_handle_error),
2018c349dbc7Sjsg 	};
20191bb76ff1Sjsg 	struct intel_gt *gt = to_gt(i915);
2020c349dbc7Sjsg 	intel_wakeref_t wakeref;
2021c349dbc7Sjsg 	int err;
2022c349dbc7Sjsg 
2023c349dbc7Sjsg 	if (!intel_has_gpu_reset(gt))
2024c349dbc7Sjsg 		return 0;
2025c349dbc7Sjsg 
2026c349dbc7Sjsg 	if (intel_gt_is_wedged(gt))
2027c349dbc7Sjsg 		return -EIO; /* we're long past hope of a successful reset */
2028c349dbc7Sjsg 
2029c349dbc7Sjsg 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2030c349dbc7Sjsg 
2031c349dbc7Sjsg 	err = intel_gt_live_subtests(tests, gt);
2032c349dbc7Sjsg 
2033c349dbc7Sjsg 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2034c349dbc7Sjsg 
2035c349dbc7Sjsg 	return err;
2036c349dbc7Sjsg }
2037