15ca02815Sjsg // SPDX-License-Identifier: MIT
2c349dbc7Sjsg /*
3c349dbc7Sjsg * Copyright © 2016 Intel Corporation
4c349dbc7Sjsg */
5c349dbc7Sjsg
6c349dbc7Sjsg #include <linux/kthread.h>
7c349dbc7Sjsg
8c349dbc7Sjsg #include "gem/i915_gem_context.h"
91bb76ff1Sjsg #include "gem/i915_gem_internal.h"
10c349dbc7Sjsg
111bb76ff1Sjsg #include "i915_gem_evict.h"
12c349dbc7Sjsg #include "intel_gt.h"
13c349dbc7Sjsg #include "intel_engine_heartbeat.h"
14c349dbc7Sjsg #include "intel_engine_pm.h"
15ad8b1aafSjsg #include "selftest_engine_heartbeat.h"
16c349dbc7Sjsg
17c349dbc7Sjsg #include "i915_selftest.h"
18c349dbc7Sjsg #include "selftests/i915_random.h"
19c349dbc7Sjsg #include "selftests/igt_flush_test.h"
20c349dbc7Sjsg #include "selftests/igt_reset.h"
21c349dbc7Sjsg #include "selftests/igt_atomic.h"
225ca02815Sjsg #include "selftests/igt_spinner.h"
235ca02815Sjsg #include "selftests/intel_scheduler_helpers.h"
24c349dbc7Sjsg
25c349dbc7Sjsg #include "selftests/mock_drm.h"
26c349dbc7Sjsg
27c349dbc7Sjsg #include "gem/selftests/mock_context.h"
28c349dbc7Sjsg #include "gem/selftests/igt_gem_utils.h"
29c349dbc7Sjsg
30c349dbc7Sjsg #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
31c349dbc7Sjsg
32c349dbc7Sjsg struct hang {
33c349dbc7Sjsg struct intel_gt *gt;
34c349dbc7Sjsg struct drm_i915_gem_object *hws;
35c349dbc7Sjsg struct drm_i915_gem_object *obj;
36c349dbc7Sjsg struct i915_gem_context *ctx;
37c349dbc7Sjsg u32 *seqno;
38c349dbc7Sjsg u32 *batch;
39c349dbc7Sjsg };
40c349dbc7Sjsg
hang_init(struct hang * h,struct intel_gt * gt)41c349dbc7Sjsg static int hang_init(struct hang *h, struct intel_gt *gt)
42c349dbc7Sjsg {
43c349dbc7Sjsg void *vaddr;
44c349dbc7Sjsg int err;
45c349dbc7Sjsg
46c349dbc7Sjsg memset(h, 0, sizeof(*h));
47c349dbc7Sjsg h->gt = gt;
48c349dbc7Sjsg
495ca02815Sjsg h->ctx = kernel_context(gt->i915, NULL);
50c349dbc7Sjsg if (IS_ERR(h->ctx))
51c349dbc7Sjsg return PTR_ERR(h->ctx);
52c349dbc7Sjsg
53c349dbc7Sjsg GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54c349dbc7Sjsg
55c349dbc7Sjsg h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56c349dbc7Sjsg if (IS_ERR(h->hws)) {
57c349dbc7Sjsg err = PTR_ERR(h->hws);
58c349dbc7Sjsg goto err_ctx;
59c349dbc7Sjsg }
60c349dbc7Sjsg
61c349dbc7Sjsg h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62c349dbc7Sjsg if (IS_ERR(h->obj)) {
63c349dbc7Sjsg err = PTR_ERR(h->obj);
64c349dbc7Sjsg goto err_hws;
65c349dbc7Sjsg }
66c349dbc7Sjsg
67c349dbc7Sjsg i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
685ca02815Sjsg vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69c349dbc7Sjsg if (IS_ERR(vaddr)) {
70c349dbc7Sjsg err = PTR_ERR(vaddr);
71c349dbc7Sjsg goto err_obj;
72c349dbc7Sjsg }
73c349dbc7Sjsg h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74c349dbc7Sjsg
755ca02815Sjsg vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76*f005ef32Sjsg intel_gt_coherent_map_type(gt, h->obj, false));
77c349dbc7Sjsg if (IS_ERR(vaddr)) {
78c349dbc7Sjsg err = PTR_ERR(vaddr);
79c349dbc7Sjsg goto err_unpin_hws;
80c349dbc7Sjsg }
81c349dbc7Sjsg h->batch = vaddr;
82c349dbc7Sjsg
83c349dbc7Sjsg return 0;
84c349dbc7Sjsg
85c349dbc7Sjsg err_unpin_hws:
86c349dbc7Sjsg i915_gem_object_unpin_map(h->hws);
87c349dbc7Sjsg err_obj:
88c349dbc7Sjsg i915_gem_object_put(h->obj);
89c349dbc7Sjsg err_hws:
90c349dbc7Sjsg i915_gem_object_put(h->hws);
91c349dbc7Sjsg err_ctx:
92c349dbc7Sjsg kernel_context_close(h->ctx);
93c349dbc7Sjsg return err;
94c349dbc7Sjsg }
95c349dbc7Sjsg
hws_address(const struct i915_vma * hws,const struct i915_request * rq)96c349dbc7Sjsg static u64 hws_address(const struct i915_vma *hws,
97c349dbc7Sjsg const struct i915_request *rq)
98c349dbc7Sjsg {
99*f005ef32Sjsg return i915_vma_offset(hws) +
100*f005ef32Sjsg offset_in_page(sizeof(u32) * rq->fence.context);
101c349dbc7Sjsg }
102c349dbc7Sjsg
103c349dbc7Sjsg static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)104c349dbc7Sjsg hang_create_request(struct hang *h, struct intel_engine_cs *engine)
105c349dbc7Sjsg {
106c349dbc7Sjsg struct intel_gt *gt = h->gt;
1071bb76ff1Sjsg struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
108c349dbc7Sjsg struct drm_i915_gem_object *obj;
109c349dbc7Sjsg struct i915_request *rq = NULL;
110c349dbc7Sjsg struct i915_vma *hws, *vma;
111c349dbc7Sjsg unsigned int flags;
112c349dbc7Sjsg void *vaddr;
113c349dbc7Sjsg u32 *batch;
114c349dbc7Sjsg int err;
115c349dbc7Sjsg
116c349dbc7Sjsg obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
117c349dbc7Sjsg if (IS_ERR(obj)) {
118c349dbc7Sjsg i915_vm_put(vm);
119c349dbc7Sjsg return ERR_CAST(obj);
120c349dbc7Sjsg }
121c349dbc7Sjsg
122*f005ef32Sjsg vaddr = i915_gem_object_pin_map_unlocked(obj, intel_gt_coherent_map_type(gt, obj, false));
123c349dbc7Sjsg if (IS_ERR(vaddr)) {
124c349dbc7Sjsg i915_gem_object_put(obj);
125c349dbc7Sjsg i915_vm_put(vm);
126c349dbc7Sjsg return ERR_CAST(vaddr);
127c349dbc7Sjsg }
128c349dbc7Sjsg
129c349dbc7Sjsg i915_gem_object_unpin_map(h->obj);
130c349dbc7Sjsg i915_gem_object_put(h->obj);
131c349dbc7Sjsg
132c349dbc7Sjsg h->obj = obj;
133c349dbc7Sjsg h->batch = vaddr;
134c349dbc7Sjsg
135c349dbc7Sjsg vma = i915_vma_instance(h->obj, vm, NULL);
136c349dbc7Sjsg if (IS_ERR(vma)) {
137c349dbc7Sjsg i915_vm_put(vm);
138c349dbc7Sjsg return ERR_CAST(vma);
139c349dbc7Sjsg }
140c349dbc7Sjsg
141c349dbc7Sjsg hws = i915_vma_instance(h->hws, vm, NULL);
142c349dbc7Sjsg if (IS_ERR(hws)) {
143c349dbc7Sjsg i915_vm_put(vm);
144c349dbc7Sjsg return ERR_CAST(hws);
145c349dbc7Sjsg }
146c349dbc7Sjsg
147c349dbc7Sjsg err = i915_vma_pin(vma, 0, 0, PIN_USER);
148c349dbc7Sjsg if (err) {
149c349dbc7Sjsg i915_vm_put(vm);
150c349dbc7Sjsg return ERR_PTR(err);
151c349dbc7Sjsg }
152c349dbc7Sjsg
153c349dbc7Sjsg err = i915_vma_pin(hws, 0, 0, PIN_USER);
154c349dbc7Sjsg if (err)
155c349dbc7Sjsg goto unpin_vma;
156c349dbc7Sjsg
157c349dbc7Sjsg rq = igt_request_alloc(h->ctx, engine);
158c349dbc7Sjsg if (IS_ERR(rq)) {
159c349dbc7Sjsg err = PTR_ERR(rq);
160c349dbc7Sjsg goto unpin_hws;
161c349dbc7Sjsg }
162c349dbc7Sjsg
163*f005ef32Sjsg err = igt_vma_move_to_active_unlocked(vma, rq, 0);
164c349dbc7Sjsg if (err)
165c349dbc7Sjsg goto cancel_rq;
166c349dbc7Sjsg
167*f005ef32Sjsg err = igt_vma_move_to_active_unlocked(hws, rq, 0);
168c349dbc7Sjsg if (err)
169c349dbc7Sjsg goto cancel_rq;
170c349dbc7Sjsg
171c349dbc7Sjsg batch = h->batch;
1725ca02815Sjsg if (GRAPHICS_VER(gt->i915) >= 8) {
173c349dbc7Sjsg *batch++ = MI_STORE_DWORD_IMM_GEN4;
174c349dbc7Sjsg *batch++ = lower_32_bits(hws_address(hws, rq));
175c349dbc7Sjsg *batch++ = upper_32_bits(hws_address(hws, rq));
176c349dbc7Sjsg *batch++ = rq->fence.seqno;
177ad8b1aafSjsg *batch++ = MI_NOOP;
178c349dbc7Sjsg
179c349dbc7Sjsg memset(batch, 0, 1024);
180c349dbc7Sjsg batch += 1024 / sizeof(*batch);
181c349dbc7Sjsg
182ad8b1aafSjsg *batch++ = MI_NOOP;
183c349dbc7Sjsg *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
184*f005ef32Sjsg *batch++ = lower_32_bits(i915_vma_offset(vma));
185*f005ef32Sjsg *batch++ = upper_32_bits(i915_vma_offset(vma));
1865ca02815Sjsg } else if (GRAPHICS_VER(gt->i915) >= 6) {
187c349dbc7Sjsg *batch++ = MI_STORE_DWORD_IMM_GEN4;
188c349dbc7Sjsg *batch++ = 0;
189c349dbc7Sjsg *batch++ = lower_32_bits(hws_address(hws, rq));
190c349dbc7Sjsg *batch++ = rq->fence.seqno;
191ad8b1aafSjsg *batch++ = MI_NOOP;
192c349dbc7Sjsg
193c349dbc7Sjsg memset(batch, 0, 1024);
194c349dbc7Sjsg batch += 1024 / sizeof(*batch);
195c349dbc7Sjsg
196ad8b1aafSjsg *batch++ = MI_NOOP;
197c349dbc7Sjsg *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
198*f005ef32Sjsg *batch++ = lower_32_bits(i915_vma_offset(vma));
1995ca02815Sjsg } else if (GRAPHICS_VER(gt->i915) >= 4) {
200c349dbc7Sjsg *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
201c349dbc7Sjsg *batch++ = 0;
202c349dbc7Sjsg *batch++ = lower_32_bits(hws_address(hws, rq));
203c349dbc7Sjsg *batch++ = rq->fence.seqno;
204ad8b1aafSjsg *batch++ = MI_NOOP;
205c349dbc7Sjsg
206c349dbc7Sjsg memset(batch, 0, 1024);
207c349dbc7Sjsg batch += 1024 / sizeof(*batch);
208c349dbc7Sjsg
209ad8b1aafSjsg *batch++ = MI_NOOP;
210c349dbc7Sjsg *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
211*f005ef32Sjsg *batch++ = lower_32_bits(i915_vma_offset(vma));
212c349dbc7Sjsg } else {
213c349dbc7Sjsg *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
214c349dbc7Sjsg *batch++ = lower_32_bits(hws_address(hws, rq));
215c349dbc7Sjsg *batch++ = rq->fence.seqno;
216ad8b1aafSjsg *batch++ = MI_NOOP;
217c349dbc7Sjsg
218c349dbc7Sjsg memset(batch, 0, 1024);
219c349dbc7Sjsg batch += 1024 / sizeof(*batch);
220c349dbc7Sjsg
221ad8b1aafSjsg *batch++ = MI_NOOP;
222c349dbc7Sjsg *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
223*f005ef32Sjsg *batch++ = lower_32_bits(i915_vma_offset(vma));
224c349dbc7Sjsg }
225c349dbc7Sjsg *batch++ = MI_BATCH_BUFFER_END; /* not reached */
226c349dbc7Sjsg intel_gt_chipset_flush(engine->gt);
227c349dbc7Sjsg
228c349dbc7Sjsg if (rq->engine->emit_init_breadcrumb) {
229c349dbc7Sjsg err = rq->engine->emit_init_breadcrumb(rq);
230c349dbc7Sjsg if (err)
231c349dbc7Sjsg goto cancel_rq;
232c349dbc7Sjsg }
233c349dbc7Sjsg
234c349dbc7Sjsg flags = 0;
2355ca02815Sjsg if (GRAPHICS_VER(gt->i915) <= 5)
236c349dbc7Sjsg flags |= I915_DISPATCH_SECURE;
237c349dbc7Sjsg
238*f005ef32Sjsg err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), PAGE_SIZE, flags);
239c349dbc7Sjsg
240c349dbc7Sjsg cancel_rq:
241c349dbc7Sjsg if (err) {
242c349dbc7Sjsg i915_request_set_error_once(rq, err);
243c349dbc7Sjsg i915_request_add(rq);
244c349dbc7Sjsg }
245c349dbc7Sjsg unpin_hws:
246c349dbc7Sjsg i915_vma_unpin(hws);
247c349dbc7Sjsg unpin_vma:
248c349dbc7Sjsg i915_vma_unpin(vma);
249c349dbc7Sjsg i915_vm_put(vm);
250c349dbc7Sjsg return err ? ERR_PTR(err) : rq;
251c349dbc7Sjsg }
252c349dbc7Sjsg
hws_seqno(const struct hang * h,const struct i915_request * rq)253c349dbc7Sjsg static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
254c349dbc7Sjsg {
255c349dbc7Sjsg return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
256c349dbc7Sjsg }
257c349dbc7Sjsg
hang_fini(struct hang * h)258c349dbc7Sjsg static void hang_fini(struct hang *h)
259c349dbc7Sjsg {
260c349dbc7Sjsg *h->batch = MI_BATCH_BUFFER_END;
261c349dbc7Sjsg intel_gt_chipset_flush(h->gt);
262c349dbc7Sjsg
263c349dbc7Sjsg i915_gem_object_unpin_map(h->obj);
264c349dbc7Sjsg i915_gem_object_put(h->obj);
265c349dbc7Sjsg
266c349dbc7Sjsg i915_gem_object_unpin_map(h->hws);
267c349dbc7Sjsg i915_gem_object_put(h->hws);
268c349dbc7Sjsg
269c349dbc7Sjsg kernel_context_close(h->ctx);
270c349dbc7Sjsg
271c349dbc7Sjsg igt_flush_test(h->gt->i915);
272c349dbc7Sjsg }
273c349dbc7Sjsg
wait_until_running(struct hang * h,struct i915_request * rq)274c349dbc7Sjsg static bool wait_until_running(struct hang *h, struct i915_request *rq)
275c349dbc7Sjsg {
276c349dbc7Sjsg return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
277c349dbc7Sjsg rq->fence.seqno),
278c349dbc7Sjsg 10) &&
279c349dbc7Sjsg wait_for(i915_seqno_passed(hws_seqno(h, rq),
280c349dbc7Sjsg rq->fence.seqno),
281c349dbc7Sjsg 1000));
282c349dbc7Sjsg }
283c349dbc7Sjsg
igt_hang_sanitycheck(void * arg)284c349dbc7Sjsg static int igt_hang_sanitycheck(void *arg)
285c349dbc7Sjsg {
286c349dbc7Sjsg struct intel_gt *gt = arg;
287c349dbc7Sjsg struct i915_request *rq;
288c349dbc7Sjsg struct intel_engine_cs *engine;
289c349dbc7Sjsg enum intel_engine_id id;
290c349dbc7Sjsg struct hang h;
291c349dbc7Sjsg int err;
292c349dbc7Sjsg
293c349dbc7Sjsg /* Basic check that we can execute our hanging batch */
294c349dbc7Sjsg
295c349dbc7Sjsg err = hang_init(&h, gt);
296c349dbc7Sjsg if (err)
297c349dbc7Sjsg return err;
298c349dbc7Sjsg
299c349dbc7Sjsg for_each_engine(engine, gt, id) {
300c349dbc7Sjsg struct intel_wedge_me w;
301c349dbc7Sjsg long timeout;
302c349dbc7Sjsg
303c349dbc7Sjsg if (!intel_engine_can_store_dword(engine))
304c349dbc7Sjsg continue;
305c349dbc7Sjsg
306c349dbc7Sjsg rq = hang_create_request(&h, engine);
307c349dbc7Sjsg if (IS_ERR(rq)) {
308c349dbc7Sjsg err = PTR_ERR(rq);
309c349dbc7Sjsg pr_err("Failed to create request for %s, err=%d\n",
310c349dbc7Sjsg engine->name, err);
311c349dbc7Sjsg goto fini;
312c349dbc7Sjsg }
313c349dbc7Sjsg
314c349dbc7Sjsg i915_request_get(rq);
315c349dbc7Sjsg
316c349dbc7Sjsg *h.batch = MI_BATCH_BUFFER_END;
317c349dbc7Sjsg intel_gt_chipset_flush(engine->gt);
318c349dbc7Sjsg
319c349dbc7Sjsg i915_request_add(rq);
320c349dbc7Sjsg
321c349dbc7Sjsg timeout = 0;
322c349dbc7Sjsg intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
323c349dbc7Sjsg timeout = i915_request_wait(rq, 0,
324c349dbc7Sjsg MAX_SCHEDULE_TIMEOUT);
325c349dbc7Sjsg if (intel_gt_is_wedged(gt))
326c349dbc7Sjsg timeout = -EIO;
327c349dbc7Sjsg
328c349dbc7Sjsg i915_request_put(rq);
329c349dbc7Sjsg
330c349dbc7Sjsg if (timeout < 0) {
331c349dbc7Sjsg err = timeout;
332c349dbc7Sjsg pr_err("Wait for request failed on %s, err=%d\n",
333c349dbc7Sjsg engine->name, err);
334c349dbc7Sjsg goto fini;
335c349dbc7Sjsg }
336c349dbc7Sjsg }
337c349dbc7Sjsg
338c349dbc7Sjsg fini:
339c349dbc7Sjsg hang_fini(&h);
340c349dbc7Sjsg return err;
341c349dbc7Sjsg }
342c349dbc7Sjsg
wait_for_idle(struct intel_engine_cs * engine)343c349dbc7Sjsg static bool wait_for_idle(struct intel_engine_cs *engine)
344c349dbc7Sjsg {
345c349dbc7Sjsg return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
346c349dbc7Sjsg }
347c349dbc7Sjsg
igt_reset_nop(void * arg)348c349dbc7Sjsg static int igt_reset_nop(void *arg)
349c349dbc7Sjsg {
350c349dbc7Sjsg struct intel_gt *gt = arg;
351c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
352c349dbc7Sjsg struct intel_engine_cs *engine;
353c349dbc7Sjsg unsigned int reset_count, count;
354c349dbc7Sjsg enum intel_engine_id id;
355c349dbc7Sjsg IGT_TIMEOUT(end_time);
356c349dbc7Sjsg int err = 0;
357c349dbc7Sjsg
358c349dbc7Sjsg /* Check that we can reset during non-user portions of requests */
359c349dbc7Sjsg
360c349dbc7Sjsg reset_count = i915_reset_count(global);
361c349dbc7Sjsg count = 0;
362c349dbc7Sjsg do {
363c349dbc7Sjsg for_each_engine(engine, gt, id) {
364c349dbc7Sjsg struct intel_context *ce;
365c349dbc7Sjsg int i;
366c349dbc7Sjsg
367c349dbc7Sjsg ce = intel_context_create(engine);
368c349dbc7Sjsg if (IS_ERR(ce)) {
369c349dbc7Sjsg err = PTR_ERR(ce);
3705ca02815Sjsg pr_err("[%s] Create context failed: %d!\n", engine->name, err);
371c349dbc7Sjsg break;
372c349dbc7Sjsg }
373c349dbc7Sjsg
374c349dbc7Sjsg for (i = 0; i < 16; i++) {
375c349dbc7Sjsg struct i915_request *rq;
376c349dbc7Sjsg
377c349dbc7Sjsg rq = intel_context_create_request(ce);
378c349dbc7Sjsg if (IS_ERR(rq)) {
379c349dbc7Sjsg err = PTR_ERR(rq);
3805ca02815Sjsg pr_err("[%s] Create request failed: %d!\n",
3815ca02815Sjsg engine->name, err);
382c349dbc7Sjsg break;
383c349dbc7Sjsg }
384c349dbc7Sjsg
385c349dbc7Sjsg i915_request_add(rq);
386c349dbc7Sjsg }
387c349dbc7Sjsg
388c349dbc7Sjsg intel_context_put(ce);
389c349dbc7Sjsg }
390c349dbc7Sjsg
391c349dbc7Sjsg igt_global_reset_lock(gt);
392c349dbc7Sjsg intel_gt_reset(gt, ALL_ENGINES, NULL);
393c349dbc7Sjsg igt_global_reset_unlock(gt);
394c349dbc7Sjsg
395c349dbc7Sjsg if (intel_gt_is_wedged(gt)) {
3965ca02815Sjsg pr_err("[%s] GT is wedged!\n", engine->name);
397c349dbc7Sjsg err = -EIO;
398c349dbc7Sjsg break;
399c349dbc7Sjsg }
400c349dbc7Sjsg
401c349dbc7Sjsg if (i915_reset_count(global) != reset_count + ++count) {
4025ca02815Sjsg pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
4035ca02815Sjsg engine->name, i915_reset_count(global), reset_count, count);
404c349dbc7Sjsg err = -EINVAL;
405c349dbc7Sjsg break;
406c349dbc7Sjsg }
407c349dbc7Sjsg
408c349dbc7Sjsg err = igt_flush_test(gt->i915);
4095ca02815Sjsg if (err) {
4105ca02815Sjsg pr_err("[%s] Flush failed: %d!\n", engine->name, err);
411c349dbc7Sjsg break;
4125ca02815Sjsg }
413c349dbc7Sjsg } while (time_before(jiffies, end_time));
414c349dbc7Sjsg pr_info("%s: %d resets\n", __func__, count);
415c349dbc7Sjsg
4165ca02815Sjsg if (igt_flush_test(gt->i915)) {
4175ca02815Sjsg pr_err("Post flush failed: %d!\n", err);
418c349dbc7Sjsg err = -EIO;
4195ca02815Sjsg }
4205ca02815Sjsg
421c349dbc7Sjsg return err;
422c349dbc7Sjsg }
423c349dbc7Sjsg
igt_reset_nop_engine(void * arg)424c349dbc7Sjsg static int igt_reset_nop_engine(void *arg)
425c349dbc7Sjsg {
426c349dbc7Sjsg struct intel_gt *gt = arg;
427c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
428c349dbc7Sjsg struct intel_engine_cs *engine;
429c349dbc7Sjsg enum intel_engine_id id;
430c349dbc7Sjsg
431c349dbc7Sjsg /* Check that we can engine-reset during non-user portions */
432c349dbc7Sjsg
433c349dbc7Sjsg if (!intel_has_reset_engine(gt))
434c349dbc7Sjsg return 0;
435c349dbc7Sjsg
436c349dbc7Sjsg for_each_engine(engine, gt, id) {
437c349dbc7Sjsg unsigned int reset_count, reset_engine_count, count;
438c349dbc7Sjsg struct intel_context *ce;
439c349dbc7Sjsg IGT_TIMEOUT(end_time);
440c349dbc7Sjsg int err;
441c349dbc7Sjsg
4425ca02815Sjsg if (intel_engine_uses_guc(engine)) {
4435ca02815Sjsg /* Engine level resets are triggered by GuC when a hang
4445ca02815Sjsg * is detected. They can't be triggered by the KMD any
4455ca02815Sjsg * more. Thus a nop batch cannot be used as a reset test
4465ca02815Sjsg */
4475ca02815Sjsg continue;
4485ca02815Sjsg }
4495ca02815Sjsg
450c349dbc7Sjsg ce = intel_context_create(engine);
4515ca02815Sjsg if (IS_ERR(ce)) {
4525ca02815Sjsg pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
453c349dbc7Sjsg return PTR_ERR(ce);
4545ca02815Sjsg }
455c349dbc7Sjsg
456c349dbc7Sjsg reset_count = i915_reset_count(global);
457c349dbc7Sjsg reset_engine_count = i915_reset_engine_count(global, engine);
458c349dbc7Sjsg count = 0;
459c349dbc7Sjsg
460ad8b1aafSjsg st_engine_heartbeat_disable(engine);
4611bb76ff1Sjsg GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
4621bb76ff1Sjsg >->reset.flags));
463c349dbc7Sjsg do {
464c349dbc7Sjsg int i;
465c349dbc7Sjsg
466c349dbc7Sjsg if (!wait_for_idle(engine)) {
467c349dbc7Sjsg pr_err("%s failed to idle before reset\n",
468c349dbc7Sjsg engine->name);
469c349dbc7Sjsg err = -EIO;
470c349dbc7Sjsg break;
471c349dbc7Sjsg }
472c349dbc7Sjsg
473c349dbc7Sjsg for (i = 0; i < 16; i++) {
474c349dbc7Sjsg struct i915_request *rq;
475c349dbc7Sjsg
476c349dbc7Sjsg rq = intel_context_create_request(ce);
477c349dbc7Sjsg if (IS_ERR(rq)) {
478ad8b1aafSjsg struct drm_printer p =
479ad8b1aafSjsg drm_info_printer(gt->i915->drm.dev);
480ad8b1aafSjsg intel_engine_dump(engine, &p,
481ad8b1aafSjsg "%s(%s): failed to submit request\n",
482ad8b1aafSjsg __func__,
483ad8b1aafSjsg engine->name);
484ad8b1aafSjsg
485ad8b1aafSjsg GEM_TRACE("%s(%s): failed to submit request\n",
486ad8b1aafSjsg __func__,
487ad8b1aafSjsg engine->name);
488ad8b1aafSjsg GEM_TRACE_DUMP();
489ad8b1aafSjsg
490ad8b1aafSjsg intel_gt_set_wedged(gt);
491ad8b1aafSjsg
492c349dbc7Sjsg err = PTR_ERR(rq);
493c349dbc7Sjsg break;
494c349dbc7Sjsg }
495c349dbc7Sjsg
496c349dbc7Sjsg i915_request_add(rq);
497c349dbc7Sjsg }
498c349dbc7Sjsg err = intel_engine_reset(engine, NULL);
499c349dbc7Sjsg if (err) {
5005ca02815Sjsg pr_err("intel_engine_reset(%s) failed, err:%d\n",
5015ca02815Sjsg engine->name, err);
502c349dbc7Sjsg break;
503c349dbc7Sjsg }
504c349dbc7Sjsg
505c349dbc7Sjsg if (i915_reset_count(global) != reset_count) {
506c349dbc7Sjsg pr_err("Full GPU reset recorded! (engine reset expected)\n");
507c349dbc7Sjsg err = -EINVAL;
508c349dbc7Sjsg break;
509c349dbc7Sjsg }
510c349dbc7Sjsg
511c349dbc7Sjsg if (i915_reset_engine_count(global, engine) !=
512c349dbc7Sjsg reset_engine_count + ++count) {
513c349dbc7Sjsg pr_err("%s engine reset not recorded!\n",
514c349dbc7Sjsg engine->name);
515c349dbc7Sjsg err = -EINVAL;
516c349dbc7Sjsg break;
517c349dbc7Sjsg }
518c349dbc7Sjsg } while (time_before(jiffies, end_time));
5191bb76ff1Sjsg clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
520ad8b1aafSjsg st_engine_heartbeat_enable(engine);
521c349dbc7Sjsg
522c349dbc7Sjsg pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
523c349dbc7Sjsg
524c349dbc7Sjsg intel_context_put(ce);
525c349dbc7Sjsg if (igt_flush_test(gt->i915))
526c349dbc7Sjsg err = -EIO;
527c349dbc7Sjsg if (err)
528c349dbc7Sjsg return err;
529c349dbc7Sjsg }
530c349dbc7Sjsg
531c349dbc7Sjsg return 0;
532c349dbc7Sjsg }
533c349dbc7Sjsg
force_reset_timeout(struct intel_engine_cs * engine)5345ca02815Sjsg static void force_reset_timeout(struct intel_engine_cs *engine)
5355ca02815Sjsg {
5365ca02815Sjsg engine->reset_timeout.probability = 999;
5375ca02815Sjsg atomic_set(&engine->reset_timeout.times, -1);
5385ca02815Sjsg }
5395ca02815Sjsg
cancel_reset_timeout(struct intel_engine_cs * engine)5405ca02815Sjsg static void cancel_reset_timeout(struct intel_engine_cs *engine)
5415ca02815Sjsg {
5425ca02815Sjsg memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
5435ca02815Sjsg }
5445ca02815Sjsg
igt_reset_fail_engine(void * arg)5455ca02815Sjsg static int igt_reset_fail_engine(void *arg)
5465ca02815Sjsg {
5475ca02815Sjsg struct intel_gt *gt = arg;
5485ca02815Sjsg struct intel_engine_cs *engine;
5495ca02815Sjsg enum intel_engine_id id;
5505ca02815Sjsg
5515ca02815Sjsg /* Check that we can recover from engine-reset failues */
5525ca02815Sjsg
5535ca02815Sjsg if (!intel_has_reset_engine(gt))
5545ca02815Sjsg return 0;
5555ca02815Sjsg
5565ca02815Sjsg for_each_engine(engine, gt, id) {
5575ca02815Sjsg unsigned int count;
5585ca02815Sjsg struct intel_context *ce;
5595ca02815Sjsg IGT_TIMEOUT(end_time);
5605ca02815Sjsg int err;
5615ca02815Sjsg
5625ca02815Sjsg /* Can't manually break the reset if i915 doesn't perform it */
5635ca02815Sjsg if (intel_engine_uses_guc(engine))
5645ca02815Sjsg continue;
5655ca02815Sjsg
5665ca02815Sjsg ce = intel_context_create(engine);
5675ca02815Sjsg if (IS_ERR(ce)) {
5685ca02815Sjsg pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
5695ca02815Sjsg return PTR_ERR(ce);
5705ca02815Sjsg }
5715ca02815Sjsg
5725ca02815Sjsg st_engine_heartbeat_disable(engine);
5731bb76ff1Sjsg GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
5741bb76ff1Sjsg >->reset.flags));
5755ca02815Sjsg
5765ca02815Sjsg force_reset_timeout(engine);
5775ca02815Sjsg err = intel_engine_reset(engine, NULL);
5785ca02815Sjsg cancel_reset_timeout(engine);
5795ca02815Sjsg if (err == 0) /* timeouts only generated on gen8+ */
5805ca02815Sjsg goto skip;
5815ca02815Sjsg
5825ca02815Sjsg count = 0;
5835ca02815Sjsg do {
5845ca02815Sjsg struct i915_request *last = NULL;
5855ca02815Sjsg int i;
5865ca02815Sjsg
5875ca02815Sjsg if (!wait_for_idle(engine)) {
5885ca02815Sjsg pr_err("%s failed to idle before reset\n",
5895ca02815Sjsg engine->name);
5905ca02815Sjsg err = -EIO;
5915ca02815Sjsg break;
5925ca02815Sjsg }
5935ca02815Sjsg
5945ca02815Sjsg for (i = 0; i < count % 15; i++) {
5955ca02815Sjsg struct i915_request *rq;
5965ca02815Sjsg
5975ca02815Sjsg rq = intel_context_create_request(ce);
5985ca02815Sjsg if (IS_ERR(rq)) {
5995ca02815Sjsg struct drm_printer p =
6005ca02815Sjsg drm_info_printer(gt->i915->drm.dev);
6015ca02815Sjsg intel_engine_dump(engine, &p,
6025ca02815Sjsg "%s(%s): failed to submit request\n",
6035ca02815Sjsg __func__,
6045ca02815Sjsg engine->name);
6055ca02815Sjsg
6065ca02815Sjsg GEM_TRACE("%s(%s): failed to submit request\n",
6075ca02815Sjsg __func__,
6085ca02815Sjsg engine->name);
6095ca02815Sjsg GEM_TRACE_DUMP();
6105ca02815Sjsg
6115ca02815Sjsg intel_gt_set_wedged(gt);
6125ca02815Sjsg if (last)
6135ca02815Sjsg i915_request_put(last);
6145ca02815Sjsg
6155ca02815Sjsg err = PTR_ERR(rq);
6165ca02815Sjsg goto out;
6175ca02815Sjsg }
6185ca02815Sjsg
6195ca02815Sjsg if (last)
6205ca02815Sjsg i915_request_put(last);
6215ca02815Sjsg last = i915_request_get(rq);
6225ca02815Sjsg i915_request_add(rq);
6235ca02815Sjsg }
6245ca02815Sjsg
6255ca02815Sjsg if (count & 1) {
6265ca02815Sjsg err = intel_engine_reset(engine, NULL);
6275ca02815Sjsg if (err) {
6285ca02815Sjsg GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
6295ca02815Sjsg engine->name, err);
6305ca02815Sjsg GEM_TRACE_DUMP();
6315ca02815Sjsg i915_request_put(last);
6325ca02815Sjsg break;
6335ca02815Sjsg }
6345ca02815Sjsg } else {
6355ca02815Sjsg force_reset_timeout(engine);
6365ca02815Sjsg err = intel_engine_reset(engine, NULL);
6375ca02815Sjsg cancel_reset_timeout(engine);
6385ca02815Sjsg if (err != -ETIMEDOUT) {
6395ca02815Sjsg pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
6405ca02815Sjsg engine->name, err);
6415ca02815Sjsg i915_request_put(last);
6425ca02815Sjsg break;
6435ca02815Sjsg }
6445ca02815Sjsg }
6455ca02815Sjsg
6465ca02815Sjsg err = 0;
6475ca02815Sjsg if (last) {
6485ca02815Sjsg if (i915_request_wait(last, 0, HZ / 2) < 0) {
6495ca02815Sjsg struct drm_printer p =
6505ca02815Sjsg drm_info_printer(gt->i915->drm.dev);
6515ca02815Sjsg
6525ca02815Sjsg intel_engine_dump(engine, &p,
6535ca02815Sjsg "%s(%s): failed to complete request\n",
6545ca02815Sjsg __func__,
6555ca02815Sjsg engine->name);
6565ca02815Sjsg
6575ca02815Sjsg GEM_TRACE("%s(%s): failed to complete request\n",
6585ca02815Sjsg __func__,
6595ca02815Sjsg engine->name);
6605ca02815Sjsg GEM_TRACE_DUMP();
6615ca02815Sjsg
6625ca02815Sjsg err = -EIO;
6635ca02815Sjsg }
6645ca02815Sjsg i915_request_put(last);
6655ca02815Sjsg }
6665ca02815Sjsg count++;
6675ca02815Sjsg } while (err == 0 && time_before(jiffies, end_time));
6685ca02815Sjsg out:
6695ca02815Sjsg pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
6705ca02815Sjsg skip:
6711bb76ff1Sjsg clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
6725ca02815Sjsg st_engine_heartbeat_enable(engine);
6735ca02815Sjsg intel_context_put(ce);
6745ca02815Sjsg
6755ca02815Sjsg if (igt_flush_test(gt->i915))
6765ca02815Sjsg err = -EIO;
6775ca02815Sjsg if (err)
6785ca02815Sjsg return err;
6795ca02815Sjsg }
6805ca02815Sjsg
6815ca02815Sjsg return 0;
6825ca02815Sjsg }
6835ca02815Sjsg
__igt_reset_engine(struct intel_gt * gt,bool active)684c349dbc7Sjsg static int __igt_reset_engine(struct intel_gt *gt, bool active)
685c349dbc7Sjsg {
686c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
687c349dbc7Sjsg struct intel_engine_cs *engine;
688c349dbc7Sjsg enum intel_engine_id id;
689c349dbc7Sjsg struct hang h;
690c349dbc7Sjsg int err = 0;
691c349dbc7Sjsg
692c349dbc7Sjsg /* Check that we can issue an engine reset on an idle engine (no-op) */
693c349dbc7Sjsg
694c349dbc7Sjsg if (!intel_has_reset_engine(gt))
695c349dbc7Sjsg return 0;
696c349dbc7Sjsg
697c349dbc7Sjsg if (active) {
698c349dbc7Sjsg err = hang_init(&h, gt);
699c349dbc7Sjsg if (err)
700c349dbc7Sjsg return err;
701c349dbc7Sjsg }
702c349dbc7Sjsg
703c349dbc7Sjsg for_each_engine(engine, gt, id) {
704c349dbc7Sjsg unsigned int reset_count, reset_engine_count;
7055ca02815Sjsg unsigned long count;
7065ca02815Sjsg bool using_guc = intel_engine_uses_guc(engine);
707c349dbc7Sjsg IGT_TIMEOUT(end_time);
708c349dbc7Sjsg
7095ca02815Sjsg if (using_guc && !active)
7105ca02815Sjsg continue;
7115ca02815Sjsg
712c349dbc7Sjsg if (active && !intel_engine_can_store_dword(engine))
713c349dbc7Sjsg continue;
714c349dbc7Sjsg
715c349dbc7Sjsg if (!wait_for_idle(engine)) {
716c349dbc7Sjsg pr_err("%s failed to idle before reset\n",
717c349dbc7Sjsg engine->name);
718c349dbc7Sjsg err = -EIO;
719c349dbc7Sjsg break;
720c349dbc7Sjsg }
721c349dbc7Sjsg
722c349dbc7Sjsg reset_count = i915_reset_count(global);
723c349dbc7Sjsg reset_engine_count = i915_reset_engine_count(global, engine);
724c349dbc7Sjsg
725ad8b1aafSjsg st_engine_heartbeat_disable(engine);
7261bb76ff1Sjsg GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
7271bb76ff1Sjsg >->reset.flags));
7285ca02815Sjsg count = 0;
729c349dbc7Sjsg do {
7305ca02815Sjsg struct i915_request *rq = NULL;
7315ca02815Sjsg struct intel_selftest_saved_policy saved;
7325ca02815Sjsg int err2;
733c349dbc7Sjsg
7345ca02815Sjsg err = intel_selftest_modify_policy(engine, &saved,
7355ca02815Sjsg SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
7365ca02815Sjsg if (err) {
7375ca02815Sjsg pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
7385ca02815Sjsg break;
7395ca02815Sjsg }
7405ca02815Sjsg
7415ca02815Sjsg if (active) {
742c349dbc7Sjsg rq = hang_create_request(&h, engine);
743c349dbc7Sjsg if (IS_ERR(rq)) {
744c349dbc7Sjsg err = PTR_ERR(rq);
7455ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n",
7465ca02815Sjsg engine->name, err);
7475ca02815Sjsg goto restore;
748c349dbc7Sjsg }
749c349dbc7Sjsg
750c349dbc7Sjsg i915_request_get(rq);
751c349dbc7Sjsg i915_request_add(rq);
752c349dbc7Sjsg
753c349dbc7Sjsg if (!wait_until_running(&h, rq)) {
754c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
755c349dbc7Sjsg
756c349dbc7Sjsg pr_err("%s: Failed to start request %llx, at %x\n",
757c349dbc7Sjsg __func__, rq->fence.seqno, hws_seqno(&h, rq));
758c349dbc7Sjsg intel_engine_dump(engine, &p,
759c349dbc7Sjsg "%s\n", engine->name);
760c349dbc7Sjsg
761c349dbc7Sjsg i915_request_put(rq);
762c349dbc7Sjsg err = -EIO;
7635ca02815Sjsg goto restore;
7645ca02815Sjsg }
765c349dbc7Sjsg }
766c349dbc7Sjsg
7675ca02815Sjsg if (!using_guc) {
768c349dbc7Sjsg err = intel_engine_reset(engine, NULL);
769c349dbc7Sjsg if (err) {
7705ca02815Sjsg pr_err("intel_engine_reset(%s) failed, err:%d\n",
7715ca02815Sjsg engine->name, err);
7725ca02815Sjsg goto skip;
773c349dbc7Sjsg }
7745ca02815Sjsg }
7755ca02815Sjsg
7765ca02815Sjsg if (rq) {
7775ca02815Sjsg /* Ensure the reset happens and kills the engine */
7785ca02815Sjsg err = intel_selftest_wait_for_rq(rq);
7795ca02815Sjsg if (err)
7805ca02815Sjsg pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
7815ca02815Sjsg engine->name, rq->fence.context,
7821bb76ff1Sjsg rq->fence.seqno, rq->context->guc_id.id, err);
7835ca02815Sjsg }
7845ca02815Sjsg
7855ca02815Sjsg skip:
7865ca02815Sjsg if (rq)
7875ca02815Sjsg i915_request_put(rq);
788c349dbc7Sjsg
789c349dbc7Sjsg if (i915_reset_count(global) != reset_count) {
790c349dbc7Sjsg pr_err("Full GPU reset recorded! (engine reset expected)\n");
791c349dbc7Sjsg err = -EINVAL;
7925ca02815Sjsg goto restore;
793c349dbc7Sjsg }
794c349dbc7Sjsg
7955ca02815Sjsg /* GuC based resets are not logged per engine */
7965ca02815Sjsg if (!using_guc) {
797c349dbc7Sjsg if (i915_reset_engine_count(global, engine) !=
798c349dbc7Sjsg ++reset_engine_count) {
799c349dbc7Sjsg pr_err("%s engine reset not recorded!\n",
800c349dbc7Sjsg engine->name);
801c349dbc7Sjsg err = -EINVAL;
8025ca02815Sjsg goto restore;
803c349dbc7Sjsg }
8045ca02815Sjsg }
8055ca02815Sjsg
8065ca02815Sjsg count++;
8075ca02815Sjsg
8085ca02815Sjsg restore:
8095ca02815Sjsg err2 = intel_selftest_restore_policy(engine, &saved);
8105ca02815Sjsg if (err2)
8115ca02815Sjsg pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
8125ca02815Sjsg if (err == 0)
8135ca02815Sjsg err = err2;
8145ca02815Sjsg if (err)
8155ca02815Sjsg break;
816c349dbc7Sjsg } while (time_before(jiffies, end_time));
8171bb76ff1Sjsg clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
818ad8b1aafSjsg st_engine_heartbeat_enable(engine);
8195ca02815Sjsg pr_info("%s: Completed %lu %s resets\n",
8205ca02815Sjsg engine->name, count, active ? "active" : "idle");
821c349dbc7Sjsg
822c349dbc7Sjsg if (err)
823c349dbc7Sjsg break;
824c349dbc7Sjsg
825c349dbc7Sjsg err = igt_flush_test(gt->i915);
8265ca02815Sjsg if (err) {
8275ca02815Sjsg pr_err("[%s] Flush failed: %d!\n", engine->name, err);
828c349dbc7Sjsg break;
829c349dbc7Sjsg }
8305ca02815Sjsg }
831c349dbc7Sjsg
8325ca02815Sjsg if (intel_gt_is_wedged(gt)) {
8335ca02815Sjsg pr_err("GT is wedged!\n");
834c349dbc7Sjsg err = -EIO;
8355ca02815Sjsg }
836c349dbc7Sjsg
837c349dbc7Sjsg if (active)
838c349dbc7Sjsg hang_fini(&h);
839c349dbc7Sjsg
840c349dbc7Sjsg return err;
841c349dbc7Sjsg }
842c349dbc7Sjsg
igt_reset_idle_engine(void * arg)843c349dbc7Sjsg static int igt_reset_idle_engine(void *arg)
844c349dbc7Sjsg {
845c349dbc7Sjsg return __igt_reset_engine(arg, false);
846c349dbc7Sjsg }
847c349dbc7Sjsg
igt_reset_active_engine(void * arg)848c349dbc7Sjsg static int igt_reset_active_engine(void *arg)
849c349dbc7Sjsg {
850c349dbc7Sjsg return __igt_reset_engine(arg, true);
851c349dbc7Sjsg }
852c349dbc7Sjsg
853c349dbc7Sjsg struct active_engine {
8542e3046b3Sjsg struct kthread_worker *worker;
8552e3046b3Sjsg struct kthread_work work;
856c349dbc7Sjsg struct intel_engine_cs *engine;
857c349dbc7Sjsg unsigned long resets;
858c349dbc7Sjsg unsigned int flags;
8592e3046b3Sjsg bool stop;
8602e3046b3Sjsg int result;
861c349dbc7Sjsg };
862c349dbc7Sjsg
863c349dbc7Sjsg #define TEST_ACTIVE BIT(0)
864c349dbc7Sjsg #define TEST_OTHERS BIT(1)
865c349dbc7Sjsg #define TEST_SELF BIT(2)
866c349dbc7Sjsg #define TEST_PRIORITY BIT(3)
867c349dbc7Sjsg
active_request_put(struct i915_request * rq)868c349dbc7Sjsg static int active_request_put(struct i915_request *rq)
869c349dbc7Sjsg {
870c349dbc7Sjsg int err = 0;
871c349dbc7Sjsg
872c349dbc7Sjsg if (!rq)
873c349dbc7Sjsg return 0;
874c349dbc7Sjsg
8755ca02815Sjsg if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
876c349dbc7Sjsg GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
877c349dbc7Sjsg rq->engine->name,
878c349dbc7Sjsg rq->fence.context,
879c349dbc7Sjsg rq->fence.seqno);
880c349dbc7Sjsg GEM_TRACE_DUMP();
881c349dbc7Sjsg
882c349dbc7Sjsg intel_gt_set_wedged(rq->engine->gt);
883c349dbc7Sjsg err = -EIO;
884c349dbc7Sjsg }
885c349dbc7Sjsg
886c349dbc7Sjsg i915_request_put(rq);
887c349dbc7Sjsg
888c349dbc7Sjsg return err;
889c349dbc7Sjsg }
890c349dbc7Sjsg
active_engine(struct kthread_work * work)8912e3046b3Sjsg static void active_engine(struct kthread_work *work)
892c349dbc7Sjsg {
893c349dbc7Sjsg I915_RND_STATE(prng);
8942e3046b3Sjsg struct active_engine *arg = container_of(work, typeof(*arg), work);
895c349dbc7Sjsg struct intel_engine_cs *engine = arg->engine;
896c349dbc7Sjsg struct i915_request *rq[8] = {};
897c349dbc7Sjsg struct intel_context *ce[ARRAY_SIZE(rq)];
898c349dbc7Sjsg unsigned long count;
899c349dbc7Sjsg int err = 0;
900c349dbc7Sjsg
901c349dbc7Sjsg for (count = 0; count < ARRAY_SIZE(ce); count++) {
902c349dbc7Sjsg ce[count] = intel_context_create(engine);
903c349dbc7Sjsg if (IS_ERR(ce[count])) {
9042e3046b3Sjsg arg->result = PTR_ERR(ce[count]);
9052e3046b3Sjsg pr_err("[%s] Create context #%ld failed: %d!\n",
9062e3046b3Sjsg engine->name, count, arg->result);
907c349dbc7Sjsg while (--count)
908c349dbc7Sjsg intel_context_put(ce[count]);
9092e3046b3Sjsg return;
910c349dbc7Sjsg }
911c349dbc7Sjsg }
912c349dbc7Sjsg
913c349dbc7Sjsg count = 0;
9142e3046b3Sjsg while (!READ_ONCE(arg->stop)) {
915c349dbc7Sjsg unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
916c349dbc7Sjsg struct i915_request *old = rq[idx];
917c349dbc7Sjsg struct i915_request *new;
918c349dbc7Sjsg
919c349dbc7Sjsg new = intel_context_create_request(ce[idx]);
920c349dbc7Sjsg if (IS_ERR(new)) {
921c349dbc7Sjsg err = PTR_ERR(new);
9225ca02815Sjsg pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
923c349dbc7Sjsg break;
924c349dbc7Sjsg }
925c349dbc7Sjsg
926c349dbc7Sjsg rq[idx] = i915_request_get(new);
927c349dbc7Sjsg i915_request_add(new);
928c349dbc7Sjsg
9295ca02815Sjsg if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
930c349dbc7Sjsg struct i915_sched_attr attr = {
931c349dbc7Sjsg .priority =
932c349dbc7Sjsg i915_prandom_u32_max_state(512, &prng),
933c349dbc7Sjsg };
9345ca02815Sjsg engine->sched_engine->schedule(rq[idx], &attr);
935c349dbc7Sjsg }
936c349dbc7Sjsg
937c349dbc7Sjsg err = active_request_put(old);
9385ca02815Sjsg if (err) {
9395ca02815Sjsg pr_err("[%s] Request put failed: %d!\n", engine->name, err);
940c349dbc7Sjsg break;
9415ca02815Sjsg }
942c349dbc7Sjsg
943c349dbc7Sjsg cond_resched();
944c349dbc7Sjsg }
945c349dbc7Sjsg
946c349dbc7Sjsg for (count = 0; count < ARRAY_SIZE(rq); count++) {
947c349dbc7Sjsg int err__ = active_request_put(rq[count]);
948c349dbc7Sjsg
9495ca02815Sjsg if (err)
9505ca02815Sjsg pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
9515ca02815Sjsg
952c349dbc7Sjsg /* Keep the first error */
953c349dbc7Sjsg if (!err)
954c349dbc7Sjsg err = err__;
955c349dbc7Sjsg
956c349dbc7Sjsg intel_context_put(ce[count]);
957c349dbc7Sjsg }
958c349dbc7Sjsg
9592e3046b3Sjsg arg->result = err;
960c349dbc7Sjsg }
961c349dbc7Sjsg
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)962c349dbc7Sjsg static int __igt_reset_engines(struct intel_gt *gt,
963c349dbc7Sjsg const char *test_name,
964c349dbc7Sjsg unsigned int flags)
965c349dbc7Sjsg {
966c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
967c349dbc7Sjsg struct intel_engine_cs *engine, *other;
9681bb76ff1Sjsg struct active_engine *threads;
969c349dbc7Sjsg enum intel_engine_id id, tmp;
970c349dbc7Sjsg struct hang h;
971c349dbc7Sjsg int err = 0;
972c349dbc7Sjsg
973c349dbc7Sjsg /* Check that issuing a reset on one engine does not interfere
974c349dbc7Sjsg * with any other engine.
975c349dbc7Sjsg */
976c349dbc7Sjsg
977c349dbc7Sjsg if (!intel_has_reset_engine(gt))
978c349dbc7Sjsg return 0;
979c349dbc7Sjsg
980c349dbc7Sjsg if (flags & TEST_ACTIVE) {
981c349dbc7Sjsg err = hang_init(&h, gt);
982c349dbc7Sjsg if (err)
983c349dbc7Sjsg return err;
984c349dbc7Sjsg
985c349dbc7Sjsg if (flags & TEST_PRIORITY)
986c349dbc7Sjsg h.ctx->sched.priority = 1024;
987c349dbc7Sjsg }
988c349dbc7Sjsg
9891bb76ff1Sjsg threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
9901bb76ff1Sjsg if (!threads)
9911bb76ff1Sjsg return -ENOMEM;
9921bb76ff1Sjsg
993c349dbc7Sjsg for_each_engine(engine, gt, id) {
994c349dbc7Sjsg unsigned long device = i915_reset_count(global);
995c349dbc7Sjsg unsigned long count = 0, reported;
9965ca02815Sjsg bool using_guc = intel_engine_uses_guc(engine);
997c349dbc7Sjsg IGT_TIMEOUT(end_time);
998c349dbc7Sjsg
9995ca02815Sjsg if (flags & TEST_ACTIVE) {
10005ca02815Sjsg if (!intel_engine_can_store_dword(engine))
10015ca02815Sjsg continue;
10025ca02815Sjsg } else if (using_guc)
1003c349dbc7Sjsg continue;
1004c349dbc7Sjsg
1005c349dbc7Sjsg if (!wait_for_idle(engine)) {
1006c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1007c349dbc7Sjsg engine->name, test_name);
1008c349dbc7Sjsg err = -EIO;
1009c349dbc7Sjsg break;
1010c349dbc7Sjsg }
1011c349dbc7Sjsg
10121bb76ff1Sjsg memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1013c349dbc7Sjsg for_each_engine(other, gt, tmp) {
10142e3046b3Sjsg struct kthread_worker *worker;
1015c349dbc7Sjsg
1016c349dbc7Sjsg threads[tmp].resets =
1017c349dbc7Sjsg i915_reset_engine_count(global, other);
1018c349dbc7Sjsg
1019ad8b1aafSjsg if (other == engine && !(flags & TEST_SELF))
1020c349dbc7Sjsg continue;
1021c349dbc7Sjsg
1022ad8b1aafSjsg if (other != engine && !(flags & TEST_OTHERS))
1023c349dbc7Sjsg continue;
1024c349dbc7Sjsg
1025c349dbc7Sjsg threads[tmp].engine = other;
1026c349dbc7Sjsg threads[tmp].flags = flags;
1027c349dbc7Sjsg
10282e3046b3Sjsg worker = kthread_create_worker(0, "igt/%s",
10292e3046b3Sjsg other->name);
10302e3046b3Sjsg if (IS_ERR(worker)) {
10312e3046b3Sjsg err = PTR_ERR(worker);
10322e3046b3Sjsg pr_err("[%s] Worker create failed: %d!\n",
10332e3046b3Sjsg engine->name, err);
1034c349dbc7Sjsg goto unwind;
1035c349dbc7Sjsg }
1036c349dbc7Sjsg
10372e3046b3Sjsg threads[tmp].worker = worker;
1038c349dbc7Sjsg
10392e3046b3Sjsg kthread_init_work(&threads[tmp].work, active_engine);
10402e3046b3Sjsg kthread_queue_work(threads[tmp].worker,
10412e3046b3Sjsg &threads[tmp].work);
10422e3046b3Sjsg }
1043c349dbc7Sjsg
10445ca02815Sjsg st_engine_heartbeat_disable_no_pm(engine);
10451bb76ff1Sjsg GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
10461bb76ff1Sjsg >->reset.flags));
1047c349dbc7Sjsg do {
1048c349dbc7Sjsg struct i915_request *rq = NULL;
10495ca02815Sjsg struct intel_selftest_saved_policy saved;
10505ca02815Sjsg int err2;
10515ca02815Sjsg
10525ca02815Sjsg err = intel_selftest_modify_policy(engine, &saved,
10535ca02815Sjsg SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
10545ca02815Sjsg if (err) {
10555ca02815Sjsg pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
10565ca02815Sjsg break;
10575ca02815Sjsg }
1058c349dbc7Sjsg
1059c349dbc7Sjsg if (flags & TEST_ACTIVE) {
1060c349dbc7Sjsg rq = hang_create_request(&h, engine);
1061c349dbc7Sjsg if (IS_ERR(rq)) {
1062c349dbc7Sjsg err = PTR_ERR(rq);
10635ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n",
10645ca02815Sjsg engine->name, err);
10655ca02815Sjsg goto restore;
1066c349dbc7Sjsg }
1067c349dbc7Sjsg
1068c349dbc7Sjsg i915_request_get(rq);
1069c349dbc7Sjsg i915_request_add(rq);
1070c349dbc7Sjsg
1071c349dbc7Sjsg if (!wait_until_running(&h, rq)) {
1072c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1073c349dbc7Sjsg
1074c349dbc7Sjsg pr_err("%s: Failed to start request %llx, at %x\n",
1075c349dbc7Sjsg __func__, rq->fence.seqno, hws_seqno(&h, rq));
1076c349dbc7Sjsg intel_engine_dump(engine, &p,
1077c349dbc7Sjsg "%s\n", engine->name);
1078c349dbc7Sjsg
1079c349dbc7Sjsg i915_request_put(rq);
1080c349dbc7Sjsg err = -EIO;
10815ca02815Sjsg goto restore;
1082c349dbc7Sjsg }
10835ca02815Sjsg } else {
10845ca02815Sjsg intel_engine_pm_get(engine);
1085c349dbc7Sjsg }
1086c349dbc7Sjsg
10875ca02815Sjsg if (!using_guc) {
1088c349dbc7Sjsg err = intel_engine_reset(engine, NULL);
1089c349dbc7Sjsg if (err) {
1090c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1091c349dbc7Sjsg engine->name, test_name, err);
10925ca02815Sjsg goto restore;
10935ca02815Sjsg }
10945ca02815Sjsg }
10955ca02815Sjsg
10965ca02815Sjsg if (rq) {
10975ca02815Sjsg /* Ensure the reset happens and kills the engine */
10985ca02815Sjsg err = intel_selftest_wait_for_rq(rq);
10995ca02815Sjsg if (err)
11005ca02815Sjsg pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
11015ca02815Sjsg engine->name, rq->fence.context,
11021bb76ff1Sjsg rq->fence.seqno, rq->context->guc_id.id, err);
1103c349dbc7Sjsg }
1104c349dbc7Sjsg
1105c349dbc7Sjsg count++;
1106c349dbc7Sjsg
1107c349dbc7Sjsg if (rq) {
1108ad8b1aafSjsg if (rq->fence.error != -EIO) {
11095ca02815Sjsg pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1110ad8b1aafSjsg engine->name, test_name,
1111ad8b1aafSjsg rq->fence.context,
11121bb76ff1Sjsg rq->fence.seqno, rq->context->guc_id.id);
1113ad8b1aafSjsg i915_request_put(rq);
1114ad8b1aafSjsg
1115ad8b1aafSjsg GEM_TRACE_DUMP();
1116ad8b1aafSjsg intel_gt_set_wedged(gt);
1117ad8b1aafSjsg err = -EIO;
11185ca02815Sjsg goto restore;
1119ad8b1aafSjsg }
1120ad8b1aafSjsg
1121c349dbc7Sjsg if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1122c349dbc7Sjsg struct drm_printer p =
1123c349dbc7Sjsg drm_info_printer(gt->i915->drm.dev);
1124c349dbc7Sjsg
1125c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s):"
1126ad8b1aafSjsg " failed to complete request %llx:%lld after reset\n",
1127ad8b1aafSjsg engine->name, test_name,
1128ad8b1aafSjsg rq->fence.context,
1129ad8b1aafSjsg rq->fence.seqno);
1130c349dbc7Sjsg intel_engine_dump(engine, &p,
1131c349dbc7Sjsg "%s\n", engine->name);
1132c349dbc7Sjsg i915_request_put(rq);
1133c349dbc7Sjsg
1134c349dbc7Sjsg GEM_TRACE_DUMP();
1135c349dbc7Sjsg intel_gt_set_wedged(gt);
1136c349dbc7Sjsg err = -EIO;
11375ca02815Sjsg goto restore;
1138c349dbc7Sjsg }
1139c349dbc7Sjsg
1140c349dbc7Sjsg i915_request_put(rq);
1141c349dbc7Sjsg }
1142c349dbc7Sjsg
11435ca02815Sjsg if (!(flags & TEST_ACTIVE))
11445ca02815Sjsg intel_engine_pm_put(engine);
11455ca02815Sjsg
1146c349dbc7Sjsg if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1147c349dbc7Sjsg struct drm_printer p =
1148c349dbc7Sjsg drm_info_printer(gt->i915->drm.dev);
1149c349dbc7Sjsg
1150c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s):"
1151c349dbc7Sjsg " failed to idle after reset\n",
1152c349dbc7Sjsg engine->name, test_name);
1153c349dbc7Sjsg intel_engine_dump(engine, &p,
1154c349dbc7Sjsg "%s\n", engine->name);
1155c349dbc7Sjsg
1156c349dbc7Sjsg err = -EIO;
11575ca02815Sjsg goto restore;
1158c349dbc7Sjsg }
11595ca02815Sjsg
11605ca02815Sjsg restore:
11615ca02815Sjsg err2 = intel_selftest_restore_policy(engine, &saved);
11625ca02815Sjsg if (err2)
11635ca02815Sjsg pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
11645ca02815Sjsg if (err == 0)
11655ca02815Sjsg err = err2;
11665ca02815Sjsg if (err)
11675ca02815Sjsg break;
1168c349dbc7Sjsg } while (time_before(jiffies, end_time));
11691bb76ff1Sjsg clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
11705ca02815Sjsg st_engine_heartbeat_enable_no_pm(engine);
1171c349dbc7Sjsg
1172c349dbc7Sjsg pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1173c349dbc7Sjsg engine->name, test_name, count);
1174c349dbc7Sjsg
11755ca02815Sjsg /* GuC based resets are not logged per engine */
11765ca02815Sjsg if (!using_guc) {
1177c349dbc7Sjsg reported = i915_reset_engine_count(global, engine);
1178c349dbc7Sjsg reported -= threads[engine->id].resets;
1179c349dbc7Sjsg if (reported != count) {
1180c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1181c349dbc7Sjsg engine->name, test_name, count, reported);
1182c349dbc7Sjsg if (!err)
1183c349dbc7Sjsg err = -EINVAL;
1184c349dbc7Sjsg }
11855ca02815Sjsg }
1186c349dbc7Sjsg
1187c349dbc7Sjsg unwind:
1188c349dbc7Sjsg for_each_engine(other, gt, tmp) {
1189c349dbc7Sjsg int ret;
1190c349dbc7Sjsg
11912e3046b3Sjsg if (!threads[tmp].worker)
1192c349dbc7Sjsg continue;
1193c349dbc7Sjsg
11942e3046b3Sjsg WRITE_ONCE(threads[tmp].stop, true);
11952e3046b3Sjsg kthread_flush_work(&threads[tmp].work);
11962e3046b3Sjsg ret = READ_ONCE(threads[tmp].result);
1197c349dbc7Sjsg if (ret) {
1198c349dbc7Sjsg pr_err("kthread for other engine %s failed, err=%d\n",
1199c349dbc7Sjsg other->name, ret);
1200c349dbc7Sjsg if (!err)
1201c349dbc7Sjsg err = ret;
1202c349dbc7Sjsg }
12032e3046b3Sjsg
12042e3046b3Sjsg kthread_destroy_worker(threads[tmp].worker);
1205c349dbc7Sjsg
12065ca02815Sjsg /* GuC based resets are not logged per engine */
12075ca02815Sjsg if (!using_guc) {
1208c349dbc7Sjsg if (other->uabi_class != engine->uabi_class &&
1209c349dbc7Sjsg threads[tmp].resets !=
1210c349dbc7Sjsg i915_reset_engine_count(global, other)) {
1211c349dbc7Sjsg pr_err("Innocent engine %s was reset (count=%ld)\n",
1212c349dbc7Sjsg other->name,
1213c349dbc7Sjsg i915_reset_engine_count(global, other) -
1214c349dbc7Sjsg threads[tmp].resets);
1215c349dbc7Sjsg if (!err)
1216c349dbc7Sjsg err = -EINVAL;
1217c349dbc7Sjsg }
1218c349dbc7Sjsg }
12195ca02815Sjsg }
1220c349dbc7Sjsg
1221c349dbc7Sjsg if (device != i915_reset_count(global)) {
1222c349dbc7Sjsg pr_err("Global reset (count=%ld)!\n",
1223c349dbc7Sjsg i915_reset_count(global) - device);
1224c349dbc7Sjsg if (!err)
1225c349dbc7Sjsg err = -EINVAL;
1226c349dbc7Sjsg }
1227c349dbc7Sjsg
1228c349dbc7Sjsg if (err)
1229c349dbc7Sjsg break;
1230c349dbc7Sjsg
1231c349dbc7Sjsg err = igt_flush_test(gt->i915);
12325ca02815Sjsg if (err) {
12335ca02815Sjsg pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1234c349dbc7Sjsg break;
1235c349dbc7Sjsg }
12365ca02815Sjsg }
12371bb76ff1Sjsg kfree(threads);
1238c349dbc7Sjsg
1239c349dbc7Sjsg if (intel_gt_is_wedged(gt))
1240c349dbc7Sjsg err = -EIO;
1241c349dbc7Sjsg
1242c349dbc7Sjsg if (flags & TEST_ACTIVE)
1243c349dbc7Sjsg hang_fini(&h);
1244c349dbc7Sjsg
1245c349dbc7Sjsg return err;
1246c349dbc7Sjsg }
1247c349dbc7Sjsg
igt_reset_engines(void * arg)1248c349dbc7Sjsg static int igt_reset_engines(void *arg)
1249c349dbc7Sjsg {
1250c349dbc7Sjsg static const struct {
1251c349dbc7Sjsg const char *name;
1252c349dbc7Sjsg unsigned int flags;
1253c349dbc7Sjsg } phases[] = {
1254c349dbc7Sjsg { "idle", 0 },
1255c349dbc7Sjsg { "active", TEST_ACTIVE },
1256c349dbc7Sjsg { "others-idle", TEST_OTHERS },
1257c349dbc7Sjsg { "others-active", TEST_OTHERS | TEST_ACTIVE },
1258c349dbc7Sjsg {
1259c349dbc7Sjsg "others-priority",
1260c349dbc7Sjsg TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1261c349dbc7Sjsg },
1262c349dbc7Sjsg {
1263c349dbc7Sjsg "self-priority",
1264ad8b1aafSjsg TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1265c349dbc7Sjsg },
1266c349dbc7Sjsg { }
1267c349dbc7Sjsg };
1268c349dbc7Sjsg struct intel_gt *gt = arg;
1269c349dbc7Sjsg typeof(*phases) *p;
1270c349dbc7Sjsg int err;
1271c349dbc7Sjsg
1272c349dbc7Sjsg for (p = phases; p->name; p++) {
1273c349dbc7Sjsg if (p->flags & TEST_PRIORITY) {
1274c349dbc7Sjsg if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1275c349dbc7Sjsg continue;
1276c349dbc7Sjsg }
1277c349dbc7Sjsg
1278c349dbc7Sjsg err = __igt_reset_engines(arg, p->name, p->flags);
1279c349dbc7Sjsg if (err)
1280c349dbc7Sjsg return err;
1281c349dbc7Sjsg }
1282c349dbc7Sjsg
1283c349dbc7Sjsg return 0;
1284c349dbc7Sjsg }
1285c349dbc7Sjsg
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1286c349dbc7Sjsg static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1287c349dbc7Sjsg {
1288c349dbc7Sjsg u32 count = i915_reset_count(>->i915->gpu_error);
1289c349dbc7Sjsg
1290c349dbc7Sjsg intel_gt_reset(gt, mask, NULL);
1291c349dbc7Sjsg
1292c349dbc7Sjsg return count;
1293c349dbc7Sjsg }
1294c349dbc7Sjsg
igt_reset_wait(void * arg)1295c349dbc7Sjsg static int igt_reset_wait(void *arg)
1296c349dbc7Sjsg {
1297c349dbc7Sjsg struct intel_gt *gt = arg;
1298c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
12991bb76ff1Sjsg struct intel_engine_cs *engine;
1300c349dbc7Sjsg struct i915_request *rq;
1301c349dbc7Sjsg unsigned int reset_count;
1302c349dbc7Sjsg struct hang h;
1303c349dbc7Sjsg long timeout;
1304c349dbc7Sjsg int err;
1305c349dbc7Sjsg
13061bb76ff1Sjsg engine = intel_selftest_find_any_engine(gt);
13071bb76ff1Sjsg
1308c349dbc7Sjsg if (!engine || !intel_engine_can_store_dword(engine))
1309c349dbc7Sjsg return 0;
1310c349dbc7Sjsg
1311c349dbc7Sjsg /* Check that we detect a stuck waiter and issue a reset */
1312c349dbc7Sjsg
1313c349dbc7Sjsg igt_global_reset_lock(gt);
1314c349dbc7Sjsg
1315c349dbc7Sjsg err = hang_init(&h, gt);
13165ca02815Sjsg if (err) {
13175ca02815Sjsg pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1318c349dbc7Sjsg goto unlock;
13195ca02815Sjsg }
1320c349dbc7Sjsg
1321c349dbc7Sjsg rq = hang_create_request(&h, engine);
1322c349dbc7Sjsg if (IS_ERR(rq)) {
1323c349dbc7Sjsg err = PTR_ERR(rq);
13245ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1325c349dbc7Sjsg goto fini;
1326c349dbc7Sjsg }
1327c349dbc7Sjsg
1328c349dbc7Sjsg i915_request_get(rq);
1329c349dbc7Sjsg i915_request_add(rq);
1330c349dbc7Sjsg
1331c349dbc7Sjsg if (!wait_until_running(&h, rq)) {
1332c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1333c349dbc7Sjsg
1334c349dbc7Sjsg pr_err("%s: Failed to start request %llx, at %x\n",
1335c349dbc7Sjsg __func__, rq->fence.seqno, hws_seqno(&h, rq));
1336c349dbc7Sjsg intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1337c349dbc7Sjsg
1338c349dbc7Sjsg intel_gt_set_wedged(gt);
1339c349dbc7Sjsg
1340c349dbc7Sjsg err = -EIO;
1341c349dbc7Sjsg goto out_rq;
1342c349dbc7Sjsg }
1343c349dbc7Sjsg
1344c349dbc7Sjsg reset_count = fake_hangcheck(gt, ALL_ENGINES);
1345c349dbc7Sjsg
1346c349dbc7Sjsg timeout = i915_request_wait(rq, 0, 10);
1347c349dbc7Sjsg if (timeout < 0) {
1348c349dbc7Sjsg pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1349c349dbc7Sjsg timeout);
1350c349dbc7Sjsg err = timeout;
1351c349dbc7Sjsg goto out_rq;
1352c349dbc7Sjsg }
1353c349dbc7Sjsg
1354c349dbc7Sjsg if (i915_reset_count(global) == reset_count) {
1355c349dbc7Sjsg pr_err("No GPU reset recorded!\n");
1356c349dbc7Sjsg err = -EINVAL;
1357c349dbc7Sjsg goto out_rq;
1358c349dbc7Sjsg }
1359c349dbc7Sjsg
1360c349dbc7Sjsg out_rq:
1361c349dbc7Sjsg i915_request_put(rq);
1362c349dbc7Sjsg fini:
1363c349dbc7Sjsg hang_fini(&h);
1364c349dbc7Sjsg unlock:
1365c349dbc7Sjsg igt_global_reset_unlock(gt);
1366c349dbc7Sjsg
1367c349dbc7Sjsg if (intel_gt_is_wedged(gt))
1368c349dbc7Sjsg return -EIO;
1369c349dbc7Sjsg
1370c349dbc7Sjsg return err;
1371c349dbc7Sjsg }
1372c349dbc7Sjsg
1373c349dbc7Sjsg struct evict_vma {
1374c349dbc7Sjsg struct completion completion;
1375c349dbc7Sjsg struct i915_vma *vma;
1376c349dbc7Sjsg };
1377c349dbc7Sjsg
evict_vma(void * data)1378c349dbc7Sjsg static int evict_vma(void *data)
1379c349dbc7Sjsg {
1380c349dbc7Sjsg struct evict_vma *arg = data;
1381c349dbc7Sjsg struct i915_address_space *vm = arg->vma->vm;
1382c349dbc7Sjsg struct drm_mm_node evict = arg->vma->node;
1383c349dbc7Sjsg int err;
1384c349dbc7Sjsg
1385c349dbc7Sjsg complete(&arg->completion);
1386c349dbc7Sjsg
1387c349dbc7Sjsg mutex_lock(&vm->mutex);
13881bb76ff1Sjsg err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1389c349dbc7Sjsg mutex_unlock(&vm->mutex);
1390c349dbc7Sjsg
1391c349dbc7Sjsg return err;
1392c349dbc7Sjsg }
1393c349dbc7Sjsg
evict_fence(void * data)1394c349dbc7Sjsg static int evict_fence(void *data)
1395c349dbc7Sjsg {
1396c349dbc7Sjsg struct evict_vma *arg = data;
1397c349dbc7Sjsg int err;
1398c349dbc7Sjsg
1399c349dbc7Sjsg complete(&arg->completion);
1400c349dbc7Sjsg
1401c349dbc7Sjsg /* Mark the fence register as dirty to force the mmio update. */
1402c349dbc7Sjsg err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1403c349dbc7Sjsg if (err) {
1404c349dbc7Sjsg pr_err("Invalid Y-tiling settings; err:%d\n", err);
1405c349dbc7Sjsg return err;
1406c349dbc7Sjsg }
1407c349dbc7Sjsg
1408c349dbc7Sjsg err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1409c349dbc7Sjsg if (err) {
1410c349dbc7Sjsg pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1411c349dbc7Sjsg return err;
1412c349dbc7Sjsg }
1413c349dbc7Sjsg
1414c349dbc7Sjsg err = i915_vma_pin_fence(arg->vma);
1415c349dbc7Sjsg i915_vma_unpin(arg->vma);
1416c349dbc7Sjsg if (err) {
1417c349dbc7Sjsg pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1418c349dbc7Sjsg return err;
1419c349dbc7Sjsg }
1420c349dbc7Sjsg
1421c349dbc7Sjsg i915_vma_unpin_fence(arg->vma);
1422c349dbc7Sjsg
1423c349dbc7Sjsg return 0;
1424c349dbc7Sjsg }
1425c349dbc7Sjsg
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1426c349dbc7Sjsg static int __igt_reset_evict_vma(struct intel_gt *gt,
1427c349dbc7Sjsg struct i915_address_space *vm,
1428c349dbc7Sjsg int (*fn)(void *),
1429c349dbc7Sjsg unsigned int flags)
1430c349dbc7Sjsg {
14311bb76ff1Sjsg struct intel_engine_cs *engine;
1432c349dbc7Sjsg struct drm_i915_gem_object *obj;
1433c349dbc7Sjsg struct task_struct *tsk = NULL;
1434c349dbc7Sjsg struct i915_request *rq;
1435c349dbc7Sjsg struct evict_vma arg;
1436c349dbc7Sjsg struct hang h;
1437c349dbc7Sjsg unsigned int pin_flags;
1438c349dbc7Sjsg int err;
1439c349dbc7Sjsg
1440c349dbc7Sjsg if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1441c349dbc7Sjsg return 0;
1442c349dbc7Sjsg
14431bb76ff1Sjsg engine = intel_selftest_find_any_engine(gt);
14441bb76ff1Sjsg
1445c349dbc7Sjsg if (!engine || !intel_engine_can_store_dword(engine))
1446c349dbc7Sjsg return 0;
1447c349dbc7Sjsg
1448c349dbc7Sjsg /* Check that we can recover an unbind stuck on a hanging request */
1449c349dbc7Sjsg
1450c349dbc7Sjsg err = hang_init(&h, gt);
14515ca02815Sjsg if (err) {
14525ca02815Sjsg pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1453c349dbc7Sjsg return err;
14545ca02815Sjsg }
1455c349dbc7Sjsg
1456c349dbc7Sjsg obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1457c349dbc7Sjsg if (IS_ERR(obj)) {
1458c349dbc7Sjsg err = PTR_ERR(obj);
14595ca02815Sjsg pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1460c349dbc7Sjsg goto fini;
1461c349dbc7Sjsg }
1462c349dbc7Sjsg
1463c349dbc7Sjsg if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1464c349dbc7Sjsg err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1465c349dbc7Sjsg if (err) {
1466c349dbc7Sjsg pr_err("Invalid X-tiling settings; err:%d\n", err);
1467c349dbc7Sjsg goto out_obj;
1468c349dbc7Sjsg }
1469c349dbc7Sjsg }
1470c349dbc7Sjsg
1471c349dbc7Sjsg arg.vma = i915_vma_instance(obj, vm, NULL);
1472c349dbc7Sjsg if (IS_ERR(arg.vma)) {
1473c349dbc7Sjsg err = PTR_ERR(arg.vma);
14745ca02815Sjsg pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1475c349dbc7Sjsg goto out_obj;
1476c349dbc7Sjsg }
1477c349dbc7Sjsg
1478c349dbc7Sjsg rq = hang_create_request(&h, engine);
1479c349dbc7Sjsg if (IS_ERR(rq)) {
1480c349dbc7Sjsg err = PTR_ERR(rq);
14815ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1482c349dbc7Sjsg goto out_obj;
1483c349dbc7Sjsg }
1484c349dbc7Sjsg
1485c349dbc7Sjsg pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1486c349dbc7Sjsg
1487c349dbc7Sjsg if (flags & EXEC_OBJECT_NEEDS_FENCE)
1488c349dbc7Sjsg pin_flags |= PIN_MAPPABLE;
1489c349dbc7Sjsg
1490c349dbc7Sjsg err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1491c349dbc7Sjsg if (err) {
1492c349dbc7Sjsg i915_request_add(rq);
14935ca02815Sjsg pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1494c349dbc7Sjsg goto out_obj;
1495c349dbc7Sjsg }
1496c349dbc7Sjsg
1497c349dbc7Sjsg if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1498c349dbc7Sjsg err = i915_vma_pin_fence(arg.vma);
1499c349dbc7Sjsg if (err) {
1500c349dbc7Sjsg pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1501c349dbc7Sjsg i915_vma_unpin(arg.vma);
1502c349dbc7Sjsg i915_request_add(rq);
1503c349dbc7Sjsg goto out_obj;
1504c349dbc7Sjsg }
1505c349dbc7Sjsg }
1506c349dbc7Sjsg
1507*f005ef32Sjsg err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags);
15085ca02815Sjsg if (err)
15095ca02815Sjsg pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1510c349dbc7Sjsg
1511c349dbc7Sjsg if (flags & EXEC_OBJECT_NEEDS_FENCE)
1512c349dbc7Sjsg i915_vma_unpin_fence(arg.vma);
1513c349dbc7Sjsg i915_vma_unpin(arg.vma);
1514c349dbc7Sjsg
1515c349dbc7Sjsg i915_request_get(rq);
1516c349dbc7Sjsg i915_request_add(rq);
1517c349dbc7Sjsg if (err)
1518c349dbc7Sjsg goto out_rq;
1519c349dbc7Sjsg
1520c349dbc7Sjsg if (!wait_until_running(&h, rq)) {
1521c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1522c349dbc7Sjsg
1523c349dbc7Sjsg pr_err("%s: Failed to start request %llx, at %x\n",
1524c349dbc7Sjsg __func__, rq->fence.seqno, hws_seqno(&h, rq));
1525c349dbc7Sjsg intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1526c349dbc7Sjsg
1527c349dbc7Sjsg intel_gt_set_wedged(gt);
1528c349dbc7Sjsg goto out_reset;
1529c349dbc7Sjsg }
1530c349dbc7Sjsg
1531c349dbc7Sjsg init_completion(&arg.completion);
1532c349dbc7Sjsg
1533c349dbc7Sjsg tsk = kthread_run(fn, &arg, "igt/evict_vma");
1534c349dbc7Sjsg if (IS_ERR(tsk)) {
1535c349dbc7Sjsg err = PTR_ERR(tsk);
15365ca02815Sjsg pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1537c349dbc7Sjsg tsk = NULL;
1538c349dbc7Sjsg goto out_reset;
1539c349dbc7Sjsg }
1540c349dbc7Sjsg get_task_struct(tsk);
1541c349dbc7Sjsg
1542c349dbc7Sjsg wait_for_completion(&arg.completion);
1543c349dbc7Sjsg
1544c349dbc7Sjsg if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1545c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1546c349dbc7Sjsg
1547c349dbc7Sjsg pr_err("igt/evict_vma kthread did not wait\n");
1548c349dbc7Sjsg intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1549c349dbc7Sjsg
1550c349dbc7Sjsg intel_gt_set_wedged(gt);
1551c349dbc7Sjsg goto out_reset;
1552c349dbc7Sjsg }
1553c349dbc7Sjsg
1554c349dbc7Sjsg out_reset:
1555c349dbc7Sjsg igt_global_reset_lock(gt);
1556c349dbc7Sjsg fake_hangcheck(gt, rq->engine->mask);
1557c349dbc7Sjsg igt_global_reset_unlock(gt);
1558c349dbc7Sjsg
1559c349dbc7Sjsg if (tsk) {
1560c349dbc7Sjsg struct intel_wedge_me w;
1561c349dbc7Sjsg
1562c349dbc7Sjsg /* The reset, even indirectly, should take less than 10ms. */
1563c349dbc7Sjsg intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1564c349dbc7Sjsg err = kthread_stop(tsk);
1565c349dbc7Sjsg
1566c349dbc7Sjsg put_task_struct(tsk);
1567c349dbc7Sjsg }
1568c349dbc7Sjsg
1569c349dbc7Sjsg out_rq:
1570c349dbc7Sjsg i915_request_put(rq);
1571c349dbc7Sjsg out_obj:
1572c349dbc7Sjsg i915_gem_object_put(obj);
1573c349dbc7Sjsg fini:
1574c349dbc7Sjsg hang_fini(&h);
1575c349dbc7Sjsg if (intel_gt_is_wedged(gt))
1576c349dbc7Sjsg return -EIO;
1577c349dbc7Sjsg
1578c349dbc7Sjsg return err;
1579c349dbc7Sjsg }
1580c349dbc7Sjsg
igt_reset_evict_ggtt(void * arg)1581c349dbc7Sjsg static int igt_reset_evict_ggtt(void *arg)
1582c349dbc7Sjsg {
1583c349dbc7Sjsg struct intel_gt *gt = arg;
1584c349dbc7Sjsg
1585c349dbc7Sjsg return __igt_reset_evict_vma(gt, >->ggtt->vm,
1586c349dbc7Sjsg evict_vma, EXEC_OBJECT_WRITE);
1587c349dbc7Sjsg }
1588c349dbc7Sjsg
igt_reset_evict_ppgtt(void * arg)1589c349dbc7Sjsg static int igt_reset_evict_ppgtt(void *arg)
1590c349dbc7Sjsg {
1591c349dbc7Sjsg struct intel_gt *gt = arg;
1592c349dbc7Sjsg struct i915_ppgtt *ppgtt;
1593c349dbc7Sjsg int err;
1594c349dbc7Sjsg
1595c349dbc7Sjsg /* aliasing == global gtt locking, covered above */
1596c349dbc7Sjsg if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1597c349dbc7Sjsg return 0;
1598c349dbc7Sjsg
15991bb76ff1Sjsg ppgtt = i915_ppgtt_create(gt, 0);
1600c349dbc7Sjsg if (IS_ERR(ppgtt))
1601c349dbc7Sjsg return PTR_ERR(ppgtt);
1602c349dbc7Sjsg
1603c349dbc7Sjsg err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1604c349dbc7Sjsg evict_vma, EXEC_OBJECT_WRITE);
1605c349dbc7Sjsg i915_vm_put(&ppgtt->vm);
1606c349dbc7Sjsg
1607c349dbc7Sjsg return err;
1608c349dbc7Sjsg }
1609c349dbc7Sjsg
igt_reset_evict_fence(void * arg)1610c349dbc7Sjsg static int igt_reset_evict_fence(void *arg)
1611c349dbc7Sjsg {
1612c349dbc7Sjsg struct intel_gt *gt = arg;
1613c349dbc7Sjsg
1614c349dbc7Sjsg return __igt_reset_evict_vma(gt, >->ggtt->vm,
1615c349dbc7Sjsg evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1616c349dbc7Sjsg }
1617c349dbc7Sjsg
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1618c349dbc7Sjsg static int wait_for_others(struct intel_gt *gt,
1619c349dbc7Sjsg struct intel_engine_cs *exclude)
1620c349dbc7Sjsg {
1621c349dbc7Sjsg struct intel_engine_cs *engine;
1622c349dbc7Sjsg enum intel_engine_id id;
1623c349dbc7Sjsg
1624c349dbc7Sjsg for_each_engine(engine, gt, id) {
1625c349dbc7Sjsg if (engine == exclude)
1626c349dbc7Sjsg continue;
1627c349dbc7Sjsg
1628c349dbc7Sjsg if (!wait_for_idle(engine))
1629c349dbc7Sjsg return -EIO;
1630c349dbc7Sjsg }
1631c349dbc7Sjsg
1632c349dbc7Sjsg return 0;
1633c349dbc7Sjsg }
1634c349dbc7Sjsg
igt_reset_queue(void * arg)1635c349dbc7Sjsg static int igt_reset_queue(void *arg)
1636c349dbc7Sjsg {
1637c349dbc7Sjsg struct intel_gt *gt = arg;
1638c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
1639c349dbc7Sjsg struct intel_engine_cs *engine;
1640c349dbc7Sjsg enum intel_engine_id id;
1641c349dbc7Sjsg struct hang h;
1642c349dbc7Sjsg int err;
1643c349dbc7Sjsg
1644c349dbc7Sjsg /* Check that we replay pending requests following a hang */
1645c349dbc7Sjsg
1646c349dbc7Sjsg igt_global_reset_lock(gt);
1647c349dbc7Sjsg
1648c349dbc7Sjsg err = hang_init(&h, gt);
1649c349dbc7Sjsg if (err)
1650c349dbc7Sjsg goto unlock;
1651c349dbc7Sjsg
1652c349dbc7Sjsg for_each_engine(engine, gt, id) {
16535ca02815Sjsg struct intel_selftest_saved_policy saved;
1654c349dbc7Sjsg struct i915_request *prev;
1655c349dbc7Sjsg IGT_TIMEOUT(end_time);
1656c349dbc7Sjsg unsigned int count;
16575ca02815Sjsg bool using_guc = intel_engine_uses_guc(engine);
1658c349dbc7Sjsg
1659c349dbc7Sjsg if (!intel_engine_can_store_dword(engine))
1660c349dbc7Sjsg continue;
1661c349dbc7Sjsg
16625ca02815Sjsg if (using_guc) {
16635ca02815Sjsg err = intel_selftest_modify_policy(engine, &saved,
16645ca02815Sjsg SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
16655ca02815Sjsg if (err) {
16665ca02815Sjsg pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
16675ca02815Sjsg goto fini;
16685ca02815Sjsg }
16695ca02815Sjsg }
16705ca02815Sjsg
1671c349dbc7Sjsg prev = hang_create_request(&h, engine);
1672c349dbc7Sjsg if (IS_ERR(prev)) {
1673c349dbc7Sjsg err = PTR_ERR(prev);
16745ca02815Sjsg pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
16755ca02815Sjsg goto restore;
1676c349dbc7Sjsg }
1677c349dbc7Sjsg
1678c349dbc7Sjsg i915_request_get(prev);
1679c349dbc7Sjsg i915_request_add(prev);
1680c349dbc7Sjsg
1681c349dbc7Sjsg count = 0;
1682c349dbc7Sjsg do {
1683c349dbc7Sjsg struct i915_request *rq;
1684c349dbc7Sjsg unsigned int reset_count;
1685c349dbc7Sjsg
1686c349dbc7Sjsg rq = hang_create_request(&h, engine);
1687c349dbc7Sjsg if (IS_ERR(rq)) {
1688c349dbc7Sjsg err = PTR_ERR(rq);
16895ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
16905ca02815Sjsg goto restore;
1691c349dbc7Sjsg }
1692c349dbc7Sjsg
1693c349dbc7Sjsg i915_request_get(rq);
1694c349dbc7Sjsg i915_request_add(rq);
1695c349dbc7Sjsg
1696c349dbc7Sjsg /*
1697c349dbc7Sjsg * XXX We don't handle resetting the kernel context
1698c349dbc7Sjsg * very well. If we trigger a device reset twice in
1699c349dbc7Sjsg * quick succession while the kernel context is
1700c349dbc7Sjsg * executing, we may end up skipping the breadcrumb.
1701c349dbc7Sjsg * This is really only a problem for the selftest as
1702c349dbc7Sjsg * normally there is a large interlude between resets
1703c349dbc7Sjsg * (hangcheck), or we focus on resetting just one
1704c349dbc7Sjsg * engine and so avoid repeatedly resetting innocents.
1705c349dbc7Sjsg */
1706c349dbc7Sjsg err = wait_for_others(gt, engine);
1707c349dbc7Sjsg if (err) {
1708c349dbc7Sjsg pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1709c349dbc7Sjsg __func__, engine->name);
1710c349dbc7Sjsg i915_request_put(rq);
1711c349dbc7Sjsg i915_request_put(prev);
1712c349dbc7Sjsg
1713c349dbc7Sjsg GEM_TRACE_DUMP();
1714c349dbc7Sjsg intel_gt_set_wedged(gt);
17155ca02815Sjsg goto restore;
1716c349dbc7Sjsg }
1717c349dbc7Sjsg
1718c349dbc7Sjsg if (!wait_until_running(&h, prev)) {
1719c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1720c349dbc7Sjsg
1721c349dbc7Sjsg pr_err("%s(%s): Failed to start request %llx, at %x\n",
1722c349dbc7Sjsg __func__, engine->name,
1723c349dbc7Sjsg prev->fence.seqno, hws_seqno(&h, prev));
1724c349dbc7Sjsg intel_engine_dump(engine, &p,
1725c349dbc7Sjsg "%s\n", engine->name);
1726c349dbc7Sjsg
1727c349dbc7Sjsg i915_request_put(rq);
1728c349dbc7Sjsg i915_request_put(prev);
1729c349dbc7Sjsg
1730c349dbc7Sjsg intel_gt_set_wedged(gt);
1731c349dbc7Sjsg
1732c349dbc7Sjsg err = -EIO;
17335ca02815Sjsg goto restore;
1734c349dbc7Sjsg }
1735c349dbc7Sjsg
1736c349dbc7Sjsg reset_count = fake_hangcheck(gt, BIT(id));
1737c349dbc7Sjsg
1738c349dbc7Sjsg if (prev->fence.error != -EIO) {
1739c349dbc7Sjsg pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1740c349dbc7Sjsg prev->fence.error);
1741c349dbc7Sjsg i915_request_put(rq);
1742c349dbc7Sjsg i915_request_put(prev);
1743c349dbc7Sjsg err = -EINVAL;
17445ca02815Sjsg goto restore;
1745c349dbc7Sjsg }
1746c349dbc7Sjsg
1747c349dbc7Sjsg if (rq->fence.error) {
1748c349dbc7Sjsg pr_err("Fence error status not zero [%d] after unrelated reset\n",
1749c349dbc7Sjsg rq->fence.error);
1750c349dbc7Sjsg i915_request_put(rq);
1751c349dbc7Sjsg i915_request_put(prev);
1752c349dbc7Sjsg err = -EINVAL;
17535ca02815Sjsg goto restore;
1754c349dbc7Sjsg }
1755c349dbc7Sjsg
1756c349dbc7Sjsg if (i915_reset_count(global) == reset_count) {
1757c349dbc7Sjsg pr_err("No GPU reset recorded!\n");
1758c349dbc7Sjsg i915_request_put(rq);
1759c349dbc7Sjsg i915_request_put(prev);
1760c349dbc7Sjsg err = -EINVAL;
17615ca02815Sjsg goto restore;
1762c349dbc7Sjsg }
1763c349dbc7Sjsg
1764c349dbc7Sjsg i915_request_put(prev);
1765c349dbc7Sjsg prev = rq;
1766c349dbc7Sjsg count++;
1767c349dbc7Sjsg } while (time_before(jiffies, end_time));
17685ca02815Sjsg pr_info("%s: Completed %d queued resets\n",
17695ca02815Sjsg engine->name, count);
1770c349dbc7Sjsg
1771c349dbc7Sjsg *h.batch = MI_BATCH_BUFFER_END;
1772c349dbc7Sjsg intel_gt_chipset_flush(engine->gt);
1773c349dbc7Sjsg
1774c349dbc7Sjsg i915_request_put(prev);
1775c349dbc7Sjsg
17765ca02815Sjsg restore:
17775ca02815Sjsg if (using_guc) {
17785ca02815Sjsg int err2 = intel_selftest_restore_policy(engine, &saved);
17795ca02815Sjsg
17805ca02815Sjsg if (err2)
17815ca02815Sjsg pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
17825ca02815Sjsg __func__, __LINE__, engine->name, err2);
17835ca02815Sjsg if (err == 0)
17845ca02815Sjsg err = err2;
17855ca02815Sjsg }
1786c349dbc7Sjsg if (err)
17875ca02815Sjsg goto fini;
17885ca02815Sjsg
17895ca02815Sjsg err = igt_flush_test(gt->i915);
17905ca02815Sjsg if (err) {
17915ca02815Sjsg pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1792c349dbc7Sjsg break;
1793c349dbc7Sjsg }
17945ca02815Sjsg }
1795c349dbc7Sjsg
1796c349dbc7Sjsg fini:
1797c349dbc7Sjsg hang_fini(&h);
1798c349dbc7Sjsg unlock:
1799c349dbc7Sjsg igt_global_reset_unlock(gt);
1800c349dbc7Sjsg
1801c349dbc7Sjsg if (intel_gt_is_wedged(gt))
1802c349dbc7Sjsg return -EIO;
1803c349dbc7Sjsg
1804c349dbc7Sjsg return err;
1805c349dbc7Sjsg }
1806c349dbc7Sjsg
igt_handle_error(void * arg)1807c349dbc7Sjsg static int igt_handle_error(void *arg)
1808c349dbc7Sjsg {
1809c349dbc7Sjsg struct intel_gt *gt = arg;
1810c349dbc7Sjsg struct i915_gpu_error *global = >->i915->gpu_error;
18111bb76ff1Sjsg struct intel_engine_cs *engine;
1812c349dbc7Sjsg struct hang h;
1813c349dbc7Sjsg struct i915_request *rq;
1814c349dbc7Sjsg struct i915_gpu_coredump *error;
1815c349dbc7Sjsg int err;
1816c349dbc7Sjsg
18171bb76ff1Sjsg engine = intel_selftest_find_any_engine(gt);
18181bb76ff1Sjsg
1819c349dbc7Sjsg /* Check that we can issue a global GPU and engine reset */
1820c349dbc7Sjsg
1821c349dbc7Sjsg if (!intel_has_reset_engine(gt))
1822c349dbc7Sjsg return 0;
1823c349dbc7Sjsg
1824c349dbc7Sjsg if (!engine || !intel_engine_can_store_dword(engine))
1825c349dbc7Sjsg return 0;
1826c349dbc7Sjsg
1827c349dbc7Sjsg err = hang_init(&h, gt);
18285ca02815Sjsg if (err) {
18295ca02815Sjsg pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1830c349dbc7Sjsg return err;
18315ca02815Sjsg }
1832c349dbc7Sjsg
1833c349dbc7Sjsg rq = hang_create_request(&h, engine);
1834c349dbc7Sjsg if (IS_ERR(rq)) {
1835c349dbc7Sjsg err = PTR_ERR(rq);
18365ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1837c349dbc7Sjsg goto err_fini;
1838c349dbc7Sjsg }
1839c349dbc7Sjsg
1840c349dbc7Sjsg i915_request_get(rq);
1841c349dbc7Sjsg i915_request_add(rq);
1842c349dbc7Sjsg
1843c349dbc7Sjsg if (!wait_until_running(&h, rq)) {
1844c349dbc7Sjsg struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1845c349dbc7Sjsg
1846c349dbc7Sjsg pr_err("%s: Failed to start request %llx, at %x\n",
1847c349dbc7Sjsg __func__, rq->fence.seqno, hws_seqno(&h, rq));
1848c349dbc7Sjsg intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1849c349dbc7Sjsg
1850c349dbc7Sjsg intel_gt_set_wedged(gt);
1851c349dbc7Sjsg
1852c349dbc7Sjsg err = -EIO;
1853c349dbc7Sjsg goto err_request;
1854c349dbc7Sjsg }
1855c349dbc7Sjsg
1856c349dbc7Sjsg /* Temporarily disable error capture */
1857c349dbc7Sjsg error = xchg(&global->first_error, (void *)-1);
1858c349dbc7Sjsg
1859c349dbc7Sjsg intel_gt_handle_error(gt, engine->mask, 0, NULL);
1860c349dbc7Sjsg
1861c349dbc7Sjsg xchg(&global->first_error, error);
1862c349dbc7Sjsg
1863c349dbc7Sjsg if (rq->fence.error != -EIO) {
1864c349dbc7Sjsg pr_err("Guilty request not identified!\n");
1865c349dbc7Sjsg err = -EINVAL;
1866c349dbc7Sjsg goto err_request;
1867c349dbc7Sjsg }
1868c349dbc7Sjsg
1869c349dbc7Sjsg err_request:
1870c349dbc7Sjsg i915_request_put(rq);
1871c349dbc7Sjsg err_fini:
1872c349dbc7Sjsg hang_fini(&h);
1873c349dbc7Sjsg return err;
1874c349dbc7Sjsg }
1875c349dbc7Sjsg
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1876c349dbc7Sjsg static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1877c349dbc7Sjsg const struct igt_atomic_section *p,
1878c349dbc7Sjsg const char *mode)
1879c349dbc7Sjsg {
18805ca02815Sjsg struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1881c349dbc7Sjsg int err;
1882c349dbc7Sjsg
1883c349dbc7Sjsg GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1884c349dbc7Sjsg engine->name, mode, p->name);
1885c349dbc7Sjsg
18865ca02815Sjsg if (t->func)
1887c349dbc7Sjsg tasklet_disable(t);
18885ca02815Sjsg if (strcmp(p->name, "softirq"))
18895ca02815Sjsg local_bh_disable();
1890c349dbc7Sjsg p->critical_section_begin();
1891c349dbc7Sjsg
18925ca02815Sjsg err = __intel_engine_reset_bh(engine, NULL);
1893c349dbc7Sjsg
1894c349dbc7Sjsg p->critical_section_end();
18955ca02815Sjsg if (strcmp(p->name, "softirq"))
18965ca02815Sjsg local_bh_enable();
18975ca02815Sjsg if (t->func) {
1898c349dbc7Sjsg tasklet_enable(t);
18995ca02815Sjsg tasklet_hi_schedule(t);
19005ca02815Sjsg }
1901c349dbc7Sjsg
1902c349dbc7Sjsg if (err)
1903c349dbc7Sjsg pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1904c349dbc7Sjsg engine->name, mode, p->name);
1905c349dbc7Sjsg
1906c349dbc7Sjsg return err;
1907c349dbc7Sjsg }
1908c349dbc7Sjsg
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1909c349dbc7Sjsg static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1910c349dbc7Sjsg const struct igt_atomic_section *p)
1911c349dbc7Sjsg {
1912c349dbc7Sjsg struct i915_request *rq;
1913c349dbc7Sjsg struct hang h;
1914c349dbc7Sjsg int err;
1915c349dbc7Sjsg
1916c349dbc7Sjsg err = __igt_atomic_reset_engine(engine, p, "idle");
1917c349dbc7Sjsg if (err)
1918c349dbc7Sjsg return err;
1919c349dbc7Sjsg
1920c349dbc7Sjsg err = hang_init(&h, engine->gt);
19215ca02815Sjsg if (err) {
19225ca02815Sjsg pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1923c349dbc7Sjsg return err;
19245ca02815Sjsg }
1925c349dbc7Sjsg
1926c349dbc7Sjsg rq = hang_create_request(&h, engine);
1927c349dbc7Sjsg if (IS_ERR(rq)) {
1928c349dbc7Sjsg err = PTR_ERR(rq);
19295ca02815Sjsg pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1930c349dbc7Sjsg goto out;
1931c349dbc7Sjsg }
1932c349dbc7Sjsg
1933c349dbc7Sjsg i915_request_get(rq);
1934c349dbc7Sjsg i915_request_add(rq);
1935c349dbc7Sjsg
1936c349dbc7Sjsg if (wait_until_running(&h, rq)) {
1937c349dbc7Sjsg err = __igt_atomic_reset_engine(engine, p, "active");
1938c349dbc7Sjsg } else {
1939c349dbc7Sjsg pr_err("%s(%s): Failed to start request %llx, at %x\n",
1940c349dbc7Sjsg __func__, engine->name,
1941c349dbc7Sjsg rq->fence.seqno, hws_seqno(&h, rq));
1942c349dbc7Sjsg intel_gt_set_wedged(engine->gt);
1943c349dbc7Sjsg err = -EIO;
1944c349dbc7Sjsg }
1945c349dbc7Sjsg
1946c349dbc7Sjsg if (err == 0) {
1947c349dbc7Sjsg struct intel_wedge_me w;
1948c349dbc7Sjsg
1949c349dbc7Sjsg intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1950c349dbc7Sjsg i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1951c349dbc7Sjsg if (intel_gt_is_wedged(engine->gt))
1952c349dbc7Sjsg err = -EIO;
1953c349dbc7Sjsg }
1954c349dbc7Sjsg
1955c349dbc7Sjsg i915_request_put(rq);
1956c349dbc7Sjsg out:
1957c349dbc7Sjsg hang_fini(&h);
1958c349dbc7Sjsg return err;
1959c349dbc7Sjsg }
1960c349dbc7Sjsg
igt_reset_engines_atomic(void * arg)1961c349dbc7Sjsg static int igt_reset_engines_atomic(void *arg)
1962c349dbc7Sjsg {
1963c349dbc7Sjsg struct intel_gt *gt = arg;
1964c349dbc7Sjsg const typeof(*igt_atomic_phases) *p;
1965c349dbc7Sjsg int err = 0;
1966c349dbc7Sjsg
1967c349dbc7Sjsg /* Check that the engines resets are usable from atomic context */
1968c349dbc7Sjsg
1969c349dbc7Sjsg if (!intel_has_reset_engine(gt))
1970c349dbc7Sjsg return 0;
1971c349dbc7Sjsg
1972c349dbc7Sjsg if (intel_uc_uses_guc_submission(>->uc))
1973c349dbc7Sjsg return 0;
1974c349dbc7Sjsg
1975c349dbc7Sjsg igt_global_reset_lock(gt);
1976c349dbc7Sjsg
1977c349dbc7Sjsg /* Flush any requests before we get started and check basics */
1978c349dbc7Sjsg if (!igt_force_reset(gt))
1979c349dbc7Sjsg goto unlock;
1980c349dbc7Sjsg
1981c349dbc7Sjsg for (p = igt_atomic_phases; p->name; p++) {
1982c349dbc7Sjsg struct intel_engine_cs *engine;
1983c349dbc7Sjsg enum intel_engine_id id;
1984c349dbc7Sjsg
1985c349dbc7Sjsg for_each_engine(engine, gt, id) {
1986c349dbc7Sjsg err = igt_atomic_reset_engine(engine, p);
1987c349dbc7Sjsg if (err)
1988c349dbc7Sjsg goto out;
1989c349dbc7Sjsg }
1990c349dbc7Sjsg }
1991c349dbc7Sjsg
1992c349dbc7Sjsg out:
1993c349dbc7Sjsg /* As we poke around the guts, do a full reset before continuing. */
1994c349dbc7Sjsg igt_force_reset(gt);
1995c349dbc7Sjsg unlock:
1996c349dbc7Sjsg igt_global_reset_unlock(gt);
1997c349dbc7Sjsg
1998c349dbc7Sjsg return err;
1999c349dbc7Sjsg }
2000c349dbc7Sjsg
intel_hangcheck_live_selftests(struct drm_i915_private * i915)2001c349dbc7Sjsg int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2002c349dbc7Sjsg {
2003c349dbc7Sjsg static const struct i915_subtest tests[] = {
2004c349dbc7Sjsg SUBTEST(igt_hang_sanitycheck),
2005c349dbc7Sjsg SUBTEST(igt_reset_nop),
2006c349dbc7Sjsg SUBTEST(igt_reset_nop_engine),
2007c349dbc7Sjsg SUBTEST(igt_reset_idle_engine),
2008c349dbc7Sjsg SUBTEST(igt_reset_active_engine),
20095ca02815Sjsg SUBTEST(igt_reset_fail_engine),
2010c349dbc7Sjsg SUBTEST(igt_reset_engines),
2011c349dbc7Sjsg SUBTEST(igt_reset_engines_atomic),
2012c349dbc7Sjsg SUBTEST(igt_reset_queue),
2013c349dbc7Sjsg SUBTEST(igt_reset_wait),
2014c349dbc7Sjsg SUBTEST(igt_reset_evict_ggtt),
2015c349dbc7Sjsg SUBTEST(igt_reset_evict_ppgtt),
2016c349dbc7Sjsg SUBTEST(igt_reset_evict_fence),
2017c349dbc7Sjsg SUBTEST(igt_handle_error),
2018c349dbc7Sjsg };
20191bb76ff1Sjsg struct intel_gt *gt = to_gt(i915);
2020c349dbc7Sjsg intel_wakeref_t wakeref;
2021c349dbc7Sjsg int err;
2022c349dbc7Sjsg
2023c349dbc7Sjsg if (!intel_has_gpu_reset(gt))
2024c349dbc7Sjsg return 0;
2025c349dbc7Sjsg
2026c349dbc7Sjsg if (intel_gt_is_wedged(gt))
2027c349dbc7Sjsg return -EIO; /* we're long past hope of a successful reset */
2028c349dbc7Sjsg
2029c349dbc7Sjsg wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2030c349dbc7Sjsg
2031c349dbc7Sjsg err = intel_gt_live_subtests(tests, gt);
2032c349dbc7Sjsg
2033c349dbc7Sjsg intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2034c349dbc7Sjsg
2035c349dbc7Sjsg return err;
2036c349dbc7Sjsg }
2037