xref: /netbsd-src/sys/external/bsd/drm2/dist/drm/i915/gt/selftest_hangcheck.c (revision 41ec02673d281bbb3d38e6c78504ce6e30c228c1)
1 /*	$NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $	*/
2 
3 /*
4  * Copyright © 2016 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  *
25  */
26 
27 #include <sys/cdefs.h>
28 __KERNEL_RCSID(0, "$NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $");
29 
30 #include <linux/kthread.h>
31 
32 #include "gem/i915_gem_context.h"
33 
34 #include "intel_gt.h"
35 #include "intel_engine_heartbeat.h"
36 #include "intel_engine_pm.h"
37 
38 #include "i915_selftest.h"
39 #include "selftests/i915_random.h"
40 #include "selftests/igt_flush_test.h"
41 #include "selftests/igt_reset.h"
42 #include "selftests/igt_atomic.h"
43 
44 #include "selftests/mock_drm.h"
45 
46 #include "gem/selftests/mock_context.h"
47 #include "gem/selftests/igt_gem_utils.h"
48 
49 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
50 
51 struct hang {
52 	struct intel_gt *gt;
53 	struct drm_i915_gem_object *hws;
54 	struct drm_i915_gem_object *obj;
55 	struct i915_gem_context *ctx;
56 	u32 *seqno;
57 	u32 *batch;
58 };
59 
hang_init(struct hang * h,struct intel_gt * gt)60 static int hang_init(struct hang *h, struct intel_gt *gt)
61 {
62 	void *vaddr;
63 	int err;
64 
65 	memset(h, 0, sizeof(*h));
66 	h->gt = gt;
67 
68 	h->ctx = kernel_context(gt->i915);
69 	if (IS_ERR(h->ctx))
70 		return PTR_ERR(h->ctx);
71 
72 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
73 
74 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
75 	if (IS_ERR(h->hws)) {
76 		err = PTR_ERR(h->hws);
77 		goto err_ctx;
78 	}
79 
80 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
81 	if (IS_ERR(h->obj)) {
82 		err = PTR_ERR(h->obj);
83 		goto err_hws;
84 	}
85 
86 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
87 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
88 	if (IS_ERR(vaddr)) {
89 		err = PTR_ERR(vaddr);
90 		goto err_obj;
91 	}
92 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
93 
94 	vaddr = i915_gem_object_pin_map(h->obj,
95 					i915_coherent_map_type(gt->i915));
96 	if (IS_ERR(vaddr)) {
97 		err = PTR_ERR(vaddr);
98 		goto err_unpin_hws;
99 	}
100 	h->batch = vaddr;
101 
102 	return 0;
103 
104 err_unpin_hws:
105 	i915_gem_object_unpin_map(h->hws);
106 err_obj:
107 	i915_gem_object_put(h->obj);
108 err_hws:
109 	i915_gem_object_put(h->hws);
110 err_ctx:
111 	kernel_context_close(h->ctx);
112 	return err;
113 }
114 
hws_address(const struct i915_vma * hws,const struct i915_request * rq)115 static u64 hws_address(const struct i915_vma *hws,
116 		       const struct i915_request *rq)
117 {
118 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
119 }
120 
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)121 static int move_to_active(struct i915_vma *vma,
122 			  struct i915_request *rq,
123 			  unsigned int flags)
124 {
125 	int err;
126 
127 	i915_vma_lock(vma);
128 	err = i915_request_await_object(rq, vma->obj,
129 					flags & EXEC_OBJECT_WRITE);
130 	if (err == 0)
131 		err = i915_vma_move_to_active(vma, rq, flags);
132 	i915_vma_unlock(vma);
133 
134 	return err;
135 }
136 
137 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)138 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
139 {
140 	struct intel_gt *gt = h->gt;
141 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
142 	struct drm_i915_gem_object *obj;
143 	struct i915_request *rq = NULL;
144 	struct i915_vma *hws, *vma;
145 	unsigned int flags;
146 	void *vaddr;
147 	u32 *batch;
148 	int err;
149 
150 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
151 	if (IS_ERR(obj)) {
152 		i915_vm_put(vm);
153 		return ERR_CAST(obj);
154 	}
155 
156 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
157 	if (IS_ERR(vaddr)) {
158 		i915_gem_object_put(obj);
159 		i915_vm_put(vm);
160 		return ERR_CAST(vaddr);
161 	}
162 
163 	i915_gem_object_unpin_map(h->obj);
164 	i915_gem_object_put(h->obj);
165 
166 	h->obj = obj;
167 	h->batch = vaddr;
168 
169 	vma = i915_vma_instance(h->obj, vm, NULL);
170 	if (IS_ERR(vma)) {
171 		i915_vm_put(vm);
172 		return ERR_CAST(vma);
173 	}
174 
175 	hws = i915_vma_instance(h->hws, vm, NULL);
176 	if (IS_ERR(hws)) {
177 		i915_vm_put(vm);
178 		return ERR_CAST(hws);
179 	}
180 
181 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
182 	if (err) {
183 		i915_vm_put(vm);
184 		return ERR_PTR(err);
185 	}
186 
187 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
188 	if (err)
189 		goto unpin_vma;
190 
191 	rq = igt_request_alloc(h->ctx, engine);
192 	if (IS_ERR(rq)) {
193 		err = PTR_ERR(rq);
194 		goto unpin_hws;
195 	}
196 
197 	err = move_to_active(vma, rq, 0);
198 	if (err)
199 		goto cancel_rq;
200 
201 	err = move_to_active(hws, rq, 0);
202 	if (err)
203 		goto cancel_rq;
204 
205 	batch = h->batch;
206 	if (INTEL_GEN(gt->i915) >= 8) {
207 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
208 		*batch++ = lower_32_bits(hws_address(hws, rq));
209 		*batch++ = upper_32_bits(hws_address(hws, rq));
210 		*batch++ = rq->fence.seqno;
211 		*batch++ = MI_ARB_CHECK;
212 
213 		memset(batch, 0, 1024);
214 		batch += 1024 / sizeof(*batch);
215 
216 		*batch++ = MI_ARB_CHECK;
217 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
218 		*batch++ = lower_32_bits(vma->node.start);
219 		*batch++ = upper_32_bits(vma->node.start);
220 	} else if (INTEL_GEN(gt->i915) >= 6) {
221 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
222 		*batch++ = 0;
223 		*batch++ = lower_32_bits(hws_address(hws, rq));
224 		*batch++ = rq->fence.seqno;
225 		*batch++ = MI_ARB_CHECK;
226 
227 		memset(batch, 0, 1024);
228 		batch += 1024 / sizeof(*batch);
229 
230 		*batch++ = MI_ARB_CHECK;
231 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
232 		*batch++ = lower_32_bits(vma->node.start);
233 	} else if (INTEL_GEN(gt->i915) >= 4) {
234 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
235 		*batch++ = 0;
236 		*batch++ = lower_32_bits(hws_address(hws, rq));
237 		*batch++ = rq->fence.seqno;
238 		*batch++ = MI_ARB_CHECK;
239 
240 		memset(batch, 0, 1024);
241 		batch += 1024 / sizeof(*batch);
242 
243 		*batch++ = MI_ARB_CHECK;
244 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
245 		*batch++ = lower_32_bits(vma->node.start);
246 	} else {
247 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
248 		*batch++ = lower_32_bits(hws_address(hws, rq));
249 		*batch++ = rq->fence.seqno;
250 		*batch++ = MI_ARB_CHECK;
251 
252 		memset(batch, 0, 1024);
253 		batch += 1024 / sizeof(*batch);
254 
255 		*batch++ = MI_ARB_CHECK;
256 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
257 		*batch++ = lower_32_bits(vma->node.start);
258 	}
259 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
260 	intel_gt_chipset_flush(engine->gt);
261 
262 	if (rq->engine->emit_init_breadcrumb) {
263 		err = rq->engine->emit_init_breadcrumb(rq);
264 		if (err)
265 			goto cancel_rq;
266 	}
267 
268 	flags = 0;
269 	if (INTEL_GEN(gt->i915) <= 5)
270 		flags |= I915_DISPATCH_SECURE;
271 
272 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
273 
274 cancel_rq:
275 	if (err) {
276 		i915_request_skip(rq, err);
277 		i915_request_add(rq);
278 	}
279 unpin_hws:
280 	i915_vma_unpin(hws);
281 unpin_vma:
282 	i915_vma_unpin(vma);
283 	i915_vm_put(vm);
284 	return err ? ERR_PTR(err) : rq;
285 }
286 
hws_seqno(const struct hang * h,const struct i915_request * rq)287 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
288 {
289 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
290 }
291 
hang_fini(struct hang * h)292 static void hang_fini(struct hang *h)
293 {
294 	*h->batch = MI_BATCH_BUFFER_END;
295 	intel_gt_chipset_flush(h->gt);
296 
297 	i915_gem_object_unpin_map(h->obj);
298 	i915_gem_object_put(h->obj);
299 
300 	i915_gem_object_unpin_map(h->hws);
301 	i915_gem_object_put(h->hws);
302 
303 	kernel_context_close(h->ctx);
304 
305 	igt_flush_test(h->gt->i915);
306 }
307 
wait_until_running(struct hang * h,struct i915_request * rq)308 static bool wait_until_running(struct hang *h, struct i915_request *rq)
309 {
310 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
311 					       rq->fence.seqno),
312 			     10) &&
313 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
314 					    rq->fence.seqno),
315 			  1000));
316 }
317 
engine_heartbeat_disable(struct intel_engine_cs * engine,unsigned long * saved)318 static void engine_heartbeat_disable(struct intel_engine_cs *engine,
319 				     unsigned long *saved)
320 {
321 	*saved = engine->props.heartbeat_interval_ms;
322 	engine->props.heartbeat_interval_ms = 0;
323 
324 	intel_engine_pm_get(engine);
325 	intel_engine_park_heartbeat(engine);
326 }
327 
engine_heartbeat_enable(struct intel_engine_cs * engine,unsigned long saved)328 static void engine_heartbeat_enable(struct intel_engine_cs *engine,
329 				    unsigned long saved)
330 {
331 	intel_engine_pm_put(engine);
332 
333 	engine->props.heartbeat_interval_ms = saved;
334 }
335 
igt_hang_sanitycheck(void * arg)336 static int igt_hang_sanitycheck(void *arg)
337 {
338 	struct intel_gt *gt = arg;
339 	struct i915_request *rq;
340 	struct intel_engine_cs *engine;
341 	enum intel_engine_id id;
342 	struct hang h;
343 	int err;
344 
345 	/* Basic check that we can execute our hanging batch */
346 
347 	err = hang_init(&h, gt);
348 	if (err)
349 		return err;
350 
351 	for_each_engine(engine, gt, id) {
352 		struct intel_wedge_me w;
353 		long timeout;
354 
355 		if (!intel_engine_can_store_dword(engine))
356 			continue;
357 
358 		rq = hang_create_request(&h, engine);
359 		if (IS_ERR(rq)) {
360 			err = PTR_ERR(rq);
361 			pr_err("Failed to create request for %s, err=%d\n",
362 			       engine->name, err);
363 			goto fini;
364 		}
365 
366 		i915_request_get(rq);
367 
368 		*h.batch = MI_BATCH_BUFFER_END;
369 		intel_gt_chipset_flush(engine->gt);
370 
371 		i915_request_add(rq);
372 
373 		timeout = 0;
374 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
375 			timeout = i915_request_wait(rq, 0,
376 						    MAX_SCHEDULE_TIMEOUT);
377 		if (intel_gt_is_wedged(gt))
378 			timeout = -EIO;
379 
380 		i915_request_put(rq);
381 
382 		if (timeout < 0) {
383 			err = timeout;
384 			pr_err("Wait for request failed on %s, err=%d\n",
385 			       engine->name, err);
386 			goto fini;
387 		}
388 	}
389 
390 fini:
391 	hang_fini(&h);
392 	return err;
393 }
394 
wait_for_idle(struct intel_engine_cs * engine)395 static bool wait_for_idle(struct intel_engine_cs *engine)
396 {
397 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
398 }
399 
igt_reset_nop(void * arg)400 static int igt_reset_nop(void *arg)
401 {
402 	struct intel_gt *gt = arg;
403 	struct i915_gpu_error *global = &gt->i915->gpu_error;
404 	struct intel_engine_cs *engine;
405 	unsigned int reset_count, count;
406 	enum intel_engine_id id;
407 	IGT_TIMEOUT(end_time);
408 	int err = 0;
409 
410 	/* Check that we can reset during non-user portions of requests */
411 
412 	reset_count = i915_reset_count(global);
413 	count = 0;
414 	do {
415 		for_each_engine(engine, gt, id) {
416 			struct intel_context *ce;
417 			int i;
418 
419 			ce = intel_context_create(engine);
420 			if (IS_ERR(ce)) {
421 				err = PTR_ERR(ce);
422 				break;
423 			}
424 
425 			for (i = 0; i < 16; i++) {
426 				struct i915_request *rq;
427 
428 				rq = intel_context_create_request(ce);
429 				if (IS_ERR(rq)) {
430 					err = PTR_ERR(rq);
431 					break;
432 				}
433 
434 				i915_request_add(rq);
435 			}
436 
437 			intel_context_put(ce);
438 		}
439 
440 		igt_global_reset_lock(gt);
441 		intel_gt_reset(gt, ALL_ENGINES, NULL);
442 		igt_global_reset_unlock(gt);
443 
444 		if (intel_gt_is_wedged(gt)) {
445 			err = -EIO;
446 			break;
447 		}
448 
449 		if (i915_reset_count(global) != reset_count + ++count) {
450 			pr_err("Full GPU reset not recorded!\n");
451 			err = -EINVAL;
452 			break;
453 		}
454 
455 		err = igt_flush_test(gt->i915);
456 		if (err)
457 			break;
458 	} while (time_before(jiffies, end_time));
459 	pr_info("%s: %d resets\n", __func__, count);
460 
461 	if (igt_flush_test(gt->i915))
462 		err = -EIO;
463 	return err;
464 }
465 
igt_reset_nop_engine(void * arg)466 static int igt_reset_nop_engine(void *arg)
467 {
468 	struct intel_gt *gt = arg;
469 	struct i915_gpu_error *global = &gt->i915->gpu_error;
470 	struct intel_engine_cs *engine;
471 	enum intel_engine_id id;
472 
473 	/* Check that we can engine-reset during non-user portions */
474 
475 	if (!intel_has_reset_engine(gt))
476 		return 0;
477 
478 	for_each_engine(engine, gt, id) {
479 		unsigned int reset_count, reset_engine_count, count;
480 		struct intel_context *ce;
481 		unsigned long heartbeat;
482 		IGT_TIMEOUT(end_time);
483 		int err;
484 
485 		ce = intel_context_create(engine);
486 		if (IS_ERR(ce))
487 			return PTR_ERR(ce);
488 
489 		reset_count = i915_reset_count(global);
490 		reset_engine_count = i915_reset_engine_count(global, engine);
491 		count = 0;
492 
493 		engine_heartbeat_disable(engine, &heartbeat);
494 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
495 		do {
496 			int i;
497 
498 			if (!wait_for_idle(engine)) {
499 				pr_err("%s failed to idle before reset\n",
500 				       engine->name);
501 				err = -EIO;
502 				break;
503 			}
504 
505 			for (i = 0; i < 16; i++) {
506 				struct i915_request *rq;
507 
508 				rq = intel_context_create_request(ce);
509 				if (IS_ERR(rq)) {
510 					err = PTR_ERR(rq);
511 					break;
512 				}
513 
514 				i915_request_add(rq);
515 			}
516 			err = intel_engine_reset(engine, NULL);
517 			if (err) {
518 				pr_err("i915_reset_engine failed\n");
519 				break;
520 			}
521 
522 			if (i915_reset_count(global) != reset_count) {
523 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
524 				err = -EINVAL;
525 				break;
526 			}
527 
528 			if (i915_reset_engine_count(global, engine) !=
529 			    reset_engine_count + ++count) {
530 				pr_err("%s engine reset not recorded!\n",
531 				       engine->name);
532 				err = -EINVAL;
533 				break;
534 			}
535 		} while (time_before(jiffies, end_time));
536 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
537 		engine_heartbeat_enable(engine, heartbeat);
538 
539 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
540 
541 		intel_context_put(ce);
542 		if (igt_flush_test(gt->i915))
543 			err = -EIO;
544 		if (err)
545 			return err;
546 	}
547 
548 	return 0;
549 }
550 
__igt_reset_engine(struct intel_gt * gt,bool active)551 static int __igt_reset_engine(struct intel_gt *gt, bool active)
552 {
553 	struct i915_gpu_error *global = &gt->i915->gpu_error;
554 	struct intel_engine_cs *engine;
555 	enum intel_engine_id id;
556 	struct hang h;
557 	int err = 0;
558 
559 	/* Check that we can issue an engine reset on an idle engine (no-op) */
560 
561 	if (!intel_has_reset_engine(gt))
562 		return 0;
563 
564 	if (active) {
565 		err = hang_init(&h, gt);
566 		if (err)
567 			return err;
568 	}
569 
570 	for_each_engine(engine, gt, id) {
571 		unsigned int reset_count, reset_engine_count;
572 		unsigned long heartbeat;
573 		IGT_TIMEOUT(end_time);
574 
575 		if (active && !intel_engine_can_store_dword(engine))
576 			continue;
577 
578 		if (!wait_for_idle(engine)) {
579 			pr_err("%s failed to idle before reset\n",
580 			       engine->name);
581 			err = -EIO;
582 			break;
583 		}
584 
585 		reset_count = i915_reset_count(global);
586 		reset_engine_count = i915_reset_engine_count(global, engine);
587 
588 		engine_heartbeat_disable(engine, &heartbeat);
589 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
590 		do {
591 			if (active) {
592 				struct i915_request *rq;
593 
594 				rq = hang_create_request(&h, engine);
595 				if (IS_ERR(rq)) {
596 					err = PTR_ERR(rq);
597 					break;
598 				}
599 
600 				i915_request_get(rq);
601 				i915_request_add(rq);
602 
603 				if (!wait_until_running(&h, rq)) {
604 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
605 
606 					pr_err("%s: Failed to start request %llx, at %x\n",
607 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
608 					intel_engine_dump(engine, &p,
609 							  "%s\n", engine->name);
610 
611 					i915_request_put(rq);
612 					err = -EIO;
613 					break;
614 				}
615 
616 				i915_request_put(rq);
617 			}
618 
619 			err = intel_engine_reset(engine, NULL);
620 			if (err) {
621 				pr_err("i915_reset_engine failed\n");
622 				break;
623 			}
624 
625 			if (i915_reset_count(global) != reset_count) {
626 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
627 				err = -EINVAL;
628 				break;
629 			}
630 
631 			if (i915_reset_engine_count(global, engine) !=
632 			    ++reset_engine_count) {
633 				pr_err("%s engine reset not recorded!\n",
634 				       engine->name);
635 				err = -EINVAL;
636 				break;
637 			}
638 		} while (time_before(jiffies, end_time));
639 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
640 		engine_heartbeat_enable(engine, heartbeat);
641 
642 		if (err)
643 			break;
644 
645 		err = igt_flush_test(gt->i915);
646 		if (err)
647 			break;
648 	}
649 
650 	if (intel_gt_is_wedged(gt))
651 		err = -EIO;
652 
653 	if (active)
654 		hang_fini(&h);
655 
656 	return err;
657 }
658 
igt_reset_idle_engine(void * arg)659 static int igt_reset_idle_engine(void *arg)
660 {
661 	return __igt_reset_engine(arg, false);
662 }
663 
igt_reset_active_engine(void * arg)664 static int igt_reset_active_engine(void *arg)
665 {
666 	return __igt_reset_engine(arg, true);
667 }
668 
669 struct active_engine {
670 	struct task_struct *task;
671 	struct intel_engine_cs *engine;
672 	unsigned long resets;
673 	unsigned int flags;
674 };
675 
676 #define TEST_ACTIVE	BIT(0)
677 #define TEST_OTHERS	BIT(1)
678 #define TEST_SELF	BIT(2)
679 #define TEST_PRIORITY	BIT(3)
680 
active_request_put(struct i915_request * rq)681 static int active_request_put(struct i915_request *rq)
682 {
683 	int err = 0;
684 
685 	if (!rq)
686 		return 0;
687 
688 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
689 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
690 			  rq->engine->name,
691 			  rq->fence.context,
692 			  rq->fence.seqno);
693 		GEM_TRACE_DUMP();
694 
695 		intel_gt_set_wedged(rq->engine->gt);
696 		err = -EIO;
697 	}
698 
699 	i915_request_put(rq);
700 
701 	return err;
702 }
703 
active_engine(void * data)704 static int active_engine(void *data)
705 {
706 	I915_RND_STATE(prng);
707 	struct active_engine *arg = data;
708 	struct intel_engine_cs *engine = arg->engine;
709 	struct i915_request *rq[8] = {};
710 	struct intel_context *ce[ARRAY_SIZE(rq)];
711 	unsigned long count;
712 	int err = 0;
713 
714 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
715 		ce[count] = intel_context_create(engine);
716 		if (IS_ERR(ce[count])) {
717 			err = PTR_ERR(ce[count]);
718 			while (--count)
719 				intel_context_put(ce[count]);
720 			return err;
721 		}
722 	}
723 
724 	count = 0;
725 	while (!kthread_should_stop()) {
726 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
727 		struct i915_request *old = rq[idx];
728 		struct i915_request *new;
729 
730 		new = intel_context_create_request(ce[idx]);
731 		if (IS_ERR(new)) {
732 			err = PTR_ERR(new);
733 			break;
734 		}
735 
736 		rq[idx] = i915_request_get(new);
737 		i915_request_add(new);
738 
739 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
740 			struct i915_sched_attr attr = {
741 				.priority =
742 					i915_prandom_u32_max_state(512, &prng),
743 			};
744 			engine->schedule(rq[idx], &attr);
745 		}
746 
747 		err = active_request_put(old);
748 		if (err)
749 			break;
750 
751 		cond_resched();
752 	}
753 
754 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
755 		int err__ = active_request_put(rq[count]);
756 
757 		/* Keep the first error */
758 		if (!err)
759 			err = err__;
760 
761 		intel_context_put(ce[count]);
762 	}
763 
764 	return err;
765 }
766 
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)767 static int __igt_reset_engines(struct intel_gt *gt,
768 			       const char *test_name,
769 			       unsigned int flags)
770 {
771 	struct i915_gpu_error *global = &gt->i915->gpu_error;
772 	struct intel_engine_cs *engine, *other;
773 	enum intel_engine_id id, tmp;
774 	struct hang h;
775 	int err = 0;
776 
777 	/* Check that issuing a reset on one engine does not interfere
778 	 * with any other engine.
779 	 */
780 
781 	if (!intel_has_reset_engine(gt))
782 		return 0;
783 
784 	if (flags & TEST_ACTIVE) {
785 		err = hang_init(&h, gt);
786 		if (err)
787 			return err;
788 
789 		if (flags & TEST_PRIORITY)
790 			h.ctx->sched.priority = 1024;
791 	}
792 
793 	for_each_engine(engine, gt, id) {
794 		struct active_engine threads[I915_NUM_ENGINES] = {};
795 		unsigned long device = i915_reset_count(global);
796 		unsigned long count = 0, reported;
797 		unsigned long heartbeat;
798 		IGT_TIMEOUT(end_time);
799 
800 		if (flags & TEST_ACTIVE &&
801 		    !intel_engine_can_store_dword(engine))
802 			continue;
803 
804 		if (!wait_for_idle(engine)) {
805 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
806 			       engine->name, test_name);
807 			err = -EIO;
808 			break;
809 		}
810 
811 		memset(threads, 0, sizeof(threads));
812 		for_each_engine(other, gt, tmp) {
813 			struct task_struct *tsk;
814 
815 			threads[tmp].resets =
816 				i915_reset_engine_count(global, other);
817 
818 			if (!(flags & TEST_OTHERS))
819 				continue;
820 
821 			if (other == engine && !(flags & TEST_SELF))
822 				continue;
823 
824 			threads[tmp].engine = other;
825 			threads[tmp].flags = flags;
826 
827 			tsk = kthread_run(active_engine, &threads[tmp],
828 					  "igt/%s", other->name);
829 			if (IS_ERR(tsk)) {
830 				err = PTR_ERR(tsk);
831 				goto unwind;
832 			}
833 
834 			threads[tmp].task = tsk;
835 			get_task_struct(tsk);
836 		}
837 
838 		yield(); /* start all threads before we begin */
839 
840 		engine_heartbeat_disable(engine, &heartbeat);
841 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
842 		do {
843 			struct i915_request *rq = NULL;
844 
845 			if (flags & TEST_ACTIVE) {
846 				rq = hang_create_request(&h, engine);
847 				if (IS_ERR(rq)) {
848 					err = PTR_ERR(rq);
849 					break;
850 				}
851 
852 				i915_request_get(rq);
853 				i915_request_add(rq);
854 
855 				if (!wait_until_running(&h, rq)) {
856 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
857 
858 					pr_err("%s: Failed to start request %llx, at %x\n",
859 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
860 					intel_engine_dump(engine, &p,
861 							  "%s\n", engine->name);
862 
863 					i915_request_put(rq);
864 					err = -EIO;
865 					break;
866 				}
867 			}
868 
869 			err = intel_engine_reset(engine, NULL);
870 			if (err) {
871 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
872 				       engine->name, test_name, err);
873 				break;
874 			}
875 
876 			count++;
877 
878 			if (rq) {
879 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
880 					struct drm_printer p =
881 						drm_info_printer(gt->i915->drm.dev);
882 
883 					pr_err("i915_reset_engine(%s:%s):"
884 					       " failed to complete request after reset\n",
885 					       engine->name, test_name);
886 					intel_engine_dump(engine, &p,
887 							  "%s\n", engine->name);
888 					i915_request_put(rq);
889 
890 					GEM_TRACE_DUMP();
891 					intel_gt_set_wedged(gt);
892 					err = -EIO;
893 					break;
894 				}
895 
896 				i915_request_put(rq);
897 			}
898 
899 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
900 				struct drm_printer p =
901 					drm_info_printer(gt->i915->drm.dev);
902 
903 				pr_err("i915_reset_engine(%s:%s):"
904 				       " failed to idle after reset\n",
905 				       engine->name, test_name);
906 				intel_engine_dump(engine, &p,
907 						  "%s\n", engine->name);
908 
909 				err = -EIO;
910 				break;
911 			}
912 		} while (time_before(jiffies, end_time));
913 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
914 		engine_heartbeat_enable(engine, heartbeat);
915 
916 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
917 			engine->name, test_name, count);
918 
919 		reported = i915_reset_engine_count(global, engine);
920 		reported -= threads[engine->id].resets;
921 		if (reported != count) {
922 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
923 			       engine->name, test_name, count, reported);
924 			if (!err)
925 				err = -EINVAL;
926 		}
927 
928 unwind:
929 		for_each_engine(other, gt, tmp) {
930 			int ret;
931 
932 			if (!threads[tmp].task)
933 				continue;
934 
935 			ret = kthread_stop(threads[tmp].task);
936 			if (ret) {
937 				pr_err("kthread for other engine %s failed, err=%d\n",
938 				       other->name, ret);
939 				if (!err)
940 					err = ret;
941 			}
942 			put_task_struct(threads[tmp].task);
943 
944 			if (other->uabi_class != engine->uabi_class &&
945 			    threads[tmp].resets !=
946 			    i915_reset_engine_count(global, other)) {
947 				pr_err("Innocent engine %s was reset (count=%ld)\n",
948 				       other->name,
949 				       i915_reset_engine_count(global, other) -
950 				       threads[tmp].resets);
951 				if (!err)
952 					err = -EINVAL;
953 			}
954 		}
955 
956 		if (device != i915_reset_count(global)) {
957 			pr_err("Global reset (count=%ld)!\n",
958 			       i915_reset_count(global) - device);
959 			if (!err)
960 				err = -EINVAL;
961 		}
962 
963 		if (err)
964 			break;
965 
966 		err = igt_flush_test(gt->i915);
967 		if (err)
968 			break;
969 	}
970 
971 	if (intel_gt_is_wedged(gt))
972 		err = -EIO;
973 
974 	if (flags & TEST_ACTIVE)
975 		hang_fini(&h);
976 
977 	return err;
978 }
979 
igt_reset_engines(void * arg)980 static int igt_reset_engines(void *arg)
981 {
982 	static const struct {
983 		const char *name;
984 		unsigned int flags;
985 	} phases[] = {
986 		{ "idle", 0 },
987 		{ "active", TEST_ACTIVE },
988 		{ "others-idle", TEST_OTHERS },
989 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
990 		{
991 			"others-priority",
992 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
993 		},
994 		{
995 			"self-priority",
996 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
997 		},
998 		{ }
999 	};
1000 	struct intel_gt *gt = arg;
1001 	typeof(*phases) *p;
1002 	int err;
1003 
1004 	for (p = phases; p->name; p++) {
1005 		if (p->flags & TEST_PRIORITY) {
1006 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1007 				continue;
1008 		}
1009 
1010 		err = __igt_reset_engines(arg, p->name, p->flags);
1011 		if (err)
1012 			return err;
1013 	}
1014 
1015 	return 0;
1016 }
1017 
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1018 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1019 {
1020 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1021 
1022 	intel_gt_reset(gt, mask, NULL);
1023 
1024 	return count;
1025 }
1026 
igt_reset_wait(void * arg)1027 static int igt_reset_wait(void *arg)
1028 {
1029 	struct intel_gt *gt = arg;
1030 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1031 	struct intel_engine_cs *engine = gt->engine[RCS0];
1032 	struct i915_request *rq;
1033 	unsigned int reset_count;
1034 	struct hang h;
1035 	long timeout;
1036 	int err;
1037 
1038 	if (!engine || !intel_engine_can_store_dword(engine))
1039 		return 0;
1040 
1041 	/* Check that we detect a stuck waiter and issue a reset */
1042 
1043 	igt_global_reset_lock(gt);
1044 
1045 	err = hang_init(&h, gt);
1046 	if (err)
1047 		goto unlock;
1048 
1049 	rq = hang_create_request(&h, engine);
1050 	if (IS_ERR(rq)) {
1051 		err = PTR_ERR(rq);
1052 		goto fini;
1053 	}
1054 
1055 	i915_request_get(rq);
1056 	i915_request_add(rq);
1057 
1058 	if (!wait_until_running(&h, rq)) {
1059 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1060 
1061 		pr_err("%s: Failed to start request %llx, at %x\n",
1062 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1063 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1064 
1065 		intel_gt_set_wedged(gt);
1066 
1067 		err = -EIO;
1068 		goto out_rq;
1069 	}
1070 
1071 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1072 
1073 	timeout = i915_request_wait(rq, 0, 10);
1074 	if (timeout < 0) {
1075 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1076 		       timeout);
1077 		err = timeout;
1078 		goto out_rq;
1079 	}
1080 
1081 	if (i915_reset_count(global) == reset_count) {
1082 		pr_err("No GPU reset recorded!\n");
1083 		err = -EINVAL;
1084 		goto out_rq;
1085 	}
1086 
1087 out_rq:
1088 	i915_request_put(rq);
1089 fini:
1090 	hang_fini(&h);
1091 unlock:
1092 	igt_global_reset_unlock(gt);
1093 
1094 	if (intel_gt_is_wedged(gt))
1095 		return -EIO;
1096 
1097 	return err;
1098 }
1099 
1100 struct evict_vma {
1101 	struct completion completion;
1102 	struct i915_vma *vma;
1103 };
1104 
evict_vma(void * data)1105 static int evict_vma(void *data)
1106 {
1107 	struct evict_vma *arg = data;
1108 	struct i915_address_space *vm = arg->vma->vm;
1109 	struct drm_mm_node evict = arg->vma->node;
1110 	int err;
1111 
1112 	complete(&arg->completion);
1113 
1114 	mutex_lock(&vm->mutex);
1115 	err = i915_gem_evict_for_node(vm, &evict, 0);
1116 	mutex_unlock(&vm->mutex);
1117 
1118 	return err;
1119 }
1120 
evict_fence(void * data)1121 static int evict_fence(void *data)
1122 {
1123 	struct evict_vma *arg = data;
1124 	int err;
1125 
1126 	complete(&arg->completion);
1127 
1128 	/* Mark the fence register as dirty to force the mmio update. */
1129 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1130 	if (err) {
1131 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1132 		return err;
1133 	}
1134 
1135 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1136 	if (err) {
1137 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1138 		return err;
1139 	}
1140 
1141 	err = i915_vma_pin_fence(arg->vma);
1142 	i915_vma_unpin(arg->vma);
1143 	if (err) {
1144 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1145 		return err;
1146 	}
1147 
1148 	i915_vma_unpin_fence(arg->vma);
1149 
1150 	return 0;
1151 }
1152 
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1153 static int __igt_reset_evict_vma(struct intel_gt *gt,
1154 				 struct i915_address_space *vm,
1155 				 int (*fn)(void *),
1156 				 unsigned int flags)
1157 {
1158 	struct intel_engine_cs *engine = gt->engine[RCS0];
1159 	struct drm_i915_gem_object *obj;
1160 	struct task_struct *tsk = NULL;
1161 	struct i915_request *rq;
1162 	struct evict_vma arg;
1163 	struct hang h;
1164 	unsigned int pin_flags;
1165 	int err;
1166 
1167 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1168 		return 0;
1169 
1170 	if (!engine || !intel_engine_can_store_dword(engine))
1171 		return 0;
1172 
1173 	/* Check that we can recover an unbind stuck on a hanging request */
1174 
1175 	err = hang_init(&h, gt);
1176 	if (err)
1177 		return err;
1178 
1179 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1180 	if (IS_ERR(obj)) {
1181 		err = PTR_ERR(obj);
1182 		goto fini;
1183 	}
1184 
1185 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1186 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1187 		if (err) {
1188 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1189 			goto out_obj;
1190 		}
1191 	}
1192 
1193 	arg.vma = i915_vma_instance(obj, vm, NULL);
1194 	if (IS_ERR(arg.vma)) {
1195 		err = PTR_ERR(arg.vma);
1196 		goto out_obj;
1197 	}
1198 
1199 	rq = hang_create_request(&h, engine);
1200 	if (IS_ERR(rq)) {
1201 		err = PTR_ERR(rq);
1202 		goto out_obj;
1203 	}
1204 
1205 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1206 
1207 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1208 		pin_flags |= PIN_MAPPABLE;
1209 
1210 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1211 	if (err) {
1212 		i915_request_add(rq);
1213 		goto out_obj;
1214 	}
1215 
1216 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1217 		err = i915_vma_pin_fence(arg.vma);
1218 		if (err) {
1219 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1220 			i915_vma_unpin(arg.vma);
1221 			i915_request_add(rq);
1222 			goto out_obj;
1223 		}
1224 	}
1225 
1226 	i915_vma_lock(arg.vma);
1227 	err = i915_request_await_object(rq, arg.vma->obj,
1228 					flags & EXEC_OBJECT_WRITE);
1229 	if (err == 0)
1230 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1231 	i915_vma_unlock(arg.vma);
1232 
1233 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1234 		i915_vma_unpin_fence(arg.vma);
1235 	i915_vma_unpin(arg.vma);
1236 
1237 	i915_request_get(rq);
1238 	i915_request_add(rq);
1239 	if (err)
1240 		goto out_rq;
1241 
1242 	if (!wait_until_running(&h, rq)) {
1243 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1244 
1245 		pr_err("%s: Failed to start request %llx, at %x\n",
1246 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1247 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1248 
1249 		intel_gt_set_wedged(gt);
1250 		goto out_reset;
1251 	}
1252 
1253 	init_completion(&arg.completion);
1254 
1255 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1256 	if (IS_ERR(tsk)) {
1257 		err = PTR_ERR(tsk);
1258 		tsk = NULL;
1259 		goto out_reset;
1260 	}
1261 	get_task_struct(tsk);
1262 
1263 	wait_for_completion(&arg.completion);
1264 
1265 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1266 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1267 
1268 		pr_err("igt/evict_vma kthread did not wait\n");
1269 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1270 
1271 		intel_gt_set_wedged(gt);
1272 		goto out_reset;
1273 	}
1274 
1275 out_reset:
1276 	igt_global_reset_lock(gt);
1277 	fake_hangcheck(gt, rq->engine->mask);
1278 	igt_global_reset_unlock(gt);
1279 
1280 	if (tsk) {
1281 		struct intel_wedge_me w;
1282 
1283 		/* The reset, even indirectly, should take less than 10ms. */
1284 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1285 			err = kthread_stop(tsk);
1286 
1287 		put_task_struct(tsk);
1288 	}
1289 
1290 out_rq:
1291 	i915_request_put(rq);
1292 out_obj:
1293 	i915_gem_object_put(obj);
1294 fini:
1295 	hang_fini(&h);
1296 	if (intel_gt_is_wedged(gt))
1297 		return -EIO;
1298 
1299 	return err;
1300 }
1301 
igt_reset_evict_ggtt(void * arg)1302 static int igt_reset_evict_ggtt(void *arg)
1303 {
1304 	struct intel_gt *gt = arg;
1305 
1306 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1307 				     evict_vma, EXEC_OBJECT_WRITE);
1308 }
1309 
igt_reset_evict_ppgtt(void * arg)1310 static int igt_reset_evict_ppgtt(void *arg)
1311 {
1312 	struct intel_gt *gt = arg;
1313 	struct i915_ppgtt *ppgtt;
1314 	int err;
1315 
1316 	/* aliasing == global gtt locking, covered above */
1317 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1318 		return 0;
1319 
1320 	ppgtt = i915_ppgtt_create(gt);
1321 	if (IS_ERR(ppgtt))
1322 		return PTR_ERR(ppgtt);
1323 
1324 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1325 				    evict_vma, EXEC_OBJECT_WRITE);
1326 	i915_vm_put(&ppgtt->vm);
1327 
1328 	return err;
1329 }
1330 
igt_reset_evict_fence(void * arg)1331 static int igt_reset_evict_fence(void *arg)
1332 {
1333 	struct intel_gt *gt = arg;
1334 
1335 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1336 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1337 }
1338 
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1339 static int wait_for_others(struct intel_gt *gt,
1340 			   struct intel_engine_cs *exclude)
1341 {
1342 	struct intel_engine_cs *engine;
1343 	enum intel_engine_id id;
1344 
1345 	for_each_engine(engine, gt, id) {
1346 		if (engine == exclude)
1347 			continue;
1348 
1349 		if (!wait_for_idle(engine))
1350 			return -EIO;
1351 	}
1352 
1353 	return 0;
1354 }
1355 
igt_reset_queue(void * arg)1356 static int igt_reset_queue(void *arg)
1357 {
1358 	struct intel_gt *gt = arg;
1359 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1360 	struct intel_engine_cs *engine;
1361 	enum intel_engine_id id;
1362 	struct hang h;
1363 	int err;
1364 
1365 	/* Check that we replay pending requests following a hang */
1366 
1367 	igt_global_reset_lock(gt);
1368 
1369 	err = hang_init(&h, gt);
1370 	if (err)
1371 		goto unlock;
1372 
1373 	for_each_engine(engine, gt, id) {
1374 		struct i915_request *prev;
1375 		IGT_TIMEOUT(end_time);
1376 		unsigned int count;
1377 
1378 		if (!intel_engine_can_store_dword(engine))
1379 			continue;
1380 
1381 		prev = hang_create_request(&h, engine);
1382 		if (IS_ERR(prev)) {
1383 			err = PTR_ERR(prev);
1384 			goto fini;
1385 		}
1386 
1387 		i915_request_get(prev);
1388 		i915_request_add(prev);
1389 
1390 		count = 0;
1391 		do {
1392 			struct i915_request *rq;
1393 			unsigned int reset_count;
1394 
1395 			rq = hang_create_request(&h, engine);
1396 			if (IS_ERR(rq)) {
1397 				err = PTR_ERR(rq);
1398 				goto fini;
1399 			}
1400 
1401 			i915_request_get(rq);
1402 			i915_request_add(rq);
1403 
1404 			/*
1405 			 * XXX We don't handle resetting the kernel context
1406 			 * very well. If we trigger a device reset twice in
1407 			 * quick succession while the kernel context is
1408 			 * executing, we may end up skipping the breadcrumb.
1409 			 * This is really only a problem for the selftest as
1410 			 * normally there is a large interlude between resets
1411 			 * (hangcheck), or we focus on resetting just one
1412 			 * engine and so avoid repeatedly resetting innocents.
1413 			 */
1414 			err = wait_for_others(gt, engine);
1415 			if (err) {
1416 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1417 				       __func__, engine->name);
1418 				i915_request_put(rq);
1419 				i915_request_put(prev);
1420 
1421 				GEM_TRACE_DUMP();
1422 				intel_gt_set_wedged(gt);
1423 				goto fini;
1424 			}
1425 
1426 			if (!wait_until_running(&h, prev)) {
1427 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1428 
1429 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1430 				       __func__, engine->name,
1431 				       prev->fence.seqno, hws_seqno(&h, prev));
1432 				intel_engine_dump(engine, &p,
1433 						  "%s\n", engine->name);
1434 
1435 				i915_request_put(rq);
1436 				i915_request_put(prev);
1437 
1438 				intel_gt_set_wedged(gt);
1439 
1440 				err = -EIO;
1441 				goto fini;
1442 			}
1443 
1444 			reset_count = fake_hangcheck(gt, BIT(id));
1445 
1446 			if (prev->fence.error != -EIO) {
1447 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1448 				       prev->fence.error);
1449 				i915_request_put(rq);
1450 				i915_request_put(prev);
1451 				err = -EINVAL;
1452 				goto fini;
1453 			}
1454 
1455 			if (rq->fence.error) {
1456 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1457 				       rq->fence.error);
1458 				i915_request_put(rq);
1459 				i915_request_put(prev);
1460 				err = -EINVAL;
1461 				goto fini;
1462 			}
1463 
1464 			if (i915_reset_count(global) == reset_count) {
1465 				pr_err("No GPU reset recorded!\n");
1466 				i915_request_put(rq);
1467 				i915_request_put(prev);
1468 				err = -EINVAL;
1469 				goto fini;
1470 			}
1471 
1472 			i915_request_put(prev);
1473 			prev = rq;
1474 			count++;
1475 		} while (time_before(jiffies, end_time));
1476 		pr_info("%s: Completed %d resets\n", engine->name, count);
1477 
1478 		*h.batch = MI_BATCH_BUFFER_END;
1479 		intel_gt_chipset_flush(engine->gt);
1480 
1481 		i915_request_put(prev);
1482 
1483 		err = igt_flush_test(gt->i915);
1484 		if (err)
1485 			break;
1486 	}
1487 
1488 fini:
1489 	hang_fini(&h);
1490 unlock:
1491 	igt_global_reset_unlock(gt);
1492 
1493 	if (intel_gt_is_wedged(gt))
1494 		return -EIO;
1495 
1496 	return err;
1497 }
1498 
igt_handle_error(void * arg)1499 static int igt_handle_error(void *arg)
1500 {
1501 	struct intel_gt *gt = arg;
1502 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1503 	struct intel_engine_cs *engine = gt->engine[RCS0];
1504 	struct hang h;
1505 	struct i915_request *rq;
1506 	struct i915_gpu_coredump *error;
1507 	int err;
1508 
1509 	/* Check that we can issue a global GPU and engine reset */
1510 
1511 	if (!intel_has_reset_engine(gt))
1512 		return 0;
1513 
1514 	if (!engine || !intel_engine_can_store_dword(engine))
1515 		return 0;
1516 
1517 	err = hang_init(&h, gt);
1518 	if (err)
1519 		return err;
1520 
1521 	rq = hang_create_request(&h, engine);
1522 	if (IS_ERR(rq)) {
1523 		err = PTR_ERR(rq);
1524 		goto err_fini;
1525 	}
1526 
1527 	i915_request_get(rq);
1528 	i915_request_add(rq);
1529 
1530 	if (!wait_until_running(&h, rq)) {
1531 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1532 
1533 		pr_err("%s: Failed to start request %llx, at %x\n",
1534 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1535 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1536 
1537 		intel_gt_set_wedged(gt);
1538 
1539 		err = -EIO;
1540 		goto err_request;
1541 	}
1542 
1543 	/* Temporarily disable error capture */
1544 	error = xchg(&global->first_error, (void *)-1);
1545 
1546 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1547 
1548 	xchg(&global->first_error, error);
1549 
1550 	if (rq->fence.error != -EIO) {
1551 		pr_err("Guilty request not identified!\n");
1552 		err = -EINVAL;
1553 		goto err_request;
1554 	}
1555 
1556 err_request:
1557 	i915_request_put(rq);
1558 err_fini:
1559 	hang_fini(&h);
1560 	return err;
1561 }
1562 
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1563 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1564 				     const struct igt_atomic_section *p,
1565 				     const char *mode)
1566 {
1567 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1568 	int err;
1569 
1570 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1571 		  engine->name, mode, p->name);
1572 
1573 	tasklet_disable(t);
1574 	p->critical_section_begin();
1575 
1576 	err = intel_engine_reset(engine, NULL);
1577 
1578 	p->critical_section_end();
1579 	tasklet_enable(t);
1580 
1581 	if (err)
1582 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1583 		       engine->name, mode, p->name);
1584 
1585 	return err;
1586 }
1587 
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1588 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1589 				   const struct igt_atomic_section *p)
1590 {
1591 	struct i915_request *rq;
1592 	struct hang h;
1593 	int err;
1594 
1595 	err = __igt_atomic_reset_engine(engine, p, "idle");
1596 	if (err)
1597 		return err;
1598 
1599 	err = hang_init(&h, engine->gt);
1600 	if (err)
1601 		return err;
1602 
1603 	rq = hang_create_request(&h, engine);
1604 	if (IS_ERR(rq)) {
1605 		err = PTR_ERR(rq);
1606 		goto out;
1607 	}
1608 
1609 	i915_request_get(rq);
1610 	i915_request_add(rq);
1611 
1612 	if (wait_until_running(&h, rq)) {
1613 		err = __igt_atomic_reset_engine(engine, p, "active");
1614 	} else {
1615 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1616 		       __func__, engine->name,
1617 		       rq->fence.seqno, hws_seqno(&h, rq));
1618 		intel_gt_set_wedged(engine->gt);
1619 		err = -EIO;
1620 	}
1621 
1622 	if (err == 0) {
1623 		struct intel_wedge_me w;
1624 
1625 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1626 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1627 		if (intel_gt_is_wedged(engine->gt))
1628 			err = -EIO;
1629 	}
1630 
1631 	i915_request_put(rq);
1632 out:
1633 	hang_fini(&h);
1634 	return err;
1635 }
1636 
igt_reset_engines_atomic(void * arg)1637 static int igt_reset_engines_atomic(void *arg)
1638 {
1639 	struct intel_gt *gt = arg;
1640 	const typeof(*igt_atomic_phases) *p;
1641 	int err = 0;
1642 
1643 	/* Check that the engines resets are usable from atomic context */
1644 
1645 	if (!intel_has_reset_engine(gt))
1646 		return 0;
1647 
1648 	if (USES_GUC_SUBMISSION(gt->i915))
1649 		return 0;
1650 
1651 	igt_global_reset_lock(gt);
1652 
1653 	/* Flush any requests before we get started and check basics */
1654 	if (!igt_force_reset(gt))
1655 		goto unlock;
1656 
1657 	for (p = igt_atomic_phases; p->name; p++) {
1658 		struct intel_engine_cs *engine;
1659 		enum intel_engine_id id;
1660 
1661 		for_each_engine(engine, gt, id) {
1662 			err = igt_atomic_reset_engine(engine, p);
1663 			if (err)
1664 				goto out;
1665 		}
1666 	}
1667 
1668 out:
1669 	/* As we poke around the guts, do a full reset before continuing. */
1670 	igt_force_reset(gt);
1671 unlock:
1672 	igt_global_reset_unlock(gt);
1673 
1674 	return err;
1675 }
1676 
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1677 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1678 {
1679 	static const struct i915_subtest tests[] = {
1680 		SUBTEST(igt_hang_sanitycheck),
1681 		SUBTEST(igt_reset_nop),
1682 		SUBTEST(igt_reset_nop_engine),
1683 		SUBTEST(igt_reset_idle_engine),
1684 		SUBTEST(igt_reset_active_engine),
1685 		SUBTEST(igt_reset_engines),
1686 		SUBTEST(igt_reset_engines_atomic),
1687 		SUBTEST(igt_reset_queue),
1688 		SUBTEST(igt_reset_wait),
1689 		SUBTEST(igt_reset_evict_ggtt),
1690 		SUBTEST(igt_reset_evict_ppgtt),
1691 		SUBTEST(igt_reset_evict_fence),
1692 		SUBTEST(igt_handle_error),
1693 	};
1694 	struct intel_gt *gt = &i915->gt;
1695 	intel_wakeref_t wakeref;
1696 	int err;
1697 
1698 	if (!intel_has_gpu_reset(gt))
1699 		return 0;
1700 
1701 	if (intel_gt_is_wedged(gt))
1702 		return -EIO; /* we're long past hope of a successful reset */
1703 
1704 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1705 
1706 	err = intel_gt_live_subtests(tests, gt);
1707 
1708 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1709 
1710 	return err;
1711 }
1712