xref: /dflybsd-src/sys/dev/drm/i915/intel_hangcheck.c (revision 3f2dd94a569761201b5b0a18b2f697f97fe1b9dc)
14be47400SFrançois Tigeot /*
24be47400SFrançois Tigeot  * Copyright © 2016 Intel Corporation
34be47400SFrançois Tigeot  *
44be47400SFrançois Tigeot  * Permission is hereby granted, free of charge, to any person obtaining a
54be47400SFrançois Tigeot  * copy of this software and associated documentation files (the "Software"),
64be47400SFrançois Tigeot  * to deal in the Software without restriction, including without limitation
74be47400SFrançois Tigeot  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
84be47400SFrançois Tigeot  * and/or sell copies of the Software, and to permit persons to whom the
94be47400SFrançois Tigeot  * Software is furnished to do so, subject to the following conditions:
104be47400SFrançois Tigeot  *
114be47400SFrançois Tigeot  * The above copyright notice and this permission notice (including the next
124be47400SFrançois Tigeot  * paragraph) shall be included in all copies or substantial portions of the
134be47400SFrançois Tigeot  * Software.
144be47400SFrançois Tigeot  *
154be47400SFrançois Tigeot  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
164be47400SFrançois Tigeot  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
174be47400SFrançois Tigeot  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
184be47400SFrançois Tigeot  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
194be47400SFrançois Tigeot  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
204be47400SFrançois Tigeot  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
214be47400SFrançois Tigeot  * IN THE SOFTWARE.
224be47400SFrançois Tigeot  *
234be47400SFrançois Tigeot  */
244be47400SFrançois Tigeot 
254be47400SFrançois Tigeot #include "i915_drv.h"
264be47400SFrançois Tigeot 
274be47400SFrançois Tigeot static bool
ipehr_is_semaphore_wait(struct intel_engine_cs * engine,u32 ipehr)284be47400SFrançois Tigeot ipehr_is_semaphore_wait(struct intel_engine_cs *engine, u32 ipehr)
294be47400SFrançois Tigeot {
304be47400SFrançois Tigeot 	if (INTEL_GEN(engine->i915) >= 8) {
314be47400SFrançois Tigeot 		return (ipehr >> 23) == 0x1c;
324be47400SFrançois Tigeot 	} else {
334be47400SFrançois Tigeot 		ipehr &= ~MI_SEMAPHORE_SYNC_MASK;
344be47400SFrançois Tigeot 		return ipehr == (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE |
354be47400SFrançois Tigeot 				 MI_SEMAPHORE_REGISTER);
364be47400SFrançois Tigeot 	}
374be47400SFrançois Tigeot }
384be47400SFrançois Tigeot 
394be47400SFrançois Tigeot static struct intel_engine_cs *
semaphore_wait_to_signaller_ring(struct intel_engine_cs * engine,u32 ipehr,u64 offset)404be47400SFrançois Tigeot semaphore_wait_to_signaller_ring(struct intel_engine_cs *engine, u32 ipehr,
414be47400SFrançois Tigeot 				 u64 offset)
424be47400SFrançois Tigeot {
434be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv = engine->i915;
444be47400SFrançois Tigeot 	struct intel_engine_cs *signaller;
454be47400SFrançois Tigeot 	enum intel_engine_id id;
464be47400SFrançois Tigeot 
474be47400SFrançois Tigeot 	if (INTEL_GEN(dev_priv) >= 8) {
484be47400SFrançois Tigeot 		for_each_engine(signaller, dev_priv, id) {
494be47400SFrançois Tigeot 			if (engine == signaller)
504be47400SFrançois Tigeot 				continue;
514be47400SFrançois Tigeot 
524be47400SFrançois Tigeot 			if (offset == signaller->semaphore.signal_ggtt[engine->hw_id])
534be47400SFrançois Tigeot 				return signaller;
544be47400SFrançois Tigeot 		}
554be47400SFrançois Tigeot 	} else {
564be47400SFrançois Tigeot 		u32 sync_bits = ipehr & MI_SEMAPHORE_SYNC_MASK;
574be47400SFrançois Tigeot 
584be47400SFrançois Tigeot 		for_each_engine(signaller, dev_priv, id) {
594be47400SFrançois Tigeot 			if(engine == signaller)
604be47400SFrançois Tigeot 				continue;
614be47400SFrançois Tigeot 
624be47400SFrançois Tigeot 			if (sync_bits == signaller->semaphore.mbox.wait[engine->hw_id])
634be47400SFrançois Tigeot 				return signaller;
644be47400SFrançois Tigeot 		}
654be47400SFrançois Tigeot 	}
664be47400SFrançois Tigeot 
674be47400SFrançois Tigeot 	DRM_DEBUG_DRIVER("No signaller ring found for %s, ipehr 0x%08x, offset 0x%016llx\n",
684be47400SFrançois Tigeot 			 engine->name, ipehr, offset);
694be47400SFrançois Tigeot 
704be47400SFrançois Tigeot 	return ERR_PTR(-ENODEV);
714be47400SFrançois Tigeot }
724be47400SFrançois Tigeot 
734be47400SFrançois Tigeot static struct intel_engine_cs *
semaphore_waits_for(struct intel_engine_cs * engine,u32 * seqno)744be47400SFrançois Tigeot semaphore_waits_for(struct intel_engine_cs *engine, u32 *seqno)
754be47400SFrançois Tigeot {
764be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv = engine->i915;
774be47400SFrançois Tigeot 	void __iomem *vaddr;
784be47400SFrançois Tigeot 	u32 cmd, ipehr, head;
794be47400SFrançois Tigeot 	u64 offset = 0;
804be47400SFrançois Tigeot 	int i, backwards;
814be47400SFrançois Tigeot 
824be47400SFrançois Tigeot 	/*
834be47400SFrançois Tigeot 	 * This function does not support execlist mode - any attempt to
844be47400SFrançois Tigeot 	 * proceed further into this function will result in a kernel panic
854be47400SFrançois Tigeot 	 * when dereferencing ring->buffer, which is not set up in execlist
864be47400SFrançois Tigeot 	 * mode.
874be47400SFrançois Tigeot 	 *
884be47400SFrançois Tigeot 	 * The correct way of doing it would be to derive the currently
894be47400SFrançois Tigeot 	 * executing ring buffer from the current context, which is derived
904be47400SFrançois Tigeot 	 * from the currently running request. Unfortunately, to get the
914be47400SFrançois Tigeot 	 * current request we would have to grab the struct_mutex before doing
924be47400SFrançois Tigeot 	 * anything else, which would be ill-advised since some other thread
934be47400SFrançois Tigeot 	 * might have grabbed it already and managed to hang itself, causing
944be47400SFrançois Tigeot 	 * the hang checker to deadlock.
954be47400SFrançois Tigeot 	 *
964be47400SFrançois Tigeot 	 * Therefore, this function does not support execlist mode in its
974be47400SFrançois Tigeot 	 * current form. Just return NULL and move on.
984be47400SFrançois Tigeot 	 */
994be47400SFrançois Tigeot 	if (engine->buffer == NULL)
1004be47400SFrançois Tigeot 		return NULL;
1014be47400SFrançois Tigeot 
1024be47400SFrançois Tigeot 	ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
1034be47400SFrançois Tigeot 	if (!ipehr_is_semaphore_wait(engine, ipehr))
1044be47400SFrançois Tigeot 		return NULL;
1054be47400SFrançois Tigeot 
1064be47400SFrançois Tigeot 	/*
1074be47400SFrançois Tigeot 	 * HEAD is likely pointing to the dword after the actual command,
1084be47400SFrançois Tigeot 	 * so scan backwards until we find the MBOX. But limit it to just 3
1094be47400SFrançois Tigeot 	 * or 4 dwords depending on the semaphore wait command size.
1104be47400SFrançois Tigeot 	 * Note that we don't care about ACTHD here since that might
1114be47400SFrançois Tigeot 	 * point at at batch, and semaphores are always emitted into the
1124be47400SFrançois Tigeot 	 * ringbuffer itself.
1134be47400SFrançois Tigeot 	 */
1144be47400SFrançois Tigeot 	head = I915_READ_HEAD(engine) & HEAD_ADDR;
1154be47400SFrançois Tigeot 	backwards = (INTEL_GEN(dev_priv) >= 8) ? 5 : 4;
1164be47400SFrançois Tigeot 	vaddr = (void __iomem *)engine->buffer->vaddr;
1174be47400SFrançois Tigeot 
1184be47400SFrançois Tigeot 	for (i = backwards; i; --i) {
1194be47400SFrançois Tigeot 		/*
1204be47400SFrançois Tigeot 		 * Be paranoid and presume the hw has gone off into the wild -
1214be47400SFrançois Tigeot 		 * our ring is smaller than what the hardware (and hence
1224be47400SFrançois Tigeot 		 * HEAD_ADDR) allows. Also handles wrap-around.
1234be47400SFrançois Tigeot 		 */
1244be47400SFrançois Tigeot 		head &= engine->buffer->size - 1;
1254be47400SFrançois Tigeot 
1264be47400SFrançois Tigeot 		/* This here seems to blow up */
1274be47400SFrançois Tigeot 		cmd = ioread32(vaddr + head);
1284be47400SFrançois Tigeot 		if (cmd == ipehr)
1294be47400SFrançois Tigeot 			break;
1304be47400SFrançois Tigeot 
1314be47400SFrançois Tigeot 		head -= 4;
1324be47400SFrançois Tigeot 	}
1334be47400SFrançois Tigeot 
1344be47400SFrançois Tigeot 	if (!i)
1354be47400SFrançois Tigeot 		return NULL;
1364be47400SFrançois Tigeot 
1374be47400SFrançois Tigeot 	*seqno = ioread32(vaddr + head + 4) + 1;
1384be47400SFrançois Tigeot 	if (INTEL_GEN(dev_priv) >= 8) {
1394be47400SFrançois Tigeot 		offset = ioread32(vaddr + head + 12);
1404be47400SFrançois Tigeot 		offset <<= 32;
1414be47400SFrançois Tigeot 		offset |= ioread32(vaddr + head + 8);
1424be47400SFrançois Tigeot 	}
1434be47400SFrançois Tigeot 	return semaphore_wait_to_signaller_ring(engine, ipehr, offset);
1444be47400SFrançois Tigeot }
1454be47400SFrançois Tigeot 
semaphore_passed(struct intel_engine_cs * engine)1464be47400SFrançois Tigeot static int semaphore_passed(struct intel_engine_cs *engine)
1474be47400SFrançois Tigeot {
1484be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv = engine->i915;
1494be47400SFrançois Tigeot 	struct intel_engine_cs *signaller;
1504be47400SFrançois Tigeot 	u32 seqno;
1514be47400SFrançois Tigeot 
1524be47400SFrançois Tigeot 	engine->hangcheck.deadlock++;
1534be47400SFrançois Tigeot 
1544be47400SFrançois Tigeot 	signaller = semaphore_waits_for(engine, &seqno);
1554be47400SFrançois Tigeot 	if (signaller == NULL)
1564be47400SFrançois Tigeot 		return -1;
1574be47400SFrançois Tigeot 
1584be47400SFrançois Tigeot 	if (IS_ERR(signaller))
1594be47400SFrançois Tigeot 		return 0;
1604be47400SFrançois Tigeot 
1614be47400SFrançois Tigeot 	/* Prevent pathological recursion due to driver bugs */
1624be47400SFrançois Tigeot 	if (signaller->hangcheck.deadlock >= I915_NUM_ENGINES)
1634be47400SFrançois Tigeot 		return -1;
1644be47400SFrançois Tigeot 
1654be47400SFrançois Tigeot 	if (i915_seqno_passed(intel_engine_get_seqno(signaller), seqno))
1664be47400SFrançois Tigeot 		return 1;
1674be47400SFrançois Tigeot 
1684be47400SFrançois Tigeot 	/* cursory check for an unkickable deadlock */
1694be47400SFrançois Tigeot 	if (I915_READ_CTL(signaller) & RING_WAIT_SEMAPHORE &&
1704be47400SFrançois Tigeot 	    semaphore_passed(signaller) < 0)
1714be47400SFrançois Tigeot 		return -1;
1724be47400SFrançois Tigeot 
1734be47400SFrançois Tigeot 	return 0;
1744be47400SFrançois Tigeot }
1754be47400SFrançois Tigeot 
semaphore_clear_deadlocks(struct drm_i915_private * dev_priv)1764be47400SFrançois Tigeot static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
1774be47400SFrançois Tigeot {
1784be47400SFrançois Tigeot 	struct intel_engine_cs *engine;
1794be47400SFrançois Tigeot 	enum intel_engine_id id;
1804be47400SFrançois Tigeot 
1814be47400SFrançois Tigeot 	for_each_engine(engine, dev_priv, id)
1824be47400SFrançois Tigeot 		engine->hangcheck.deadlock = 0;
1834be47400SFrançois Tigeot }
1844be47400SFrançois Tigeot 
instdone_unchanged(u32 current_instdone,u32 * old_instdone)1854be47400SFrançois Tigeot static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
1864be47400SFrançois Tigeot {
1874be47400SFrançois Tigeot 	u32 tmp = current_instdone | *old_instdone;
1884be47400SFrançois Tigeot 	bool unchanged;
1894be47400SFrançois Tigeot 
1904be47400SFrançois Tigeot 	unchanged = tmp == *old_instdone;
1914be47400SFrançois Tigeot 	*old_instdone |= tmp;
1924be47400SFrançois Tigeot 
1934be47400SFrançois Tigeot 	return unchanged;
1944be47400SFrançois Tigeot }
1954be47400SFrançois Tigeot 
subunits_stuck(struct intel_engine_cs * engine)1964be47400SFrançois Tigeot static bool subunits_stuck(struct intel_engine_cs *engine)
1974be47400SFrançois Tigeot {
1984be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv = engine->i915;
1994be47400SFrançois Tigeot 	struct intel_instdone instdone;
2004be47400SFrançois Tigeot 	struct intel_instdone *accu_instdone = &engine->hangcheck.instdone;
2014be47400SFrançois Tigeot 	bool stuck;
2024be47400SFrançois Tigeot 	int slice;
2034be47400SFrançois Tigeot 	int subslice;
2044be47400SFrançois Tigeot 
2054be47400SFrançois Tigeot 	if (engine->id != RCS)
2064be47400SFrançois Tigeot 		return true;
2074be47400SFrançois Tigeot 
2084be47400SFrançois Tigeot 	intel_engine_get_instdone(engine, &instdone);
2094be47400SFrançois Tigeot 
2104be47400SFrançois Tigeot 	/* There might be unstable subunit states even when
2114be47400SFrançois Tigeot 	 * actual head is not moving. Filter out the unstable ones by
2124be47400SFrançois Tigeot 	 * accumulating the undone -> done transitions and only
2134be47400SFrançois Tigeot 	 * consider those as progress.
2144be47400SFrançois Tigeot 	 */
2154be47400SFrançois Tigeot 	stuck = instdone_unchanged(instdone.instdone,
2164be47400SFrançois Tigeot 				   &accu_instdone->instdone);
2174be47400SFrançois Tigeot 	stuck &= instdone_unchanged(instdone.slice_common,
2184be47400SFrançois Tigeot 				    &accu_instdone->slice_common);
2194be47400SFrançois Tigeot 
2204be47400SFrançois Tigeot 	for_each_instdone_slice_subslice(dev_priv, slice, subslice) {
2214be47400SFrançois Tigeot 		stuck &= instdone_unchanged(instdone.sampler[slice][subslice],
2224be47400SFrançois Tigeot 					    &accu_instdone->sampler[slice][subslice]);
2234be47400SFrançois Tigeot 		stuck &= instdone_unchanged(instdone.row[slice][subslice],
2244be47400SFrançois Tigeot 					    &accu_instdone->row[slice][subslice]);
2254be47400SFrançois Tigeot 	}
2264be47400SFrançois Tigeot 
2274be47400SFrançois Tigeot 	return stuck;
2284be47400SFrançois Tigeot }
2294be47400SFrançois Tigeot 
2304be47400SFrançois Tigeot static enum intel_engine_hangcheck_action
head_stuck(struct intel_engine_cs * engine,u64 acthd)2314be47400SFrançois Tigeot head_stuck(struct intel_engine_cs *engine, u64 acthd)
2324be47400SFrançois Tigeot {
2334be47400SFrançois Tigeot 	if (acthd != engine->hangcheck.acthd) {
2344be47400SFrançois Tigeot 
2354be47400SFrançois Tigeot 		/* Clear subunit states on head movement */
2364be47400SFrançois Tigeot 		memset(&engine->hangcheck.instdone, 0,
2374be47400SFrançois Tigeot 		       sizeof(engine->hangcheck.instdone));
2384be47400SFrançois Tigeot 
239a85cb24fSFrançois Tigeot 		return ENGINE_ACTIVE_HEAD;
2404be47400SFrançois Tigeot 	}
2414be47400SFrançois Tigeot 
2424be47400SFrançois Tigeot 	if (!subunits_stuck(engine))
243a85cb24fSFrançois Tigeot 		return ENGINE_ACTIVE_SUBUNITS;
2444be47400SFrançois Tigeot 
245a85cb24fSFrançois Tigeot 	return ENGINE_DEAD;
2464be47400SFrançois Tigeot }
2474be47400SFrançois Tigeot 
2484be47400SFrançois Tigeot static enum intel_engine_hangcheck_action
engine_stuck(struct intel_engine_cs * engine,u64 acthd)2494be47400SFrançois Tigeot engine_stuck(struct intel_engine_cs *engine, u64 acthd)
2504be47400SFrançois Tigeot {
2514be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv = engine->i915;
2524be47400SFrançois Tigeot 	enum intel_engine_hangcheck_action ha;
2534be47400SFrançois Tigeot 	u32 tmp;
2544be47400SFrançois Tigeot 
2554be47400SFrançois Tigeot 	ha = head_stuck(engine, acthd);
256a85cb24fSFrançois Tigeot 	if (ha != ENGINE_DEAD)
2574be47400SFrançois Tigeot 		return ha;
2584be47400SFrançois Tigeot 
2594be47400SFrançois Tigeot 	if (IS_GEN2(dev_priv))
260a85cb24fSFrançois Tigeot 		return ENGINE_DEAD;
2614be47400SFrançois Tigeot 
2624be47400SFrançois Tigeot 	/* Is the chip hanging on a WAIT_FOR_EVENT?
2634be47400SFrançois Tigeot 	 * If so we can simply poke the RB_WAIT bit
2644be47400SFrançois Tigeot 	 * and break the hang. This should work on
2654be47400SFrançois Tigeot 	 * all but the second generation chipsets.
2664be47400SFrançois Tigeot 	 */
2674be47400SFrançois Tigeot 	tmp = I915_READ_CTL(engine);
2684be47400SFrançois Tigeot 	if (tmp & RING_WAIT) {
2694be47400SFrançois Tigeot 		i915_handle_error(dev_priv, 0,
2704be47400SFrançois Tigeot 				  "Kicking stuck wait on %s",
2714be47400SFrançois Tigeot 				  engine->name);
2724be47400SFrançois Tigeot 		I915_WRITE_CTL(engine, tmp);
273a85cb24fSFrançois Tigeot 		return ENGINE_WAIT_KICK;
2744be47400SFrançois Tigeot 	}
2754be47400SFrançois Tigeot 
2764be47400SFrançois Tigeot 	if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) {
2774be47400SFrançois Tigeot 		switch (semaphore_passed(engine)) {
2784be47400SFrançois Tigeot 		default:
279a85cb24fSFrançois Tigeot 			return ENGINE_DEAD;
2804be47400SFrançois Tigeot 		case 1:
2814be47400SFrançois Tigeot 			i915_handle_error(dev_priv, 0,
2824be47400SFrançois Tigeot 					  "Kicking stuck semaphore on %s",
2834be47400SFrançois Tigeot 					  engine->name);
2844be47400SFrançois Tigeot 			I915_WRITE_CTL(engine, tmp);
285a85cb24fSFrançois Tigeot 			return ENGINE_WAIT_KICK;
2864be47400SFrançois Tigeot 		case 0:
287a85cb24fSFrançois Tigeot 			return ENGINE_WAIT;
2884be47400SFrançois Tigeot 		}
2894be47400SFrançois Tigeot 	}
2904be47400SFrançois Tigeot 
291a85cb24fSFrançois Tigeot 	return ENGINE_DEAD;
292a85cb24fSFrançois Tigeot }
293a85cb24fSFrançois Tigeot 
hangcheck_load_sample(struct intel_engine_cs * engine,struct intel_engine_hangcheck * hc)294a85cb24fSFrançois Tigeot static void hangcheck_load_sample(struct intel_engine_cs *engine,
295a85cb24fSFrançois Tigeot 				  struct intel_engine_hangcheck *hc)
296a85cb24fSFrançois Tigeot {
297a85cb24fSFrançois Tigeot 	/* We don't strictly need an irq-barrier here, as we are not
298a85cb24fSFrançois Tigeot 	 * serving an interrupt request, be paranoid in case the
299a85cb24fSFrançois Tigeot 	 * barrier has side-effects (such as preventing a broken
300a85cb24fSFrançois Tigeot 	 * cacheline snoop) and so be sure that we can see the seqno
301a85cb24fSFrançois Tigeot 	 * advance. If the seqno should stick, due to a stale
302a85cb24fSFrançois Tigeot 	 * cacheline, we would erroneously declare the GPU hung.
303a85cb24fSFrançois Tigeot 	 */
304a85cb24fSFrançois Tigeot 	if (engine->irq_seqno_barrier)
305a85cb24fSFrançois Tigeot 		engine->irq_seqno_barrier(engine);
306a85cb24fSFrançois Tigeot 
307a85cb24fSFrançois Tigeot 	hc->acthd = intel_engine_get_active_head(engine);
308a85cb24fSFrançois Tigeot 	hc->seqno = intel_engine_get_seqno(engine);
309a85cb24fSFrançois Tigeot }
310a85cb24fSFrançois Tigeot 
hangcheck_store_sample(struct intel_engine_cs * engine,const struct intel_engine_hangcheck * hc)311a85cb24fSFrançois Tigeot static void hangcheck_store_sample(struct intel_engine_cs *engine,
312a85cb24fSFrançois Tigeot 				   const struct intel_engine_hangcheck *hc)
313a85cb24fSFrançois Tigeot {
314a85cb24fSFrançois Tigeot 	engine->hangcheck.acthd = hc->acthd;
315a85cb24fSFrançois Tigeot 	engine->hangcheck.seqno = hc->seqno;
316a85cb24fSFrançois Tigeot 	engine->hangcheck.action = hc->action;
317a85cb24fSFrançois Tigeot 	engine->hangcheck.stalled = hc->stalled;
318a85cb24fSFrançois Tigeot }
319a85cb24fSFrançois Tigeot 
320a85cb24fSFrançois Tigeot static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs * engine,const struct intel_engine_hangcheck * hc)321a85cb24fSFrançois Tigeot hangcheck_get_action(struct intel_engine_cs *engine,
322a85cb24fSFrançois Tigeot 		     const struct intel_engine_hangcheck *hc)
323a85cb24fSFrançois Tigeot {
324a85cb24fSFrançois Tigeot 	if (engine->hangcheck.seqno != hc->seqno)
325a85cb24fSFrançois Tigeot 		return ENGINE_ACTIVE_SEQNO;
326a85cb24fSFrançois Tigeot 
327*3f2dd94aSFrançois Tigeot 	if (intel_engine_is_idle(engine))
328a85cb24fSFrançois Tigeot 		return ENGINE_IDLE;
329a85cb24fSFrançois Tigeot 
330a85cb24fSFrançois Tigeot 	return engine_stuck(engine, hc->acthd);
331a85cb24fSFrançois Tigeot }
332a85cb24fSFrançois Tigeot 
hangcheck_accumulate_sample(struct intel_engine_cs * engine,struct intel_engine_hangcheck * hc)333a85cb24fSFrançois Tigeot static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
334a85cb24fSFrançois Tigeot 					struct intel_engine_hangcheck *hc)
335a85cb24fSFrançois Tigeot {
336a85cb24fSFrançois Tigeot 	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
337a85cb24fSFrançois Tigeot 
338a85cb24fSFrançois Tigeot 	hc->action = hangcheck_get_action(engine, hc);
339a85cb24fSFrançois Tigeot 
340a85cb24fSFrançois Tigeot 	/* We always increment the progress
341a85cb24fSFrançois Tigeot 	 * if the engine is busy and still processing
342a85cb24fSFrançois Tigeot 	 * the same request, so that no single request
343a85cb24fSFrançois Tigeot 	 * can run indefinitely (such as a chain of
344a85cb24fSFrançois Tigeot 	 * batches). The only time we do not increment
345a85cb24fSFrançois Tigeot 	 * the hangcheck score on this ring, if this
346a85cb24fSFrançois Tigeot 	 * engine is in a legitimate wait for another
347a85cb24fSFrançois Tigeot 	 * engine. In that case the waiting engine is a
348a85cb24fSFrançois Tigeot 	 * victim and we want to be sure we catch the
349a85cb24fSFrançois Tigeot 	 * right culprit. Then every time we do kick
350a85cb24fSFrançois Tigeot 	 * the ring, make it as a progress as the seqno
351a85cb24fSFrançois Tigeot 	 * advancement might ensure and if not, it
352a85cb24fSFrançois Tigeot 	 * will catch the hanging engine.
353a85cb24fSFrançois Tigeot 	 */
354a85cb24fSFrançois Tigeot 
355a85cb24fSFrançois Tigeot 	switch (hc->action) {
356a85cb24fSFrançois Tigeot 	case ENGINE_IDLE:
357a85cb24fSFrançois Tigeot 	case ENGINE_ACTIVE_SEQNO:
358a85cb24fSFrançois Tigeot 		/* Clear head and subunit states on seqno movement */
359a85cb24fSFrançois Tigeot 		hc->acthd = 0;
360a85cb24fSFrançois Tigeot 
361a85cb24fSFrançois Tigeot 		memset(&engine->hangcheck.instdone, 0,
362a85cb24fSFrançois Tigeot 		       sizeof(engine->hangcheck.instdone));
363a85cb24fSFrançois Tigeot 
364a85cb24fSFrançois Tigeot 		/* Intentional fall through */
365a85cb24fSFrançois Tigeot 	case ENGINE_WAIT_KICK:
366a85cb24fSFrançois Tigeot 	case ENGINE_WAIT:
367a85cb24fSFrançois Tigeot 		engine->hangcheck.action_timestamp = jiffies;
368a85cb24fSFrançois Tigeot 		break;
369a85cb24fSFrançois Tigeot 
370a85cb24fSFrançois Tigeot 	case ENGINE_ACTIVE_HEAD:
371a85cb24fSFrançois Tigeot 	case ENGINE_ACTIVE_SUBUNITS:
372a85cb24fSFrançois Tigeot 		/* Seqno stuck with still active engine gets leeway,
373a85cb24fSFrançois Tigeot 		 * in hopes that it is just a long shader.
374a85cb24fSFrançois Tigeot 		 */
375a85cb24fSFrançois Tigeot 		timeout = I915_SEQNO_DEAD_TIMEOUT;
376a85cb24fSFrançois Tigeot 		break;
377a85cb24fSFrançois Tigeot 
378a85cb24fSFrançois Tigeot 	case ENGINE_DEAD:
379a85cb24fSFrançois Tigeot 		break;
380a85cb24fSFrançois Tigeot 
381a85cb24fSFrançois Tigeot 	default:
382a85cb24fSFrançois Tigeot 		MISSING_CASE(hc->action);
383a85cb24fSFrançois Tigeot 	}
384a85cb24fSFrançois Tigeot 
385a85cb24fSFrançois Tigeot 	hc->stalled = time_after(jiffies,
386a85cb24fSFrançois Tigeot 				 engine->hangcheck.action_timestamp + timeout);
387a85cb24fSFrançois Tigeot }
388a85cb24fSFrançois Tigeot 
hangcheck_declare_hang(struct drm_i915_private * i915,unsigned int hung,unsigned int stuck)389a85cb24fSFrançois Tigeot static void hangcheck_declare_hang(struct drm_i915_private *i915,
390a85cb24fSFrançois Tigeot 				   unsigned int hung,
391a85cb24fSFrançois Tigeot 				   unsigned int stuck)
392a85cb24fSFrançois Tigeot {
393a85cb24fSFrançois Tigeot 	struct intel_engine_cs *engine;
394a85cb24fSFrançois Tigeot 	char msg[80];
395a85cb24fSFrançois Tigeot 	unsigned int tmp;
396a85cb24fSFrançois Tigeot 	int len;
397a85cb24fSFrançois Tigeot 
398a85cb24fSFrançois Tigeot 	/* If some rings hung but others were still busy, only
399a85cb24fSFrançois Tigeot 	 * blame the hanging rings in the synopsis.
400a85cb24fSFrançois Tigeot 	 */
401a85cb24fSFrançois Tigeot 	if (stuck != hung)
402a85cb24fSFrançois Tigeot 		hung &= ~stuck;
403a85cb24fSFrançois Tigeot 	len = scnprintf(msg, sizeof(msg),
404a85cb24fSFrançois Tigeot 			"%s on ", stuck == hung ? "No progress" : "Hang");
405a85cb24fSFrançois Tigeot 	for_each_engine_masked(engine, i915, hung, tmp)
406a85cb24fSFrançois Tigeot 		len += scnprintf(msg + len, sizeof(msg) - len,
407a85cb24fSFrançois Tigeot 				 "%s, ", engine->name);
408a85cb24fSFrançois Tigeot 	msg[len-2] = '\0';
409a85cb24fSFrançois Tigeot 
410*3f2dd94aSFrançois Tigeot 	return i915_handle_error(i915, hung, "%s", msg);
4114be47400SFrançois Tigeot }
4124be47400SFrançois Tigeot 
4134be47400SFrançois Tigeot /*
4144be47400SFrançois Tigeot  * This is called when the chip hasn't reported back with completed
4154be47400SFrançois Tigeot  * batchbuffers in a long time. We keep track per ring seqno progress and
4164be47400SFrançois Tigeot  * if there are no progress, hangcheck score for that ring is increased.
4174be47400SFrançois Tigeot  * Further, acthd is inspected to see if the ring is stuck. On stuck case
4184be47400SFrançois Tigeot  * we kick the ring. If we see no progress on three subsequent calls
4194be47400SFrançois Tigeot  * we assume chip is wedged and try to fix it by resetting the chip.
4204be47400SFrançois Tigeot  */
i915_hangcheck_elapsed(struct work_struct * work)4214be47400SFrançois Tigeot static void i915_hangcheck_elapsed(struct work_struct *work)
4224be47400SFrançois Tigeot {
4234be47400SFrançois Tigeot 	struct drm_i915_private *dev_priv =
4244be47400SFrançois Tigeot 		container_of(work, typeof(*dev_priv),
4254be47400SFrançois Tigeot 			     gpu_error.hangcheck_work.work);
4264be47400SFrançois Tigeot 	struct intel_engine_cs *engine;
4274be47400SFrançois Tigeot 	enum intel_engine_id id;
4284be47400SFrançois Tigeot 	unsigned int hung = 0, stuck = 0;
4294be47400SFrançois Tigeot 	int busy_count = 0;
4304be47400SFrançois Tigeot 
431*3f2dd94aSFrançois Tigeot 	if (!i915_modparams.enable_hangcheck)
4324be47400SFrançois Tigeot 		return;
4334be47400SFrançois Tigeot 
4344be47400SFrançois Tigeot 	if (!READ_ONCE(dev_priv->gt.awake))
4354be47400SFrançois Tigeot 		return;
4364be47400SFrançois Tigeot 
437a85cb24fSFrançois Tigeot 	if (i915_terminally_wedged(&dev_priv->gpu_error))
438a85cb24fSFrançois Tigeot 		return;
439a85cb24fSFrançois Tigeot 
4404be47400SFrançois Tigeot 	/* As enabling the GPU requires fairly extensive mmio access,
4414be47400SFrançois Tigeot 	 * periodically arm the mmio checker to see if we are triggering
4424be47400SFrançois Tigeot 	 * any invalid access.
4434be47400SFrançois Tigeot 	 */
4444be47400SFrançois Tigeot 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
4454be47400SFrançois Tigeot 
4464be47400SFrançois Tigeot 	for_each_engine(engine, dev_priv, id) {
447a85cb24fSFrançois Tigeot 		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
448a85cb24fSFrançois Tigeot 		const bool busy = intel_engine_has_waiter(engine);
4494be47400SFrançois Tigeot 
4504be47400SFrançois Tigeot 		semaphore_clear_deadlocks(dev_priv);
4514be47400SFrançois Tigeot 
452a85cb24fSFrançois Tigeot 		hangcheck_load_sample(engine, hc);
453a85cb24fSFrançois Tigeot 		hangcheck_accumulate_sample(engine, hc);
454a85cb24fSFrançois Tigeot 		hangcheck_store_sample(engine, hc);
4554be47400SFrançois Tigeot 
456a85cb24fSFrançois Tigeot 		if (engine->hangcheck.stalled) {
4574be47400SFrançois Tigeot 			hung |= intel_engine_flag(engine);
458a85cb24fSFrançois Tigeot 			if (hc->action != ENGINE_DEAD)
4594be47400SFrançois Tigeot 				stuck |= intel_engine_flag(engine);
4604be47400SFrançois Tigeot 		}
4614be47400SFrançois Tigeot 
4624be47400SFrançois Tigeot 		busy_count += busy;
4634be47400SFrançois Tigeot 	}
4644be47400SFrançois Tigeot 
465a85cb24fSFrançois Tigeot 	if (hung)
466a85cb24fSFrançois Tigeot 		hangcheck_declare_hang(dev_priv, hung, stuck);
4674be47400SFrançois Tigeot 
4684be47400SFrançois Tigeot 	/* Reset timer in case GPU hangs without another request being added */
4694be47400SFrançois Tigeot 	if (busy_count)
4704be47400SFrançois Tigeot 		i915_queue_hangcheck(dev_priv);
4714be47400SFrançois Tigeot }
4724be47400SFrançois Tigeot 
intel_engine_init_hangcheck(struct intel_engine_cs * engine)4734be47400SFrançois Tigeot void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
4744be47400SFrançois Tigeot {
4754be47400SFrançois Tigeot 	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
4764be47400SFrançois Tigeot }
4774be47400SFrançois Tigeot 
intel_hangcheck_init(struct drm_i915_private * i915)4784be47400SFrançois Tigeot void intel_hangcheck_init(struct drm_i915_private *i915)
4794be47400SFrançois Tigeot {
4804be47400SFrançois Tigeot 	INIT_DELAYED_WORK(&i915->gpu_error.hangcheck_work,
4814be47400SFrançois Tigeot 			  i915_hangcheck_elapsed);
4824be47400SFrançois Tigeot }
483a85cb24fSFrançois Tigeot 
484a85cb24fSFrançois Tigeot #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
485a85cb24fSFrançois Tigeot #include "selftests/intel_hangcheck.c"
486a85cb24fSFrançois Tigeot #endif
487