1  // SPDX-License-Identifier: MIT
2  /*
3   * Copyright © 2019 Intel Corporation
4   */
5  
6  #include "i915_drv.h"
7  #include "i915_request.h"
8  
9  #include "intel_context.h"
10  #include "intel_engine_heartbeat.h"
11  #include "intel_engine_pm.h"
12  #include "intel_engine.h"
13  #include "intel_gt.h"
14  #include "intel_reset.h"
15  
16  /*
17   * While the engine is active, we send a periodic pulse along the engine
18   * to check on its health and to flush any idle-barriers. If that request
19   * is stuck, and we fail to preempt it, we declare the engine hung and
20   * issue a reset -- in the hope that restores progress.
21   */
22  
next_heartbeat(struct intel_engine_cs * engine)23  static bool next_heartbeat(struct intel_engine_cs *engine)
24  {
25  	struct i915_request *rq;
26  	long delay;
27  
28  	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
29  
30  	rq = engine->heartbeat.systole;
31  
32  	/*
33  	 * FIXME: The final period extension is disabled if the period has been
34  	 * modified from the default. This is to prevent issues with certain
35  	 * selftests which override the value and expect specific behaviour.
36  	 * Once the selftests have been updated to either cope with variable
37  	 * heartbeat periods (or to override the pre-emption timeout as well,
38  	 * or just to add a selftest specific override of the extension), the
39  	 * generic override can be removed.
40  	 */
41  	if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
42  	    delay == engine->defaults.heartbeat_interval_ms) {
43  		long longer;
44  
45  		/*
46  		 * The final try is at the highest priority possible. Up until now
47  		 * a pre-emption might not even have been attempted. So make sure
48  		 * this last attempt allows enough time for a pre-emption to occur.
49  		 */
50  		longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
51  		longer = intel_clamp_heartbeat_interval_ms(engine, longer);
52  		if (longer > delay)
53  			delay = longer;
54  	}
55  
56  	if (!delay)
57  		return false;
58  
59  	delay = msecs_to_jiffies_timeout(delay);
60  	if (delay >= HZ)
61  		delay = round_jiffies_up_relative(delay);
62  	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
63  
64  	return true;
65  }
66  
67  static struct i915_request *
heartbeat_create(struct intel_context * ce,gfp_t gfp)68  heartbeat_create(struct intel_context *ce, gfp_t gfp)
69  {
70  	struct i915_request *rq;
71  
72  	intel_context_enter(ce);
73  	rq = __i915_request_create(ce, gfp);
74  	intel_context_exit(ce);
75  
76  	return rq;
77  }
78  
idle_pulse(struct intel_engine_cs * engine,struct i915_request * rq)79  static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
80  {
81  	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
82  	i915_request_add_active_barriers(rq);
83  	if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
84  		engine->heartbeat.systole = i915_request_get(rq);
85  }
86  
heartbeat_commit(struct i915_request * rq,const struct i915_sched_attr * attr)87  static void heartbeat_commit(struct i915_request *rq,
88  			     const struct i915_sched_attr *attr)
89  {
90  	idle_pulse(rq->engine, rq);
91  
92  	__i915_request_commit(rq);
93  	__i915_request_queue(rq, attr);
94  }
95  
show_heartbeat(const struct i915_request * rq,struct intel_engine_cs * engine)96  static void show_heartbeat(const struct i915_request *rq,
97  			   struct intel_engine_cs *engine)
98  {
99  	struct drm_printer p =
100  		drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat");
101  
102  	if (!rq) {
103  		intel_engine_dump(engine, &p,
104  				  "%s heartbeat not ticking\n",
105  				  engine->name);
106  	} else {
107  		intel_engine_dump(engine, &p,
108  				  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
109  				  engine->name,
110  				  rq->fence.context,
111  				  rq->fence.seqno,
112  				  rq->sched.attr.priority);
113  	}
114  }
115  
116  static void
reset_engine(struct intel_engine_cs * engine,struct i915_request * rq)117  reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
118  {
119  	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
120  		show_heartbeat(rq, engine);
121  
122  	if (intel_engine_uses_guc(engine))
123  		/*
124  		 * GuC itself is toast or GuC's hang detection
125  		 * is disabled. Either way, need to find the
126  		 * hang culprit manually.
127  		 */
128  		intel_guc_find_hung_context(engine);
129  
130  	intel_gt_handle_error(engine->gt, engine->mask,
131  			      I915_ERROR_CAPTURE,
132  			      "stopped heartbeat on %s",
133  			      engine->name);
134  }
135  
heartbeat(struct work_struct * wrk)136  static void heartbeat(struct work_struct *wrk)
137  {
138  	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
139  	struct intel_engine_cs *engine =
140  		container_of(wrk, typeof(*engine), heartbeat.work.work);
141  	struct intel_context *ce = engine->kernel_context;
142  	struct i915_request *rq;
143  	unsigned long serial;
144  
145  	/* Just in case everything has gone horribly wrong, give it a kick */
146  	intel_engine_flush_submission(engine);
147  
148  	rq = engine->heartbeat.systole;
149  	if (rq && i915_request_completed(rq)) {
150  		i915_request_put(rq);
151  		engine->heartbeat.systole = NULL;
152  	}
153  
154  	if (!intel_engine_pm_get_if_awake(engine))
155  		return;
156  
157  	if (intel_gt_is_wedged(engine->gt))
158  		goto out;
159  
160  	if (i915_sched_engine_disabled(engine->sched_engine)) {
161  		reset_engine(engine, engine->heartbeat.systole);
162  		goto out;
163  	}
164  
165  	if (engine->heartbeat.systole) {
166  		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
167  
168  		/* Safeguard against too-fast worker invocations */
169  		if (!time_after(jiffies,
170  				rq->emitted_jiffies + msecs_to_jiffies(delay)))
171  			goto out;
172  
173  		if (!i915_sw_fence_signaled(&rq->submit)) {
174  			/*
175  			 * Not yet submitted, system is stalled.
176  			 *
177  			 * This more often happens for ring submission,
178  			 * where all contexts are funnelled into a common
179  			 * ringbuffer. If one context is blocked on an
180  			 * external fence, not only is it not submitted,
181  			 * but all other contexts, including the kernel
182  			 * context are stuck waiting for the signal.
183  			 */
184  		} else if (engine->sched_engine->schedule &&
185  			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
186  			/*
187  			 * Gradually raise the priority of the heartbeat to
188  			 * give high priority work [which presumably desires
189  			 * low latency and no jitter] the chance to naturally
190  			 * complete before being preempted.
191  			 */
192  			attr.priority = I915_PRIORITY_NORMAL;
193  			if (rq->sched.attr.priority >= attr.priority)
194  				attr.priority = I915_PRIORITY_HEARTBEAT;
195  			if (rq->sched.attr.priority >= attr.priority)
196  				attr.priority = I915_PRIORITY_BARRIER;
197  
198  			local_bh_disable();
199  			engine->sched_engine->schedule(rq, &attr);
200  			local_bh_enable();
201  		} else {
202  			reset_engine(engine, rq);
203  		}
204  
205  		rq->emitted_jiffies = jiffies;
206  		goto out;
207  	}
208  
209  	serial = READ_ONCE(engine->serial);
210  	if (engine->wakeref_serial == serial)
211  		goto out;
212  
213  	if (!mutex_trylock(&ce->timeline->mutex)) {
214  		/* Unable to lock the kernel timeline, is the engine stuck? */
215  		if (xchg(&engine->heartbeat.blocked, serial) == serial)
216  			intel_gt_handle_error(engine->gt, engine->mask,
217  					      I915_ERROR_CAPTURE,
218  					      "no heartbeat on %s",
219  					      engine->name);
220  		goto out;
221  	}
222  
223  	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
224  	if (IS_ERR(rq))
225  		goto unlock;
226  
227  	heartbeat_commit(rq, &attr);
228  
229  unlock:
230  	mutex_unlock(&ce->timeline->mutex);
231  out:
232  	if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
233  		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
234  	intel_engine_pm_put(engine);
235  }
236  
intel_engine_unpark_heartbeat(struct intel_engine_cs * engine)237  void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
238  {
239  	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
240  		return;
241  
242  	next_heartbeat(engine);
243  }
244  
intel_engine_park_heartbeat(struct intel_engine_cs * engine)245  void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
246  {
247  	if (cancel_delayed_work(&engine->heartbeat.work))
248  		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
249  }
250  
intel_gt_unpark_heartbeats(struct intel_gt * gt)251  void intel_gt_unpark_heartbeats(struct intel_gt *gt)
252  {
253  	struct intel_engine_cs *engine;
254  	enum intel_engine_id id;
255  
256  	for_each_engine(engine, gt, id)
257  		if (intel_engine_pm_is_awake(engine))
258  			intel_engine_unpark_heartbeat(engine);
259  }
260  
intel_gt_park_heartbeats(struct intel_gt * gt)261  void intel_gt_park_heartbeats(struct intel_gt *gt)
262  {
263  	struct intel_engine_cs *engine;
264  	enum intel_engine_id id;
265  
266  	for_each_engine(engine, gt, id)
267  		intel_engine_park_heartbeat(engine);
268  }
269  
intel_engine_init_heartbeat(struct intel_engine_cs * engine)270  void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
271  {
272  	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
273  }
274  
__intel_engine_pulse(struct intel_engine_cs * engine)275  static int __intel_engine_pulse(struct intel_engine_cs *engine)
276  {
277  	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
278  	struct intel_context *ce = engine->kernel_context;
279  	struct i915_request *rq;
280  
281  	lockdep_assert_held(&ce->timeline->mutex);
282  	GEM_BUG_ON(!intel_engine_has_preemption(engine));
283  	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
284  
285  	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
286  	if (IS_ERR(rq))
287  		return PTR_ERR(rq);
288  
289  	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
290  
291  	heartbeat_commit(rq, &attr);
292  	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
293  
294  	/* Ensure the forced pulse gets a full period to execute */
295  	next_heartbeat(engine);
296  
297  	return 0;
298  }
299  
set_heartbeat(struct intel_engine_cs * engine,unsigned long delay)300  static unsigned long set_heartbeat(struct intel_engine_cs *engine,
301  				   unsigned long delay)
302  {
303  	unsigned long old;
304  
305  	old = xchg(&engine->props.heartbeat_interval_ms, delay);
306  	if (delay)
307  		intel_engine_unpark_heartbeat(engine);
308  	else
309  		intel_engine_park_heartbeat(engine);
310  
311  	return old;
312  }
313  
intel_engine_set_heartbeat(struct intel_engine_cs * engine,unsigned long delay)314  int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
315  			       unsigned long delay)
316  {
317  	struct intel_context *ce = engine->kernel_context;
318  	int err = 0;
319  
320  	if (!delay && !intel_engine_has_preempt_reset(engine))
321  		return -ENODEV;
322  
323  	/* FIXME: Remove together with equally marked hack in next_heartbeat. */
324  	if (delay != engine->defaults.heartbeat_interval_ms &&
325  	    delay < 2 * engine->props.preempt_timeout_ms) {
326  		if (intel_engine_uses_guc(engine))
327  			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
328  				   engine->name);
329  		else
330  			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
331  				   engine->name);
332  	}
333  
334  	intel_engine_pm_get(engine);
335  
336  	err = mutex_lock_interruptible(&ce->timeline->mutex);
337  	if (err)
338  		goto out_rpm;
339  
340  	if (delay != engine->props.heartbeat_interval_ms) {
341  		unsigned long saved = set_heartbeat(engine, delay);
342  
343  		/* recheck current execution */
344  		if (intel_engine_has_preemption(engine)) {
345  			err = __intel_engine_pulse(engine);
346  			if (err)
347  				set_heartbeat(engine, saved);
348  		}
349  	}
350  
351  	mutex_unlock(&ce->timeline->mutex);
352  
353  out_rpm:
354  	intel_engine_pm_put(engine);
355  	return err;
356  }
357  
intel_engine_pulse(struct intel_engine_cs * engine)358  int intel_engine_pulse(struct intel_engine_cs *engine)
359  {
360  	struct intel_context *ce = engine->kernel_context;
361  	int err;
362  
363  	if (!intel_engine_has_preemption(engine))
364  		return -ENODEV;
365  
366  	if (!intel_engine_pm_get_if_awake(engine))
367  		return 0;
368  
369  	err = -EINTR;
370  	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
371  		err = __intel_engine_pulse(engine);
372  		mutex_unlock(&ce->timeline->mutex);
373  	}
374  
375  	intel_engine_flush_submission(engine);
376  	intel_engine_pm_put(engine);
377  	return err;
378  }
379  
intel_engine_flush_barriers(struct intel_engine_cs * engine)380  int intel_engine_flush_barriers(struct intel_engine_cs *engine)
381  {
382  	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
383  	struct intel_context *ce = engine->kernel_context;
384  	struct i915_request *rq;
385  	int err;
386  
387  	if (llist_empty(&engine->barrier_tasks))
388  		return 0;
389  
390  	if (!intel_engine_pm_get_if_awake(engine))
391  		return 0;
392  
393  	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
394  		err = -EINTR;
395  		goto out_rpm;
396  	}
397  
398  	rq = heartbeat_create(ce, GFP_KERNEL);
399  	if (IS_ERR(rq)) {
400  		err = PTR_ERR(rq);
401  		goto out_unlock;
402  	}
403  
404  	heartbeat_commit(rq, &attr);
405  
406  	err = 0;
407  out_unlock:
408  	mutex_unlock(&ce->timeline->mutex);
409  out_rpm:
410  	intel_engine_pm_put(engine);
411  	return err;
412  }
413  
414  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
415  #include "selftest_engine_heartbeat.c"
416  #endif
417