1  // SPDX-License-Identifier: MIT
2  /*
3   * Copyright © 2020 Intel Corporation
4   */
5  
6  #include "gen6_engine_cs.h"
7  #include "intel_engine.h"
8  #include "intel_engine_regs.h"
9  #include "intel_gpu_commands.h"
10  #include "intel_gt.h"
11  #include "intel_gt_irq.h"
12  #include "intel_gt_pm_irq.h"
13  #include "intel_ring.h"
14  
15  #define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
16  
17  /*
18   * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19   * implementing two workarounds on gen6.  From section 1.4.7.1
20   * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21   *
22   * [DevSNB-C+{W/A}] Before any depth stall flush (including those
23   * produced by non-pipelined state commands), software needs to first
24   * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25   * 0.
26   *
27   * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28   * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29   *
30   * And the workaround for these two requires this workaround first:
31   *
32   * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33   * BEFORE the pipe-control with a post-sync op and no write-cache
34   * flushes.
35   *
36   * And this last workaround is tricky because of the requirements on
37   * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38   * volume 2 part 1:
39   *
40   *     "1 of the following must also be set:
41   *      - Render Target Cache Flush Enable ([12] of DW1)
42   *      - Depth Cache Flush Enable ([0] of DW1)
43   *      - Stall at Pixel Scoreboard ([1] of DW1)
44   *      - Depth Stall ([13] of DW1)
45   *      - Post-Sync Operation ([13] of DW1)
46   *      - Notify Enable ([8] of DW1)"
47   *
48   * The cache flushes require the workaround flush that triggered this
49   * one, so we can't use it.  Depth stall would trigger the same.
50   * Post-sync nonzero is what triggered this second workaround, so we
51   * can't use that one either.  Notify enable is IRQs, which aren't
52   * really our business.  That leaves only stall at scoreboard.
53   */
54  static int
gen6_emit_post_sync_nonzero_flush(struct i915_request * rq)55  gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56  {
57  	u32 scratch_addr =
58  		intel_gt_scratch_offset(rq->engine->gt,
59  					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60  	u32 *cs;
61  
62  	cs = intel_ring_begin(rq, 6);
63  	if (IS_ERR(cs))
64  		return PTR_ERR(cs);
65  
66  	*cs++ = GFX_OP_PIPE_CONTROL(5);
67  	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
68  	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
69  	*cs++ = 0; /* low dword */
70  	*cs++ = 0; /* high dword */
71  	*cs++ = MI_NOOP;
72  	intel_ring_advance(rq, cs);
73  
74  	cs = intel_ring_begin(rq, 6);
75  	if (IS_ERR(cs))
76  		return PTR_ERR(cs);
77  
78  	*cs++ = GFX_OP_PIPE_CONTROL(5);
79  	*cs++ = PIPE_CONTROL_QW_WRITE;
80  	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
81  	*cs++ = 0;
82  	*cs++ = 0;
83  	*cs++ = MI_NOOP;
84  	intel_ring_advance(rq, cs);
85  
86  	return 0;
87  }
88  
gen6_emit_flush_rcs(struct i915_request * rq,u32 mode)89  int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90  {
91  	u32 scratch_addr =
92  		intel_gt_scratch_offset(rq->engine->gt,
93  					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94  	u32 *cs, flags = 0;
95  	int ret;
96  
97  	/* Force SNB workarounds for PIPE_CONTROL flushes */
98  	ret = gen6_emit_post_sync_nonzero_flush(rq);
99  	if (ret)
100  		return ret;
101  
102  	/*
103  	 * Just flush everything.  Experiments have shown that reducing the
104  	 * number of bits based on the write domains has little performance
105  	 * impact. And when rearranging requests, the order of flushes is
106  	 * unknown.
107  	 */
108  	if (mode & EMIT_FLUSH) {
109  		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110  		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111  		/*
112  		 * Ensure that any following seqno writes only happen
113  		 * when the render cache is indeed flushed.
114  		 */
115  		flags |= PIPE_CONTROL_CS_STALL;
116  	}
117  	if (mode & EMIT_INVALIDATE) {
118  		flags |= PIPE_CONTROL_TLB_INVALIDATE;
119  		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120  		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121  		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122  		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123  		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124  		/*
125  		 * TLB invalidate requires a post-sync write.
126  		 */
127  		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
128  	}
129  
130  	cs = intel_ring_begin(rq, 4);
131  	if (IS_ERR(cs))
132  		return PTR_ERR(cs);
133  
134  	*cs++ = GFX_OP_PIPE_CONTROL(4);
135  	*cs++ = flags;
136  	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137  	*cs++ = 0;
138  	intel_ring_advance(rq, cs);
139  
140  	return 0;
141  }
142  
gen6_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)143  u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144  {
145  	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
146  	*cs++ = GFX_OP_PIPE_CONTROL(4);
147  	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
148  	*cs++ = 0;
149  	*cs++ = 0;
150  
151  	*cs++ = GFX_OP_PIPE_CONTROL(4);
152  	*cs++ = PIPE_CONTROL_QW_WRITE;
153  	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
154  					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
155  		PIPE_CONTROL_GLOBAL_GTT;
156  	*cs++ = 0;
157  
158  	/* Finally we can flush and with it emit the breadcrumb */
159  	*cs++ = GFX_OP_PIPE_CONTROL(4);
160  	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
161  		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
162  		 PIPE_CONTROL_DC_FLUSH_ENABLE |
163  		 PIPE_CONTROL_QW_WRITE |
164  		 PIPE_CONTROL_CS_STALL);
165  	*cs++ = i915_request_active_seqno(rq) |
166  		PIPE_CONTROL_GLOBAL_GTT;
167  	*cs++ = rq->fence.seqno;
168  
169  	*cs++ = MI_USER_INTERRUPT;
170  	*cs++ = MI_NOOP;
171  
172  	rq->tail = intel_ring_offset(rq, cs);
173  	assert_ring_tail_valid(rq->ring, rq->tail);
174  
175  	return cs;
176  }
177  
mi_flush_dw(struct i915_request * rq,u32 flags)178  static int mi_flush_dw(struct i915_request *rq, u32 flags)
179  {
180  	u32 cmd, *cs;
181  
182  	cs = intel_ring_begin(rq, 4);
183  	if (IS_ERR(cs))
184  		return PTR_ERR(cs);
185  
186  	cmd = MI_FLUSH_DW;
187  
188  	/*
189  	 * We always require a command barrier so that subsequent
190  	 * commands, such as breadcrumb interrupts, are strictly ordered
191  	 * wrt the contents of the write cache being flushed to memory
192  	 * (and thus being coherent from the CPU).
193  	 */
194  	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
195  
196  	/*
197  	 * Bspec vol 1c.3 - blitter engine command streamer:
198  	 * "If ENABLED, all TLBs will be invalidated once the flush
199  	 * operation is complete. This bit is only valid when the
200  	 * Post-Sync Operation field is a value of 1h or 3h."
201  	 */
202  	cmd |= flags;
203  
204  	*cs++ = cmd;
205  	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
206  	*cs++ = 0;
207  	*cs++ = MI_NOOP;
208  
209  	intel_ring_advance(rq, cs);
210  
211  	return 0;
212  }
213  
gen6_flush_dw(struct i915_request * rq,u32 mode,u32 invflags)214  static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215  {
216  	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
217  }
218  
gen6_emit_flush_xcs(struct i915_request * rq,u32 mode)219  int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220  {
221  	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222  }
223  
gen6_emit_flush_vcs(struct i915_request * rq,u32 mode)224  int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225  {
226  	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
227  }
228  
gen6_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)229  int gen6_emit_bb_start(struct i915_request *rq,
230  		       u64 offset, u32 len,
231  		       unsigned int dispatch_flags)
232  {
233  	u32 security;
234  	u32 *cs;
235  
236  	security = MI_BATCH_NON_SECURE_I965;
237  	if (dispatch_flags & I915_DISPATCH_SECURE)
238  		security = 0;
239  
240  	cs = intel_ring_begin(rq, 2);
241  	if (IS_ERR(cs))
242  		return PTR_ERR(cs);
243  
244  	cs = __gen6_emit_bb_start(cs, offset, security);
245  	intel_ring_advance(rq, cs);
246  
247  	return 0;
248  }
249  
250  int
hsw_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)251  hsw_emit_bb_start(struct i915_request *rq,
252  		  u64 offset, u32 len,
253  		  unsigned int dispatch_flags)
254  {
255  	u32 security;
256  	u32 *cs;
257  
258  	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
259  	if (dispatch_flags & I915_DISPATCH_SECURE)
260  		security = 0;
261  
262  	cs = intel_ring_begin(rq, 2);
263  	if (IS_ERR(cs))
264  		return PTR_ERR(cs);
265  
266  	cs = __gen6_emit_bb_start(cs, offset, security);
267  	intel_ring_advance(rq, cs);
268  
269  	return 0;
270  }
271  
gen7_stall_cs(struct i915_request * rq)272  static int gen7_stall_cs(struct i915_request *rq)
273  {
274  	u32 *cs;
275  
276  	cs = intel_ring_begin(rq, 4);
277  	if (IS_ERR(cs))
278  		return PTR_ERR(cs);
279  
280  	*cs++ = GFX_OP_PIPE_CONTROL(4);
281  	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
282  	*cs++ = 0;
283  	*cs++ = 0;
284  	intel_ring_advance(rq, cs);
285  
286  	return 0;
287  }
288  
gen7_emit_flush_rcs(struct i915_request * rq,u32 mode)289  int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290  {
291  	u32 scratch_addr =
292  		intel_gt_scratch_offset(rq->engine->gt,
293  					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294  	u32 *cs, flags = 0;
295  
296  	/*
297  	 * Ensure that any following seqno writes only happen when the render
298  	 * cache is indeed flushed.
299  	 *
300  	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
301  	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
302  	 * don't try to be clever and just set it unconditionally.
303  	 */
304  	flags |= PIPE_CONTROL_CS_STALL;
305  
306  	/*
307  	 * CS_STALL suggests at least a post-sync write.
308  	 */
309  	flags |= PIPE_CONTROL_QW_WRITE;
310  	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
311  
312  	/*
313  	 * Just flush everything.  Experiments have shown that reducing the
314  	 * number of bits based on the write domains has little performance
315  	 * impact.
316  	 */
317  	if (mode & EMIT_FLUSH) {
318  		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319  		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320  		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
321  		flags |= PIPE_CONTROL_FLUSH_ENABLE;
322  	}
323  	if (mode & EMIT_INVALIDATE) {
324  		flags |= PIPE_CONTROL_TLB_INVALIDATE;
325  		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326  		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327  		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328  		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329  		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330  		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331  
332  		/*
333  		 * Workaround: we must issue a pipe_control with CS-stall bit
334  		 * set before a pipe_control command that has the state cache
335  		 * invalidate bit set.
336  		 */
337  		gen7_stall_cs(rq);
338  	}
339  
340  	cs = intel_ring_begin(rq, 4);
341  	if (IS_ERR(cs))
342  		return PTR_ERR(cs);
343  
344  	*cs++ = GFX_OP_PIPE_CONTROL(4);
345  	*cs++ = flags;
346  	*cs++ = scratch_addr;
347  	*cs++ = 0;
348  	intel_ring_advance(rq, cs);
349  
350  	return 0;
351  }
352  
gen7_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)353  u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354  {
355  	*cs++ = GFX_OP_PIPE_CONTROL(4);
356  	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
357  		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
358  		 PIPE_CONTROL_DC_FLUSH_ENABLE |
359  		 PIPE_CONTROL_FLUSH_ENABLE |
360  		 PIPE_CONTROL_QW_WRITE |
361  		 PIPE_CONTROL_GLOBAL_GTT_IVB |
362  		 PIPE_CONTROL_CS_STALL);
363  	*cs++ = i915_request_active_seqno(rq);
364  	*cs++ = rq->fence.seqno;
365  
366  	*cs++ = MI_USER_INTERRUPT;
367  	*cs++ = MI_NOOP;
368  
369  	rq->tail = intel_ring_offset(rq, cs);
370  	assert_ring_tail_valid(rq->ring, rq->tail);
371  
372  	return cs;
373  }
374  
gen6_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)375  u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376  {
377  	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
378  	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379  
380  	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
381  	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
382  	*cs++ = rq->fence.seqno;
383  
384  	*cs++ = MI_USER_INTERRUPT;
385  
386  	rq->tail = intel_ring_offset(rq, cs);
387  	assert_ring_tail_valid(rq->ring, rq->tail);
388  
389  	return cs;
390  }
391  
392  #define GEN7_XCS_WA 32
gen7_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)393  u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
394  {
395  	int i;
396  
397  	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
398  	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399  
400  	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
401  		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
402  	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
403  	*cs++ = rq->fence.seqno;
404  
405  	for (i = 0; i < GEN7_XCS_WA; i++) {
406  		*cs++ = MI_STORE_DWORD_INDEX;
407  		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
408  		*cs++ = rq->fence.seqno;
409  	}
410  
411  	*cs++ = MI_FLUSH_DW;
412  	*cs++ = 0;
413  	*cs++ = 0;
414  
415  	*cs++ = MI_USER_INTERRUPT;
416  	*cs++ = MI_NOOP;
417  
418  	rq->tail = intel_ring_offset(rq, cs);
419  	assert_ring_tail_valid(rq->ring, rq->tail);
420  
421  	return cs;
422  }
423  #undef GEN7_XCS_WA
424  
gen6_irq_enable(struct intel_engine_cs * engine)425  void gen6_irq_enable(struct intel_engine_cs *engine)
426  {
427  	ENGINE_WRITE(engine, RING_IMR,
428  		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
429  
430  	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
431  	ENGINE_POSTING_READ(engine, RING_IMR);
432  
433  	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
434  }
435  
gen6_irq_disable(struct intel_engine_cs * engine)436  void gen6_irq_disable(struct intel_engine_cs *engine)
437  {
438  	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439  	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
440  }
441  
hsw_irq_enable_vecs(struct intel_engine_cs * engine)442  void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443  {
444  	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445  
446  	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
447  	ENGINE_POSTING_READ(engine, RING_IMR);
448  
449  	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
450  }
451  
hsw_irq_disable_vecs(struct intel_engine_cs * engine)452  void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453  {
454  	ENGINE_WRITE(engine, RING_IMR, ~0);
455  	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
456  }
457