1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_sriov.h"
28 #include "xe_vm.h"
29 #include "xe_wa.h"
30 
31 #define LRC_VALID				BIT_ULL(0)
32 #define LRC_PRIVILEGE				BIT_ULL(8)
33 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
34 #define LRC_LEGACY_64B_CONTEXT			3
35 
36 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
37 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
38 
39 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
40 
41 struct xe_lrc_snapshot {
42 	struct xe_bo *lrc_bo;
43 	void *lrc_snapshot;
44 	unsigned long lrc_size, lrc_offset;
45 
46 	u32 context_desc;
47 	u32 indirect_context_desc;
48 	u32 head;
49 	struct {
50 		u32 internal;
51 		u32 memory;
52 	} tail;
53 	u32 start_seqno;
54 	u32 seqno;
55 	u32 ctx_timestamp;
56 	u32 ctx_job_timestamp;
57 };
58 
59 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)60 lrc_to_xe(struct xe_lrc *lrc)
61 {
62 	return gt_to_xe(lrc->fence_ctx.gt);
63 }
64 
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)65 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
66 {
67 	struct xe_device *xe = gt_to_xe(gt);
68 	size_t size;
69 
70 	switch (class) {
71 	case XE_ENGINE_CLASS_RENDER:
72 		if (GRAPHICS_VER(xe) >= 20)
73 			size = 4 * SZ_4K;
74 		else
75 			size = 14 * SZ_4K;
76 		break;
77 	case XE_ENGINE_CLASS_COMPUTE:
78 		/* 14 pages since graphics_ver == 11 */
79 		if (GRAPHICS_VER(xe) >= 20)
80 			size = 3 * SZ_4K;
81 		else
82 			size = 14 * SZ_4K;
83 		break;
84 	default:
85 		WARN(1, "Unknown engine class: %d", class);
86 		fallthrough;
87 	case XE_ENGINE_CLASS_COPY:
88 	case XE_ENGINE_CLASS_VIDEO_DECODE:
89 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
90 	case XE_ENGINE_CLASS_OTHER:
91 		size = 2 * SZ_4K;
92 	}
93 
94 	/* Add indirect ring state page */
95 	if (xe_gt_has_indirect_ring_state(gt))
96 		size += LRC_INDIRECT_RING_STATE_SIZE;
97 
98 	return size;
99 }
100 
101 /*
102  * The per-platform tables are u8-encoded in @data. Decode @data and set the
103  * addresses' offset and commands in @regs. The following encoding is used
104  * for each byte. There are 2 steps: decoding commands and decoding addresses.
105  *
106  * Commands:
107  * [7]: create NOPs - number of NOPs are set in lower bits
108  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
109  *      MI_LRI_FORCE_POSTED
110  * [5:0]: Number of NOPs or registers to set values to in case of
111  *        MI_LOAD_REGISTER_IMM
112  *
113  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
114  * number of registers. They are set by using the REG/REG16 macros: the former
115  * is used for offsets smaller than 0x200 while the latter is for values bigger
116  * than that. Those macros already set all the bits documented below correctly:
117  *
118  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
119  *      follow, for the lower bits
120  * [6:0]: Register offset, without considering the engine base.
121  *
122  * This function only tweaks the commands and register offsets. Values are not
123  * filled out.
124  */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)125 static void set_offsets(u32 *regs,
126 			const u8 *data,
127 			const struct xe_hw_engine *hwe)
128 #define NOP(x) (BIT(7) | (x))
129 #define LRI(count, flags) ((flags) << 6 | (count) | \
130 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
131 #define POSTED BIT(0)
132 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
133 #define REG16(x) \
134 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
135 	(((x) >> 2) & 0x7f)
136 {
137 	const u32 base = hwe->mmio_base;
138 
139 	while (*data) {
140 		u8 count, flags;
141 
142 		if (*data & BIT(7)) { /* skip */
143 			count = *data++ & ~BIT(7);
144 			regs += count;
145 			continue;
146 		}
147 
148 		count = *data & 0x3f;
149 		flags = *data >> 6;
150 		data++;
151 
152 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
153 		if (flags & POSTED)
154 			*regs |= MI_LRI_FORCE_POSTED;
155 		*regs |= MI_LRI_LRM_CS_MMIO;
156 		regs++;
157 
158 		xe_gt_assert(hwe->gt, count);
159 		do {
160 			u32 offset = 0;
161 			u8 v;
162 
163 			do {
164 				v = *data++;
165 				offset <<= 7;
166 				offset |= v & ~BIT(7);
167 			} while (v & BIT(7));
168 
169 			regs[0] = base + (offset << 2);
170 			regs += 2;
171 		} while (--count);
172 	}
173 
174 	*regs = MI_BATCH_BUFFER_END | BIT(0);
175 }
176 
177 static const u8 gen12_xcs_offsets[] = {
178 	NOP(1),
179 	LRI(13, POSTED),
180 	REG16(0x244),
181 	REG(0x034),
182 	REG(0x030),
183 	REG(0x038),
184 	REG(0x03c),
185 	REG(0x168),
186 	REG(0x140),
187 	REG(0x110),
188 	REG(0x1c0),
189 	REG(0x1c4),
190 	REG(0x1c8),
191 	REG(0x180),
192 	REG16(0x2b4),
193 
194 	NOP(5),
195 	LRI(9, POSTED),
196 	REG16(0x3a8),
197 	REG16(0x28c),
198 	REG16(0x288),
199 	REG16(0x284),
200 	REG16(0x280),
201 	REG16(0x27c),
202 	REG16(0x278),
203 	REG16(0x274),
204 	REG16(0x270),
205 
206 	0
207 };
208 
209 static const u8 dg2_xcs_offsets[] = {
210 	NOP(1),
211 	LRI(15, POSTED),
212 	REG16(0x244),
213 	REG(0x034),
214 	REG(0x030),
215 	REG(0x038),
216 	REG(0x03c),
217 	REG(0x168),
218 	REG(0x140),
219 	REG(0x110),
220 	REG(0x1c0),
221 	REG(0x1c4),
222 	REG(0x1c8),
223 	REG(0x180),
224 	REG16(0x2b4),
225 	REG(0x120),
226 	REG(0x124),
227 
228 	NOP(1),
229 	LRI(9, POSTED),
230 	REG16(0x3a8),
231 	REG16(0x28c),
232 	REG16(0x288),
233 	REG16(0x284),
234 	REG16(0x280),
235 	REG16(0x27c),
236 	REG16(0x278),
237 	REG16(0x274),
238 	REG16(0x270),
239 
240 	0
241 };
242 
243 static const u8 gen12_rcs_offsets[] = {
244 	NOP(1),
245 	LRI(13, POSTED),
246 	REG16(0x244),
247 	REG(0x034),
248 	REG(0x030),
249 	REG(0x038),
250 	REG(0x03c),
251 	REG(0x168),
252 	REG(0x140),
253 	REG(0x110),
254 	REG(0x1c0),
255 	REG(0x1c4),
256 	REG(0x1c8),
257 	REG(0x180),
258 	REG16(0x2b4),
259 
260 	NOP(5),
261 	LRI(9, POSTED),
262 	REG16(0x3a8),
263 	REG16(0x28c),
264 	REG16(0x288),
265 	REG16(0x284),
266 	REG16(0x280),
267 	REG16(0x27c),
268 	REG16(0x278),
269 	REG16(0x274),
270 	REG16(0x270),
271 
272 	LRI(3, POSTED),
273 	REG(0x1b0),
274 	REG16(0x5a8),
275 	REG16(0x5ac),
276 
277 	NOP(6),
278 	LRI(1, 0),
279 	REG(0x0c8),
280 	NOP(3 + 9 + 1),
281 
282 	LRI(51, POSTED),
283 	REG16(0x588),
284 	REG16(0x588),
285 	REG16(0x588),
286 	REG16(0x588),
287 	REG16(0x588),
288 	REG16(0x588),
289 	REG(0x028),
290 	REG(0x09c),
291 	REG(0x0c0),
292 	REG(0x178),
293 	REG(0x17c),
294 	REG16(0x358),
295 	REG(0x170),
296 	REG(0x150),
297 	REG(0x154),
298 	REG(0x158),
299 	REG16(0x41c),
300 	REG16(0x600),
301 	REG16(0x604),
302 	REG16(0x608),
303 	REG16(0x60c),
304 	REG16(0x610),
305 	REG16(0x614),
306 	REG16(0x618),
307 	REG16(0x61c),
308 	REG16(0x620),
309 	REG16(0x624),
310 	REG16(0x628),
311 	REG16(0x62c),
312 	REG16(0x630),
313 	REG16(0x634),
314 	REG16(0x638),
315 	REG16(0x63c),
316 	REG16(0x640),
317 	REG16(0x644),
318 	REG16(0x648),
319 	REG16(0x64c),
320 	REG16(0x650),
321 	REG16(0x654),
322 	REG16(0x658),
323 	REG16(0x65c),
324 	REG16(0x660),
325 	REG16(0x664),
326 	REG16(0x668),
327 	REG16(0x66c),
328 	REG16(0x670),
329 	REG16(0x674),
330 	REG16(0x678),
331 	REG16(0x67c),
332 	REG(0x068),
333 	REG(0x084),
334 	NOP(1),
335 
336 	0
337 };
338 
339 static const u8 xehp_rcs_offsets[] = {
340 	NOP(1),
341 	LRI(13, POSTED),
342 	REG16(0x244),
343 	REG(0x034),
344 	REG(0x030),
345 	REG(0x038),
346 	REG(0x03c),
347 	REG(0x168),
348 	REG(0x140),
349 	REG(0x110),
350 	REG(0x1c0),
351 	REG(0x1c4),
352 	REG(0x1c8),
353 	REG(0x180),
354 	REG16(0x2b4),
355 
356 	NOP(5),
357 	LRI(9, POSTED),
358 	REG16(0x3a8),
359 	REG16(0x28c),
360 	REG16(0x288),
361 	REG16(0x284),
362 	REG16(0x280),
363 	REG16(0x27c),
364 	REG16(0x278),
365 	REG16(0x274),
366 	REG16(0x270),
367 
368 	LRI(3, POSTED),
369 	REG(0x1b0),
370 	REG16(0x5a8),
371 	REG16(0x5ac),
372 
373 	NOP(6),
374 	LRI(1, 0),
375 	REG(0x0c8),
376 
377 	0
378 };
379 
380 static const u8 dg2_rcs_offsets[] = {
381 	NOP(1),
382 	LRI(15, POSTED),
383 	REG16(0x244),
384 	REG(0x034),
385 	REG(0x030),
386 	REG(0x038),
387 	REG(0x03c),
388 	REG(0x168),
389 	REG(0x140),
390 	REG(0x110),
391 	REG(0x1c0),
392 	REG(0x1c4),
393 	REG(0x1c8),
394 	REG(0x180),
395 	REG16(0x2b4),
396 	REG(0x120),
397 	REG(0x124),
398 
399 	NOP(1),
400 	LRI(9, POSTED),
401 	REG16(0x3a8),
402 	REG16(0x28c),
403 	REG16(0x288),
404 	REG16(0x284),
405 	REG16(0x280),
406 	REG16(0x27c),
407 	REG16(0x278),
408 	REG16(0x274),
409 	REG16(0x270),
410 
411 	LRI(3, POSTED),
412 	REG(0x1b0),
413 	REG16(0x5a8),
414 	REG16(0x5ac),
415 
416 	NOP(6),
417 	LRI(1, 0),
418 	REG(0x0c8),
419 
420 	0
421 };
422 
423 static const u8 mtl_rcs_offsets[] = {
424 	NOP(1),
425 	LRI(15, POSTED),
426 	REG16(0x244),
427 	REG(0x034),
428 	REG(0x030),
429 	REG(0x038),
430 	REG(0x03c),
431 	REG(0x168),
432 	REG(0x140),
433 	REG(0x110),
434 	REG(0x1c0),
435 	REG(0x1c4),
436 	REG(0x1c8),
437 	REG(0x180),
438 	REG16(0x2b4),
439 	REG(0x120),
440 	REG(0x124),
441 
442 	NOP(1),
443 	LRI(9, POSTED),
444 	REG16(0x3a8),
445 	REG16(0x28c),
446 	REG16(0x288),
447 	REG16(0x284),
448 	REG16(0x280),
449 	REG16(0x27c),
450 	REG16(0x278),
451 	REG16(0x274),
452 	REG16(0x270),
453 
454 	NOP(2),
455 	LRI(2, POSTED),
456 	REG16(0x5a8),
457 	REG16(0x5ac),
458 
459 	NOP(6),
460 	LRI(1, 0),
461 	REG(0x0c8),
462 
463 	0
464 };
465 
466 #define XE2_CTX_COMMON \
467 	NOP(1),                 /* [0x00] */ \
468 	LRI(15, POSTED),        /* [0x01] */ \
469 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
470 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
471 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
472 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
473 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
474 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
475 	REG(0x140),             /* [0x0e] BB_ADDR */ \
476 	REG(0x110),             /* [0x10] BB_STATE */ \
477 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
478 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
479 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
480 	REG(0x180),             /* [0x18] CCID */ \
481 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
482 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
483 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
484 	\
485 	NOP(1),                 /* [0x20] */ \
486 	LRI(9, POSTED),         /* [0x21] */ \
487 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
488 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
489 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
490 	REG16(0x284),           /* [0x28] dummy reg */ \
491 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
492 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
493 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
494 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
495 	REG16(0x270)            /* [0x32] PTBP_LDW */
496 
497 static const u8 xe2_rcs_offsets[] = {
498 	XE2_CTX_COMMON,
499 
500 	NOP(2),                 /* [0x34] */
501 	LRI(2, POSTED),         /* [0x36] */
502 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
503 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
504 
505 	NOP(6),                 /* [0x41] */
506 	LRI(1, 0),              /* [0x47] */
507 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
508 
509 	0
510 };
511 
512 static const u8 xe2_bcs_offsets[] = {
513 	XE2_CTX_COMMON,
514 
515 	NOP(4 + 8 + 1),         /* [0x34] */
516 	LRI(2, POSTED),         /* [0x41] */
517 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
518 	REG16(0x204),           /* [0x44] BLIT_CCTL */
519 
520 	0
521 };
522 
523 static const u8 xe2_xcs_offsets[] = {
524 	XE2_CTX_COMMON,
525 
526 	0
527 };
528 
529 static const u8 xe2_indirect_ring_state_offsets[] = {
530 	NOP(1),                 /* [0x00] */
531 	LRI(5, POSTED),         /* [0x01] */
532 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
533 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
534 	REG(0x038),             /* [0x06] RING_BUFFER_START */
535 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
536 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
537 
538 	NOP(5),                 /* [0x0c] */
539 	LRI(9, POSTED),         /* [0x11] */
540 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
541 	REG(0x140),             /* [0x14] BB_ADDR */
542 	REG(0x110),             /* [0x16] BB_STATE */
543 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
544 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
545 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
546 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
547 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
548 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
549 
550 	NOP(12),                 /* [0x00] */
551 
552 	0
553 };
554 
555 #undef REG16
556 #undef REG
557 #undef LRI
558 #undef NOP
559 
reg_offsets(struct xe_device * xe,enum xe_engine_class class)560 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
561 {
562 	if (class == XE_ENGINE_CLASS_RENDER) {
563 		if (GRAPHICS_VER(xe) >= 20)
564 			return xe2_rcs_offsets;
565 		else if (GRAPHICS_VERx100(xe) >= 1270)
566 			return mtl_rcs_offsets;
567 		else if (GRAPHICS_VERx100(xe) >= 1255)
568 			return dg2_rcs_offsets;
569 		else if (GRAPHICS_VERx100(xe) >= 1250)
570 			return xehp_rcs_offsets;
571 		else
572 			return gen12_rcs_offsets;
573 	} else if (class == XE_ENGINE_CLASS_COPY) {
574 		if (GRAPHICS_VER(xe) >= 20)
575 			return xe2_bcs_offsets;
576 		else
577 			return gen12_xcs_offsets;
578 	} else {
579 		if (GRAPHICS_VER(xe) >= 20)
580 			return xe2_xcs_offsets;
581 		else if (GRAPHICS_VERx100(xe) >= 1255)
582 			return dg2_xcs_offsets;
583 		else
584 			return gen12_xcs_offsets;
585 	}
586 }
587 
set_context_control(u32 * regs,struct xe_hw_engine * hwe)588 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
589 {
590 	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
591 						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
592 
593 	if (xe_gt_has_indirect_ring_state(hwe->gt))
594 		regs[CTX_CONTEXT_CONTROL] |=
595 			_MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
596 
597 	/* TODO: Timestamp */
598 }
599 
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)600 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
601 {
602 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
603 	struct xe_device *xe = gt_to_xe(hwe->gt);
604 
605 	if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
606 		return;
607 
608 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
609 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
610 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
611 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
612 
613 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
614 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
615 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
616 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
617 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
618 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
619 }
620 
lrc_ring_mi_mode(struct xe_hw_engine * hwe)621 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
622 {
623 	struct xe_device *xe = gt_to_xe(hwe->gt);
624 
625 	if (GRAPHICS_VERx100(xe) >= 1250)
626 		return 0x70;
627 	else
628 		return 0x60;
629 }
630 
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)631 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
632 {
633 	int x;
634 
635 	x = lrc_ring_mi_mode(hwe);
636 	regs[x + 1] &= ~STOP_RING;
637 	regs[x + 1] |= STOP_RING << 16;
638 }
639 
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)640 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
641 {
642 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
643 }
644 
__xe_lrc_ring_offset(struct xe_lrc * lrc)645 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
646 {
647 	return 0;
648 }
649 
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)650 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
651 {
652 	return lrc->ring.size;
653 }
654 
655 /* Make the magic macros work */
656 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
657 #define __xe_lrc_regs_offset xe_lrc_regs_offset
658 
659 #define LRC_SEQNO_PPHWSP_OFFSET 512
660 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
661 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
662 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
663 #define LRC_PPHWSP_SIZE SZ_4K
664 
xe_lrc_regs_offset(struct xe_lrc * lrc)665 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
666 {
667 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
668 }
669 
lrc_reg_size(struct xe_device * xe)670 static size_t lrc_reg_size(struct xe_device *xe)
671 {
672 	if (GRAPHICS_VERx100(xe) >= 1250)
673 		return 96 * sizeof(u32);
674 	else
675 		return 80 * sizeof(u32);
676 }
677 
xe_lrc_skip_size(struct xe_device * xe)678 size_t xe_lrc_skip_size(struct xe_device *xe)
679 {
680 	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
681 }
682 
__xe_lrc_seqno_offset(struct xe_lrc * lrc)683 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
684 {
685 	/* The seqno is stored in the driver-defined portion of PPHWSP */
686 	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
687 }
688 
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)689 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
690 {
691 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
692 	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
693 }
694 
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)695 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
696 {
697 	/* The start seqno is stored in the driver-defined portion of PPHWSP */
698 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
699 }
700 
__xe_lrc_parallel_offset(struct xe_lrc * lrc)701 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
702 {
703 	/* The parallel is stored in the driver-defined portion of PPHWSP */
704 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
705 }
706 
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)707 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
708 {
709 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
710 }
711 
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)712 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
713 {
714 	/* Indirect ring state page is at the very end of LRC */
715 	return lrc->size - LRC_INDIRECT_RING_STATE_SIZE;
716 }
717 
718 #define DECL_MAP_ADDR_HELPERS(elem) \
719 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
720 { \
721 	struct iosys_map map = lrc->bo->vmap; \
722 \
723 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
724 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
725 	return map; \
726 } \
727 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
728 { \
729 	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
730 } \
731 
732 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)733 DECL_MAP_ADDR_HELPERS(pphwsp)
734 DECL_MAP_ADDR_HELPERS(seqno)
735 DECL_MAP_ADDR_HELPERS(regs)
736 DECL_MAP_ADDR_HELPERS(start_seqno)
737 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
738 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
739 DECL_MAP_ADDR_HELPERS(parallel)
740 DECL_MAP_ADDR_HELPERS(indirect_ring)
741 
742 #undef DECL_MAP_ADDR_HELPERS
743 
744 /**
745  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
746  * @lrc: Pointer to the lrc.
747  *
748  * Returns: ctx timestamp GGTT address
749  */
750 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
751 {
752 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
753 }
754 
755 /**
756  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
757  * @lrc: Pointer to the lrc.
758  *
759  * Returns: ctx timestamp value
760  */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)761 u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
762 {
763 	struct xe_device *xe = lrc_to_xe(lrc);
764 	struct iosys_map map;
765 
766 	map = __xe_lrc_ctx_timestamp_map(lrc);
767 	return xe_map_read32(xe, &map);
768 }
769 
770 /**
771  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
772  * @lrc: Pointer to the lrc.
773  *
774  * Returns: ctx timestamp job GGTT address
775  */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)776 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
777 {
778 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
779 }
780 
781 /**
782  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
783  * @lrc: Pointer to the lrc.
784  *
785  * Returns: ctx timestamp job value
786  */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)787 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
788 {
789 	struct xe_device *xe = lrc_to_xe(lrc);
790 	struct iosys_map map;
791 
792 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
793 	return xe_map_read32(xe, &map);
794 }
795 
xe_lrc_ggtt_addr(struct xe_lrc * lrc)796 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
797 {
798 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
799 }
800 
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)801 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
802 {
803 	if (!xe_lrc_has_indirect_ring_state(lrc))
804 		return 0;
805 
806 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
807 }
808 
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)809 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
810 {
811 	struct xe_device *xe = lrc_to_xe(lrc);
812 	struct iosys_map map;
813 
814 	map = __xe_lrc_indirect_ring_map(lrc);
815 	iosys_map_incr(&map, reg_nr * sizeof(u32));
816 	return xe_map_read32(xe, &map);
817 }
818 
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)819 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
820 					  int reg_nr, u32 val)
821 {
822 	struct xe_device *xe = lrc_to_xe(lrc);
823 	struct iosys_map map;
824 
825 	map = __xe_lrc_indirect_ring_map(lrc);
826 	iosys_map_incr(&map, reg_nr * sizeof(u32));
827 	xe_map_write32(xe, &map, val);
828 }
829 
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)830 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
831 {
832 	struct xe_device *xe = lrc_to_xe(lrc);
833 	struct iosys_map map;
834 
835 	map = __xe_lrc_regs_map(lrc);
836 	iosys_map_incr(&map, reg_nr * sizeof(u32));
837 	return xe_map_read32(xe, &map);
838 }
839 
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)840 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
841 {
842 	struct xe_device *xe = lrc_to_xe(lrc);
843 	struct iosys_map map;
844 
845 	map = __xe_lrc_regs_map(lrc);
846 	iosys_map_incr(&map, reg_nr * sizeof(u32));
847 	xe_map_write32(xe, &map, val);
848 }
849 
empty_lrc_data(struct xe_hw_engine * hwe)850 static void *empty_lrc_data(struct xe_hw_engine *hwe)
851 {
852 	struct xe_gt *gt = hwe->gt;
853 	void *data;
854 	u32 *regs;
855 
856 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
857 	if (!data)
858 		return NULL;
859 
860 	/* 1st page: Per-Process of HW status Page */
861 	regs = data + LRC_PPHWSP_SIZE;
862 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
863 	set_context_control(regs, hwe);
864 	set_memory_based_intr(regs, hwe);
865 	reset_stop_ring(regs, hwe);
866 	if (xe_gt_has_indirect_ring_state(gt)) {
867 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
868 		       LRC_INDIRECT_RING_STATE_SIZE;
869 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
870 	}
871 
872 	return data;
873 }
874 
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)875 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
876 {
877 	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
878 
879 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
880 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
881 }
882 
xe_lrc_finish(struct xe_lrc * lrc)883 static void xe_lrc_finish(struct xe_lrc *lrc)
884 {
885 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
886 	xe_bo_lock(lrc->bo, false);
887 	xe_bo_unpin(lrc->bo);
888 	xe_bo_unlock(lrc->bo);
889 	xe_bo_put(lrc->bo);
890 }
891 
892 #define PVC_CTX_ASID		(0x2e + 1)
893 #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
894 
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size)895 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
896 		       struct xe_vm *vm, u32 ring_size)
897 {
898 	struct xe_gt *gt = hwe->gt;
899 	struct xe_tile *tile = gt_to_tile(gt);
900 	struct xe_device *xe = gt_to_xe(gt);
901 	struct iosys_map map;
902 	void *init_data = NULL;
903 	u32 arb_enable;
904 	u32 lrc_size;
905 	int err;
906 
907 	kref_init(&lrc->refcount);
908 	lrc->flags = 0;
909 	lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
910 	if (xe_gt_has_indirect_ring_state(gt))
911 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
912 
913 	/*
914 	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
915 	 * via VM bind calls.
916 	 */
917 	lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
918 				       ttm_bo_type_kernel,
919 				       XE_BO_FLAG_VRAM_IF_DGFX(tile) |
920 				       XE_BO_FLAG_GGTT |
921 				       XE_BO_FLAG_GGTT_INVALIDATE);
922 	if (IS_ERR(lrc->bo))
923 		return PTR_ERR(lrc->bo);
924 
925 	lrc->size = lrc_size;
926 	lrc->tile = gt_to_tile(hwe->gt);
927 	lrc->ring.size = ring_size;
928 	lrc->ring.tail = 0;
929 	lrc->ctx_timestamp = 0;
930 
931 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
932 			     hwe->fence_irq, hwe->name);
933 
934 	if (!gt->default_lrc[hwe->class]) {
935 		init_data = empty_lrc_data(hwe);
936 		if (!init_data) {
937 			err = -ENOMEM;
938 			goto err_lrc_finish;
939 		}
940 	}
941 
942 	/*
943 	 * Init Per-Process of HW status Page, LRC / context state to known
944 	 * values
945 	 */
946 	map = __xe_lrc_pphwsp_map(lrc);
947 	if (!init_data) {
948 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
949 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
950 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
951 				 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE);
952 	} else {
953 		xe_map_memcpy_to(xe, &map, 0, init_data,
954 				 xe_gt_lrc_size(gt, hwe->class));
955 		kfree(init_data);
956 	}
957 
958 	if (vm) {
959 		xe_lrc_set_ppgtt(lrc, vm);
960 
961 		if (vm->xef)
962 			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
963 	}
964 
965 	if (xe_gt_has_indirect_ring_state(gt)) {
966 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
967 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
968 
969 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
970 					      __xe_lrc_ring_ggtt_addr(lrc));
971 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
972 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
973 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
974 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
975 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
976 	} else {
977 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
978 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
979 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
980 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
981 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
982 	}
983 
984 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
985 
986 	if (xe->info.has_asid && vm)
987 		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
988 
989 	lrc->desc = LRC_VALID;
990 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
991 	/* TODO: Priority */
992 
993 	/* While this appears to have something about privileged batches or
994 	 * some such, it really just means PPGTT mode.
995 	 */
996 	if (vm)
997 		lrc->desc |= LRC_PRIVILEGE;
998 
999 	if (GRAPHICS_VERx100(xe) < 1250) {
1000 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1001 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1002 	}
1003 
1004 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1005 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1006 
1007 	map = __xe_lrc_seqno_map(lrc);
1008 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1009 
1010 	map = __xe_lrc_start_seqno_map(lrc);
1011 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1012 
1013 	return 0;
1014 
1015 err_lrc_finish:
1016 	xe_lrc_finish(lrc);
1017 	return err;
1018 }
1019 
1020 /**
1021  * xe_lrc_create - Create a LRC
1022  * @hwe: Hardware Engine
1023  * @vm: The VM (address space)
1024  * @ring_size: LRC ring size
1025  *
1026  * Allocate and initialize the Logical Ring Context (LRC).
1027  *
1028  * Return pointer to created LRC upon success and an error pointer
1029  * upon failure.
1030  */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size)1031 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1032 			     u32 ring_size)
1033 {
1034 	struct xe_lrc *lrc;
1035 	int err;
1036 
1037 	lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1038 	if (!lrc)
1039 		return ERR_PTR(-ENOMEM);
1040 
1041 	err = xe_lrc_init(lrc, hwe, vm, ring_size);
1042 	if (err) {
1043 		kfree(lrc);
1044 		return ERR_PTR(err);
1045 	}
1046 
1047 	return lrc;
1048 }
1049 
1050 /**
1051  * xe_lrc_destroy - Destroy the LRC
1052  * @ref: reference to LRC
1053  *
1054  * Called when ref == 0, release resources held by the Logical Ring Context
1055  * (LRC) and free the LRC memory.
1056  */
xe_lrc_destroy(struct kref * ref)1057 void xe_lrc_destroy(struct kref *ref)
1058 {
1059 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1060 
1061 	xe_lrc_finish(lrc);
1062 	kfree(lrc);
1063 }
1064 
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1065 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1066 {
1067 	if (xe_lrc_has_indirect_ring_state(lrc))
1068 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1069 	else
1070 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1071 }
1072 
xe_lrc_ring_tail(struct xe_lrc * lrc)1073 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1074 {
1075 	if (xe_lrc_has_indirect_ring_state(lrc))
1076 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1077 	else
1078 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1079 }
1080 
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1081 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1082 {
1083 	if (xe_lrc_has_indirect_ring_state(lrc))
1084 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1085 	else
1086 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1087 }
1088 
xe_lrc_ring_head(struct xe_lrc * lrc)1089 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1090 {
1091 	if (xe_lrc_has_indirect_ring_state(lrc))
1092 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1093 	else
1094 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1095 }
1096 
xe_lrc_ring_space(struct xe_lrc * lrc)1097 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1098 {
1099 	const u32 head = xe_lrc_ring_head(lrc);
1100 	const u32 tail = lrc->ring.tail;
1101 	const u32 size = lrc->ring.size;
1102 
1103 	return ((head - tail - 1) & (size - 1)) + 1;
1104 }
1105 
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1106 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1107 				const void *data, size_t size)
1108 {
1109 	struct xe_device *xe = lrc_to_xe(lrc);
1110 
1111 	iosys_map_incr(&ring, lrc->ring.tail);
1112 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1113 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1114 }
1115 
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1116 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1117 {
1118 	struct xe_device *xe = lrc_to_xe(lrc);
1119 	struct iosys_map ring;
1120 	u32 rhs;
1121 	size_t aligned_size;
1122 
1123 	xe_assert(xe, IS_ALIGNED(size, 4));
1124 	aligned_size = ALIGN(size, 8);
1125 
1126 	ring = __xe_lrc_ring_map(lrc);
1127 
1128 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1129 	rhs = lrc->ring.size - lrc->ring.tail;
1130 	if (size > rhs) {
1131 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1132 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1133 	} else {
1134 		__xe_lrc_write_ring(lrc, ring, data, size);
1135 	}
1136 
1137 	if (aligned_size > size) {
1138 		u32 noop = MI_NOOP;
1139 
1140 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1141 	}
1142 }
1143 
xe_lrc_descriptor(struct xe_lrc * lrc)1144 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1145 {
1146 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1147 }
1148 
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1149 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1150 {
1151 	return __xe_lrc_seqno_ggtt_addr(lrc);
1152 }
1153 
1154 /**
1155  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1156  *
1157  * Allocate but don't initialize an lrc seqno fence.
1158  *
1159  * Return: Pointer to the allocated fence or
1160  * negative error pointer on error.
1161  */
xe_lrc_alloc_seqno_fence(void)1162 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1163 {
1164 	return xe_hw_fence_alloc();
1165 }
1166 
1167 /**
1168  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1169  * @fence: Pointer to the fence to free.
1170  *
1171  * Frees an lrc seqno fence that hasn't yet been
1172  * initialized.
1173  */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1174 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1175 {
1176 	xe_hw_fence_free(fence);
1177 }
1178 
1179 /**
1180  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1181  * @lrc: Pointer to the lrc.
1182  * @fence: Pointer to the fence to initialize.
1183  *
1184  * Initializes a pre-allocated lrc seqno fence.
1185  * After initialization, the fence is subject to normal
1186  * dma-fence refcounting.
1187  */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1188 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1189 {
1190 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1191 }
1192 
xe_lrc_seqno(struct xe_lrc * lrc)1193 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1194 {
1195 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1196 
1197 	return xe_map_read32(lrc_to_xe(lrc), &map);
1198 }
1199 
xe_lrc_start_seqno(struct xe_lrc * lrc)1200 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1201 {
1202 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1203 
1204 	return xe_map_read32(lrc_to_xe(lrc), &map);
1205 }
1206 
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1207 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1208 {
1209 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1210 }
1211 
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1212 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1213 {
1214 	return __xe_lrc_parallel_ggtt_addr(lrc);
1215 }
1216 
xe_lrc_parallel_map(struct xe_lrc * lrc)1217 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1218 {
1219 	return __xe_lrc_parallel_map(lrc);
1220 }
1221 
instr_dw(u32 cmd_header)1222 static int instr_dw(u32 cmd_header)
1223 {
1224 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1225 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1226 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1227 		return 1;
1228 
1229 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1230 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1231 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1232 
1233 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1234 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1235 }
1236 
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1237 static int dump_mi_command(struct drm_printer *p,
1238 			   struct xe_gt *gt,
1239 			   u32 *dw,
1240 			   int remaining_dw)
1241 {
1242 	u32 inst_header = *dw;
1243 	u32 numdw = instr_dw(inst_header);
1244 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1245 	int num_noop;
1246 
1247 	/* First check for commands that don't have/use a '# DW' field */
1248 	switch (inst_header & MI_OPCODE) {
1249 	case MI_NOOP:
1250 		num_noop = 1;
1251 		while (num_noop < remaining_dw &&
1252 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1253 			num_noop++;
1254 		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1255 		return num_noop;
1256 
1257 	case MI_TOPOLOGY_FILTER:
1258 		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1259 		return 1;
1260 
1261 	case MI_BATCH_BUFFER_END:
1262 		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1263 		/* Return 'remaining_dw' to consume the rest of the LRC */
1264 		return remaining_dw;
1265 	}
1266 
1267 	/*
1268 	 * Any remaining commands include a # of dwords.  We should make sure
1269 	 * it doesn't exceed the remaining size of the LRC.
1270 	 */
1271 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1272 		numdw = remaining_dw;
1273 
1274 	switch (inst_header & MI_OPCODE) {
1275 	case MI_LOAD_REGISTER_IMM:
1276 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1277 			   inst_header, (numdw - 1) / 2);
1278 		for (int i = 1; i < numdw; i += 2)
1279 			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1280 		return numdw;
1281 
1282 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1283 		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1284 			   inst_header,
1285 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1286 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1287 		if (numdw == 4)
1288 			drm_printf(p, " - %#6x = %#010llx\n",
1289 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1290 		else
1291 			drm_printf(p, " - %*ph (%s)\n",
1292 				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1293 				   numdw < 4 ? "truncated" : "malformed");
1294 		return numdw;
1295 
1296 	case MI_FORCE_WAKEUP:
1297 		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1298 		return numdw;
1299 
1300 	default:
1301 		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1302 			   inst_header, opcode, numdw);
1303 		return numdw;
1304 	}
1305 }
1306 
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1307 static int dump_gfxpipe_command(struct drm_printer *p,
1308 				struct xe_gt *gt,
1309 				u32 *dw,
1310 				int remaining_dw)
1311 {
1312 	u32 numdw = instr_dw(*dw);
1313 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1314 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1315 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1316 
1317 	/*
1318 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1319 	 * remaining size of the LRC.
1320 	 */
1321 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1322 		numdw = remaining_dw;
1323 
1324 	switch (*dw & GFXPIPE_MATCH_MASK) {
1325 #define MATCH(cmd) \
1326 	case cmd: \
1327 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1328 		return numdw
1329 #define MATCH3D(cmd) \
1330 	case CMD_##cmd: \
1331 		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1332 		return numdw
1333 
1334 	MATCH(STATE_BASE_ADDRESS);
1335 	MATCH(STATE_SIP);
1336 	MATCH(GPGPU_CSR_BASE_ADDRESS);
1337 	MATCH(STATE_COMPUTE_MODE);
1338 	MATCH3D(3DSTATE_BTD);
1339 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1340 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1341 
1342 	MATCH3D(3DSTATE_VF_STATISTICS);
1343 
1344 	MATCH(PIPELINE_SELECT);
1345 
1346 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1347 	MATCH3D(3DSTATE_CLEAR_PARAMS);
1348 	MATCH3D(3DSTATE_DEPTH_BUFFER);
1349 	MATCH3D(3DSTATE_STENCIL_BUFFER);
1350 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1351 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1352 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1353 	MATCH3D(3DSTATE_INDEX_BUFFER);
1354 	MATCH3D(3DSTATE_VF);
1355 	MATCH3D(3DSTATE_MULTISAMPLE);
1356 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1357 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1358 	MATCH3D(3DSTATE_VS);
1359 	MATCH3D(3DSTATE_GS);
1360 	MATCH3D(3DSTATE_CLIP);
1361 	MATCH3D(3DSTATE_SF);
1362 	MATCH3D(3DSTATE_WM);
1363 	MATCH3D(3DSTATE_CONSTANT_VS);
1364 	MATCH3D(3DSTATE_CONSTANT_GS);
1365 	MATCH3D(3DSTATE_CONSTANT_PS);
1366 	MATCH3D(3DSTATE_SAMPLE_MASK);
1367 	MATCH3D(3DSTATE_CONSTANT_HS);
1368 	MATCH3D(3DSTATE_CONSTANT_DS);
1369 	MATCH3D(3DSTATE_HS);
1370 	MATCH3D(3DSTATE_TE);
1371 	MATCH3D(3DSTATE_DS);
1372 	MATCH3D(3DSTATE_STREAMOUT);
1373 	MATCH3D(3DSTATE_SBE);
1374 	MATCH3D(3DSTATE_PS);
1375 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1376 	MATCH3D(3DSTATE_CPS_POINTERS);
1377 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1378 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1379 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1380 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1381 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1382 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1383 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1384 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1385 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1386 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1387 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1388 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1389 	MATCH3D(3DSTATE_VF_INSTANCING);
1390 	MATCH3D(3DSTATE_VF_SGVS);
1391 	MATCH3D(3DSTATE_VF_TOPOLOGY);
1392 	MATCH3D(3DSTATE_WM_CHROMAKEY);
1393 	MATCH3D(3DSTATE_PS_BLEND);
1394 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1395 	MATCH3D(3DSTATE_PS_EXTRA);
1396 	MATCH3D(3DSTATE_RASTER);
1397 	MATCH3D(3DSTATE_SBE_SWIZ);
1398 	MATCH3D(3DSTATE_WM_HZ_OP);
1399 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1400 	MATCH3D(3DSTATE_VF_SGVS_2);
1401 	MATCH3D(3DSTATE_VFG);
1402 	MATCH3D(3DSTATE_URB_ALLOC_VS);
1403 	MATCH3D(3DSTATE_URB_ALLOC_HS);
1404 	MATCH3D(3DSTATE_URB_ALLOC_DS);
1405 	MATCH3D(3DSTATE_URB_ALLOC_GS);
1406 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1407 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1408 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1409 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1410 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1411 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1412 	MATCH3D(3DSTATE_AMFS);
1413 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1414 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1415 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1416 	MATCH3D(3DSTATE_MESH_CONTROL);
1417 	MATCH3D(3DSTATE_MESH_DISTRIB);
1418 	MATCH3D(3DSTATE_TASK_REDISTRIB);
1419 	MATCH3D(3DSTATE_MESH_SHADER);
1420 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1421 	MATCH3D(3DSTATE_TASK_CONTROL);
1422 	MATCH3D(3DSTATE_TASK_SHADER);
1423 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1424 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1425 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1426 	MATCH3D(3DSTATE_CLIP_MESH);
1427 	MATCH3D(3DSTATE_SBE_MESH);
1428 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1429 
1430 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1431 	MATCH3D(3DSTATE_CHROMA_KEY);
1432 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1433 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1434 	MATCH3D(3DSTATE_LINE_STIPPLE);
1435 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1436 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1437 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1438 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1439 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1440 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1441 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1442 	MATCH3D(3DSTATE_SO_DECL_LIST);
1443 	MATCH3D(3DSTATE_SO_BUFFER);
1444 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1445 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1446 	MATCH3D(3DSTATE_3D_MODE);
1447 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1448 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1449 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1450 
1451 	default:
1452 		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1453 			   *dw, pipeline, opcode, subopcode, numdw);
1454 		return numdw;
1455 	}
1456 }
1457 
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1458 static int dump_gfx_state_command(struct drm_printer *p,
1459 				  struct xe_gt *gt,
1460 				  u32 *dw,
1461 				  int remaining_dw)
1462 {
1463 	u32 numdw = instr_dw(*dw);
1464 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1465 
1466 	/*
1467 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1468 	 * remaining size of the LRC.
1469 	 */
1470 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1471 		numdw = remaining_dw;
1472 
1473 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1474 	MATCH(STATE_WRITE_INLINE);
1475 
1476 	default:
1477 		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1478 			   *dw, opcode, numdw);
1479 		return numdw;
1480 	}
1481 }
1482 
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)1483 void xe_lrc_dump_default(struct drm_printer *p,
1484 			 struct xe_gt *gt,
1485 			 enum xe_engine_class hwe_class)
1486 {
1487 	u32 *dw;
1488 	int remaining_dw, num_dw;
1489 
1490 	if (!gt->default_lrc[hwe_class]) {
1491 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1492 		return;
1493 	}
1494 
1495 	/*
1496 	 * Skip the beginning of the LRC since it contains the per-process
1497 	 * hardware status page.
1498 	 */
1499 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1500 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1501 
1502 	while (remaining_dw > 0) {
1503 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1504 			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1505 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1506 			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1507 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1508 			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1509 		} else {
1510 			num_dw = min(instr_dw(*dw), remaining_dw);
1511 			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1512 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1513 				   num_dw);
1514 		}
1515 
1516 		dw += num_dw;
1517 		remaining_dw -= num_dw;
1518 	}
1519 }
1520 
1521 struct instr_state {
1522 	u32 instr;
1523 	u16 num_dw;
1524 };
1525 
1526 static const struct instr_state xe_hpg_svg_state[] = {
1527 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1528 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1529 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1530 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1531 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1532 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1533 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1534 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1535 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1536 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1537 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1538 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1539 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1540 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1541 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1542 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1543 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1544 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1545 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1546 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1547 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1548 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1549 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1550 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1551 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1552 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1553 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1554 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1555 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1556 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1557 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1558 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1559 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1560 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1561 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1562 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1563 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1564 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1565 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1566 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1567 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1568 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1569 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1570 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1571 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1572 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1573 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1574 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1575 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1576 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1577 };
1578 
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,struct xe_bb * bb)1579 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1580 {
1581 	struct xe_gt *gt = q->hwe->gt;
1582 	struct xe_device *xe = gt_to_xe(gt);
1583 	const struct instr_state *state_table = NULL;
1584 	int state_table_size = 0;
1585 
1586 	/*
1587 	 * Wa_14019789679
1588 	 *
1589 	 * If the driver doesn't explicitly emit the SVG instructions while
1590 	 * setting up the default LRC, the context switch will write 0's
1591 	 * (noops) into the LRC memory rather than the expected instruction
1592 	 * headers.  Application contexts start out as a copy of the default
1593 	 * LRC, and if they also do not emit specific settings for some SVG
1594 	 * state, then on context restore they'll unintentionally inherit
1595 	 * whatever state setting the previous context had programmed into the
1596 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1597 	 * prevent the hardware from resetting that state back to any specific
1598 	 * value).
1599 	 *
1600 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1601 	 * since that's a specific state setting that can easily cause GPU
1602 	 * hangs if unintentionally inherited.  However to be safe we'll
1603 	 * continue to emit all of the SVG state since it's best not to leak
1604 	 * any of the state between contexts, even if that leakage is harmless.
1605 	 */
1606 	if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1607 		state_table = xe_hpg_svg_state;
1608 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1609 	}
1610 
1611 	if (!state_table) {
1612 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1613 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1614 		return;
1615 	}
1616 
1617 	for (int i = 0; i < state_table_size; i++) {
1618 		u32 instr = state_table[i].instr;
1619 		u16 num_dw = state_table[i].num_dw;
1620 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1621 
1622 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1623 		xe_gt_assert(gt, num_dw != 0);
1624 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1625 
1626 		/*
1627 		 * Xe2's SVG context is the same as the one on DG2 / MTL
1628 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1629 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1630 		 * Just make the replacement here rather than defining a
1631 		 * whole separate table for the single trivial change.
1632 		 */
1633 		if (GRAPHICS_VER(xe) >= 20 &&
1634 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1635 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1636 
1637 		bb->cs[bb->len] = instr;
1638 		if (!is_single_dw)
1639 			bb->cs[bb->len] |= (num_dw - 2);
1640 
1641 		bb->len += num_dw;
1642 	}
1643 }
1644 
xe_lrc_snapshot_capture(struct xe_lrc * lrc)1645 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1646 {
1647 	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1648 
1649 	if (!snapshot)
1650 		return NULL;
1651 
1652 	if (lrc->bo->vm)
1653 		xe_vm_get(lrc->bo->vm);
1654 
1655 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1656 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1657 	snapshot->head = xe_lrc_ring_head(lrc);
1658 	snapshot->tail.internal = lrc->ring.tail;
1659 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1660 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1661 	snapshot->seqno = xe_lrc_seqno(lrc);
1662 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
1663 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1664 	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1665 	snapshot->lrc_snapshot = NULL;
1666 	snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1667 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1668 	return snapshot;
1669 }
1670 
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)1671 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1672 {
1673 	struct xe_bo *bo;
1674 	struct xe_vm *vm;
1675 	struct iosys_map src;
1676 
1677 	if (!snapshot)
1678 		return;
1679 
1680 	bo = snapshot->lrc_bo;
1681 	vm = bo->vm;
1682 	snapshot->lrc_bo = NULL;
1683 
1684 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1685 	if (!snapshot->lrc_snapshot)
1686 		goto put_bo;
1687 
1688 	xe_bo_lock(bo, false);
1689 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
1690 		xe_map_memcpy_from(xe_bo_device(bo),
1691 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1692 				   snapshot->lrc_size);
1693 		ttm_bo_vunmap(&bo->ttm, &src);
1694 	} else {
1695 		kvfree(snapshot->lrc_snapshot);
1696 		snapshot->lrc_snapshot = NULL;
1697 	}
1698 	xe_bo_unlock(bo);
1699 put_bo:
1700 	xe_bo_put(bo);
1701 	if (vm)
1702 		xe_vm_put(vm);
1703 }
1704 
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)1705 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1706 {
1707 	unsigned long i;
1708 
1709 	if (!snapshot)
1710 		return;
1711 
1712 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1713 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1714 		   snapshot->indirect_context_desc);
1715 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1716 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1717 		   snapshot->tail.internal, snapshot->tail.memory);
1718 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1719 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1720 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1721 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1722 
1723 	if (!snapshot->lrc_snapshot)
1724 		return;
1725 
1726 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1727 	drm_puts(p, "\t[HWSP].data: ");
1728 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1729 		u32 *val = snapshot->lrc_snapshot + i;
1730 		char dumped[ASCII85_BUFSZ];
1731 
1732 		drm_puts(p, ascii85_encode(*val, dumped));
1733 	}
1734 
1735 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1736 	drm_puts(p, "\t[HWCTX].data: ");
1737 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1738 		u32 *val = snapshot->lrc_snapshot + i;
1739 		char dumped[ASCII85_BUFSZ];
1740 
1741 		drm_puts(p, ascii85_encode(*val, dumped));
1742 	}
1743 	drm_puts(p, "\n");
1744 }
1745 
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)1746 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1747 {
1748 	if (!snapshot)
1749 		return;
1750 
1751 	kvfree(snapshot->lrc_snapshot);
1752 	if (snapshot->lrc_bo) {
1753 		struct xe_vm *vm;
1754 
1755 		vm = snapshot->lrc_bo->vm;
1756 		xe_bo_put(snapshot->lrc_bo);
1757 		if (vm)
1758 			xe_vm_put(vm);
1759 	}
1760 	kfree(snapshot);
1761 }
1762 
1763 /**
1764  * xe_lrc_update_timestamp() - Update ctx timestamp
1765  * @lrc: Pointer to the lrc.
1766  * @old_ts: Old timestamp value
1767  *
1768  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1769  * update saved value.
1770  *
1771  * Returns: New ctx timestamp value
1772  */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u32 * old_ts)1773 u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
1774 {
1775 	*old_ts = lrc->ctx_timestamp;
1776 
1777 	lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1778 
1779 	return lrc->ctx_timestamp;
1780 }
1781