1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include "i915_selftest.h"
7 
8 #include "gem/i915_gem_internal.h"
9 #include "gem/i915_gem_lmem.h"
10 #include "gem/i915_gem_region.h"
11 
12 #include "gen8_engine_cs.h"
13 #include "i915_gem_ww.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_context.h"
17 #include "intel_gt.h"
18 #include "intel_ring.h"
19 
20 #include "selftests/igt_flush_test.h"
21 #include "selftests/i915_random.h"
22 
vma_set_qw(struct i915_vma * vma,u64 addr,u64 val)23 static void vma_set_qw(struct i915_vma *vma, u64 addr, u64 val)
24 {
25 	GEM_BUG_ON(addr < i915_vma_offset(vma));
26 	GEM_BUG_ON(addr >= i915_vma_offset(vma) + i915_vma_size(vma) + sizeof(val));
27 	memset64(page_mask_bits(vma->obj->mm.mapping) +
28 		 (addr - i915_vma_offset(vma)), val, 1);
29 }
30 
31 static int
pte_tlbinv(struct intel_context * ce,struct i915_vma * va,struct i915_vma * vb,u64 align,void (* tlbinv)(struct i915_address_space * vm,u64 addr,u64 length),u64 length,struct rnd_state * prng)32 pte_tlbinv(struct intel_context *ce,
33 	   struct i915_vma *va,
34 	   struct i915_vma *vb,
35 	   u64 align,
36 	   void (*tlbinv)(struct i915_address_space *vm, u64 addr, u64 length),
37 	   u64 length,
38 	   struct rnd_state *prng)
39 {
40 	const unsigned int pat_index =
41 		i915_gem_get_pat_index(ce->vm->i915, I915_CACHE_NONE);
42 	struct drm_i915_gem_object *batch;
43 	struct drm_mm_node vb_node;
44 	struct i915_request *rq;
45 	struct i915_vma *vma;
46 	u64 addr;
47 	int err;
48 	u32 *cs;
49 
50 	batch = i915_gem_object_create_internal(ce->vm->i915, 4096);
51 	if (IS_ERR(batch))
52 		return PTR_ERR(batch);
53 
54 	vma = i915_vma_instance(batch, ce->vm, NULL);
55 	if (IS_ERR(vma)) {
56 		err = PTR_ERR(vma);
57 		goto out;
58 	}
59 
60 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
61 	if (err)
62 		goto out;
63 
64 	/* Pin va at random but aligned offset after vma */
65 	addr = round_up(vma->node.start + vma->node.size, align);
66 	/* MI_CONDITIONAL_BATCH_BUFFER_END limits address to 48b */
67 	addr = igt_random_offset(prng, addr, min(ce->vm->total, BIT_ULL(48)),
68 				 va->size, align);
69 	err = i915_vma_pin(va,  0, 0, addr | PIN_OFFSET_FIXED | PIN_USER);
70 	if (err) {
71 		pr_err("Cannot pin at %llx+%llx\n", addr, va->size);
72 		goto out;
73 	}
74 	GEM_BUG_ON(i915_vma_offset(va) != addr);
75 	if (vb != va) {
76 		vb_node = vb->node;
77 		vb->node = va->node; /* overwrites the _same_ PTE  */
78 	}
79 
80 	/*
81 	 * Now choose random dword at the 1st pinned page.
82 	 *
83 	 * SZ_64K pages on dg1 require that the whole PT be marked
84 	 * containing 64KiB entries. So we make sure that vma
85 	 * covers the whole PT, despite being randomly aligned to 64KiB
86 	 * and restrict our sampling to the 2MiB PT within where
87 	 * we know that we will be using 64KiB pages.
88 	 */
89 	if (align == SZ_64K)
90 		addr = round_up(addr, SZ_2M);
91 	addr = igt_random_offset(prng, addr, addr + align, 8, 8);
92 
93 	if (va != vb)
94 		pr_info("%s(%s): Sampling %llx, with alignment %llx, using PTE size %x (phys %x, sg %x), invalidate:%llx+%llx\n",
95 			ce->engine->name, va->obj->mm.region->name ?: "smem",
96 			addr, align, va->resource->page_sizes_gtt,
97 			va->page_sizes.phys, va->page_sizes.sg,
98 			addr & -length, length);
99 
100 	cs = i915_gem_object_pin_map_unlocked(batch, I915_MAP_WC);
101 	*cs++ = MI_NOOP; /* for later termination */
102 	/*
103 	 * Sample the target to see if we spot the updated backing store.
104 	 * Gen8 VCS compares immediate value with bitwise-and of two
105 	 * consecutive DWORDS pointed by addr, other gen/engines compare value
106 	 * with DWORD pointed by addr. Moreover we want to exercise DWORD size
107 	 * invalidations. To fulfill all these requirements below values
108 	 * have been chosen.
109 	 */
110 	*cs++ = MI_CONDITIONAL_BATCH_BUFFER_END | MI_DO_COMPARE | 2;
111 	*cs++ = 0; /* break if *addr == 0 */
112 	*cs++ = lower_32_bits(addr);
113 	*cs++ = upper_32_bits(addr);
114 	vma_set_qw(va, addr, -1);
115 	vma_set_qw(vb, addr, 0);
116 
117 	/* Keep sampling until we get bored */
118 	*cs++ = MI_BATCH_BUFFER_START | BIT(8) | 1;
119 	*cs++ = lower_32_bits(i915_vma_offset(vma));
120 	*cs++ = upper_32_bits(i915_vma_offset(vma));
121 
122 	i915_gem_object_flush_map(batch);
123 
124 	rq = i915_request_create(ce);
125 	if (IS_ERR(rq)) {
126 		err = PTR_ERR(rq);
127 		goto out_va;
128 	}
129 
130 	err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), 0, 0);
131 	if (err) {
132 		i915_request_add(rq);
133 		goto out_va;
134 	}
135 
136 	i915_request_get(rq);
137 	i915_request_add(rq);
138 
139 	/*
140 	 * Short sleep to sanitycheck the batch is spinning before we begin.
141 	 * FIXME: Why is GSC so slow?
142 	 */
143 	if (ce->engine->class == OTHER_CLASS)
144 		msleep(200);
145 	else
146 		msleep(10);
147 
148 	if (va == vb) {
149 		if (!i915_request_completed(rq)) {
150 			pr_err("%s(%s): Semaphore sanitycheck failed %llx, with alignment %llx, using PTE size %x (phys %x, sg %x)\n",
151 			       ce->engine->name, va->obj->mm.region->name ?: "smem",
152 			       addr, align, va->resource->page_sizes_gtt,
153 			       va->page_sizes.phys, va->page_sizes.sg);
154 			err = -EIO;
155 		}
156 	} else if (!i915_request_completed(rq)) {
157 		struct i915_vma_resource vb_res = {
158 			.bi.pages = vb->obj->mm.pages,
159 			.bi.page_sizes = vb->obj->mm.page_sizes,
160 			.start = i915_vma_offset(vb),
161 			.vma_size = i915_vma_size(vb)
162 		};
163 		unsigned int pte_flags = 0;
164 
165 		/* Flip the PTE between A and B */
166 		if (i915_gem_object_is_lmem(vb->obj))
167 			pte_flags |= PTE_LM;
168 		ce->vm->insert_entries(ce->vm, &vb_res, pat_index, pte_flags);
169 
170 		/* Flush the PTE update to concurrent HW */
171 		tlbinv(ce->vm, addr & -length, length);
172 
173 		if (wait_for(i915_request_completed(rq), HZ / 2)) {
174 			pr_err("%s: Request did not complete; the COND_BBE did not read the updated PTE\n",
175 			       ce->engine->name);
176 			err = -EINVAL;
177 		}
178 	} else {
179 		pr_err("Spinner ended unexpectedly\n");
180 		err = -EIO;
181 	}
182 	i915_request_put(rq);
183 
184 	cs = page_mask_bits(batch->mm.mapping);
185 	*cs = MI_BATCH_BUFFER_END;
186 	wmb();
187 
188 out_va:
189 	if (vb != va)
190 		vb->node = vb_node;
191 	i915_vma_unpin(va);
192 	if (i915_vma_unbind_unlocked(va))
193 		err = -EIO;
194 out:
195 	i915_gem_object_put(batch);
196 	return err;
197 }
198 
create_lmem(struct intel_gt * gt)199 static struct drm_i915_gem_object *create_lmem(struct intel_gt *gt)
200 {
201 	struct intel_memory_region *mr = gt->i915->mm.regions[INTEL_REGION_LMEM_0];
202 	resource_size_t size = SZ_1G;
203 
204 	/*
205 	 * Allocation of largest possible page size allows to test all types
206 	 * of pages. To succeed with both allocations, especially in case of Small
207 	 * BAR, try to allocate no more than quarter of mappable memory.
208 	 */
209 	if (mr && size > resource_size(&mr->io) / 4)
210 		size = resource_size(&mr->io) / 4;
211 
212 	return i915_gem_object_create_lmem(gt->i915, size, I915_BO_ALLOC_CONTIGUOUS);
213 }
214 
create_smem(struct intel_gt * gt)215 static struct drm_i915_gem_object *create_smem(struct intel_gt *gt)
216 {
217 	/*
218 	 * SZ_64K pages require covering the whole 2M PT (gen8 to tgl/dg1).
219 	 * While that does not require the whole 2M block to be contiguous
220 	 * it is easier to make it so, since we need that for SZ_2M pagees.
221 	 * Since we randomly offset the start of the vma, we need a 4M object
222 	 * so that there is a 2M range within it is suitable for SZ_64K PTE.
223 	 */
224 	return i915_gem_object_create_internal(gt->i915, SZ_4M);
225 }
226 
227 static int
mem_tlbinv(struct intel_gt * gt,struct drm_i915_gem_object * (* create_fn)(struct intel_gt *),void (* tlbinv)(struct i915_address_space * vm,u64 addr,u64 length))228 mem_tlbinv(struct intel_gt *gt,
229 	   struct drm_i915_gem_object *(*create_fn)(struct intel_gt *),
230 	   void (*tlbinv)(struct i915_address_space *vm, u64 addr, u64 length))
231 {
232 	unsigned int ppgtt_size = RUNTIME_INFO(gt->i915)->ppgtt_size;
233 	struct intel_engine_cs *engine;
234 	struct drm_i915_gem_object *A, *B;
235 	struct i915_ppgtt *ppgtt;
236 	struct i915_vma *va, *vb;
237 	enum intel_engine_id id;
238 	I915_RND_STATE(prng);
239 	void *vaddr;
240 	int err;
241 
242 	/*
243 	 * Check that the TLB invalidate is able to revoke an active
244 	 * page. We load a page into a spinning COND_BBE loop and then
245 	 * remap that page to a new physical address. The old address, and
246 	 * so the loop keeps spinning, is retained in the TLB cache until
247 	 * we issue an invalidate.
248 	 */
249 
250 	A = create_fn(gt);
251 	if (IS_ERR(A))
252 		return PTR_ERR(A);
253 
254 	vaddr = i915_gem_object_pin_map_unlocked(A, I915_MAP_WC);
255 	if (IS_ERR(vaddr)) {
256 		err = PTR_ERR(vaddr);
257 		goto out_a;
258 	}
259 
260 	B = create_fn(gt);
261 	if (IS_ERR(B)) {
262 		err = PTR_ERR(B);
263 		goto out_a;
264 	}
265 
266 	vaddr = i915_gem_object_pin_map_unlocked(B, I915_MAP_WC);
267 	if (IS_ERR(vaddr)) {
268 		err = PTR_ERR(vaddr);
269 		goto out_b;
270 	}
271 
272 	GEM_BUG_ON(A->base.size != B->base.size);
273 	if ((A->mm.page_sizes.phys | B->mm.page_sizes.phys) & (A->base.size - 1))
274 		pr_warn("Failed to allocate contiguous pages for size %zx\n",
275 			A->base.size);
276 
277 	ppgtt = i915_ppgtt_create(gt, 0);
278 	if (IS_ERR(ppgtt)) {
279 		err = PTR_ERR(ppgtt);
280 		goto out_b;
281 	}
282 
283 	va = i915_vma_instance(A, &ppgtt->vm, NULL);
284 	if (IS_ERR(va)) {
285 		err = PTR_ERR(va);
286 		goto out_vm;
287 	}
288 
289 	vb = i915_vma_instance(B, &ppgtt->vm, NULL);
290 	if (IS_ERR(vb)) {
291 		err = PTR_ERR(vb);
292 		goto out_vm;
293 	}
294 
295 	err = 0;
296 	for_each_engine(engine, gt, id) {
297 		struct i915_gem_ww_ctx ww;
298 		struct intel_context *ce;
299 		int bit;
300 
301 		ce = intel_context_create(engine);
302 		if (IS_ERR(ce)) {
303 			err = PTR_ERR(ce);
304 			break;
305 		}
306 
307 		i915_vm_put(ce->vm);
308 		ce->vm = i915_vm_get(&ppgtt->vm);
309 
310 		for_i915_gem_ww(&ww, err, true)
311 			err = intel_context_pin_ww(ce, &ww);
312 		if (err)
313 			goto err_put;
314 
315 		for_each_set_bit(bit,
316 				 (unsigned long *)&RUNTIME_INFO(gt->i915)->page_sizes,
317 				 BITS_PER_TYPE(RUNTIME_INFO(gt->i915)->page_sizes)) {
318 			unsigned int len;
319 
320 			if (BIT_ULL(bit) < i915_vm_obj_min_alignment(va->vm, va->obj))
321 				continue;
322 
323 			/* sanitycheck the semaphore wake up */
324 			err = pte_tlbinv(ce, va, va,
325 					 BIT_ULL(bit),
326 					 NULL, SZ_4K,
327 					 &prng);
328 			if (err)
329 				goto err_unpin;
330 
331 			for (len = 2; len <= ppgtt_size; len = min(2 * len, ppgtt_size)) {
332 				err = pte_tlbinv(ce, va, vb,
333 						 BIT_ULL(bit),
334 						 tlbinv,
335 						 BIT_ULL(len),
336 						 &prng);
337 				if (err)
338 					goto err_unpin;
339 				if (len == ppgtt_size)
340 					break;
341 			}
342 		}
343 err_unpin:
344 		intel_context_unpin(ce);
345 err_put:
346 		intel_context_put(ce);
347 		if (err)
348 			break;
349 	}
350 
351 	if (igt_flush_test(gt->i915))
352 		err = -EIO;
353 
354 out_vm:
355 	i915_vm_put(&ppgtt->vm);
356 out_b:
357 	i915_gem_object_put(B);
358 out_a:
359 	i915_gem_object_put(A);
360 	return err;
361 }
362 
tlbinv_full(struct i915_address_space * vm,u64 addr,u64 length)363 static void tlbinv_full(struct i915_address_space *vm, u64 addr, u64 length)
364 {
365 	intel_gt_invalidate_tlb_full(vm->gt, intel_gt_tlb_seqno(vm->gt) | 1);
366 }
367 
invalidate_full(void * arg)368 static int invalidate_full(void *arg)
369 {
370 	struct intel_gt *gt = arg;
371 	int err;
372 
373 	if (GRAPHICS_VER(gt->i915) < 8)
374 		return 0; /* TLB invalidate not implemented */
375 
376 	err = mem_tlbinv(gt, create_smem, tlbinv_full);
377 	if (err == 0)
378 		err = mem_tlbinv(gt, create_lmem, tlbinv_full);
379 	if (err == -ENODEV || err == -ENXIO)
380 		err = 0;
381 
382 	return err;
383 }
384 
intel_tlb_live_selftests(struct drm_i915_private * i915)385 int intel_tlb_live_selftests(struct drm_i915_private *i915)
386 {
387 	static const struct i915_subtest tests[] = {
388 		SUBTEST(invalidate_full),
389 	};
390 	struct intel_gt *gt;
391 	unsigned int i;
392 
393 	for_each_gt(gt, i915, i) {
394 		int err;
395 
396 		if (intel_gt_is_wedged(gt))
397 			continue;
398 
399 		err = intel_gt_live_subtests(tests, gt);
400 		if (err)
401 			return err;
402 	}
403 
404 	return 0;
405 }
406