1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * KVM dirty page logging test
4   *
5   * Copyright (C) 2018, Red Hat, Inc.
6   */
7  #include <stdio.h>
8  #include <stdlib.h>
9  #include <pthread.h>
10  #include <semaphore.h>
11  #include <sys/types.h>
12  #include <signal.h>
13  #include <errno.h>
14  #include <linux/bitmap.h>
15  #include <linux/bitops.h>
16  #include <linux/atomic.h>
17  #include <asm/barrier.h>
18  
19  #include "kvm_util.h"
20  #include "test_util.h"
21  #include "guest_modes.h"
22  #include "processor.h"
23  #include "ucall_common.h"
24  
25  #define DIRTY_MEM_BITS 30 /* 1G */
26  #define PAGE_SHIFT_4K  12
27  
28  /* The memory slot index to track dirty pages */
29  #define TEST_MEM_SLOT_INDEX		1
30  
31  /* Default guest test virtual memory offset */
32  #define DEFAULT_GUEST_TEST_MEM		0xc0000000
33  
34  /* How many pages to dirty for each guest loop */
35  #define TEST_PAGES_PER_LOOP		1024
36  
37  /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
38  #define TEST_HOST_LOOP_N		32UL
39  
40  /* Interval for each host loop (ms) */
41  #define TEST_HOST_LOOP_INTERVAL		10UL
42  
43  /* Dirty bitmaps are always little endian, so we need to swap on big endian */
44  #if defined(__s390x__)
45  # define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
46  # define test_bit_le(nr, addr) \
47  	test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
48  # define __set_bit_le(nr, addr) \
49  	__set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
50  # define __clear_bit_le(nr, addr) \
51  	__clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
52  # define __test_and_set_bit_le(nr, addr) \
53  	__test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
54  # define __test_and_clear_bit_le(nr, addr) \
55  	__test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
56  #else
57  # define test_bit_le			test_bit
58  # define __set_bit_le			__set_bit
59  # define __clear_bit_le			__clear_bit
60  # define __test_and_set_bit_le		__test_and_set_bit
61  # define __test_and_clear_bit_le	__test_and_clear_bit
62  #endif
63  
64  #define TEST_DIRTY_RING_COUNT		65536
65  
66  #define SIG_IPI SIGUSR1
67  
68  /*
69   * Guest/Host shared variables. Ensure addr_gva2hva() and/or
70   * sync_global_to/from_guest() are used when accessing from
71   * the host. READ/WRITE_ONCE() should also be used with anything
72   * that may change.
73   */
74  static uint64_t host_page_size;
75  static uint64_t guest_page_size;
76  static uint64_t guest_num_pages;
77  static uint64_t iteration;
78  
79  /*
80   * Guest physical memory offset of the testing memory slot.
81   * This will be set to the topmost valid physical address minus
82   * the test memory size.
83   */
84  static uint64_t guest_test_phys_mem;
85  
86  /*
87   * Guest virtual memory offset of the testing memory slot.
88   * Must not conflict with identity mapped test code.
89   */
90  static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
91  
92  /*
93   * Continuously write to the first 8 bytes of a random pages within
94   * the testing memory region.
95   */
guest_code(void)96  static void guest_code(void)
97  {
98  	uint64_t addr;
99  	int i;
100  
101  	/*
102  	 * On s390x, all pages of a 1M segment are initially marked as dirty
103  	 * when a page of the segment is written to for the very first time.
104  	 * To compensate this specialty in this test, we need to touch all
105  	 * pages during the first iteration.
106  	 */
107  	for (i = 0; i < guest_num_pages; i++) {
108  		addr = guest_test_virt_mem + i * guest_page_size;
109  		vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration));
110  	}
111  
112  	while (true) {
113  		for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
114  			addr = guest_test_virt_mem;
115  			addr += (guest_random_u64(&guest_rng) % guest_num_pages)
116  				* guest_page_size;
117  			addr = align_down(addr, host_page_size);
118  
119  			vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration));
120  		}
121  
122  		GUEST_SYNC(1);
123  	}
124  }
125  
126  /* Host variables */
127  static bool host_quit;
128  
129  /* Points to the test VM memory region on which we track dirty logs */
130  static void *host_test_mem;
131  static uint64_t host_num_pages;
132  
133  /* For statistics only */
134  static uint64_t host_dirty_count;
135  static uint64_t host_clear_count;
136  static uint64_t host_track_next_count;
137  
138  /* Whether dirty ring reset is requested, or finished */
139  static sem_t sem_vcpu_stop;
140  static sem_t sem_vcpu_cont;
141  /*
142   * This is only set by main thread, and only cleared by vcpu thread.  It is
143   * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
144   * is the only place that we'll guarantee both "dirty bit" and "dirty data"
145   * will match.  E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
146   * after setting dirty bit but before the data is written.
147   */
148  static atomic_t vcpu_sync_stop_requested;
149  /*
150   * This is updated by the vcpu thread to tell the host whether it's a
151   * ring-full event.  It should only be read until a sem_wait() of
152   * sem_vcpu_stop and before vcpu continues to run.
153   */
154  static bool dirty_ring_vcpu_ring_full;
155  /*
156   * This is only used for verifying the dirty pages.  Dirty ring has a very
157   * tricky case when the ring just got full, kvm will do userspace exit due to
158   * ring full.  When that happens, the very last PFN is set but actually the
159   * data is not changed (the guest WRITE is not really applied yet), because
160   * we found that the dirty ring is full, refused to continue the vcpu, and
161   * recorded the dirty gfn with the old contents.
162   *
163   * For this specific case, it's safe to skip checking this pfn for this
164   * bit, because it's a redundant bit, and when the write happens later the bit
165   * will be set again.  We use this variable to always keep track of the latest
166   * dirty gfn we've collected, so that if a mismatch of data found later in the
167   * verifying process, we let it pass.
168   */
169  static uint64_t dirty_ring_last_page;
170  
171  enum log_mode_t {
172  	/* Only use KVM_GET_DIRTY_LOG for logging */
173  	LOG_MODE_DIRTY_LOG = 0,
174  
175  	/* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
176  	LOG_MODE_CLEAR_LOG = 1,
177  
178  	/* Use dirty ring for logging */
179  	LOG_MODE_DIRTY_RING = 2,
180  
181  	LOG_MODE_NUM,
182  
183  	/* Run all supported modes */
184  	LOG_MODE_ALL = LOG_MODE_NUM,
185  };
186  
187  /* Mode of logging to test.  Default is to run all supported modes */
188  static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
189  /* Logging mode for current run */
190  static enum log_mode_t host_log_mode;
191  static pthread_t vcpu_thread;
192  static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT;
193  
vcpu_kick(void)194  static void vcpu_kick(void)
195  {
196  	pthread_kill(vcpu_thread, SIG_IPI);
197  }
198  
199  /*
200   * In our test we do signal tricks, let's use a better version of
201   * sem_wait to avoid signal interrupts
202   */
sem_wait_until(sem_t * sem)203  static void sem_wait_until(sem_t *sem)
204  {
205  	int ret;
206  
207  	do
208  		ret = sem_wait(sem);
209  	while (ret == -1 && errno == EINTR);
210  }
211  
clear_log_supported(void)212  static bool clear_log_supported(void)
213  {
214  	return kvm_has_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
215  }
216  
clear_log_create_vm_done(struct kvm_vm * vm)217  static void clear_log_create_vm_done(struct kvm_vm *vm)
218  {
219  	u64 manual_caps;
220  
221  	manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
222  	TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
223  	manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
224  			KVM_DIRTY_LOG_INITIALLY_SET);
225  	vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, manual_caps);
226  }
227  
dirty_log_collect_dirty_pages(struct kvm_vcpu * vcpu,int slot,void * bitmap,uint32_t num_pages,uint32_t * unused)228  static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
229  					  void *bitmap, uint32_t num_pages,
230  					  uint32_t *unused)
231  {
232  	kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
233  }
234  
clear_log_collect_dirty_pages(struct kvm_vcpu * vcpu,int slot,void * bitmap,uint32_t num_pages,uint32_t * unused)235  static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
236  					  void *bitmap, uint32_t num_pages,
237  					  uint32_t *unused)
238  {
239  	kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap);
240  	kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages);
241  }
242  
243  /* Should only be called after a GUEST_SYNC */
vcpu_handle_sync_stop(void)244  static void vcpu_handle_sync_stop(void)
245  {
246  	if (atomic_read(&vcpu_sync_stop_requested)) {
247  		/* It means main thread is sleeping waiting */
248  		atomic_set(&vcpu_sync_stop_requested, false);
249  		sem_post(&sem_vcpu_stop);
250  		sem_wait_until(&sem_vcpu_cont);
251  	}
252  }
253  
default_after_vcpu_run(struct kvm_vcpu * vcpu,int ret,int err)254  static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
255  {
256  	struct kvm_run *run = vcpu->run;
257  
258  	TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR),
259  		    "vcpu run failed: errno=%d", err);
260  
261  	TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
262  		    "Invalid guest sync status: exit_reason=%s",
263  		    exit_reason_str(run->exit_reason));
264  
265  	vcpu_handle_sync_stop();
266  }
267  
dirty_ring_supported(void)268  static bool dirty_ring_supported(void)
269  {
270  	return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) ||
271  		kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL));
272  }
273  
dirty_ring_create_vm_done(struct kvm_vm * vm)274  static void dirty_ring_create_vm_done(struct kvm_vm *vm)
275  {
276  	uint64_t pages;
277  	uint32_t limit;
278  
279  	/*
280  	 * We rely on vcpu exit due to full dirty ring state. Adjust
281  	 * the ring buffer size to ensure we're able to reach the
282  	 * full dirty ring state.
283  	 */
284  	pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3;
285  	pages = vm_adjust_num_guest_pages(vm->mode, pages);
286  	if (vm->page_size < getpagesize())
287  		pages = vm_num_host_pages(vm->mode, pages);
288  
289  	limit = 1 << (31 - __builtin_clz(pages));
290  	test_dirty_ring_count = 1 << (31 - __builtin_clz(test_dirty_ring_count));
291  	test_dirty_ring_count = min(limit, test_dirty_ring_count);
292  	pr_info("dirty ring count: 0x%x\n", test_dirty_ring_count);
293  
294  	/*
295  	 * Switch to dirty ring mode after VM creation but before any
296  	 * of the vcpu creation.
297  	 */
298  	vm_enable_dirty_ring(vm, test_dirty_ring_count *
299  			     sizeof(struct kvm_dirty_gfn));
300  }
301  
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)302  static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
303  {
304  	return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
305  }
306  
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)307  static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
308  {
309  	smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
310  }
311  
dirty_ring_collect_one(struct kvm_dirty_gfn * dirty_gfns,int slot,void * bitmap,uint32_t num_pages,uint32_t * fetch_index)312  static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
313  				       int slot, void *bitmap,
314  				       uint32_t num_pages, uint32_t *fetch_index)
315  {
316  	struct kvm_dirty_gfn *cur;
317  	uint32_t count = 0;
318  
319  	while (true) {
320  		cur = &dirty_gfns[*fetch_index % test_dirty_ring_count];
321  		if (!dirty_gfn_is_dirtied(cur))
322  			break;
323  		TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
324  			    "%u != %u", cur->slot, slot);
325  		TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
326  			    "0x%llx >= 0x%x", cur->offset, num_pages);
327  		//pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset);
328  		__set_bit_le(cur->offset, bitmap);
329  		dirty_ring_last_page = cur->offset;
330  		dirty_gfn_set_collected(cur);
331  		(*fetch_index)++;
332  		count++;
333  	}
334  
335  	return count;
336  }
337  
dirty_ring_wait_vcpu(void)338  static void dirty_ring_wait_vcpu(void)
339  {
340  	/* This makes sure that hardware PML cache flushed */
341  	vcpu_kick();
342  	sem_wait_until(&sem_vcpu_stop);
343  }
344  
dirty_ring_continue_vcpu(void)345  static void dirty_ring_continue_vcpu(void)
346  {
347  	pr_info("Notifying vcpu to continue\n");
348  	sem_post(&sem_vcpu_cont);
349  }
350  
dirty_ring_collect_dirty_pages(struct kvm_vcpu * vcpu,int slot,void * bitmap,uint32_t num_pages,uint32_t * ring_buf_idx)351  static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
352  					   void *bitmap, uint32_t num_pages,
353  					   uint32_t *ring_buf_idx)
354  {
355  	uint32_t count = 0, cleared;
356  	bool continued_vcpu = false;
357  
358  	dirty_ring_wait_vcpu();
359  
360  	if (!dirty_ring_vcpu_ring_full) {
361  		/*
362  		 * This is not a ring-full event, it's safe to allow
363  		 * vcpu to continue
364  		 */
365  		dirty_ring_continue_vcpu();
366  		continued_vcpu = true;
367  	}
368  
369  	/* Only have one vcpu */
370  	count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu),
371  				       slot, bitmap, num_pages,
372  				       ring_buf_idx);
373  
374  	cleared = kvm_vm_reset_dirty_ring(vcpu->vm);
375  
376  	/*
377  	 * Cleared pages should be the same as collected, as KVM is supposed to
378  	 * clear only the entries that have been harvested.
379  	 */
380  	TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
381  		    "with collected (%u)", cleared, count);
382  
383  	if (!continued_vcpu) {
384  		TEST_ASSERT(dirty_ring_vcpu_ring_full,
385  			    "Didn't continue vcpu even without ring full");
386  		dirty_ring_continue_vcpu();
387  	}
388  
389  	pr_info("Iteration %ld collected %u pages\n", iteration, count);
390  }
391  
dirty_ring_after_vcpu_run(struct kvm_vcpu * vcpu,int ret,int err)392  static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
393  {
394  	struct kvm_run *run = vcpu->run;
395  
396  	/* A ucall-sync or ring-full event is allowed */
397  	if (get_ucall(vcpu, NULL) == UCALL_SYNC) {
398  		/* We should allow this to continue */
399  		;
400  	} else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL ||
401  		   (ret == -1 && err == EINTR)) {
402  		/* Update the flag first before pause */
403  		WRITE_ONCE(dirty_ring_vcpu_ring_full,
404  			   run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
405  		sem_post(&sem_vcpu_stop);
406  		pr_info("vcpu stops because %s...\n",
407  			dirty_ring_vcpu_ring_full ?
408  			"dirty ring is full" : "vcpu is kicked out");
409  		sem_wait_until(&sem_vcpu_cont);
410  		pr_info("vcpu continues now.\n");
411  	} else {
412  		TEST_ASSERT(false, "Invalid guest sync status: "
413  			    "exit_reason=%s",
414  			    exit_reason_str(run->exit_reason));
415  	}
416  }
417  
418  struct log_mode {
419  	const char *name;
420  	/* Return true if this mode is supported, otherwise false */
421  	bool (*supported)(void);
422  	/* Hook when the vm creation is done (before vcpu creation) */
423  	void (*create_vm_done)(struct kvm_vm *vm);
424  	/* Hook to collect the dirty pages into the bitmap provided */
425  	void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot,
426  				     void *bitmap, uint32_t num_pages,
427  				     uint32_t *ring_buf_idx);
428  	/* Hook to call when after each vcpu run */
429  	void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err);
430  } log_modes[LOG_MODE_NUM] = {
431  	{
432  		.name = "dirty-log",
433  		.collect_dirty_pages = dirty_log_collect_dirty_pages,
434  		.after_vcpu_run = default_after_vcpu_run,
435  	},
436  	{
437  		.name = "clear-log",
438  		.supported = clear_log_supported,
439  		.create_vm_done = clear_log_create_vm_done,
440  		.collect_dirty_pages = clear_log_collect_dirty_pages,
441  		.after_vcpu_run = default_after_vcpu_run,
442  	},
443  	{
444  		.name = "dirty-ring",
445  		.supported = dirty_ring_supported,
446  		.create_vm_done = dirty_ring_create_vm_done,
447  		.collect_dirty_pages = dirty_ring_collect_dirty_pages,
448  		.after_vcpu_run = dirty_ring_after_vcpu_run,
449  	},
450  };
451  
452  /*
453   * We use this bitmap to track some pages that should have its dirty
454   * bit set in the _next_ iteration.  For example, if we detected the
455   * page value changed to current iteration but at the same time the
456   * page bit is cleared in the latest bitmap, then the system must
457   * report that write in the next get dirty log call.
458   */
459  static unsigned long *host_bmap_track;
460  
log_modes_dump(void)461  static void log_modes_dump(void)
462  {
463  	int i;
464  
465  	printf("all");
466  	for (i = 0; i < LOG_MODE_NUM; i++)
467  		printf(", %s", log_modes[i].name);
468  	printf("\n");
469  }
470  
log_mode_supported(void)471  static bool log_mode_supported(void)
472  {
473  	struct log_mode *mode = &log_modes[host_log_mode];
474  
475  	if (mode->supported)
476  		return mode->supported();
477  
478  	return true;
479  }
480  
log_mode_create_vm_done(struct kvm_vm * vm)481  static void log_mode_create_vm_done(struct kvm_vm *vm)
482  {
483  	struct log_mode *mode = &log_modes[host_log_mode];
484  
485  	if (mode->create_vm_done)
486  		mode->create_vm_done(vm);
487  }
488  
log_mode_collect_dirty_pages(struct kvm_vcpu * vcpu,int slot,void * bitmap,uint32_t num_pages,uint32_t * ring_buf_idx)489  static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot,
490  					 void *bitmap, uint32_t num_pages,
491  					 uint32_t *ring_buf_idx)
492  {
493  	struct log_mode *mode = &log_modes[host_log_mode];
494  
495  	TEST_ASSERT(mode->collect_dirty_pages != NULL,
496  		    "collect_dirty_pages() is required for any log mode!");
497  	mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx);
498  }
499  
log_mode_after_vcpu_run(struct kvm_vcpu * vcpu,int ret,int err)500  static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
501  {
502  	struct log_mode *mode = &log_modes[host_log_mode];
503  
504  	if (mode->after_vcpu_run)
505  		mode->after_vcpu_run(vcpu, ret, err);
506  }
507  
vcpu_worker(void * data)508  static void *vcpu_worker(void *data)
509  {
510  	int ret;
511  	struct kvm_vcpu *vcpu = data;
512  	uint64_t pages_count = 0;
513  	struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset)
514  						 + sizeof(sigset_t));
515  	sigset_t *sigset = (sigset_t *) &sigmask->sigset;
516  
517  	/*
518  	 * SIG_IPI is unblocked atomically while in KVM_RUN.  It causes the
519  	 * ioctl to return with -EINTR, but it is still pending and we need
520  	 * to accept it with the sigwait.
521  	 */
522  	sigmask->len = 8;
523  	pthread_sigmask(0, NULL, sigset);
524  	sigdelset(sigset, SIG_IPI);
525  	vcpu_ioctl(vcpu, KVM_SET_SIGNAL_MASK, sigmask);
526  
527  	sigemptyset(sigset);
528  	sigaddset(sigset, SIG_IPI);
529  
530  	while (!READ_ONCE(host_quit)) {
531  		/* Clear any existing kick signals */
532  		pages_count += TEST_PAGES_PER_LOOP;
533  		/* Let the guest dirty the random pages */
534  		ret = __vcpu_run(vcpu);
535  		if (ret == -1 && errno == EINTR) {
536  			int sig = -1;
537  			sigwait(sigset, &sig);
538  			assert(sig == SIG_IPI);
539  		}
540  		log_mode_after_vcpu_run(vcpu, ret, errno);
541  	}
542  
543  	pr_info("Dirtied %"PRIu64" pages\n", pages_count);
544  
545  	return NULL;
546  }
547  
vm_dirty_log_verify(enum vm_guest_mode mode,unsigned long * bmap)548  static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
549  {
550  	uint64_t step = vm_num_host_pages(mode, 1);
551  	uint64_t page;
552  	uint64_t *value_ptr;
553  	uint64_t min_iter = 0;
554  
555  	for (page = 0; page < host_num_pages; page += step) {
556  		value_ptr = host_test_mem + page * host_page_size;
557  
558  		/* If this is a special page that we were tracking... */
559  		if (__test_and_clear_bit_le(page, host_bmap_track)) {
560  			host_track_next_count++;
561  			TEST_ASSERT(test_bit_le(page, bmap),
562  				    "Page %"PRIu64" should have its dirty bit "
563  				    "set in this iteration but it is missing",
564  				    page);
565  		}
566  
567  		if (__test_and_clear_bit_le(page, bmap)) {
568  			bool matched;
569  
570  			host_dirty_count++;
571  
572  			/*
573  			 * If the bit is set, the value written onto
574  			 * the corresponding page should be either the
575  			 * previous iteration number or the current one.
576  			 */
577  			matched = (*value_ptr == iteration ||
578  				   *value_ptr == iteration - 1);
579  
580  			if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) {
581  				if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) {
582  					/*
583  					 * Short answer: this case is special
584  					 * only for dirty ring test where the
585  					 * page is the last page before a kvm
586  					 * dirty ring full in iteration N-2.
587  					 *
588  					 * Long answer: Assuming ring size R,
589  					 * one possible condition is:
590  					 *
591  					 *      main thr       vcpu thr
592  					 *      --------       --------
593  					 *    iter=1
594  					 *                   write 1 to page 0~(R-1)
595  					 *                   full, vmexit
596  					 *    collect 0~(R-1)
597  					 *    kick vcpu
598  					 *                   write 1 to (R-1)~(2R-2)
599  					 *                   full, vmexit
600  					 *    iter=2
601  					 *    collect (R-1)~(2R-2)
602  					 *    kick vcpu
603  					 *                   write 1 to (2R-2)
604  					 *                   (NOTE!!! "1" cached in cpu reg)
605  					 *                   write 2 to (2R-1)~(3R-3)
606  					 *                   full, vmexit
607  					 *    iter=3
608  					 *    collect (2R-2)~(3R-3)
609  					 *    (here if we read value on page
610  					 *     "2R-2" is 1, while iter=3!!!)
611  					 *
612  					 * This however can only happen once per iteration.
613  					 */
614  					min_iter = iteration - 1;
615  					continue;
616  				} else if (page == dirty_ring_last_page) {
617  					/*
618  					 * Please refer to comments in
619  					 * dirty_ring_last_page.
620  					 */
621  					continue;
622  				}
623  			}
624  
625  			TEST_ASSERT(matched,
626  				    "Set page %"PRIu64" value %"PRIu64
627  				    " incorrect (iteration=%"PRIu64")",
628  				    page, *value_ptr, iteration);
629  		} else {
630  			host_clear_count++;
631  			/*
632  			 * If cleared, the value written can be any
633  			 * value smaller or equals to the iteration
634  			 * number.  Note that the value can be exactly
635  			 * (iteration-1) if that write can happen
636  			 * like this:
637  			 *
638  			 * (1) increase loop count to "iteration-1"
639  			 * (2) write to page P happens (with value
640  			 *     "iteration-1")
641  			 * (3) get dirty log for "iteration-1"; we'll
642  			 *     see that page P bit is set (dirtied),
643  			 *     and not set the bit in host_bmap_track
644  			 * (4) increase loop count to "iteration"
645  			 *     (which is current iteration)
646  			 * (5) get dirty log for current iteration,
647  			 *     we'll see that page P is cleared, with
648  			 *     value "iteration-1".
649  			 */
650  			TEST_ASSERT(*value_ptr <= iteration,
651  				    "Clear page %"PRIu64" value %"PRIu64
652  				    " incorrect (iteration=%"PRIu64")",
653  				    page, *value_ptr, iteration);
654  			if (*value_ptr == iteration) {
655  				/*
656  				 * This page is _just_ modified; it
657  				 * should report its dirtyness in the
658  				 * next run
659  				 */
660  				__set_bit_le(page, host_bmap_track);
661  			}
662  		}
663  	}
664  }
665  
create_vm(enum vm_guest_mode mode,struct kvm_vcpu ** vcpu,uint64_t extra_mem_pages,void * guest_code)666  static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu,
667  				uint64_t extra_mem_pages, void *guest_code)
668  {
669  	struct kvm_vm *vm;
670  
671  	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
672  
673  	vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages);
674  
675  	log_mode_create_vm_done(vm);
676  	*vcpu = vm_vcpu_add(vm, 0, guest_code);
677  	return vm;
678  }
679  
680  struct test_params {
681  	unsigned long iterations;
682  	unsigned long interval;
683  	uint64_t phys_offset;
684  };
685  
run_test(enum vm_guest_mode mode,void * arg)686  static void run_test(enum vm_guest_mode mode, void *arg)
687  {
688  	struct test_params *p = arg;
689  	struct kvm_vcpu *vcpu;
690  	struct kvm_vm *vm;
691  	unsigned long *bmap;
692  	uint32_t ring_buf_idx = 0;
693  	int sem_val;
694  
695  	if (!log_mode_supported()) {
696  		print_skip("Log mode '%s' not supported",
697  			   log_modes[host_log_mode].name);
698  		return;
699  	}
700  
701  	/*
702  	 * We reserve page table for 2 times of extra dirty mem which
703  	 * will definitely cover the original (1G+) test range.  Here
704  	 * we do the calculation with 4K page size which is the
705  	 * smallest so the page number will be enough for all archs
706  	 * (e.g., 64K page size guest will need even less memory for
707  	 * page tables).
708  	 */
709  	vm = create_vm(mode, &vcpu,
710  		       2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code);
711  
712  	guest_page_size = vm->page_size;
713  	/*
714  	 * A little more than 1G of guest page sized pages.  Cover the
715  	 * case where the size is not aligned to 64 pages.
716  	 */
717  	guest_num_pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3;
718  	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
719  
720  	host_page_size = getpagesize();
721  	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
722  
723  	if (!p->phys_offset) {
724  		guest_test_phys_mem = (vm->max_gfn - guest_num_pages) *
725  				      guest_page_size;
726  		guest_test_phys_mem = align_down(guest_test_phys_mem, host_page_size);
727  	} else {
728  		guest_test_phys_mem = p->phys_offset;
729  	}
730  
731  #ifdef __s390x__
732  	/* Align to 1M (segment size) */
733  	guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20);
734  #endif
735  
736  	pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
737  
738  	bmap = bitmap_zalloc(host_num_pages);
739  	host_bmap_track = bitmap_zalloc(host_num_pages);
740  
741  	/* Add an extra memory slot for testing dirty logging */
742  	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
743  				    guest_test_phys_mem,
744  				    TEST_MEM_SLOT_INDEX,
745  				    guest_num_pages,
746  				    KVM_MEM_LOG_DIRTY_PAGES);
747  
748  	/* Do mapping for the dirty track memory slot */
749  	virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages);
750  
751  	/* Cache the HVA pointer of the region */
752  	host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
753  
754  	/* Export the shared variables to the guest */
755  	sync_global_to_guest(vm, host_page_size);
756  	sync_global_to_guest(vm, guest_page_size);
757  	sync_global_to_guest(vm, guest_test_virt_mem);
758  	sync_global_to_guest(vm, guest_num_pages);
759  
760  	/* Start the iterations */
761  	iteration = 1;
762  	sync_global_to_guest(vm, iteration);
763  	WRITE_ONCE(host_quit, false);
764  	host_dirty_count = 0;
765  	host_clear_count = 0;
766  	host_track_next_count = 0;
767  	WRITE_ONCE(dirty_ring_vcpu_ring_full, false);
768  
769  	/*
770  	 * Ensure the previous iteration didn't leave a dangling semaphore, i.e.
771  	 * that the main task and vCPU worker were synchronized and completed
772  	 * verification of all iterations.
773  	 */
774  	sem_getvalue(&sem_vcpu_stop, &sem_val);
775  	TEST_ASSERT_EQ(sem_val, 0);
776  	sem_getvalue(&sem_vcpu_cont, &sem_val);
777  	TEST_ASSERT_EQ(sem_val, 0);
778  
779  	pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu);
780  
781  	while (iteration < p->iterations) {
782  		/* Give the vcpu thread some time to dirty some pages */
783  		usleep(p->interval * 1000);
784  		log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX,
785  					     bmap, host_num_pages,
786  					     &ring_buf_idx);
787  
788  		/*
789  		 * See vcpu_sync_stop_requested definition for details on why
790  		 * we need to stop vcpu when verify data.
791  		 */
792  		atomic_set(&vcpu_sync_stop_requested, true);
793  		sem_wait_until(&sem_vcpu_stop);
794  		/*
795  		 * NOTE: for dirty ring, it's possible that we didn't stop at
796  		 * GUEST_SYNC but instead we stopped because ring is full;
797  		 * that's okay too because ring full means we're only missing
798  		 * the flush of the last page, and since we handle the last
799  		 * page specially verification will succeed anyway.
800  		 */
801  		assert(host_log_mode == LOG_MODE_DIRTY_RING ||
802  		       atomic_read(&vcpu_sync_stop_requested) == false);
803  		vm_dirty_log_verify(mode, bmap);
804  
805  		/*
806  		 * Set host_quit before sem_vcpu_cont in the final iteration to
807  		 * ensure that the vCPU worker doesn't resume the guest.  As
808  		 * above, the dirty ring test may stop and wait even when not
809  		 * explicitly request to do so, i.e. would hang waiting for a
810  		 * "continue" if it's allowed to resume the guest.
811  		 */
812  		if (++iteration == p->iterations)
813  			WRITE_ONCE(host_quit, true);
814  
815  		sem_post(&sem_vcpu_cont);
816  		sync_global_to_guest(vm, iteration);
817  	}
818  
819  	pthread_join(vcpu_thread, NULL);
820  
821  	pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
822  		"track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
823  		host_track_next_count);
824  
825  	free(bmap);
826  	free(host_bmap_track);
827  	kvm_vm_free(vm);
828  }
829  
help(char * name)830  static void help(char *name)
831  {
832  	puts("");
833  	printf("usage: %s [-h] [-i iterations] [-I interval] "
834  	       "[-p offset] [-m mode]\n", name);
835  	puts("");
836  	printf(" -c: hint to dirty ring size, in number of entries\n");
837  	printf("     (only useful for dirty-ring test; default: %"PRIu32")\n",
838  	       TEST_DIRTY_RING_COUNT);
839  	printf(" -i: specify iteration counts (default: %"PRIu64")\n",
840  	       TEST_HOST_LOOP_N);
841  	printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
842  	       TEST_HOST_LOOP_INTERVAL);
843  	printf(" -p: specify guest physical test memory offset\n"
844  	       "     Warning: a low offset can conflict with the loaded test code.\n");
845  	printf(" -M: specify the host logging mode "
846  	       "(default: run all log modes).  Supported modes: \n\t");
847  	log_modes_dump();
848  	guest_modes_help();
849  	puts("");
850  	exit(0);
851  }
852  
main(int argc,char * argv[])853  int main(int argc, char *argv[])
854  {
855  	struct test_params p = {
856  		.iterations = TEST_HOST_LOOP_N,
857  		.interval = TEST_HOST_LOOP_INTERVAL,
858  	};
859  	int opt, i;
860  	sigset_t sigset;
861  
862  	sem_init(&sem_vcpu_stop, 0, 0);
863  	sem_init(&sem_vcpu_cont, 0, 0);
864  
865  	guest_modes_append_default();
866  
867  	while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) {
868  		switch (opt) {
869  		case 'c':
870  			test_dirty_ring_count = strtol(optarg, NULL, 10);
871  			break;
872  		case 'i':
873  			p.iterations = strtol(optarg, NULL, 10);
874  			break;
875  		case 'I':
876  			p.interval = strtol(optarg, NULL, 10);
877  			break;
878  		case 'p':
879  			p.phys_offset = strtoull(optarg, NULL, 0);
880  			break;
881  		case 'm':
882  			guest_modes_cmdline(optarg);
883  			break;
884  		case 'M':
885  			if (!strcmp(optarg, "all")) {
886  				host_log_mode_option = LOG_MODE_ALL;
887  				break;
888  			}
889  			for (i = 0; i < LOG_MODE_NUM; i++) {
890  				if (!strcmp(optarg, log_modes[i].name)) {
891  					pr_info("Setting log mode to: '%s'\n",
892  						optarg);
893  					host_log_mode_option = i;
894  					break;
895  				}
896  			}
897  			if (i == LOG_MODE_NUM) {
898  				printf("Log mode '%s' invalid. Please choose "
899  				       "from: ", optarg);
900  				log_modes_dump();
901  				exit(1);
902  			}
903  			break;
904  		case 'h':
905  		default:
906  			help(argv[0]);
907  			break;
908  		}
909  	}
910  
911  	TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two");
912  	TEST_ASSERT(p.interval > 0, "Interval must be greater than zero");
913  
914  	pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
915  		p.iterations, p.interval);
916  
917  	srandom(time(0));
918  
919  	/* Ensure that vCPU threads start with SIG_IPI blocked.  */
920  	sigemptyset(&sigset);
921  	sigaddset(&sigset, SIG_IPI);
922  	pthread_sigmask(SIG_BLOCK, &sigset, NULL);
923  
924  	if (host_log_mode_option == LOG_MODE_ALL) {
925  		/* Run each log mode */
926  		for (i = 0; i < LOG_MODE_NUM; i++) {
927  			pr_info("Testing Log Mode '%s'\n", log_modes[i].name);
928  			host_log_mode = i;
929  			for_each_guest_mode(run_test, &p);
930  		}
931  	} else {
932  		host_log_mode = host_log_mode_option;
933  		for_each_guest_mode(run_test, &p);
934  	}
935  
936  	return 0;
937  }
938