1130803Smarcel// SPDX-License-Identifier: GPL-2.0
2130803Smarcel/*
3130803Smarcel * xapic_ipi_test
4130803Smarcel *
5130803Smarcel * Copyright (C) 2020, Google LLC.
6130803Smarcel *
7130803Smarcel * This work is licensed under the terms of the GNU GPL, version 2.
8130803Smarcel *
9130803Smarcel * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
10130803Smarcel * another vCPU that is halted when KVM's backing page for the APIC access
11130803Smarcel * address has been moved by mm.
12130803Smarcel *
13130803Smarcel * The test starts two vCPUs: one that sends IPIs and one that continually
14130803Smarcel * executes HLT. The sender checks that the halter has woken from the HLT and
15130803Smarcel * has reentered HLT before sending the next IPI. While the vCPUs are running,
16130803Smarcel * the host continually calls migrate_pages to move all of the process' pages
17130803Smarcel * amongst the available numa nodes on the machine.
18130803Smarcel *
19130803Smarcel * Migration is a command line option. When used on non-numa machines will
20130803Smarcel * exit with error. Test is still usefull on non-numa for testing IPIs.
21130803Smarcel */
22130803Smarcel
23130803Smarcel#define _GNU_SOURCE /* for program_invocation_short_name */
24130803Smarcel#include <getopt.h>
25130803Smarcel#include <pthread.h>
26130803Smarcel#include <inttypes.h>
27130803Smarcel#include <string.h>
28130803Smarcel#include <time.h>
29130803Smarcel
30130803Smarcel#include "kvm_util.h"
31130803Smarcel#include "numaif.h"
32130803Smarcel#include "processor.h"
33130803Smarcel#include "test_util.h"
34130803Smarcel#include "vmx.h"
35130803Smarcel
36130803Smarcel/* Default running time for the test */
37130803Smarcel#define DEFAULT_RUN_SECS 3
38130803Smarcel
39130803Smarcel/* Default delay between migrate_pages calls (microseconds) */
40130803Smarcel#define DEFAULT_DELAY_USECS 500000
41130803Smarcel
42130803Smarcel/*
43130803Smarcel * Vector for IPI from sender vCPU to halting vCPU.
44130803Smarcel * Value is arbitrary and was chosen for the alternating bit pattern. Any
45130803Smarcel * value should work.
46130803Smarcel */
47130803Smarcel#define IPI_VECTOR	 0xa5
48130803Smarcel
49130803Smarcel/*
50130803Smarcel * Incremented in the IPI handler. Provides evidence to the sender that the IPI
51130803Smarcel * arrived at the destination
52130803Smarcel */
53130803Smarcelstatic volatile uint64_t ipis_rcvd;
54130803Smarcel
55130803Smarcel/* Data struct shared between host main thread and vCPUs */
56130803Smarcelstruct test_data_page {
57130803Smarcel	uint32_t halter_apic_id;
58130803Smarcel	volatile uint64_t hlt_count;
59130803Smarcel	volatile uint64_t wake_count;
60130803Smarcel	uint64_t ipis_sent;
61130803Smarcel	uint64_t migrations_attempted;
62130803Smarcel	uint64_t migrations_completed;
63130803Smarcel	uint32_t icr;
64130803Smarcel	uint32_t icr2;
65130803Smarcel	uint32_t halter_tpr;
66130803Smarcel	uint32_t halter_ppr;
67130803Smarcel
68130803Smarcel	/*
69130803Smarcel	 *  Record local version register as a cross-check that APIC access
70130803Smarcel	 *  worked. Value should match what KVM reports (APIC_VERSION in
71130803Smarcel	 *  arch/x86/kvm/lapic.c). If test is failing, check that values match
72130803Smarcel	 *  to determine whether APIC access exits are working.
73130803Smarcel	 */
74130803Smarcel	uint32_t halter_lvr;
75130803Smarcel};
76130803Smarcel
77130803Smarcelstruct thread_params {
78130803Smarcel	struct test_data_page *data;
79130803Smarcel	struct kvm_vcpu *vcpu;
80130803Smarcel	uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
81130803Smarcel};
82130803Smarcel
83130803Smarcelvoid verify_apic_base_addr(void)
84130803Smarcel{
85130803Smarcel	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
86130803Smarcel	uint64_t base = GET_APIC_BASE(msr);
87130803Smarcel
88130803Smarcel	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
89130803Smarcel}
90130803Smarcel
91130803Smarcelstatic void halter_guest_code(struct test_data_page *data)
92130803Smarcel{
93130803Smarcel	verify_apic_base_addr();
94130803Smarcel	xapic_enable();
95130803Smarcel
96130803Smarcel	data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
97130803Smarcel	data->halter_lvr = xapic_read_reg(APIC_LVR);
98130803Smarcel
99130803Smarcel	/*
100130803Smarcel	 * Loop forever HLTing and recording halts & wakes. Disable interrupts
101130803Smarcel	 * each time around to minimize window between signaling the pending
102130803Smarcel	 * halt to the sender vCPU and executing the halt. No need to disable on
103130803Smarcel	 * first run as this vCPU executes first and the host waits for it to
104130803Smarcel	 * signal going into first halt before starting the sender vCPU. Record
105130803Smarcel	 * TPR and PPR for diagnostic purposes in case the test fails.
106130803Smarcel	 */
107130803Smarcel	for (;;) {
108130803Smarcel		data->halter_tpr = xapic_read_reg(APIC_TASKPRI);
109130803Smarcel		data->halter_ppr = xapic_read_reg(APIC_PROCPRI);
110130803Smarcel		data->hlt_count++;
111130803Smarcel		asm volatile("sti; hlt; cli");
112130803Smarcel		data->wake_count++;
113130803Smarcel	}
114130803Smarcel}
115130803Smarcel
116130803Smarcel/*
117130803Smarcel * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
118130803Smarcel * enable diagnosing errant writes to the APIC access address backing page in
119130803Smarcel * case of test failure.
120130803Smarcel */
121130803Smarcelstatic void guest_ipi_handler(struct ex_regs *regs)
122130803Smarcel{
123130803Smarcel	ipis_rcvd++;
124130803Smarcel	xapic_write_reg(APIC_EOI, 77);
125130803Smarcel}
126130803Smarcel
127130803Smarcelstatic void sender_guest_code(struct test_data_page *data)
128130803Smarcel{
129130803Smarcel	uint64_t last_wake_count;
130130803Smarcel	uint64_t last_hlt_count;
131130803Smarcel	uint64_t last_ipis_rcvd_count;
132130803Smarcel	uint32_t icr_val;
133130803Smarcel	uint32_t icr2_val;
134130803Smarcel	uint64_t tsc_start;
135130803Smarcel
136130803Smarcel	verify_apic_base_addr();
137130803Smarcel	xapic_enable();
138130803Smarcel
139130803Smarcel	/*
140130803Smarcel	 * Init interrupt command register for sending IPIs
141130803Smarcel	 *
142130803Smarcel	 * Delivery mode=fixed, per SDM:
143130803Smarcel	 *   "Delivers the interrupt specified in the vector field to the target
144130803Smarcel	 *    processor."
145130803Smarcel	 *
146130803Smarcel	 * Destination mode=physical i.e. specify target by its local APIC
147130803Smarcel	 * ID. This vCPU assumes that the halter vCPU has already started and
148130803Smarcel	 * set data->halter_apic_id.
149130803Smarcel	 */
150130803Smarcel	icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
151130803Smarcel	icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
152130803Smarcel	data->icr = icr_val;
153130803Smarcel	data->icr2 = icr2_val;
154130803Smarcel
155130803Smarcel	last_wake_count = data->wake_count;
156130803Smarcel	last_hlt_count = data->hlt_count;
157130803Smarcel	last_ipis_rcvd_count = ipis_rcvd;
158130803Smarcel	for (;;) {
159130803Smarcel		/*
160130803Smarcel		 * Send IPI to halter vCPU.
161130803Smarcel		 * First IPI can be sent unconditionally because halter vCPU
162130803Smarcel		 * starts earlier.
163130803Smarcel		 */
164		xapic_write_reg(APIC_ICR2, icr2_val);
165		xapic_write_reg(APIC_ICR, icr_val);
166		data->ipis_sent++;
167
168		/*
169		 * Wait up to ~1 sec for halter to indicate that it has:
170		 * 1. Received the IPI
171		 * 2. Woken up from the halt
172		 * 3. Gone back into halt
173		 * Current CPUs typically run at 2.x Ghz which is ~2
174		 * billion ticks per second.
175		 */
176		tsc_start = rdtsc();
177		while (rdtsc() - tsc_start < 2000000000) {
178			if ((ipis_rcvd != last_ipis_rcvd_count) &&
179			    (data->wake_count != last_wake_count) &&
180			    (data->hlt_count != last_hlt_count))
181				break;
182		}
183
184		GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
185			     (data->wake_count != last_wake_count) &&
186			     (data->hlt_count != last_hlt_count));
187
188		last_wake_count = data->wake_count;
189		last_hlt_count = data->hlt_count;
190		last_ipis_rcvd_count = ipis_rcvd;
191	}
192}
193
194static void *vcpu_thread(void *arg)
195{
196	struct thread_params *params = (struct thread_params *)arg;
197	struct kvm_vcpu *vcpu = params->vcpu;
198	struct ucall uc;
199	int old;
200	int r;
201
202	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
203	TEST_ASSERT(r == 0,
204		    "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
205		    vcpu->id, r);
206
207	fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id);
208	vcpu_run(vcpu);
209
210	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
211
212	if (get_ucall(vcpu, &uc) == UCALL_ABORT) {
213		TEST_ASSERT(false,
214			    "vCPU %u exited with error: %s.\n"
215			    "Sending vCPU sent %lu IPIs to halting vCPU\n"
216			    "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
217			    "Halter TPR=%#x PPR=%#x LVR=%#x\n"
218			    "Migrations attempted: %lu\n"
219			    "Migrations completed: %lu",
220			    vcpu->id, (const char *)uc.args[0],
221			    params->data->ipis_sent, params->data->hlt_count,
222			    params->data->wake_count,
223			    *params->pipis_rcvd, params->data->halter_tpr,
224			    params->data->halter_ppr, params->data->halter_lvr,
225			    params->data->migrations_attempted,
226			    params->data->migrations_completed);
227	}
228
229	return NULL;
230}
231
232static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
233{
234	void *retval;
235	int r;
236
237	r = pthread_cancel(thread);
238	TEST_ASSERT(r == 0,
239		    "pthread_cancel on vcpu_id=%d failed with errno=%d",
240		    vcpu->id, r);
241
242	r = pthread_join(thread, &retval);
243	TEST_ASSERT(r == 0,
244		    "pthread_join on vcpu_id=%d failed with errno=%d",
245		    vcpu->id, r);
246	TEST_ASSERT(retval == PTHREAD_CANCELED,
247		    "expected retval=%p, got %p", PTHREAD_CANCELED,
248		    retval);
249}
250
251void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
252		   uint64_t *pipis_rcvd)
253{
254	long pages_not_moved;
255	unsigned long nodemask = 0;
256	unsigned long nodemasks[sizeof(nodemask) * 8];
257	int nodes = 0;
258	time_t start_time, last_update, now;
259	time_t interval_secs = 1;
260	int i, r;
261	int from, to;
262	unsigned long bit;
263	uint64_t hlt_count;
264	uint64_t wake_count;
265	uint64_t ipis_sent;
266
267	fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
268		delay_usecs);
269
270	/* Get set of first 64 numa nodes available */
271	r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
272			  0, MPOL_F_MEMS_ALLOWED);
273	TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
274
275	fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
276		"(each 1-bit indicates node is present): %#lx\n",
277		sizeof(nodemask) * 8, nodemask);
278
279	/* Init array of masks containing a single-bit in each, one for each
280	 * available node. migrate_pages called below requires specifying nodes
281	 * as bit masks.
282	 */
283	for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
284		if (nodemask & bit) {
285			nodemasks[nodes] = nodemask & bit;
286			nodes++;
287		}
288	}
289
290	TEST_ASSERT(nodes > 1,
291		    "Did not find at least 2 numa nodes. Can't do migration");
292
293	fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
294
295	from = 0;
296	to = 1;
297	start_time = time(NULL);
298	last_update = start_time;
299
300	ipis_sent = data->ipis_sent;
301	hlt_count = data->hlt_count;
302	wake_count = data->wake_count;
303
304	while ((int)(time(NULL) - start_time) < run_secs) {
305		data->migrations_attempted++;
306
307		/*
308		 * migrate_pages with PID=0 will migrate all pages of this
309		 * process between the nodes specified as bitmasks. The page
310		 * backing the APIC access address belongs to this process
311		 * because it is allocated by KVM in the context of the
312		 * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
313		 * test may break or give a false positive signal.
314		 */
315		pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
316						&nodemasks[from],
317						&nodemasks[to]);
318		if (pages_not_moved < 0)
319			fprintf(stderr,
320				"migrate_pages failed, errno=%d\n", errno);
321		else if (pages_not_moved > 0)
322			fprintf(stderr,
323				"migrate_pages could not move %ld pages\n",
324				pages_not_moved);
325		else
326			data->migrations_completed++;
327
328		from = to;
329		to++;
330		if (to == nodes)
331			to = 0;
332
333		now = time(NULL);
334		if (((now - start_time) % interval_secs == 0) &&
335		    (now != last_update)) {
336			last_update = now;
337			fprintf(stderr,
338				"%lu seconds: Migrations attempted=%lu completed=%lu, "
339				"IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
340				now - start_time, data->migrations_attempted,
341				data->migrations_completed,
342				data->ipis_sent, *pipis_rcvd,
343				data->hlt_count, data->wake_count);
344
345			TEST_ASSERT(ipis_sent != data->ipis_sent &&
346				    hlt_count != data->hlt_count &&
347				    wake_count != data->wake_count,
348				    "IPI, HLT and wake count have not increased "
349				    "in the last %lu seconds. "
350				    "HLTer is likely hung.", interval_secs);
351
352			ipis_sent = data->ipis_sent;
353			hlt_count = data->hlt_count;
354			wake_count = data->wake_count;
355		}
356		usleep(delay_usecs);
357	}
358}
359
360void get_cmdline_args(int argc, char *argv[], int *run_secs,
361		      bool *migrate, int *delay_usecs)
362{
363	for (;;) {
364		int opt = getopt(argc, argv, "s:d:m");
365
366		if (opt == -1)
367			break;
368		switch (opt) {
369		case 's':
370			*run_secs = parse_size(optarg);
371			break;
372		case 'm':
373			*migrate = true;
374			break;
375		case 'd':
376			*delay_usecs = parse_size(optarg);
377			break;
378		default:
379			TEST_ASSERT(false,
380				    "Usage: -s <runtime seconds>. Default is %d seconds.\n"
381				    "-m adds calls to migrate_pages while vCPUs are running."
382				    " Default is no migrations.\n"
383				    "-d <delay microseconds> - delay between migrate_pages() calls."
384				    " Default is %d microseconds.",
385				    DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
386		}
387	}
388}
389
390int main(int argc, char *argv[])
391{
392	int r;
393	int wait_secs;
394	const int max_halter_wait = 10;
395	int run_secs = 0;
396	int delay_usecs = 0;
397	struct test_data_page *data;
398	vm_vaddr_t test_data_page_vaddr;
399	bool migrate = false;
400	pthread_t threads[2];
401	struct thread_params params[2];
402	struct kvm_vm *vm;
403	uint64_t *pipis_rcvd;
404
405	get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
406	if (run_secs <= 0)
407		run_secs = DEFAULT_RUN_SECS;
408	if (delay_usecs <= 0)
409		delay_usecs = DEFAULT_DELAY_USECS;
410
411	vm = vm_create_with_one_vcpu(&params[0].vcpu, halter_guest_code);
412
413	vm_init_descriptor_tables(vm);
414	vcpu_init_descriptor_tables(params[0].vcpu);
415	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
416
417	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
418
419	params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code);
420
421	test_data_page_vaddr = vm_vaddr_alloc_page(vm);
422	data = addr_gva2hva(vm, test_data_page_vaddr);
423	memset(data, 0, sizeof(*data));
424	params[0].data = data;
425	params[1].data = data;
426
427	vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr);
428	vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr);
429
430	pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
431	params[0].pipis_rcvd = pipis_rcvd;
432	params[1].pipis_rcvd = pipis_rcvd;
433
434	/* Start halter vCPU thread and wait for it to execute first HLT. */
435	r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
436	TEST_ASSERT(r == 0,
437		    "pthread_create halter failed errno=%d", errno);
438	fprintf(stderr, "Halter vCPU thread started\n");
439
440	wait_secs = 0;
441	while ((wait_secs < max_halter_wait) && !data->hlt_count) {
442		sleep(1);
443		wait_secs++;
444	}
445
446	TEST_ASSERT(data->hlt_count,
447		    "Halter vCPU did not execute first HLT within %d seconds",
448		    max_halter_wait);
449
450	fprintf(stderr,
451		"Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
452		data->halter_apic_id, wait_secs);
453
454	r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
455	TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
456
457	fprintf(stderr,
458		"IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
459		run_secs);
460
461	if (!migrate)
462		sleep(run_secs);
463	else
464		do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
465
466	/*
467	 * Cancel threads and wait for them to stop.
468	 */
469	cancel_join_vcpu_thread(threads[0], params[0].vcpu);
470	cancel_join_vcpu_thread(threads[1], params[1].vcpu);
471
472	fprintf(stderr,
473		"Test successful after running for %d seconds.\n"
474		"Sending vCPU sent %lu IPIs to halting vCPU\n"
475		"Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
476		"Halter APIC ID=%#x\n"
477		"Sender ICR value=%#x ICR2 value=%#x\n"
478		"Halter TPR=%#x PPR=%#x LVR=%#x\n"
479		"Migrations attempted: %lu\n"
480		"Migrations completed: %lu\n",
481		run_secs, data->ipis_sent,
482		data->hlt_count, data->wake_count, *pipis_rcvd,
483		data->halter_apic_id,
484		data->icr, data->icr2,
485		data->halter_tpr, data->halter_ppr, data->halter_lvr,
486		data->migrations_attempted, data->migrations_completed);
487
488	kvm_vm_free(vm);
489
490	return 0;
491}
492