1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 * Copyright (c) 2019 Joyent, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31#include "opt_bhyve_snapshot.h"
32
33#include <sys/param.h>
34#include <sys/lock.h>
35#include <sys/kernel.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/systm.h>
39#include <sys/smp.h>
40
41#include <x86/specialreg.h>
42#include <x86/apicreg.h>
43
44#include <machine/clock.h>
45#include <machine/smp.h>
46
47#include <machine/vmm.h>
48#include <machine/vmm_snapshot.h>
49
50#include "vmm_lapic.h"
51#include "vmm_ktr.h"
52#include "vmm_stat.h"
53
54#include "vlapic.h"
55#include "vlapic_priv.h"
56#include "vioapic.h"
57
58#define	PRIO(x)			((x) >> 4)
59
60#define VLAPIC_VERSION		(0x14)
61
62#define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
63
64/*
65 * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
66 * vlapic_callout_handler() and vcpu accesses to:
67 * - timer_freq_bt, timer_period_bt, timer_fire_bt
68 * - timer LVT register
69 */
70#define	VLAPIC_TIMER_LOCK(vlapic)	mtx_lock_spin(&((vlapic)->timer_mtx))
71#define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
72#define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
73
74/*
75 * APIC timer frequency:
76 * - arbitrary but chosen to be in the ballpark of contemporary hardware.
77 * - power-of-two to avoid loss of precision when converted to a bintime.
78 */
79#define VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
80
81static void vlapic_set_error(struct vlapic *, uint32_t, bool);
82static void vlapic_callout_handler(void *arg);
83static void vlapic_reset(struct vlapic *vlapic);
84
85static __inline uint32_t
86vlapic_get_id(struct vlapic *vlapic)
87{
88
89	if (x2apic(vlapic))
90		return (vlapic->vcpuid);
91	else
92		return (vlapic->vcpuid << 24);
93}
94
95static uint32_t
96x2apic_ldr(struct vlapic *vlapic)
97{
98	int apicid;
99	uint32_t ldr;
100
101	apicid = vlapic_get_id(vlapic);
102	ldr = 1 << (apicid & 0xf);
103	ldr |= (apicid & 0xffff0) << 12;
104	return (ldr);
105}
106
107void
108vlapic_dfr_write_handler(struct vlapic *vlapic)
109{
110	struct LAPIC *lapic;
111
112	lapic = vlapic->apic_page;
113	if (x2apic(vlapic)) {
114		VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
115		    lapic->dfr);
116		lapic->dfr = 0;
117		return;
118	}
119
120	lapic->dfr &= APIC_DFR_MODEL_MASK;
121	lapic->dfr |= APIC_DFR_RESERVED;
122
123	if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
124		VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
125	else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
126		VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
127	else
128		VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
129}
130
131void
132vlapic_ldr_write_handler(struct vlapic *vlapic)
133{
134	struct LAPIC *lapic;
135
136	lapic = vlapic->apic_page;
137
138	/* LDR is read-only in x2apic mode */
139	if (x2apic(vlapic)) {
140		VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
141		    lapic->ldr);
142		lapic->ldr = x2apic_ldr(vlapic);
143	} else {
144		lapic->ldr &= ~APIC_LDR_RESERVED;
145		VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
146	}
147}
148
149void
150vlapic_id_write_handler(struct vlapic *vlapic)
151{
152	struct LAPIC *lapic;
153
154	/*
155	 * We don't allow the ID register to be modified so reset it back to
156	 * its default value.
157	 */
158	lapic = vlapic->apic_page;
159	lapic->id = vlapic_get_id(vlapic);
160}
161
162static int
163vlapic_timer_divisor(uint32_t dcr)
164{
165	switch (dcr & 0xB) {
166	case APIC_TDCR_1:
167		return (1);
168	case APIC_TDCR_2:
169		return (2);
170	case APIC_TDCR_4:
171		return (4);
172	case APIC_TDCR_8:
173		return (8);
174	case APIC_TDCR_16:
175		return (16);
176	case APIC_TDCR_32:
177		return (32);
178	case APIC_TDCR_64:
179		return (64);
180	case APIC_TDCR_128:
181		return (128);
182	default:
183		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
184	}
185}
186
187#if 0
188static inline void
189vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
190{
191	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
192	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
193	    *lvt & APIC_LVTT_M);
194}
195#endif
196
197static uint32_t
198vlapic_get_ccr(struct vlapic *vlapic)
199{
200	struct bintime bt_now, bt_rem;
201	struct LAPIC *lapic __diagused;
202	uint32_t ccr;
203
204	ccr = 0;
205	lapic = vlapic->apic_page;
206
207	VLAPIC_TIMER_LOCK(vlapic);
208	if (callout_active(&vlapic->callout)) {
209		/*
210		 * If the timer is scheduled to expire in the future then
211		 * compute the value of 'ccr' based on the remaining time.
212		 */
213		binuptime(&bt_now);
214		if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
215			bt_rem = vlapic->timer_fire_bt;
216			bintime_sub(&bt_rem, &bt_now);
217			ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
218			ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
219		}
220	}
221	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
222	    "icr_timer is %#x", ccr, lapic->icr_timer));
223	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
224	    ccr, lapic->icr_timer);
225	VLAPIC_TIMER_UNLOCK(vlapic);
226	return (ccr);
227}
228
229void
230vlapic_dcr_write_handler(struct vlapic *vlapic)
231{
232	struct LAPIC *lapic;
233	int divisor;
234
235	lapic = vlapic->apic_page;
236	VLAPIC_TIMER_LOCK(vlapic);
237
238	divisor = vlapic_timer_divisor(lapic->dcr_timer);
239	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
240	    lapic->dcr_timer, divisor);
241
242	/*
243	 * Update the timer frequency and the timer period.
244	 *
245	 * XXX changes to the frequency divider will not take effect until
246	 * the timer is reloaded.
247	 */
248	FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
249	vlapic->timer_period_bt = vlapic->timer_freq_bt;
250	bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
251
252	VLAPIC_TIMER_UNLOCK(vlapic);
253}
254
255void
256vlapic_esr_write_handler(struct vlapic *vlapic)
257{
258	struct LAPIC *lapic;
259
260	lapic = vlapic->apic_page;
261	lapic->esr = vlapic->esr_pending;
262	vlapic->esr_pending = 0;
263}
264
265int
266vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
267{
268	struct LAPIC *lapic;
269	uint32_t *irrptr, *tmrptr, mask;
270	int idx;
271
272	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
273
274	lapic = vlapic->apic_page;
275	if (!(lapic->svr & APIC_SVR_ENABLE)) {
276		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
277		    "interrupt %d", vector);
278		return (0);
279	}
280
281	if (vector < 16) {
282		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
283		    false);
284		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
285		    vector);
286		return (1);
287	}
288
289	if (vlapic->ops.set_intr_ready)
290		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
291
292	idx = (vector / 32) * 4;
293	mask = 1 << (vector % 32);
294
295	irrptr = &lapic->irr0;
296	atomic_set_int(&irrptr[idx], mask);
297
298	/*
299	 * Verify that the trigger-mode of the interrupt matches with
300	 * the vlapic TMR registers.
301	 */
302	tmrptr = &lapic->tmr0;
303	if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
304		VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
305		    "interrupt is %s-triggered", idx / 4, tmrptr[idx],
306		    level ? "level" : "edge");
307	}
308
309	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
310	return (1);
311}
312
313static __inline uint32_t *
314vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
315{
316	struct LAPIC	*lapic = vlapic->apic_page;
317	int 		 i;
318
319	switch (offset) {
320	case APIC_OFFSET_CMCI_LVT:
321		return (&lapic->lvt_cmci);
322	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
323		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
324		return ((&lapic->lvt_timer) + i);
325	default:
326		panic("vlapic_get_lvt: invalid LVT\n");
327	}
328}
329
330static __inline int
331lvt_off_to_idx(uint32_t offset)
332{
333	int index;
334
335	switch (offset) {
336	case APIC_OFFSET_CMCI_LVT:
337		index = APIC_LVT_CMCI;
338		break;
339	case APIC_OFFSET_TIMER_LVT:
340		index = APIC_LVT_TIMER;
341		break;
342	case APIC_OFFSET_THERM_LVT:
343		index = APIC_LVT_THERMAL;
344		break;
345	case APIC_OFFSET_PERF_LVT:
346		index = APIC_LVT_PMC;
347		break;
348	case APIC_OFFSET_LINT0_LVT:
349		index = APIC_LVT_LINT0;
350		break;
351	case APIC_OFFSET_LINT1_LVT:
352		index = APIC_LVT_LINT1;
353		break;
354	case APIC_OFFSET_ERROR_LVT:
355		index = APIC_LVT_ERROR;
356		break;
357	default:
358		index = -1;
359		break;
360	}
361	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
362	    "invalid lvt index %d for offset %#x", index, offset));
363
364	return (index);
365}
366
367static __inline uint32_t
368vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
369{
370	int idx;
371	uint32_t val;
372
373	idx = lvt_off_to_idx(offset);
374	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
375	return (val);
376}
377
378void
379vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
380{
381	uint32_t *lvtptr, mask, val;
382	struct LAPIC *lapic;
383	int idx;
384
385	lapic = vlapic->apic_page;
386	lvtptr = vlapic_get_lvtptr(vlapic, offset);
387	val = *lvtptr;
388	idx = lvt_off_to_idx(offset);
389
390	if (!(lapic->svr & APIC_SVR_ENABLE))
391		val |= APIC_LVT_M;
392	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
393	switch (offset) {
394	case APIC_OFFSET_TIMER_LVT:
395		mask |= APIC_LVTT_TM;
396		break;
397	case APIC_OFFSET_ERROR_LVT:
398		break;
399	case APIC_OFFSET_LINT0_LVT:
400	case APIC_OFFSET_LINT1_LVT:
401		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
402		/* FALLTHROUGH */
403	default:
404		mask |= APIC_LVT_DM;
405		break;
406	}
407	val &= mask;
408	*lvtptr = val;
409	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
410}
411
412static void
413vlapic_mask_lvts(struct vlapic *vlapic)
414{
415	struct LAPIC *lapic = vlapic->apic_page;
416
417	lapic->lvt_cmci |= APIC_LVT_M;
418	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
419
420	lapic->lvt_timer |= APIC_LVT_M;
421	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
422
423	lapic->lvt_thermal |= APIC_LVT_M;
424	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
425
426	lapic->lvt_pcint |= APIC_LVT_M;
427	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
428
429	lapic->lvt_lint0 |= APIC_LVT_M;
430	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
431
432	lapic->lvt_lint1 |= APIC_LVT_M;
433	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
434
435	lapic->lvt_error |= APIC_LVT_M;
436	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
437}
438
439static int
440vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
441{
442	uint32_t mode, reg, vec;
443
444	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
445
446	if (reg & APIC_LVT_M)
447		return (0);
448	vec = reg & APIC_LVT_VECTOR;
449	mode = reg & APIC_LVT_DM;
450
451	switch (mode) {
452	case APIC_LVT_DM_FIXED:
453		if (vec < 16) {
454			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
455			    lvt == APIC_LVT_ERROR);
456			return (0);
457		}
458		if (vlapic_set_intr_ready(vlapic, vec, false))
459			vcpu_notify_event(vlapic->vcpu, true);
460		break;
461	case APIC_LVT_DM_NMI:
462		vm_inject_nmi(vlapic->vcpu);
463		break;
464	case APIC_LVT_DM_EXTINT:
465		vm_inject_extint(vlapic->vcpu);
466		break;
467	default:
468		// Other modes ignored
469		return (0);
470	}
471	return (1);
472}
473
474#if 1
475static void
476dump_isrvec_stk(struct vlapic *vlapic)
477{
478	int i;
479	uint32_t *isrptr;
480
481	isrptr = &vlapic->apic_page->isr0;
482	for (i = 0; i < 8; i++)
483		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
484
485	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
486		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
487}
488#endif
489
490/*
491 * Algorithm adopted from section "Interrupt, Task and Processor Priority"
492 * in Intel Architecture Manual Vol 3a.
493 */
494static void
495vlapic_update_ppr(struct vlapic *vlapic)
496{
497	int isrvec, tpr, ppr;
498
499	/*
500	 * Note that the value on the stack at index 0 is always 0.
501	 *
502	 * This is a placeholder for the value of ISRV when none of the
503	 * bits is set in the ISRx registers.
504	 */
505	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
506	tpr = vlapic->apic_page->tpr;
507
508#if 1
509	{
510		int i, lastprio, curprio, vector, idx;
511		uint32_t *isrptr;
512
513		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
514			panic("isrvec_stk is corrupted: %d", isrvec);
515
516		/*
517		 * Make sure that the priority of the nested interrupts is
518		 * always increasing.
519		 */
520		lastprio = -1;
521		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
522			curprio = PRIO(vlapic->isrvec_stk[i]);
523			if (curprio <= lastprio) {
524				dump_isrvec_stk(vlapic);
525				panic("isrvec_stk does not satisfy invariant");
526			}
527			lastprio = curprio;
528		}
529
530		/*
531		 * Make sure that each bit set in the ISRx registers has a
532		 * corresponding entry on the isrvec stack.
533		 */
534		i = 1;
535		isrptr = &vlapic->apic_page->isr0;
536		for (vector = 0; vector < 256; vector++) {
537			idx = (vector / 32) * 4;
538			if (isrptr[idx] & (1 << (vector % 32))) {
539				if (i > vlapic->isrvec_stk_top ||
540				    vlapic->isrvec_stk[i] != vector) {
541					dump_isrvec_stk(vlapic);
542					panic("ISR and isrvec_stk out of sync");
543				}
544				i++;
545			}
546		}
547	}
548#endif
549
550	if (PRIO(tpr) >= PRIO(isrvec))
551		ppr = tpr;
552	else
553		ppr = isrvec & 0xf0;
554
555	vlapic->apic_page->ppr = ppr;
556	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
557}
558
559void
560vlapic_sync_tpr(struct vlapic *vlapic)
561{
562	vlapic_update_ppr(vlapic);
563}
564
565static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
566
567static void
568vlapic_process_eoi(struct vlapic *vlapic)
569{
570	struct LAPIC	*lapic = vlapic->apic_page;
571	uint32_t	*isrptr, *tmrptr;
572	int		i, idx, bitpos, vector;
573
574	isrptr = &lapic->isr0;
575	tmrptr = &lapic->tmr0;
576
577	for (i = 7; i >= 0; i--) {
578		idx = i * 4;
579		bitpos = fls(isrptr[idx]);
580		if (bitpos-- != 0) {
581			if (vlapic->isrvec_stk_top <= 0) {
582				panic("invalid vlapic isrvec_stk_top %d",
583				      vlapic->isrvec_stk_top);
584			}
585			isrptr[idx] &= ~(1 << bitpos);
586			vector = i * 32 + bitpos;
587			VLAPIC_CTR1(vlapic, "EOI vector %d", vector);
588			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
589			vlapic->isrvec_stk_top--;
590			vlapic_update_ppr(vlapic);
591			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
592				vioapic_process_eoi(vlapic->vm, vector);
593			}
594			return;
595		}
596	}
597	VLAPIC_CTR0(vlapic, "Gratuitous EOI");
598	vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1);
599}
600
601static __inline int
602vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
603{
604
605	return (lvt & mask);
606}
607
608static __inline int
609vlapic_periodic_timer(struct vlapic *vlapic)
610{
611	uint32_t lvt;
612
613	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
614
615	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
616}
617
618static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
619
620static void
621vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
622{
623
624	vlapic->esr_pending |= mask;
625
626	/*
627	 * Avoid infinite recursion if the error LVT itself is configured with
628	 * an illegal vector.
629	 */
630	if (lvt_error)
631		return;
632
633	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
634		vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1);
635	}
636}
637
638static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
639
640static void
641vlapic_fire_timer(struct vlapic *vlapic)
642{
643
644	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
645
646	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
647		VLAPIC_CTR0(vlapic, "vlapic timer fired");
648		vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1);
649	}
650}
651
652static VMM_STAT(VLAPIC_INTR_CMC,
653    "corrected machine check interrupts generated by vlapic");
654
655void
656vlapic_fire_cmci(struct vlapic *vlapic)
657{
658
659	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
660		vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1);
661	}
662}
663
664static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
665    "lvts triggered");
666
667int
668vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
669{
670
671	if (vlapic_enabled(vlapic) == false) {
672		/*
673		 * When the local APIC is global/hardware disabled,
674		 * LINT[1:0] pins are configured as INTR and NMI pins,
675		 * respectively.
676		*/
677		switch (vector) {
678			case APIC_LVT_LINT0:
679				vm_inject_extint(vlapic->vcpu);
680				break;
681			case APIC_LVT_LINT1:
682				vm_inject_nmi(vlapic->vcpu);
683				break;
684			default:
685				break;
686		}
687		return (0);
688	}
689
690	switch (vector) {
691	case APIC_LVT_LINT0:
692	case APIC_LVT_LINT1:
693	case APIC_LVT_TIMER:
694	case APIC_LVT_ERROR:
695	case APIC_LVT_PMC:
696	case APIC_LVT_THERMAL:
697	case APIC_LVT_CMCI:
698		if (vlapic_fire_lvt(vlapic, vector)) {
699			vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED,
700			    vector, 1);
701		}
702		break;
703	default:
704		return (EINVAL);
705	}
706	return (0);
707}
708
709static void
710vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t)
711{
712	callout_reset_sbt_curcpu(&vlapic->callout, t, 0,
713	    vlapic_callout_handler, vlapic, 0);
714}
715
716static void
717vlapic_callout_handler(void *arg)
718{
719	struct vlapic *vlapic;
720	struct bintime bt, btnow;
721	sbintime_t rem_sbt;
722
723	vlapic = arg;
724
725	VLAPIC_TIMER_LOCK(vlapic);
726	if (callout_pending(&vlapic->callout))	/* callout was reset */
727		goto done;
728
729	if (!callout_active(&vlapic->callout))	/* callout was stopped */
730		goto done;
731
732	callout_deactivate(&vlapic->callout);
733
734	vlapic_fire_timer(vlapic);
735
736	if (vlapic_periodic_timer(vlapic)) {
737		binuptime(&btnow);
738		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
739		    ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
740		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
741		    vlapic->timer_fire_bt.frac));
742
743		/*
744		 * Compute the delta between when the timer was supposed to
745		 * fire and the present time.
746		 */
747		bt = btnow;
748		bintime_sub(&bt, &vlapic->timer_fire_bt);
749
750		rem_sbt = bttosbt(vlapic->timer_period_bt);
751		if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
752			/*
753			 * Adjust the time until the next countdown downward
754			 * to account for the lost time.
755			 */
756			rem_sbt -= bttosbt(bt);
757		} else {
758			/*
759			 * If the delta is greater than the timer period then
760			 * just reset our time base instead of trying to catch
761			 * up.
762			 */
763			vlapic->timer_fire_bt = btnow;
764			VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
765			    "usecs, period is %lu usecs - resetting time base",
766			    bttosbt(bt) / SBT_1US,
767			    bttosbt(vlapic->timer_period_bt) / SBT_1US);
768		}
769
770		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
771		vlapic_callout_reset(vlapic, rem_sbt);
772	}
773done:
774	VLAPIC_TIMER_UNLOCK(vlapic);
775}
776
777void
778vlapic_icrtmr_write_handler(struct vlapic *vlapic)
779{
780	struct LAPIC *lapic;
781	sbintime_t sbt;
782	uint32_t icr_timer;
783
784	VLAPIC_TIMER_LOCK(vlapic);
785
786	lapic = vlapic->apic_page;
787	icr_timer = lapic->icr_timer;
788
789	vlapic->timer_period_bt = vlapic->timer_freq_bt;
790	bintime_mul(&vlapic->timer_period_bt, icr_timer);
791
792	if (icr_timer != 0) {
793		binuptime(&vlapic->timer_fire_bt);
794		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
795
796		sbt = bttosbt(vlapic->timer_period_bt);
797		vlapic_callout_reset(vlapic, sbt);
798	} else
799		callout_stop(&vlapic->callout);
800
801	VLAPIC_TIMER_UNLOCK(vlapic);
802}
803
804/*
805 * This function populates 'dmask' with the set of vcpus that match the
806 * addressing specified by the (dest, phys, lowprio) tuple.
807 *
808 * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
809 * or xAPIC (8-bit) destination field.
810 */
811static void
812vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
813    bool lowprio, bool x2apic_dest)
814{
815	struct vlapic *vlapic;
816	uint32_t dfr, ldr, ldest, cluster;
817	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
818	cpuset_t amask;
819	int vcpuid;
820
821	if ((x2apic_dest && dest == 0xffffffff) ||
822	    (!x2apic_dest && dest == 0xff)) {
823		/*
824		 * Broadcast in both logical and physical modes.
825		 */
826		*dmask = vm_active_cpus(vm);
827		return;
828	}
829
830	if (phys) {
831		/*
832		 * Physical mode: destination is APIC ID.
833		 */
834		CPU_ZERO(dmask);
835		vcpuid = vm_apicid2vcpuid(vm, dest);
836		amask = vm_active_cpus(vm);
837		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
838			CPU_SET(vcpuid, dmask);
839	} else {
840		/*
841		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
842		 * bitmask. This model is only available in the xAPIC mode.
843		 */
844		mda_flat_ldest = dest & 0xff;
845
846		/*
847		 * In the "Cluster Model" the MDA is used to identify a
848		 * specific cluster and a set of APICs in that cluster.
849		 */
850		if (x2apic_dest) {
851			mda_cluster_id = dest >> 16;
852			mda_cluster_ldest = dest & 0xffff;
853		} else {
854			mda_cluster_id = (dest >> 4) & 0xf;
855			mda_cluster_ldest = dest & 0xf;
856		}
857
858		/*
859		 * Logical mode: match each APIC that has a bit set
860		 * in its LDR that matches a bit in the ldest.
861		 */
862		CPU_ZERO(dmask);
863		amask = vm_active_cpus(vm);
864		CPU_FOREACH_ISSET(vcpuid, &amask) {
865			vlapic = vm_lapic(vm_vcpu(vm, vcpuid));
866			dfr = vlapic->apic_page->dfr;
867			ldr = vlapic->apic_page->ldr;
868
869			if ((dfr & APIC_DFR_MODEL_MASK) ==
870			    APIC_DFR_MODEL_FLAT) {
871				ldest = ldr >> 24;
872				mda_ldest = mda_flat_ldest;
873			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
874			    APIC_DFR_MODEL_CLUSTER) {
875				if (x2apic(vlapic)) {
876					cluster = ldr >> 16;
877					ldest = ldr & 0xffff;
878				} else {
879					cluster = ldr >> 28;
880					ldest = (ldr >> 24) & 0xf;
881				}
882				if (cluster != mda_cluster_id)
883					continue;
884				mda_ldest = mda_cluster_ldest;
885			} else {
886				/*
887				 * Guest has configured a bad logical
888				 * model for this vcpu - skip it.
889				 */
890				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
891				    "model %x - cannot deliver interrupt", dfr);
892				continue;
893			}
894
895			if ((mda_ldest & ldest) != 0) {
896				CPU_SET(vcpuid, dmask);
897				if (lowprio)
898					break;
899			}
900		}
901	}
902}
903
904static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu");
905static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu");
906
907static void
908vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
909{
910	struct LAPIC *lapic = vlapic->apic_page;
911
912	if (lapic->tpr != val) {
913		VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x",
914		    lapic->tpr, val);
915		lapic->tpr = val;
916		vlapic_update_ppr(vlapic);
917	}
918}
919
920static uint8_t
921vlapic_get_tpr(struct vlapic *vlapic)
922{
923	struct LAPIC *lapic = vlapic->apic_page;
924
925	return (lapic->tpr);
926}
927
928void
929vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
930{
931	uint8_t tpr;
932
933	if (val & ~0xf) {
934		vm_inject_gp(vlapic->vcpu);
935		return;
936	}
937
938	tpr = val << 4;
939	vlapic_set_tpr(vlapic, tpr);
940}
941
942uint64_t
943vlapic_get_cr8(struct vlapic *vlapic)
944{
945	uint8_t tpr;
946
947	tpr = vlapic_get_tpr(vlapic);
948	return (tpr >> 4);
949}
950
951static bool
952vlapic_is_icr_valid(uint64_t icrval)
953{
954	uint32_t mode = icrval & APIC_DELMODE_MASK;
955	uint32_t level = icrval & APIC_LEVEL_MASK;
956	uint32_t trigger = icrval & APIC_TRIGMOD_MASK;
957	uint32_t shorthand = icrval & APIC_DEST_MASK;
958
959	switch (mode) {
960	case APIC_DELMODE_FIXED:
961		if (trigger == APIC_TRIGMOD_EDGE)
962			return (true);
963		/*
964		 * AMD allows a level assert IPI and Intel converts a level
965		 * assert IPI into an edge IPI.
966		 */
967		if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT)
968			return (true);
969		break;
970	case APIC_DELMODE_LOWPRIO:
971	case APIC_DELMODE_SMI:
972	case APIC_DELMODE_NMI:
973	case APIC_DELMODE_INIT:
974		if (trigger == APIC_TRIGMOD_EDGE &&
975		    (shorthand == APIC_DEST_DESTFLD ||
976			shorthand == APIC_DEST_ALLESELF))
977			return (true);
978		/*
979		 * AMD allows a level assert IPI and Intel converts a level
980		 * assert IPI into an edge IPI.
981		 */
982		if (trigger == APIC_TRIGMOD_LEVEL &&
983		    level == APIC_LEVEL_ASSERT &&
984		    (shorthand == APIC_DEST_DESTFLD ||
985			shorthand == APIC_DEST_ALLESELF))
986			return (true);
987		/*
988		 * An level triggered deassert INIT is defined in the Intel
989		 * Multiprocessor Specification and the Intel Software Developer
990		 * Manual. Due to the MPS it's required to send a level assert
991		 * INIT to a cpu and then a level deassert INIT. Some operating
992		 * systems e.g. FreeBSD or Linux use that algorithm. According
993		 * to the SDM a level deassert INIT is only supported by Pentium
994		 * and P6 processors. It's always send to all cpus regardless of
995		 * the destination or shorthand field. It resets the arbitration
996		 * id register. This register is not software accessible and
997		 * only required for the APIC bus arbitration. So, the level
998		 * deassert INIT doesn't need any emulation and we should ignore
999		 * it. The SDM also defines that newer processors don't support
1000		 * the level deassert INIT and it's not valid any more. As it's
1001		 * defined for older systems, it can't be invalid per se.
1002		 * Otherwise, backward compatibility would be broken. However,
1003		 * when returning false here, it'll be ignored which is the
1004		 * desired behaviour.
1005		 */
1006		if (mode == APIC_DELMODE_INIT &&
1007		    trigger == APIC_TRIGMOD_LEVEL &&
1008		    level == APIC_LEVEL_DEASSERT)
1009			return (false);
1010		break;
1011	case APIC_DELMODE_STARTUP:
1012		if (shorthand == APIC_DEST_DESTFLD ||
1013		    shorthand == APIC_DEST_ALLESELF)
1014			return (true);
1015		break;
1016	case APIC_DELMODE_RR:
1017		/* Only available on AMD! */
1018		if (trigger == APIC_TRIGMOD_EDGE &&
1019		    shorthand == APIC_DEST_DESTFLD)
1020			return (true);
1021		break;
1022	case APIC_DELMODE_RESV:
1023		return (false);
1024	default:
1025		__assert_unreachable();
1026	}
1027
1028	return (false);
1029}
1030
1031int
1032vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
1033{
1034	int i;
1035	bool phys;
1036	cpuset_t dmask, ipimask;
1037	uint64_t icrval;
1038	uint32_t dest, vec, mode, shorthand;
1039	struct vcpu *vcpu;
1040	struct vm_exit *vmexit;
1041	struct LAPIC *lapic;
1042
1043	lapic = vlapic->apic_page;
1044	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
1045	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1046
1047	if (x2apic(vlapic))
1048		dest = icrval >> 32;
1049	else
1050		dest = icrval >> (32 + 24);
1051	vec = icrval & APIC_VECTOR_MASK;
1052	mode = icrval & APIC_DELMODE_MASK;
1053	phys = (icrval & APIC_DESTMODE_LOG) == 0;
1054	shorthand = icrval & APIC_DEST_MASK;
1055
1056	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
1057
1058	switch (shorthand) {
1059	case APIC_DEST_DESTFLD:
1060		vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic));
1061		break;
1062	case APIC_DEST_SELF:
1063		CPU_SETOF(vlapic->vcpuid, &dmask);
1064		break;
1065	case APIC_DEST_ALLISELF:
1066		dmask = vm_active_cpus(vlapic->vm);
1067		break;
1068	case APIC_DEST_ALLESELF:
1069		dmask = vm_active_cpus(vlapic->vm);
1070		CPU_CLR(vlapic->vcpuid, &dmask);
1071		break;
1072	default:
1073		__assert_unreachable();
1074	}
1075
1076	/*
1077	 * Ignore invalid combinations of the icr.
1078	 */
1079	if (!vlapic_is_icr_valid(icrval)) {
1080		VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval);
1081		return (0);
1082	}
1083
1084	/*
1085	 * ipimask is a set of vCPUs needing userland handling of the current
1086	 * IPI.
1087	 */
1088	CPU_ZERO(&ipimask);
1089
1090	switch (mode) {
1091	case APIC_DELMODE_FIXED:
1092		if (vec < 16) {
1093			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
1094			    false);
1095			VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
1096			return (0);
1097		}
1098
1099		CPU_FOREACH_ISSET(i, &dmask) {
1100			vcpu = vm_vcpu(vlapic->vm, i);
1101			lapic_intr_edge(vcpu, vec);
1102			vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1);
1103			vmm_stat_incr(vcpu, VLAPIC_IPI_RECV, 1);
1104			VLAPIC_CTR2(vlapic,
1105			    "vlapic sending ipi %d to vcpuid %d", vec, i);
1106		}
1107
1108		break;
1109	case APIC_DELMODE_NMI:
1110		CPU_FOREACH_ISSET(i, &dmask) {
1111			vcpu = vm_vcpu(vlapic->vm, i);
1112			vm_inject_nmi(vcpu);
1113			VLAPIC_CTR1(vlapic,
1114			    "vlapic sending ipi nmi to vcpuid %d", i);
1115		}
1116
1117		break;
1118	case APIC_DELMODE_INIT:
1119	case APIC_DELMODE_STARTUP:
1120		if (!vlapic->ipi_exit) {
1121			if (!phys)
1122				break;
1123
1124			i = vm_apicid2vcpuid(vlapic->vm, dest);
1125			if (i >= vm_get_maxcpus(vlapic->vm) ||
1126			    i == vlapic->vcpuid)
1127				break;
1128
1129			CPU_SETOF(i, &ipimask);
1130
1131			break;
1132		}
1133
1134		CPU_COPY(&dmask, &ipimask);
1135		break;
1136	default:
1137		return (1);
1138	}
1139
1140	if (!CPU_EMPTY(&ipimask)) {
1141		vmexit = vm_exitinfo(vlapic->vcpu);
1142		vmexit->exitcode = VM_EXITCODE_IPI;
1143		vmexit->u.ipi.mode = mode;
1144		vmexit->u.ipi.vector = vec;
1145		*vm_exitinfo_cpuset(vlapic->vcpu) = ipimask;
1146
1147		*retu = true;
1148	}
1149
1150	return (0);
1151}
1152
1153static void
1154vlapic_handle_init(struct vcpu *vcpu, void *arg)
1155{
1156	struct vlapic *vlapic = vm_lapic(vcpu);
1157
1158	vlapic_reset(vlapic);
1159}
1160
1161int
1162vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1163{
1164	struct vlapic *vlapic = vm_lapic(vcpu);
1165	cpuset_t *dmask = vm_exitinfo_cpuset(vcpu);
1166	uint8_t vec = vme->u.ipi.vector;
1167
1168	*retu = true;
1169	switch (vme->u.ipi.mode) {
1170	case APIC_DELMODE_INIT: {
1171		cpuset_t active, reinit;
1172
1173		active = vm_active_cpus(vcpu_vm(vcpu));
1174		CPU_AND(&reinit, &active, dmask);
1175		if (!CPU_EMPTY(&reinit)) {
1176			vm_smp_rendezvous(vcpu, reinit, vlapic_handle_init,
1177			    NULL);
1178		}
1179		vm_await_start(vcpu_vm(vcpu), dmask);
1180
1181		if (!vlapic->ipi_exit)
1182			*retu = false;
1183
1184		break;
1185	}
1186	case APIC_DELMODE_STARTUP:
1187		/*
1188		 * Ignore SIPIs in any state other than wait-for-SIPI
1189		 */
1190		*dmask = vm_start_cpus(vcpu_vm(vcpu), dmask);
1191
1192		if (CPU_EMPTY(dmask)) {
1193			*retu = false;
1194			break;
1195		}
1196
1197		/*
1198		 * Old bhyve versions don't support the IPI
1199		 * exit. Translate it into the old style.
1200		 */
1201		if (!vlapic->ipi_exit) {
1202			vme->exitcode = VM_EXITCODE_SPINUP_AP;
1203			vme->u.spinup_ap.vcpu = CPU_FFS(dmask) - 1;
1204			vme->u.spinup_ap.rip = vec << PAGE_SHIFT;
1205		}
1206
1207		break;
1208	default:
1209		__assert_unreachable();
1210	}
1211
1212	return (0);
1213}
1214
1215void
1216vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
1217{
1218	int vec;
1219
1220	KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
1221
1222	vec = val & 0xff;
1223	lapic_intr_edge(vlapic->vcpu, vec);
1224	vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1);
1225	vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_RECV, 1);
1226	VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
1227}
1228
1229int
1230vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1231{
1232	struct LAPIC	*lapic = vlapic->apic_page;
1233	int	  	 idx, i, bitpos, vector;
1234	uint32_t	*irrptr, val;
1235
1236	vlapic_update_ppr(vlapic);
1237
1238	if (vlapic->ops.pending_intr)
1239		return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
1240
1241	irrptr = &lapic->irr0;
1242
1243	for (i = 7; i >= 0; i--) {
1244		idx = i * 4;
1245		val = atomic_load_acq_int(&irrptr[idx]);
1246		bitpos = fls(val);
1247		if (bitpos != 0) {
1248			vector = i * 32 + (bitpos - 1);
1249			if (PRIO(vector) > PRIO(lapic->ppr)) {
1250				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
1251				if (vecptr != NULL)
1252					*vecptr = vector;
1253				return (1);
1254			} else
1255				break;
1256		}
1257	}
1258	return (0);
1259}
1260
1261void
1262vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1263{
1264	struct LAPIC	*lapic = vlapic->apic_page;
1265	uint32_t	*irrptr, *isrptr;
1266	int		idx, stk_top;
1267
1268	if (vlapic->ops.intr_accepted)
1269		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1270
1271	/*
1272	 * clear the ready bit for vector being accepted in irr
1273	 * and set the vector as in service in isr.
1274	 */
1275	idx = (vector / 32) * 4;
1276
1277	irrptr = &lapic->irr0;
1278	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1279	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
1280
1281	isrptr = &lapic->isr0;
1282	isrptr[idx] |= 1 << (vector % 32);
1283	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
1284
1285	/*
1286	 * Update the PPR
1287	 */
1288	vlapic->isrvec_stk_top++;
1289
1290	stk_top = vlapic->isrvec_stk_top;
1291	if (stk_top >= ISRVEC_STK_SIZE)
1292		panic("isrvec_stk_top overflow %d", stk_top);
1293
1294	vlapic->isrvec_stk[stk_top] = vector;
1295}
1296
1297void
1298vlapic_svr_write_handler(struct vlapic *vlapic)
1299{
1300	struct LAPIC *lapic;
1301	uint32_t old, new, changed;
1302
1303	lapic = vlapic->apic_page;
1304
1305	new = lapic->svr;
1306	old = vlapic->svr_last;
1307	vlapic->svr_last = new;
1308
1309	changed = old ^ new;
1310	if ((changed & APIC_SVR_ENABLE) != 0) {
1311		if ((new & APIC_SVR_ENABLE) == 0) {
1312			/*
1313			 * The apic is now disabled so stop the apic timer
1314			 * and mask all the LVT entries.
1315			 */
1316			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
1317			VLAPIC_TIMER_LOCK(vlapic);
1318			callout_stop(&vlapic->callout);
1319			VLAPIC_TIMER_UNLOCK(vlapic);
1320			vlapic_mask_lvts(vlapic);
1321		} else {
1322			/*
1323			 * The apic is now enabled so restart the apic timer
1324			 * if it is configured in periodic mode.
1325			 */
1326			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
1327			if (vlapic_periodic_timer(vlapic))
1328				vlapic_icrtmr_write_handler(vlapic);
1329		}
1330	}
1331}
1332
1333int
1334vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
1335    uint64_t *data, bool *retu)
1336{
1337	struct LAPIC	*lapic = vlapic->apic_page;
1338	uint32_t	*reg;
1339	int		 i;
1340
1341	/* Ignore MMIO accesses in x2APIC mode */
1342	if (x2apic(vlapic) && mmio_access) {
1343		VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
1344		    offset);
1345		*data = 0;
1346		goto done;
1347	}
1348
1349	if (!x2apic(vlapic) && !mmio_access) {
1350		/*
1351		 * XXX Generate GP fault for MSR accesses in xAPIC mode
1352		 */
1353		VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
1354		    "xAPIC mode", offset);
1355		*data = 0;
1356		goto done;
1357	}
1358
1359	if (offset > sizeof(*lapic)) {
1360		*data = 0;
1361		goto done;
1362	}
1363
1364	offset &= ~3;
1365	switch(offset)
1366	{
1367		case APIC_OFFSET_ID:
1368			*data = lapic->id;
1369			break;
1370		case APIC_OFFSET_VER:
1371			*data = lapic->version;
1372			break;
1373		case APIC_OFFSET_TPR:
1374			*data = vlapic_get_tpr(vlapic);
1375			break;
1376		case APIC_OFFSET_APR:
1377			*data = lapic->apr;
1378			break;
1379		case APIC_OFFSET_PPR:
1380			*data = lapic->ppr;
1381			break;
1382		case APIC_OFFSET_EOI:
1383			*data = lapic->eoi;
1384			break;
1385		case APIC_OFFSET_LDR:
1386			*data = lapic->ldr;
1387			break;
1388		case APIC_OFFSET_DFR:
1389			*data = lapic->dfr;
1390			break;
1391		case APIC_OFFSET_SVR:
1392			*data = lapic->svr;
1393			break;
1394		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1395			i = (offset - APIC_OFFSET_ISR0) >> 2;
1396			reg = &lapic->isr0;
1397			*data = *(reg + i);
1398			break;
1399		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1400			i = (offset - APIC_OFFSET_TMR0) >> 2;
1401			reg = &lapic->tmr0;
1402			*data = *(reg + i);
1403			break;
1404		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1405			i = (offset - APIC_OFFSET_IRR0) >> 2;
1406			reg = &lapic->irr0;
1407			*data = atomic_load_acq_int(reg + i);
1408			break;
1409		case APIC_OFFSET_ESR:
1410			*data = lapic->esr;
1411			break;
1412		case APIC_OFFSET_ICR_LOW:
1413			*data = lapic->icr_lo;
1414			if (x2apic(vlapic))
1415				*data |= (uint64_t)lapic->icr_hi << 32;
1416			break;
1417		case APIC_OFFSET_ICR_HI:
1418			*data = lapic->icr_hi;
1419			break;
1420		case APIC_OFFSET_CMCI_LVT:
1421		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1422			*data = vlapic_get_lvt(vlapic, offset);
1423#ifdef INVARIANTS
1424			reg = vlapic_get_lvtptr(vlapic, offset);
1425			KASSERT(*data == *reg, ("inconsistent lvt value at "
1426			    "offset %#lx: %#lx/%#x", offset, *data, *reg));
1427#endif
1428			break;
1429		case APIC_OFFSET_TIMER_ICR:
1430			*data = lapic->icr_timer;
1431			break;
1432		case APIC_OFFSET_TIMER_CCR:
1433			*data = vlapic_get_ccr(vlapic);
1434			break;
1435		case APIC_OFFSET_TIMER_DCR:
1436			*data = lapic->dcr_timer;
1437			break;
1438		case APIC_OFFSET_SELF_IPI:
1439			/*
1440			 * XXX generate a GP fault if vlapic is in x2apic mode
1441			 */
1442			*data = 0;
1443			break;
1444		case APIC_OFFSET_RRR:
1445		default:
1446			*data = 0;
1447			break;
1448	}
1449done:
1450	VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
1451	return 0;
1452}
1453
1454int
1455vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
1456    uint64_t data, bool *retu)
1457{
1458	struct LAPIC	*lapic = vlapic->apic_page;
1459	uint32_t	*regptr;
1460	int		retval;
1461
1462	KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
1463	    ("vlapic_write: invalid offset %#lx", offset));
1464
1465	VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
1466	    offset, data);
1467
1468	if (offset > sizeof(*lapic))
1469		return (0);
1470
1471	/* Ignore MMIO accesses in x2APIC mode */
1472	if (x2apic(vlapic) && mmio_access) {
1473		VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
1474		    "in x2APIC mode", data, offset);
1475		return (0);
1476	}
1477
1478	/*
1479	 * XXX Generate GP fault for MSR accesses in xAPIC mode
1480	 */
1481	if (!x2apic(vlapic) && !mmio_access) {
1482		VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
1483		    "in xAPIC mode", data, offset);
1484		return (0);
1485	}
1486
1487	retval = 0;
1488	switch(offset)
1489	{
1490		case APIC_OFFSET_ID:
1491			lapic->id = data;
1492			vlapic_id_write_handler(vlapic);
1493			break;
1494		case APIC_OFFSET_TPR:
1495			vlapic_set_tpr(vlapic, data & 0xff);
1496			break;
1497		case APIC_OFFSET_EOI:
1498			vlapic_process_eoi(vlapic);
1499			break;
1500		case APIC_OFFSET_LDR:
1501			lapic->ldr = data;
1502			vlapic_ldr_write_handler(vlapic);
1503			break;
1504		case APIC_OFFSET_DFR:
1505			lapic->dfr = data;
1506			vlapic_dfr_write_handler(vlapic);
1507			break;
1508		case APIC_OFFSET_SVR:
1509			lapic->svr = data;
1510			vlapic_svr_write_handler(vlapic);
1511			break;
1512		case APIC_OFFSET_ICR_LOW:
1513			lapic->icr_lo = data;
1514			if (x2apic(vlapic))
1515				lapic->icr_hi = data >> 32;
1516			retval = vlapic_icrlo_write_handler(vlapic, retu);
1517			break;
1518		case APIC_OFFSET_ICR_HI:
1519			lapic->icr_hi = data;
1520			break;
1521		case APIC_OFFSET_CMCI_LVT:
1522		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1523			regptr = vlapic_get_lvtptr(vlapic, offset);
1524			*regptr = data;
1525			vlapic_lvt_write_handler(vlapic, offset);
1526			break;
1527		case APIC_OFFSET_TIMER_ICR:
1528			lapic->icr_timer = data;
1529			vlapic_icrtmr_write_handler(vlapic);
1530			break;
1531
1532		case APIC_OFFSET_TIMER_DCR:
1533			lapic->dcr_timer = data;
1534			vlapic_dcr_write_handler(vlapic);
1535			break;
1536
1537		case APIC_OFFSET_ESR:
1538			vlapic_esr_write_handler(vlapic);
1539			break;
1540
1541		case APIC_OFFSET_SELF_IPI:
1542			if (x2apic(vlapic))
1543				vlapic_self_ipi_handler(vlapic, data);
1544			break;
1545
1546		case APIC_OFFSET_VER:
1547		case APIC_OFFSET_APR:
1548		case APIC_OFFSET_PPR:
1549		case APIC_OFFSET_RRR:
1550		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1551		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1552		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1553		case APIC_OFFSET_TIMER_CCR:
1554		default:
1555			// Read only.
1556			break;
1557	}
1558
1559	return (retval);
1560}
1561
1562static void
1563vlapic_reset(struct vlapic *vlapic)
1564{
1565	struct LAPIC *lapic;
1566
1567	lapic = vlapic->apic_page;
1568	bzero(lapic, sizeof(struct LAPIC));
1569
1570	lapic->id = vlapic_get_id(vlapic);
1571	lapic->version = VLAPIC_VERSION;
1572	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1573	lapic->dfr = 0xffffffff;
1574	lapic->svr = APIC_SVR_VECTOR;
1575	vlapic_mask_lvts(vlapic);
1576	vlapic_reset_tmr(vlapic);
1577
1578	lapic->dcr_timer = 0;
1579	vlapic_dcr_write_handler(vlapic);
1580
1581	vlapic->svr_last = lapic->svr;
1582}
1583
1584void
1585vlapic_init(struct vlapic *vlapic)
1586{
1587	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1588	KASSERT(vlapic->vcpuid >= 0 &&
1589	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1590	    ("vlapic_init: vcpuid is not initialized"));
1591	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1592	    "initialized"));
1593
1594	/*
1595	 * If the vlapic is configured in x2apic mode then it will be
1596	 * accessed in the critical section via the MSR emulation code.
1597	 *
1598	 * Therefore the timer mutex must be a spinlock because blockable
1599	 * mutexes cannot be acquired in a critical section.
1600	 */
1601	mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
1602	callout_init(&vlapic->callout, 1);
1603
1604	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1605
1606	if (vlapic->vcpuid == 0)
1607		vlapic->msr_apicbase |= APICBASE_BSP;
1608
1609	vlapic->ipi_exit = false;
1610
1611	vlapic_reset(vlapic);
1612}
1613
1614void
1615vlapic_cleanup(struct vlapic *vlapic)
1616{
1617
1618	callout_drain(&vlapic->callout);
1619	mtx_destroy(&vlapic->timer_mtx);
1620}
1621
1622uint64_t
1623vlapic_get_apicbase(struct vlapic *vlapic)
1624{
1625
1626	return (vlapic->msr_apicbase);
1627}
1628
1629int
1630vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
1631{
1632
1633	if (vlapic->msr_apicbase != new) {
1634		VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
1635		    "not supported", vlapic->msr_apicbase, new);
1636		return (-1);
1637	}
1638
1639	return (0);
1640}
1641
1642void
1643vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state)
1644{
1645	struct vlapic *vlapic;
1646	struct LAPIC *lapic;
1647
1648	vlapic = vm_lapic(vcpu);
1649
1650	if (state == X2APIC_DISABLED)
1651		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1652	else
1653		vlapic->msr_apicbase |= APICBASE_X2APIC;
1654
1655	/*
1656	 * Reset the local APIC registers whose values are mode-dependent.
1657	 *
1658	 * XXX this works because the APIC mode can be changed only at vcpu
1659	 * initialization time.
1660	 */
1661	lapic = vlapic->apic_page;
1662	lapic->id = vlapic_get_id(vlapic);
1663	if (x2apic(vlapic)) {
1664		lapic->ldr = x2apic_ldr(vlapic);
1665		lapic->dfr = 0;
1666	} else {
1667		lapic->ldr = 0;
1668		lapic->dfr = 0xffffffff;
1669	}
1670
1671	if (state == X2APIC_ENABLED) {
1672		if (vlapic->ops.enable_x2apic_mode)
1673			(*vlapic->ops.enable_x2apic_mode)(vlapic);
1674	}
1675}
1676
1677void
1678vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1679    int delmode, int vec)
1680{
1681	struct vcpu *vcpu;
1682	bool lowprio;
1683	int vcpuid;
1684	cpuset_t dmask;
1685
1686	if (delmode != IOART_DELFIXED &&
1687	    delmode != IOART_DELLOPRI &&
1688	    delmode != IOART_DELEXINT) {
1689		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
1690		return;
1691	}
1692	lowprio = (delmode == IOART_DELLOPRI);
1693
1694	/*
1695	 * We don't provide any virtual interrupt redirection hardware so
1696	 * all interrupts originating from the ioapic or MSI specify the
1697	 * 'dest' in the legacy xAPIC format.
1698	 */
1699	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1700
1701	CPU_FOREACH_ISSET(vcpuid, &dmask) {
1702		vcpu = vm_vcpu(vm, vcpuid);
1703		if (delmode == IOART_DELEXINT) {
1704			vm_inject_extint(vcpu);
1705		} else {
1706			lapic_set_intr(vcpu, vec, level);
1707		}
1708	}
1709}
1710
1711void
1712vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
1713{
1714	/*
1715	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
1716	 *
1717	 * This is done by leveraging features like Posted Interrupts (Intel)
1718	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1719	 *
1720	 * If neither of these features are available then fallback to
1721	 * sending an IPI to 'hostcpu'.
1722	 */
1723	if (vlapic->ops.post_intr)
1724		(*vlapic->ops.post_intr)(vlapic, hostcpu);
1725	else
1726		ipi_cpu(hostcpu, ipinum);
1727}
1728
1729bool
1730vlapic_enabled(struct vlapic *vlapic)
1731{
1732	struct LAPIC *lapic = vlapic->apic_page;
1733
1734	if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
1735	    (lapic->svr & APIC_SVR_ENABLE) != 0)
1736		return (true);
1737	else
1738		return (false);
1739}
1740
1741static void
1742vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
1743{
1744	struct LAPIC *lapic;
1745	uint32_t *tmrptr, mask;
1746	int idx;
1747
1748	lapic = vlapic->apic_page;
1749	tmrptr = &lapic->tmr0;
1750	idx = (vector / 32) * 4;
1751	mask = 1 << (vector % 32);
1752	if (level)
1753		tmrptr[idx] |= mask;
1754	else
1755		tmrptr[idx] &= ~mask;
1756
1757	if (vlapic->ops.set_tmr != NULL)
1758		(*vlapic->ops.set_tmr)(vlapic, vector, level);
1759}
1760
1761void
1762vlapic_reset_tmr(struct vlapic *vlapic)
1763{
1764	int vector;
1765
1766	VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
1767
1768	for (vector = 0; vector <= 255; vector++)
1769		vlapic_set_tmr(vlapic, vector, false);
1770}
1771
1772void
1773vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
1774    int delmode, int vector)
1775{
1776	cpuset_t dmask;
1777	bool lowprio;
1778
1779	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
1780
1781	/*
1782	 * A level trigger is valid only for fixed and lowprio delivery modes.
1783	 */
1784	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
1785		VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
1786		    "delivery-mode %d", delmode);
1787		return;
1788	}
1789
1790	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
1791	vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
1792
1793	if (!CPU_ISSET(vlapic->vcpuid, &dmask))
1794		return;
1795
1796	VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
1797	vlapic_set_tmr(vlapic, vector, true);
1798}
1799
1800#ifdef BHYVE_SNAPSHOT
1801static void
1802vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr)
1803{
1804	/* The implementation is similar to the one in the
1805	 * `vlapic_icrtmr_write_handler` function
1806	 */
1807	sbintime_t sbt;
1808	struct bintime bt;
1809
1810	VLAPIC_TIMER_LOCK(vlapic);
1811
1812	bt = vlapic->timer_freq_bt;
1813	bintime_mul(&bt, ccr);
1814
1815	if (ccr != 0) {
1816		binuptime(&vlapic->timer_fire_bt);
1817		bintime_add(&vlapic->timer_fire_bt, &bt);
1818
1819		sbt = bttosbt(bt);
1820		vlapic_callout_reset(vlapic, sbt);
1821	} else {
1822		/* even if the CCR was 0, periodic timers should be reset */
1823		if (vlapic_periodic_timer(vlapic)) {
1824			binuptime(&vlapic->timer_fire_bt);
1825			bintime_add(&vlapic->timer_fire_bt,
1826				    &vlapic->timer_period_bt);
1827			sbt = bttosbt(vlapic->timer_period_bt);
1828
1829			callout_stop(&vlapic->callout);
1830			vlapic_callout_reset(vlapic, sbt);
1831		}
1832	}
1833
1834	VLAPIC_TIMER_UNLOCK(vlapic);
1835}
1836
1837int
1838vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
1839{
1840	int ret;
1841	struct vcpu *vcpu;
1842	struct vlapic *vlapic;
1843	struct LAPIC *lapic;
1844	uint32_t ccr;
1845	uint16_t i, maxcpus;
1846
1847	KASSERT(vm != NULL, ("%s: arg was NULL", __func__));
1848
1849	ret = 0;
1850
1851	maxcpus = vm_get_maxcpus(vm);
1852	for (i = 0; i < maxcpus; i++) {
1853		vcpu = vm_vcpu(vm, i);
1854		if (vcpu == NULL)
1855			continue;
1856		vlapic = vm_lapic(vcpu);
1857
1858		/* snapshot the page first; timer period depends on icr_timer */
1859		lapic = vlapic->apic_page;
1860		SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done);
1861
1862		SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done);
1863
1864		SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec,
1865				      meta, ret, done);
1866		SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac,
1867				      meta, ret, done);
1868
1869		/*
1870		 * Timer period is equal to 'icr_timer' ticks at a frequency of
1871		 * 'timer_freq_bt'.
1872		 */
1873		if (meta->op == VM_SNAPSHOT_RESTORE) {
1874			vlapic->timer_period_bt = vlapic->timer_freq_bt;
1875			bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
1876		}
1877
1878		SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk,
1879				      sizeof(vlapic->isrvec_stk),
1880				      meta, ret, done);
1881		SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done);
1882
1883		SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last,
1884				      sizeof(vlapic->lvt_last),
1885				      meta, ret, done);
1886
1887		if (meta->op == VM_SNAPSHOT_SAVE)
1888			ccr = vlapic_get_ccr(vlapic);
1889
1890		SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done);
1891
1892		if (meta->op == VM_SNAPSHOT_RESTORE &&
1893		    vlapic_enabled(vlapic) && lapic->icr_timer != 0) {
1894			/* Reset the value of the 'timer_fire_bt' and the vlapic
1895			 * callout based on the value of the current count
1896			 * register saved when the VM snapshot was created.
1897			 * If initial count register is 0, timer is not used.
1898			 * Look at "10.5.4 APIC Timer" in Software Developer Manual.
1899			 */
1900			vlapic_reset_callout(vlapic, ccr);
1901		}
1902	}
1903
1904done:
1905	return (ret);
1906}
1907#endif
1908