1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2017-2019, IBM Corporation.
4 */
5
6#define pr_fmt(fmt) "xive-kvm: " fmt
7
8#include <linux/kernel.h>
9#include <linux/kvm_host.h>
10#include <linux/err.h>
11#include <linux/gfp.h>
12#include <linux/spinlock.h>
13#include <linux/delay.h>
14#include <linux/file.h>
15#include <linux/irqdomain.h>
16#include <asm/uaccess.h>
17#include <asm/kvm_book3s.h>
18#include <asm/kvm_ppc.h>
19#include <asm/hvcall.h>
20#include <asm/xive.h>
21#include <asm/xive-regs.h>
22#include <asm/debug.h>
23#include <asm/opal.h>
24
25#include <linux/debugfs.h>
26#include <linux/seq_file.h>
27
28#include "book3s_xive.h"
29
30static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31{
32	u64 val;
33
34	/*
35	 * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
36	 * load operation, so there is no need to enforce load-after-store
37	 * ordering.
38	 */
39
40	val = in_be64(xd->eoi_mmio + offset);
41	return (u8)val;
42}
43
44static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
45{
46	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
47	struct xive_q *q = &xc->queues[prio];
48
49	xive_native_disable_queue(xc->vp_id, q, prio);
50	if (q->qpage) {
51		put_page(virt_to_page(q->qpage));
52		q->qpage = NULL;
53	}
54}
55
56static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
57					      u8 prio, __be32 *qpage,
58					      u32 order, bool can_escalate)
59{
60	int rc;
61	__be32 *qpage_prev = q->qpage;
62
63	rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
64					 can_escalate);
65	if (rc)
66		return rc;
67
68	if (qpage_prev)
69		put_page(virt_to_page(qpage_prev));
70
71	return rc;
72}
73
74void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
75{
76	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
77	int i;
78
79	if (!kvmppc_xive_enabled(vcpu))
80		return;
81
82	if (!xc)
83		return;
84
85	pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
86
87	/* Ensure no interrupt is still routed to that VP */
88	xc->valid = false;
89	kvmppc_xive_disable_vcpu_interrupts(vcpu);
90
91	/* Free escalations */
92	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
93		/* Free the escalation irq */
94		if (xc->esc_virq[i]) {
95			if (kvmppc_xive_has_single_escalation(xc->xive))
96				xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]);
97			free_irq(xc->esc_virq[i], vcpu);
98			irq_dispose_mapping(xc->esc_virq[i]);
99			kfree(xc->esc_virq_names[i]);
100			xc->esc_virq[i] = 0;
101		}
102	}
103
104	/* Disable the VP */
105	xive_native_disable_vp(xc->vp_id);
106
107	/* Clear the cam word so guest entry won't try to push context */
108	vcpu->arch.xive_cam_word = 0;
109
110	/* Free the queues */
111	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
112		kvmppc_xive_native_cleanup_queue(vcpu, i);
113	}
114
115	/* Free the VP */
116	kfree(xc);
117
118	/* Cleanup the vcpu */
119	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
120	vcpu->arch.xive_vcpu = NULL;
121}
122
123int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
124				    struct kvm_vcpu *vcpu, u32 server_num)
125{
126	struct kvmppc_xive *xive = dev->private;
127	struct kvmppc_xive_vcpu *xc = NULL;
128	int rc;
129	u32 vp_id;
130
131	pr_devel("native_connect_vcpu(server=%d)\n", server_num);
132
133	if (dev->ops != &kvm_xive_native_ops) {
134		pr_devel("Wrong ops !\n");
135		return -EPERM;
136	}
137	if (xive->kvm != vcpu->kvm)
138		return -EPERM;
139	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
140		return -EBUSY;
141
142	mutex_lock(&xive->lock);
143
144	rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
145	if (rc)
146		goto bail;
147
148	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
149	if (!xc) {
150		rc = -ENOMEM;
151		goto bail;
152	}
153
154	vcpu->arch.xive_vcpu = xc;
155	xc->xive = xive;
156	xc->vcpu = vcpu;
157	xc->server_num = server_num;
158
159	xc->vp_id = vp_id;
160	xc->valid = true;
161	vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
162
163	rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
164	if (rc) {
165		pr_err("Failed to get VP info from OPAL: %d\n", rc);
166		goto bail;
167	}
168
169	if (!kvmppc_xive_check_save_restore(vcpu)) {
170		pr_err("inconsistent save-restore setup for VCPU %d\n", server_num);
171		rc = -EIO;
172		goto bail;
173	}
174
175	/*
176	 * Enable the VP first as the single escalation mode will
177	 * affect escalation interrupts numbering
178	 */
179	rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
180	if (rc) {
181		pr_err("Failed to enable VP in OPAL: %d\n", rc);
182		goto bail;
183	}
184
185	/* Configure VCPU fields for use by assembly push/pull */
186	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
187	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
188
189	/* TODO: reset all queues to a clean state ? */
190bail:
191	mutex_unlock(&xive->lock);
192	if (rc)
193		kvmppc_xive_native_cleanup_vcpu(vcpu);
194
195	return rc;
196}
197
198/*
199 * Device passthrough support
200 */
201static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
202{
203	struct kvmppc_xive *xive = kvm->arch.xive;
204	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
205
206	if (irq >= KVMPPC_XIVE_NR_IRQS)
207		return -EINVAL;
208
209	/*
210	 * Clear the ESB pages of the IRQ number being mapped (or
211	 * unmapped) into the guest and let the VM fault handler
212	 * repopulate with the appropriate ESB pages (device or IC)
213	 */
214	pr_debug("clearing esb pages for girq 0x%lx\n", irq);
215	mutex_lock(&xive->mapping_lock);
216	if (xive->mapping)
217		unmap_mapping_range(xive->mapping,
218				    esb_pgoff << PAGE_SHIFT,
219				    2ull << PAGE_SHIFT, 1);
220	mutex_unlock(&xive->mapping_lock);
221	return 0;
222}
223
224static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
225	.reset_mapped = kvmppc_xive_native_reset_mapped,
226};
227
228static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
229{
230	struct vm_area_struct *vma = vmf->vma;
231	struct kvm_device *dev = vma->vm_file->private_data;
232	struct kvmppc_xive *xive = dev->private;
233	struct kvmppc_xive_src_block *sb;
234	struct kvmppc_xive_irq_state *state;
235	struct xive_irq_data *xd;
236	u32 hw_num;
237	u16 src;
238	u64 page;
239	unsigned long irq;
240	u64 page_offset;
241
242	/*
243	 * Linux/KVM uses a two pages ESB setting, one for trigger and
244	 * one for EOI
245	 */
246	page_offset = vmf->pgoff - vma->vm_pgoff;
247	irq = page_offset / 2;
248
249	sb = kvmppc_xive_find_source(xive, irq, &src);
250	if (!sb) {
251		pr_devel("%s: source %lx not found !\n", __func__, irq);
252		return VM_FAULT_SIGBUS;
253	}
254
255	state = &sb->irq_state[src];
256
257	/* Some sanity checking */
258	if (!state->valid) {
259		pr_devel("%s: source %lx invalid !\n", __func__, irq);
260		return VM_FAULT_SIGBUS;
261	}
262
263	kvmppc_xive_select_irq(state, &hw_num, &xd);
264
265	arch_spin_lock(&sb->lock);
266
267	/*
268	 * first/even page is for trigger
269	 * second/odd page is for EOI and management.
270	 */
271	page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
272	arch_spin_unlock(&sb->lock);
273
274	if (WARN_ON(!page)) {
275		pr_err("%s: accessing invalid ESB page for source %lx !\n",
276		       __func__, irq);
277		return VM_FAULT_SIGBUS;
278	}
279
280	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
281	return VM_FAULT_NOPAGE;
282}
283
284static const struct vm_operations_struct xive_native_esb_vmops = {
285	.fault = xive_native_esb_fault,
286};
287
288static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
289{
290	struct vm_area_struct *vma = vmf->vma;
291
292	switch (vmf->pgoff - vma->vm_pgoff) {
293	case 0: /* HW - forbid access */
294	case 1: /* HV - forbid access */
295		return VM_FAULT_SIGBUS;
296	case 2: /* OS */
297		vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
298		return VM_FAULT_NOPAGE;
299	case 3: /* USER - TODO */
300	default:
301		return VM_FAULT_SIGBUS;
302	}
303}
304
305static const struct vm_operations_struct xive_native_tima_vmops = {
306	.fault = xive_native_tima_fault,
307};
308
309static int kvmppc_xive_native_mmap(struct kvm_device *dev,
310				   struct vm_area_struct *vma)
311{
312	struct kvmppc_xive *xive = dev->private;
313
314	/* We only allow mappings at fixed offset for now */
315	if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
316		if (vma_pages(vma) > 4)
317			return -EINVAL;
318		vma->vm_ops = &xive_native_tima_vmops;
319	} else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
320		if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
321			return -EINVAL;
322		vma->vm_ops = &xive_native_esb_vmops;
323	} else {
324		return -EINVAL;
325	}
326
327	vm_flags_set(vma, VM_IO | VM_PFNMAP);
328	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
329
330	/*
331	 * Grab the KVM device file address_space to be able to clear
332	 * the ESB pages mapping when a device is passed-through into
333	 * the guest.
334	 */
335	xive->mapping = vma->vm_file->f_mapping;
336	return 0;
337}
338
339static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
340					 u64 addr)
341{
342	struct kvmppc_xive_src_block *sb;
343	struct kvmppc_xive_irq_state *state;
344	u64 __user *ubufp = (u64 __user *) addr;
345	u64 val;
346	u16 idx;
347	int rc;
348
349	pr_devel("%s irq=0x%lx\n", __func__, irq);
350
351	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
352		return -E2BIG;
353
354	sb = kvmppc_xive_find_source(xive, irq, &idx);
355	if (!sb) {
356		pr_debug("No source, creating source block...\n");
357		sb = kvmppc_xive_create_src_block(xive, irq);
358		if (!sb) {
359			pr_err("Failed to create block...\n");
360			return -ENOMEM;
361		}
362	}
363	state = &sb->irq_state[idx];
364
365	if (get_user(val, ubufp)) {
366		pr_err("fault getting user info !\n");
367		return -EFAULT;
368	}
369
370	arch_spin_lock(&sb->lock);
371
372	/*
373	 * If the source doesn't already have an IPI, allocate
374	 * one and get the corresponding data
375	 */
376	if (!state->ipi_number) {
377		state->ipi_number = xive_native_alloc_irq();
378		if (state->ipi_number == 0) {
379			pr_err("Failed to allocate IRQ !\n");
380			rc = -ENXIO;
381			goto unlock;
382		}
383		xive_native_populate_irq_data(state->ipi_number,
384					      &state->ipi_data);
385		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
386			 state->ipi_number, irq);
387	}
388
389	/* Restore LSI state */
390	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
391		state->lsi = true;
392		if (val & KVM_XIVE_LEVEL_ASSERTED)
393			state->asserted = true;
394		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
395	}
396
397	/* Mask IRQ to start with */
398	state->act_server = 0;
399	state->act_priority = MASKED;
400	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
401	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
402
403	/* Increment the number of valid sources and mark this one valid */
404	if (!state->valid)
405		xive->src_count++;
406	state->valid = true;
407
408	rc = 0;
409
410unlock:
411	arch_spin_unlock(&sb->lock);
412
413	return rc;
414}
415
416static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
417					struct kvmppc_xive_src_block *sb,
418					struct kvmppc_xive_irq_state *state,
419					u32 server, u8 priority, bool masked,
420					u32 eisn)
421{
422	struct kvm *kvm = xive->kvm;
423	u32 hw_num;
424	int rc = 0;
425
426	arch_spin_lock(&sb->lock);
427
428	if (state->act_server == server && state->act_priority == priority &&
429	    state->eisn == eisn)
430		goto unlock;
431
432	pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
433		 priority, server, masked, state->act_server,
434		 state->act_priority);
435
436	kvmppc_xive_select_irq(state, &hw_num, NULL);
437
438	if (priority != MASKED && !masked) {
439		rc = kvmppc_xive_select_target(kvm, &server, priority);
440		if (rc)
441			goto unlock;
442
443		state->act_priority = priority;
444		state->act_server = server;
445		state->eisn = eisn;
446
447		rc = xive_native_configure_irq(hw_num,
448					       kvmppc_xive_vp(xive, server),
449					       priority, eisn);
450	} else {
451		state->act_priority = MASKED;
452		state->act_server = 0;
453		state->eisn = 0;
454
455		rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
456	}
457
458unlock:
459	arch_spin_unlock(&sb->lock);
460	return rc;
461}
462
463static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
464						long irq, u64 addr)
465{
466	struct kvmppc_xive_src_block *sb;
467	struct kvmppc_xive_irq_state *state;
468	u64 __user *ubufp = (u64 __user *) addr;
469	u16 src;
470	u64 kvm_cfg;
471	u32 server;
472	u8 priority;
473	bool masked;
474	u32 eisn;
475
476	sb = kvmppc_xive_find_source(xive, irq, &src);
477	if (!sb)
478		return -ENOENT;
479
480	state = &sb->irq_state[src];
481
482	if (!state->valid)
483		return -EINVAL;
484
485	if (get_user(kvm_cfg, ubufp))
486		return -EFAULT;
487
488	pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
489
490	priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
491		KVM_XIVE_SOURCE_PRIORITY_SHIFT;
492	server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
493		KVM_XIVE_SOURCE_SERVER_SHIFT;
494	masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
495		KVM_XIVE_SOURCE_MASKED_SHIFT;
496	eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
497		KVM_XIVE_SOURCE_EISN_SHIFT;
498
499	if (priority != xive_prio_from_guest(priority)) {
500		pr_err("invalid priority for queue %d for VCPU %d\n",
501		       priority, server);
502		return -EINVAL;
503	}
504
505	return kvmppc_xive_native_update_source_config(xive, sb, state, server,
506						       priority, masked, eisn);
507}
508
509static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
510					  long irq, u64 addr)
511{
512	struct kvmppc_xive_src_block *sb;
513	struct kvmppc_xive_irq_state *state;
514	struct xive_irq_data *xd;
515	u32 hw_num;
516	u16 src;
517	int rc = 0;
518
519	pr_devel("%s irq=0x%lx", __func__, irq);
520
521	sb = kvmppc_xive_find_source(xive, irq, &src);
522	if (!sb)
523		return -ENOENT;
524
525	state = &sb->irq_state[src];
526
527	rc = -EINVAL;
528
529	arch_spin_lock(&sb->lock);
530
531	if (state->valid) {
532		kvmppc_xive_select_irq(state, &hw_num, &xd);
533		xive_native_sync_source(hw_num);
534		rc = 0;
535	}
536
537	arch_spin_unlock(&sb->lock);
538	return rc;
539}
540
541static int xive_native_validate_queue_size(u32 qshift)
542{
543	/*
544	 * We only support 64K pages for the moment. This is also
545	 * advertised in the DT property "ibm,xive-eq-sizes"
546	 */
547	switch (qshift) {
548	case 0: /* EQ reset */
549	case 16:
550		return 0;
551	case 12:
552	case 21:
553	case 24:
554	default:
555		return -EINVAL;
556	}
557}
558
559static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
560					       long eq_idx, u64 addr)
561{
562	struct kvm *kvm = xive->kvm;
563	struct kvm_vcpu *vcpu;
564	struct kvmppc_xive_vcpu *xc;
565	void __user *ubufp = (void __user *) addr;
566	u32 server;
567	u8 priority;
568	struct kvm_ppc_xive_eq kvm_eq;
569	int rc;
570	__be32 *qaddr = NULL;
571	struct page *page;
572	struct xive_q *q;
573	gfn_t gfn;
574	unsigned long page_size;
575	int srcu_idx;
576
577	/*
578	 * Demangle priority/server tuple from the EQ identifier
579	 */
580	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
581		KVM_XIVE_EQ_PRIORITY_SHIFT;
582	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
583		KVM_XIVE_EQ_SERVER_SHIFT;
584
585	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
586		return -EFAULT;
587
588	vcpu = kvmppc_xive_find_server(kvm, server);
589	if (!vcpu) {
590		pr_err("Can't find server %d\n", server);
591		return -ENOENT;
592	}
593	xc = vcpu->arch.xive_vcpu;
594
595	if (priority != xive_prio_from_guest(priority)) {
596		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
597		       priority, server);
598		return -EINVAL;
599	}
600	q = &xc->queues[priority];
601
602	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
603		 __func__, server, priority, kvm_eq.flags,
604		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
605
606	/* reset queue and disable queueing */
607	if (!kvm_eq.qshift) {
608		q->guest_qaddr  = 0;
609		q->guest_qshift = 0;
610
611		rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
612							NULL, 0, true);
613		if (rc) {
614			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
615			       priority, xc->server_num, rc);
616			return rc;
617		}
618
619		return 0;
620	}
621
622	/*
623	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
624	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
625	 * without using the coalescing mechanisms provided by the
626	 * XIVE END ESBs. This is required on KVM as notification
627	 * using the END ESBs is not supported.
628	 */
629	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
630		pr_err("invalid flags %d\n", kvm_eq.flags);
631		return -EINVAL;
632	}
633
634	rc = xive_native_validate_queue_size(kvm_eq.qshift);
635	if (rc) {
636		pr_err("invalid queue size %d\n", kvm_eq.qshift);
637		return rc;
638	}
639
640	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
641		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
642		       1ull << kvm_eq.qshift);
643		return -EINVAL;
644	}
645
646	srcu_idx = srcu_read_lock(&kvm->srcu);
647	gfn = gpa_to_gfn(kvm_eq.qaddr);
648
649	page_size = kvm_host_page_size(vcpu, gfn);
650	if (1ull << kvm_eq.qshift > page_size) {
651		srcu_read_unlock(&kvm->srcu, srcu_idx);
652		pr_warn("Incompatible host page size %lx!\n", page_size);
653		return -EINVAL;
654	}
655
656	page = gfn_to_page(kvm, gfn);
657	if (is_error_page(page)) {
658		srcu_read_unlock(&kvm->srcu, srcu_idx);
659		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
660		return -EINVAL;
661	}
662
663	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
664	srcu_read_unlock(&kvm->srcu, srcu_idx);
665
666	/*
667	 * Backup the queue page guest address to the mark EQ page
668	 * dirty for migration.
669	 */
670	q->guest_qaddr  = kvm_eq.qaddr;
671	q->guest_qshift = kvm_eq.qshift;
672
673	 /*
674	  * Unconditional Notification is forced by default at the
675	  * OPAL level because the use of END ESBs is not supported by
676	  * Linux.
677	  */
678	rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
679					(__be32 *) qaddr, kvm_eq.qshift, true);
680	if (rc) {
681		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
682		       priority, xc->server_num, rc);
683		put_page(page);
684		return rc;
685	}
686
687	/*
688	 * Only restore the queue state when needed. When doing the
689	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
690	 */
691	if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
692		rc = xive_native_set_queue_state(xc->vp_id, priority,
693						 kvm_eq.qtoggle,
694						 kvm_eq.qindex);
695		if (rc)
696			goto error;
697	}
698
699	rc = kvmppc_xive_attach_escalation(vcpu, priority,
700					   kvmppc_xive_has_single_escalation(xive));
701error:
702	if (rc)
703		kvmppc_xive_native_cleanup_queue(vcpu, priority);
704	return rc;
705}
706
707static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
708					       long eq_idx, u64 addr)
709{
710	struct kvm *kvm = xive->kvm;
711	struct kvm_vcpu *vcpu;
712	struct kvmppc_xive_vcpu *xc;
713	struct xive_q *q;
714	void __user *ubufp = (u64 __user *) addr;
715	u32 server;
716	u8 priority;
717	struct kvm_ppc_xive_eq kvm_eq;
718	u64 qaddr;
719	u64 qshift;
720	u64 qeoi_page;
721	u32 escalate_irq;
722	u64 qflags;
723	int rc;
724
725	/*
726	 * Demangle priority/server tuple from the EQ identifier
727	 */
728	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
729		KVM_XIVE_EQ_PRIORITY_SHIFT;
730	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
731		KVM_XIVE_EQ_SERVER_SHIFT;
732
733	vcpu = kvmppc_xive_find_server(kvm, server);
734	if (!vcpu) {
735		pr_err("Can't find server %d\n", server);
736		return -ENOENT;
737	}
738	xc = vcpu->arch.xive_vcpu;
739
740	if (priority != xive_prio_from_guest(priority)) {
741		pr_err("invalid priority for queue %d for VCPU %d\n",
742		       priority, server);
743		return -EINVAL;
744	}
745	q = &xc->queues[priority];
746
747	memset(&kvm_eq, 0, sizeof(kvm_eq));
748
749	if (!q->qpage)
750		return 0;
751
752	rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
753					&qeoi_page, &escalate_irq, &qflags);
754	if (rc)
755		return rc;
756
757	kvm_eq.flags = 0;
758	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
759		kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
760
761	kvm_eq.qshift = q->guest_qshift;
762	kvm_eq.qaddr  = q->guest_qaddr;
763
764	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
765					 &kvm_eq.qindex);
766	if (rc)
767		return rc;
768
769	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
770		 __func__, server, priority, kvm_eq.flags,
771		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
772
773	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
774		return -EFAULT;
775
776	return 0;
777}
778
779static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
780{
781	int i;
782
783	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
784		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
785
786		if (!state->valid)
787			continue;
788
789		if (state->act_priority == MASKED)
790			continue;
791
792		state->eisn = 0;
793		state->act_server = 0;
794		state->act_priority = MASKED;
795		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
796		xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
797		if (state->pt_number) {
798			xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
799			xive_native_configure_irq(state->pt_number,
800						  0, MASKED, 0);
801		}
802	}
803}
804
805static int kvmppc_xive_reset(struct kvmppc_xive *xive)
806{
807	struct kvm *kvm = xive->kvm;
808	struct kvm_vcpu *vcpu;
809	unsigned long i;
810
811	pr_devel("%s\n", __func__);
812
813	mutex_lock(&xive->lock);
814
815	kvm_for_each_vcpu(i, vcpu, kvm) {
816		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
817		unsigned int prio;
818
819		if (!xc)
820			continue;
821
822		kvmppc_xive_disable_vcpu_interrupts(vcpu);
823
824		for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
825
826			/* Single escalation, no queue 7 */
827			if (prio == 7 && kvmppc_xive_has_single_escalation(xive))
828				break;
829
830			if (xc->esc_virq[prio]) {
831				free_irq(xc->esc_virq[prio], vcpu);
832				irq_dispose_mapping(xc->esc_virq[prio]);
833				kfree(xc->esc_virq_names[prio]);
834				xc->esc_virq[prio] = 0;
835			}
836
837			kvmppc_xive_native_cleanup_queue(vcpu, prio);
838		}
839	}
840
841	for (i = 0; i <= xive->max_sbid; i++) {
842		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
843
844		if (sb) {
845			arch_spin_lock(&sb->lock);
846			kvmppc_xive_reset_sources(sb);
847			arch_spin_unlock(&sb->lock);
848		}
849	}
850
851	mutex_unlock(&xive->lock);
852
853	return 0;
854}
855
856static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
857{
858	int j;
859
860	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
861		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
862		struct xive_irq_data *xd;
863		u32 hw_num;
864
865		if (!state->valid)
866			continue;
867
868		/*
869		 * The struct kvmppc_xive_irq_state reflects the state
870		 * of the EAS configuration and not the state of the
871		 * source. The source is masked setting the PQ bits to
872		 * '-Q', which is what is being done before calling
873		 * the KVM_DEV_XIVE_EQ_SYNC control.
874		 *
875		 * If a source EAS is configured, OPAL syncs the XIVE
876		 * IC of the source and the XIVE IC of the previous
877		 * target if any.
878		 *
879		 * So it should be fine ignoring MASKED sources as
880		 * they have been synced already.
881		 */
882		if (state->act_priority == MASKED)
883			continue;
884
885		kvmppc_xive_select_irq(state, &hw_num, &xd);
886		xive_native_sync_source(hw_num);
887		xive_native_sync_queue(hw_num);
888	}
889}
890
891static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
892{
893	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
894	unsigned int prio;
895	int srcu_idx;
896
897	if (!xc)
898		return -ENOENT;
899
900	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
901		struct xive_q *q = &xc->queues[prio];
902
903		if (!q->qpage)
904			continue;
905
906		/* Mark EQ page dirty for migration */
907		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
908		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
909		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
910	}
911	return 0;
912}
913
914static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
915{
916	struct kvm *kvm = xive->kvm;
917	struct kvm_vcpu *vcpu;
918	unsigned long i;
919
920	pr_devel("%s\n", __func__);
921
922	mutex_lock(&xive->lock);
923	for (i = 0; i <= xive->max_sbid; i++) {
924		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
925
926		if (sb) {
927			arch_spin_lock(&sb->lock);
928			kvmppc_xive_native_sync_sources(sb);
929			arch_spin_unlock(&sb->lock);
930		}
931	}
932
933	kvm_for_each_vcpu(i, vcpu, kvm) {
934		kvmppc_xive_native_vcpu_eq_sync(vcpu);
935	}
936	mutex_unlock(&xive->lock);
937
938	return 0;
939}
940
941static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
942				       struct kvm_device_attr *attr)
943{
944	struct kvmppc_xive *xive = dev->private;
945
946	switch (attr->group) {
947	case KVM_DEV_XIVE_GRP_CTRL:
948		switch (attr->attr) {
949		case KVM_DEV_XIVE_RESET:
950			return kvmppc_xive_reset(xive);
951		case KVM_DEV_XIVE_EQ_SYNC:
952			return kvmppc_xive_native_eq_sync(xive);
953		case KVM_DEV_XIVE_NR_SERVERS:
954			return kvmppc_xive_set_nr_servers(xive, attr->addr);
955		}
956		break;
957	case KVM_DEV_XIVE_GRP_SOURCE:
958		return kvmppc_xive_native_set_source(xive, attr->attr,
959						     attr->addr);
960	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
961		return kvmppc_xive_native_set_source_config(xive, attr->attr,
962							    attr->addr);
963	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
964		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
965							   attr->addr);
966	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
967		return kvmppc_xive_native_sync_source(xive, attr->attr,
968						      attr->addr);
969	}
970	return -ENXIO;
971}
972
973static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
974				       struct kvm_device_attr *attr)
975{
976	struct kvmppc_xive *xive = dev->private;
977
978	switch (attr->group) {
979	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
980		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
981							   attr->addr);
982	}
983	return -ENXIO;
984}
985
986static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
987				       struct kvm_device_attr *attr)
988{
989	switch (attr->group) {
990	case KVM_DEV_XIVE_GRP_CTRL:
991		switch (attr->attr) {
992		case KVM_DEV_XIVE_RESET:
993		case KVM_DEV_XIVE_EQ_SYNC:
994		case KVM_DEV_XIVE_NR_SERVERS:
995			return 0;
996		}
997		break;
998	case KVM_DEV_XIVE_GRP_SOURCE:
999	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
1000	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
1001		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
1002		    attr->attr < KVMPPC_XIVE_NR_IRQS)
1003			return 0;
1004		break;
1005	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
1006		return 0;
1007	}
1008	return -ENXIO;
1009}
1010
1011/*
1012 * Called when device fd is closed.  kvm->lock is held.
1013 */
1014static void kvmppc_xive_native_release(struct kvm_device *dev)
1015{
1016	struct kvmppc_xive *xive = dev->private;
1017	struct kvm *kvm = xive->kvm;
1018	struct kvm_vcpu *vcpu;
1019	unsigned long i;
1020
1021	pr_devel("Releasing xive native device\n");
1022
1023	/*
1024	 * Clear the KVM device file address_space which is used to
1025	 * unmap the ESB pages when a device is passed-through.
1026	 */
1027	mutex_lock(&xive->mapping_lock);
1028	xive->mapping = NULL;
1029	mutex_unlock(&xive->mapping_lock);
1030
1031	/*
1032	 * Since this is the device release function, we know that
1033	 * userspace does not have any open fd or mmap referring to
1034	 * the device.  Therefore there can not be any of the
1035	 * device attribute set/get, mmap, or page fault functions
1036	 * being executed concurrently, and similarly, the
1037	 * connect_vcpu and set/clr_mapped functions also cannot
1038	 * be being executed.
1039	 */
1040
1041	debugfs_remove(xive->dentry);
1042
1043	/*
1044	 * We should clean up the vCPU interrupt presenters first.
1045	 */
1046	kvm_for_each_vcpu(i, vcpu, kvm) {
1047		/*
1048		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1049		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1050		 * Holding the vcpu->mutex also means that the vcpu cannot
1051		 * be executing the KVM_RUN ioctl, and therefore it cannot
1052		 * be executing the XIVE push or pull code or accessing
1053		 * the XIVE MMIO regions.
1054		 */
1055		mutex_lock(&vcpu->mutex);
1056		kvmppc_xive_native_cleanup_vcpu(vcpu);
1057		mutex_unlock(&vcpu->mutex);
1058	}
1059
1060	/*
1061	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1062	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1063	 * against xive code getting called during vcpu execution or
1064	 * set/get one_reg operations.
1065	 */
1066	kvm->arch.xive = NULL;
1067
1068	for (i = 0; i <= xive->max_sbid; i++) {
1069		if (xive->src_blocks[i])
1070			kvmppc_xive_free_sources(xive->src_blocks[i]);
1071		kfree(xive->src_blocks[i]);
1072		xive->src_blocks[i] = NULL;
1073	}
1074
1075	if (xive->vp_base != XIVE_INVALID_VP)
1076		xive_native_free_vp_block(xive->vp_base);
1077
1078	/*
1079	 * A reference of the kvmppc_xive pointer is now kept under
1080	 * the xive_devices struct of the machine for reuse. It is
1081	 * freed when the VM is destroyed for now until we fix all the
1082	 * execution paths.
1083	 */
1084
1085	kfree(dev);
1086}
1087
1088/*
1089 * Create a XIVE device.  kvm->lock is held.
1090 */
1091static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1092{
1093	struct kvmppc_xive *xive;
1094	struct kvm *kvm = dev->kvm;
1095
1096	pr_devel("Creating xive native device\n");
1097
1098	if (kvm->arch.xive)
1099		return -EEXIST;
1100
1101	xive = kvmppc_xive_get_device(kvm, type);
1102	if (!xive)
1103		return -ENOMEM;
1104
1105	dev->private = xive;
1106	xive->dev = dev;
1107	xive->kvm = kvm;
1108	mutex_init(&xive->mapping_lock);
1109	mutex_init(&xive->lock);
1110
1111	/* VP allocation is delayed to the first call to connect_vcpu */
1112	xive->vp_base = XIVE_INVALID_VP;
1113	/* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1114	 * on a POWER9 system.
1115	 */
1116	xive->nr_servers = KVM_MAX_VCPUS;
1117
1118	if (xive_native_has_single_escalation())
1119		xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
1120
1121	if (xive_native_has_save_restore())
1122		xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
1123
1124	xive->ops = &kvmppc_xive_native_ops;
1125
1126	kvm->arch.xive = xive;
1127	return 0;
1128}
1129
1130/*
1131 * Interrupt Pending Buffer (IPB) offset
1132 */
1133#define TM_IPB_SHIFT 40
1134#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1135
1136int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1137{
1138	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1139	u64 opal_state;
1140	int rc;
1141
1142	if (!kvmppc_xive_enabled(vcpu))
1143		return -EPERM;
1144
1145	if (!xc)
1146		return -ENOENT;
1147
1148	/* Thread context registers. We only care about IPB and CPPR */
1149	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1150
1151	/* Get the VP state from OPAL */
1152	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1153	if (rc)
1154		return rc;
1155
1156	/*
1157	 * Capture the backup of IPB register in the NVT structure and
1158	 * merge it in our KVM VP state.
1159	 */
1160	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1161
1162	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1163		 __func__,
1164		 vcpu->arch.xive_saved_state.nsr,
1165		 vcpu->arch.xive_saved_state.cppr,
1166		 vcpu->arch.xive_saved_state.ipb,
1167		 vcpu->arch.xive_saved_state.pipr,
1168		 vcpu->arch.xive_saved_state.w01,
1169		 (u32) vcpu->arch.xive_cam_word, opal_state);
1170
1171	return 0;
1172}
1173
1174int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1175{
1176	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1177	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1178
1179	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1180		 val->xive_timaval[0], val->xive_timaval[1]);
1181
1182	if (!kvmppc_xive_enabled(vcpu))
1183		return -EPERM;
1184
1185	if (!xc || !xive)
1186		return -ENOENT;
1187
1188	/* We can't update the state of a "pushed" VCPU	 */
1189	if (WARN_ON(vcpu->arch.xive_pushed))
1190		return -EBUSY;
1191
1192	/*
1193	 * Restore the thread context registers. IPB and CPPR should
1194	 * be the only ones that matter.
1195	 */
1196	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1197
1198	/*
1199	 * There is no need to restore the XIVE internal state (IPB
1200	 * stored in the NVT) as the IPB register was merged in KVM VP
1201	 * state when captured.
1202	 */
1203	return 0;
1204}
1205
1206bool kvmppc_xive_native_supported(void)
1207{
1208	return xive_native_has_queue_state_support();
1209}
1210
1211static int xive_native_debug_show(struct seq_file *m, void *private)
1212{
1213	struct kvmppc_xive *xive = m->private;
1214	struct kvm *kvm = xive->kvm;
1215	struct kvm_vcpu *vcpu;
1216	unsigned long i;
1217
1218	if (!kvm)
1219		return 0;
1220
1221	seq_puts(m, "=========\nVCPU state\n=========\n");
1222
1223	kvm_for_each_vcpu(i, vcpu, kvm) {
1224		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1225
1226		if (!xc)
1227			continue;
1228
1229		seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
1230			   "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1231			   xc->server_num, xc->vp_id, xc->vp_chip_id,
1232			   vcpu->arch.xive_saved_state.nsr,
1233			   vcpu->arch.xive_saved_state.cppr,
1234			   vcpu->arch.xive_saved_state.ipb,
1235			   vcpu->arch.xive_saved_state.pipr,
1236			   be64_to_cpu(vcpu->arch.xive_saved_state.w01),
1237			   be32_to_cpu(vcpu->arch.xive_cam_word));
1238
1239		kvmppc_xive_debug_show_queues(m, vcpu);
1240	}
1241
1242	seq_puts(m, "=========\nSources\n=========\n");
1243
1244	for (i = 0; i <= xive->max_sbid; i++) {
1245		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1246
1247		if (sb) {
1248			arch_spin_lock(&sb->lock);
1249			kvmppc_xive_debug_show_sources(m, sb);
1250			arch_spin_unlock(&sb->lock);
1251		}
1252	}
1253
1254	return 0;
1255}
1256
1257DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
1258
1259static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1260{
1261	xive->dentry = debugfs_create_file("xive", 0444, xive->kvm->debugfs_dentry,
1262					   xive, &xive_native_debug_fops);
1263
1264	pr_debug("%s: created\n", __func__);
1265}
1266
1267static void kvmppc_xive_native_init(struct kvm_device *dev)
1268{
1269	struct kvmppc_xive *xive = dev->private;
1270
1271	/* Register some debug interfaces */
1272	xive_native_debugfs_init(xive);
1273}
1274
1275struct kvm_device_ops kvm_xive_native_ops = {
1276	.name = "kvm-xive-native",
1277	.create = kvmppc_xive_native_create,
1278	.init = kvmppc_xive_native_init,
1279	.release = kvmppc_xive_native_release,
1280	.set_attr = kvmppc_xive_native_set_attr,
1281	.get_attr = kvmppc_xive_native_get_attr,
1282	.has_attr = kvmppc_xive_native_has_attr,
1283	.mmap = kvmppc_xive_native_mmap,
1284};
1285