1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
5 *         Leo Duran <leo.duran@amd.com>
6 */
7
8#define pr_fmt(fmt)     "AMD-Vi: " fmt
9#define dev_fmt(fmt)    pr_fmt(fmt)
10
11#include <linux/ratelimit.h>
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci-ats.h>
15#include <linux/bitmap.h>
16#include <linux/slab.h>
17#include <linux/debugfs.h>
18#include <linux/scatterlist.h>
19#include <linux/dma-map-ops.h>
20#include <linux/dma-direct.h>
21#include <linux/iommu-helper.h>
22#include <linux/delay.h>
23#include <linux/amd-iommu.h>
24#include <linux/notifier.h>
25#include <linux/export.h>
26#include <linux/irq.h>
27#include <linux/msi.h>
28#include <linux/irqdomain.h>
29#include <linux/percpu.h>
30#include <linux/io-pgtable.h>
31#include <linux/cc_platform.h>
32#include <asm/irq_remapping.h>
33#include <asm/io_apic.h>
34#include <asm/apic.h>
35#include <asm/hw_irq.h>
36#include <asm/proto.h>
37#include <asm/iommu.h>
38#include <asm/gart.h>
39#include <asm/dma.h>
40#include <uapi/linux/iommufd.h>
41
42#include "amd_iommu.h"
43#include "../dma-iommu.h"
44#include "../irq_remapping.h"
45
46#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
47
48/* Reserved IOVA ranges */
49#define MSI_RANGE_START		(0xfee00000)
50#define MSI_RANGE_END		(0xfeefffff)
51#define HT_RANGE_START		(0xfd00000000ULL)
52#define HT_RANGE_END		(0xffffffffffULL)
53
54#define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
55
56static DEFINE_SPINLOCK(pd_bitmap_lock);
57
58LIST_HEAD(ioapic_map);
59LIST_HEAD(hpet_map);
60LIST_HEAD(acpihid_map);
61
62const struct iommu_ops amd_iommu_ops;
63static const struct iommu_dirty_ops amd_dirty_ops;
64
65int amd_iommu_max_glx_val = -1;
66
67/*
68 * general struct to manage commands send to an IOMMU
69 */
70struct iommu_cmd {
71	u32 data[4];
72};
73
74struct kmem_cache *amd_iommu_irq_cache;
75
76static void detach_device(struct device *dev);
77
78static void set_dte_entry(struct amd_iommu *iommu,
79			  struct iommu_dev_data *dev_data);
80
81/****************************************************************************
82 *
83 * Helper functions
84 *
85 ****************************************************************************/
86
87static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
88{
89	return (pdom && (pdom->pd_mode == PD_MODE_V2));
90}
91
92static inline int get_acpihid_device_id(struct device *dev,
93					struct acpihid_map_entry **entry)
94{
95	struct acpi_device *adev = ACPI_COMPANION(dev);
96	struct acpihid_map_entry *p;
97
98	if (!adev)
99		return -ENODEV;
100
101	list_for_each_entry(p, &acpihid_map, list) {
102		if (acpi_dev_hid_uid_match(adev, p->hid,
103					   p->uid[0] ? p->uid : NULL)) {
104			if (entry)
105				*entry = p;
106			return p->devid;
107		}
108	}
109	return -EINVAL;
110}
111
112static inline int get_device_sbdf_id(struct device *dev)
113{
114	int sbdf;
115
116	if (dev_is_pci(dev))
117		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
118	else
119		sbdf = get_acpihid_device_id(dev, NULL);
120
121	return sbdf;
122}
123
124struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
125{
126	struct dev_table_entry *dev_table;
127	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
128
129	BUG_ON(pci_seg == NULL);
130	dev_table = pci_seg->dev_table;
131	BUG_ON(dev_table == NULL);
132
133	return dev_table;
134}
135
136static inline u16 get_device_segment(struct device *dev)
137{
138	u16 seg;
139
140	if (dev_is_pci(dev)) {
141		struct pci_dev *pdev = to_pci_dev(dev);
142
143		seg = pci_domain_nr(pdev->bus);
144	} else {
145		u32 devid = get_acpihid_device_id(dev, NULL);
146
147		seg = PCI_SBDF_TO_SEGID(devid);
148	}
149
150	return seg;
151}
152
153/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
154void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
155{
156	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
157
158	pci_seg->rlookup_table[devid] = iommu;
159}
160
161static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
162{
163	struct amd_iommu_pci_seg *pci_seg;
164
165	for_each_pci_segment(pci_seg) {
166		if (pci_seg->id == seg)
167			return pci_seg->rlookup_table[devid];
168	}
169	return NULL;
170}
171
172static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
173{
174	u16 seg = get_device_segment(dev);
175	int devid = get_device_sbdf_id(dev);
176
177	if (devid < 0)
178		return NULL;
179	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
180}
181
182static struct protection_domain *to_pdomain(struct iommu_domain *dom)
183{
184	return container_of(dom, struct protection_domain, domain);
185}
186
187static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
188{
189	struct iommu_dev_data *dev_data;
190	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
191
192	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
193	if (!dev_data)
194		return NULL;
195
196	spin_lock_init(&dev_data->lock);
197	dev_data->devid = devid;
198	ratelimit_default_init(&dev_data->rs);
199
200	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
201	return dev_data;
202}
203
204static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
205{
206	struct iommu_dev_data *dev_data;
207	struct llist_node *node;
208	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
209
210	if (llist_empty(&pci_seg->dev_data_list))
211		return NULL;
212
213	node = pci_seg->dev_data_list.first;
214	llist_for_each_entry(dev_data, node, dev_data_list) {
215		if (dev_data->devid == devid)
216			return dev_data;
217	}
218
219	return NULL;
220}
221
222static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
223{
224	struct amd_iommu *iommu;
225	struct dev_table_entry *dev_table;
226	u16 devid = pci_dev_id(pdev);
227
228	if (devid == alias)
229		return 0;
230
231	iommu = rlookup_amd_iommu(&pdev->dev);
232	if (!iommu)
233		return 0;
234
235	amd_iommu_set_rlookup_table(iommu, alias);
236	dev_table = get_dev_table(iommu);
237	memcpy(dev_table[alias].data,
238	       dev_table[devid].data,
239	       sizeof(dev_table[alias].data));
240
241	return 0;
242}
243
244static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
245{
246	struct pci_dev *pdev;
247
248	if (!dev_is_pci(dev))
249		return;
250	pdev = to_pci_dev(dev);
251
252	/*
253	 * The IVRS alias stored in the alias table may not be
254	 * part of the PCI DMA aliases if it's bus differs
255	 * from the original device.
256	 */
257	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
258
259	pci_for_each_dma_alias(pdev, clone_alias, NULL);
260}
261
262static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
263{
264	struct pci_dev *pdev = to_pci_dev(dev);
265	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
266	u16 ivrs_alias;
267
268	/* For ACPI HID devices, there are no aliases */
269	if (!dev_is_pci(dev))
270		return;
271
272	/*
273	 * Add the IVRS alias to the pci aliases if it is on the same
274	 * bus. The IVRS table may know about a quirk that we don't.
275	 */
276	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
277	if (ivrs_alias != pci_dev_id(pdev) &&
278	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
279		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
280
281	clone_aliases(iommu, dev);
282}
283
284static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
285{
286	struct iommu_dev_data *dev_data;
287
288	dev_data = search_dev_data(iommu, devid);
289
290	if (dev_data == NULL) {
291		dev_data = alloc_dev_data(iommu, devid);
292		if (!dev_data)
293			return NULL;
294
295		if (translation_pre_enabled(iommu))
296			dev_data->defer_attach = true;
297	}
298
299	return dev_data;
300}
301
302/*
303* Find or create an IOMMU group for a acpihid device.
304*/
305static struct iommu_group *acpihid_device_group(struct device *dev)
306{
307	struct acpihid_map_entry *p, *entry = NULL;
308	int devid;
309
310	devid = get_acpihid_device_id(dev, &entry);
311	if (devid < 0)
312		return ERR_PTR(devid);
313
314	list_for_each_entry(p, &acpihid_map, list) {
315		if ((devid == p->devid) && p->group)
316			entry->group = p->group;
317	}
318
319	if (!entry->group)
320		entry->group = generic_device_group(dev);
321	else
322		iommu_group_ref_get(entry->group);
323
324	return entry->group;
325}
326
327static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
328{
329	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
330}
331
332static u32 pdev_get_caps(struct pci_dev *pdev)
333{
334	int features;
335	u32 flags = 0;
336
337	if (pci_ats_supported(pdev))
338		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
339
340	if (pci_pri_supported(pdev))
341		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
342
343	features = pci_pasid_features(pdev);
344	if (features >= 0) {
345		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
346
347		if (features & PCI_PASID_CAP_EXEC)
348			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
349
350		if (features & PCI_PASID_CAP_PRIV)
351			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
352	}
353
354	return flags;
355}
356
357static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
358{
359	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
360	int ret = -EINVAL;
361
362	if (dev_data->ats_enabled)
363		return 0;
364
365	if (amd_iommu_iotlb_sup &&
366	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
367		ret = pci_enable_ats(pdev, PAGE_SHIFT);
368		if (!ret) {
369			dev_data->ats_enabled = 1;
370			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
371		}
372	}
373
374	return ret;
375}
376
377static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
378{
379	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
380
381	if (dev_data->ats_enabled) {
382		pci_disable_ats(pdev);
383		dev_data->ats_enabled = 0;
384	}
385}
386
387int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev)
388{
389	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
390	int ret = -EINVAL;
391
392	if (dev_data->pri_enabled)
393		return 0;
394
395	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
396		/*
397		 * First reset the PRI state of the device.
398		 * FIXME: Hardcode number of outstanding requests for now
399		 */
400		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
401			dev_data->pri_enabled = 1;
402			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
403
404			ret = 0;
405		}
406	}
407
408	return ret;
409}
410
411void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev)
412{
413	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
414
415	if (dev_data->pri_enabled) {
416		pci_disable_pri(pdev);
417		dev_data->pri_enabled = 0;
418	}
419}
420
421static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
422{
423	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
424	int ret = -EINVAL;
425
426	if (dev_data->pasid_enabled)
427		return 0;
428
429	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
430		/* Only allow access to user-accessible pages */
431		ret = pci_enable_pasid(pdev, 0);
432		if (!ret)
433			dev_data->pasid_enabled = 1;
434	}
435
436	return ret;
437}
438
439static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
440{
441	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
442
443	if (dev_data->pasid_enabled) {
444		pci_disable_pasid(pdev);
445		dev_data->pasid_enabled = 0;
446	}
447}
448
449static void pdev_enable_caps(struct pci_dev *pdev)
450{
451	pdev_enable_cap_ats(pdev);
452	pdev_enable_cap_pasid(pdev);
453	amd_iommu_pdev_enable_cap_pri(pdev);
454
455}
456
457static void pdev_disable_caps(struct pci_dev *pdev)
458{
459	pdev_disable_cap_ats(pdev);
460	pdev_disable_cap_pasid(pdev);
461	amd_iommu_pdev_disable_cap_pri(pdev);
462}
463
464/*
465 * This function checks if the driver got a valid device from the caller to
466 * avoid dereferencing invalid pointers.
467 */
468static bool check_device(struct device *dev)
469{
470	struct amd_iommu_pci_seg *pci_seg;
471	struct amd_iommu *iommu;
472	int devid, sbdf;
473
474	if (!dev)
475		return false;
476
477	sbdf = get_device_sbdf_id(dev);
478	if (sbdf < 0)
479		return false;
480	devid = PCI_SBDF_TO_DEVID(sbdf);
481
482	iommu = rlookup_amd_iommu(dev);
483	if (!iommu)
484		return false;
485
486	/* Out of our scope? */
487	pci_seg = iommu->pci_seg;
488	if (devid > pci_seg->last_bdf)
489		return false;
490
491	return true;
492}
493
494static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
495{
496	struct iommu_dev_data *dev_data;
497	int devid, sbdf;
498
499	if (dev_iommu_priv_get(dev))
500		return 0;
501
502	sbdf = get_device_sbdf_id(dev);
503	if (sbdf < 0)
504		return sbdf;
505
506	devid = PCI_SBDF_TO_DEVID(sbdf);
507	dev_data = find_dev_data(iommu, devid);
508	if (!dev_data)
509		return -ENOMEM;
510
511	dev_data->dev = dev;
512	setup_aliases(iommu, dev);
513
514	/*
515	 * By default we use passthrough mode for IOMMUv2 capable device.
516	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
517	 * invalid address), we ignore the capability for the device so
518	 * it'll be forced to go into translation mode.
519	 */
520	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
521	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
522		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
523	}
524
525	dev_iommu_priv_set(dev, dev_data);
526
527	return 0;
528}
529
530static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
531{
532	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
533	struct dev_table_entry *dev_table = get_dev_table(iommu);
534	int devid, sbdf;
535
536	sbdf = get_device_sbdf_id(dev);
537	if (sbdf < 0)
538		return;
539
540	devid = PCI_SBDF_TO_DEVID(sbdf);
541	pci_seg->rlookup_table[devid] = NULL;
542	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
543
544	setup_aliases(iommu, dev);
545}
546
547static void amd_iommu_uninit_device(struct device *dev)
548{
549	struct iommu_dev_data *dev_data;
550
551	dev_data = dev_iommu_priv_get(dev);
552	if (!dev_data)
553		return;
554
555	if (dev_data->domain)
556		detach_device(dev);
557
558	/*
559	 * We keep dev_data around for unplugged devices and reuse it when the
560	 * device is re-plugged - not doing so would introduce a ton of races.
561	 */
562}
563
564/****************************************************************************
565 *
566 * Interrupt handling functions
567 *
568 ****************************************************************************/
569
570static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
571{
572	int i;
573	struct dev_table_entry *dev_table = get_dev_table(iommu);
574
575	for (i = 0; i < 4; ++i)
576		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
577}
578
579static void dump_command(unsigned long phys_addr)
580{
581	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
582	int i;
583
584	for (i = 0; i < 4; ++i)
585		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
586}
587
588static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
589{
590	struct iommu_dev_data *dev_data = NULL;
591	int devid, vmg_tag, flags;
592	struct pci_dev *pdev;
593	u64 spa;
594
595	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
596	vmg_tag = (event[1]) & 0xFFFF;
597	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
598	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
599
600	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
601					   devid & 0xff);
602	if (pdev)
603		dev_data = dev_iommu_priv_get(&pdev->dev);
604
605	if (dev_data) {
606		if (__ratelimit(&dev_data->rs)) {
607			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
608				vmg_tag, spa, flags);
609		}
610	} else {
611		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
612			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
613			vmg_tag, spa, flags);
614	}
615
616	if (pdev)
617		pci_dev_put(pdev);
618}
619
620static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
621{
622	struct iommu_dev_data *dev_data = NULL;
623	int devid, flags_rmp, vmg_tag, flags;
624	struct pci_dev *pdev;
625	u64 gpa;
626
627	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
628	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
629	vmg_tag   = (event[1]) & 0xFFFF;
630	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
631	gpa       = ((u64)event[3] << 32) | event[2];
632
633	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
634					   devid & 0xff);
635	if (pdev)
636		dev_data = dev_iommu_priv_get(&pdev->dev);
637
638	if (dev_data) {
639		if (__ratelimit(&dev_data->rs)) {
640			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
641				vmg_tag, gpa, flags_rmp, flags);
642		}
643	} else {
644		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
645			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
646			vmg_tag, gpa, flags_rmp, flags);
647	}
648
649	if (pdev)
650		pci_dev_put(pdev);
651}
652
653#define IS_IOMMU_MEM_TRANSACTION(flags)		\
654	(((flags) & EVENT_FLAG_I) == 0)
655
656#define IS_WRITE_REQUEST(flags)			\
657	((flags) & EVENT_FLAG_RW)
658
659static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
660					u16 devid, u16 domain_id,
661					u64 address, int flags)
662{
663	struct iommu_dev_data *dev_data = NULL;
664	struct pci_dev *pdev;
665
666	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
667					   devid & 0xff);
668	if (pdev)
669		dev_data = dev_iommu_priv_get(&pdev->dev);
670
671	if (dev_data) {
672		/*
673		 * If this is a DMA fault (for which the I(nterrupt)
674		 * bit will be unset), allow report_iommu_fault() to
675		 * prevent logging it.
676		 */
677		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
678			/* Device not attached to domain properly */
679			if (dev_data->domain == NULL) {
680				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
681				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
682						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
683						   PCI_FUNC(devid), domain_id);
684				goto out;
685			}
686
687			if (!report_iommu_fault(&dev_data->domain->domain,
688						&pdev->dev, address,
689						IS_WRITE_REQUEST(flags) ?
690							IOMMU_FAULT_WRITE :
691							IOMMU_FAULT_READ))
692				goto out;
693		}
694
695		if (__ratelimit(&dev_data->rs)) {
696			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
697				domain_id, address, flags);
698		}
699	} else {
700		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
701			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
702			domain_id, address, flags);
703	}
704
705out:
706	if (pdev)
707		pci_dev_put(pdev);
708}
709
710static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
711{
712	struct device *dev = iommu->iommu.dev;
713	int type, devid, flags, tag;
714	volatile u32 *event = __evt;
715	int count = 0;
716	u64 address;
717	u32 pasid;
718
719retry:
720	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
721	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
722	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
723		  (event[1] & EVENT_DOMID_MASK_LO);
724	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
725	address = (u64)(((u64)event[3]) << 32) | event[2];
726
727	if (type == 0) {
728		/* Did we hit the erratum? */
729		if (++count == LOOP_TIMEOUT) {
730			pr_err("No event written to event log\n");
731			return;
732		}
733		udelay(1);
734		goto retry;
735	}
736
737	if (type == EVENT_TYPE_IO_FAULT) {
738		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
739		return;
740	}
741
742	switch (type) {
743	case EVENT_TYPE_ILL_DEV:
744		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
745			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
746			pasid, address, flags);
747		dump_dte_entry(iommu, devid);
748		break;
749	case EVENT_TYPE_DEV_TAB_ERR:
750		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
751			"address=0x%llx flags=0x%04x]\n",
752			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
753			address, flags);
754		break;
755	case EVENT_TYPE_PAGE_TAB_ERR:
756		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
757			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
758			pasid, address, flags);
759		break;
760	case EVENT_TYPE_ILL_CMD:
761		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
762		dump_command(address);
763		break;
764	case EVENT_TYPE_CMD_HARD_ERR:
765		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
766			address, flags);
767		break;
768	case EVENT_TYPE_IOTLB_INV_TO:
769		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
770			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
771			address);
772		break;
773	case EVENT_TYPE_INV_DEV_REQ:
774		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
775			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
776			pasid, address, flags);
777		break;
778	case EVENT_TYPE_RMP_FAULT:
779		amd_iommu_report_rmp_fault(iommu, event);
780		break;
781	case EVENT_TYPE_RMP_HW_ERR:
782		amd_iommu_report_rmp_hw_error(iommu, event);
783		break;
784	case EVENT_TYPE_INV_PPR_REQ:
785		pasid = PPR_PASID(*((u64 *)__evt));
786		tag = event[1] & 0x03FF;
787		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
788			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
789			pasid, address, flags, tag);
790		break;
791	default:
792		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
793			event[0], event[1], event[2], event[3]);
794	}
795
796	/*
797	 * To detect the hardware errata 732 we need to clear the
798	 * entry back to zero. This issue does not exist on SNP
799	 * enabled system. Also this buffer is not writeable on
800	 * SNP enabled system.
801	 */
802	if (!amd_iommu_snp_en)
803		memset(__evt, 0, 4 * sizeof(u32));
804}
805
806static void iommu_poll_events(struct amd_iommu *iommu)
807{
808	u32 head, tail;
809
810	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
811	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
812
813	while (head != tail) {
814		iommu_print_event(iommu, iommu->evt_buf + head);
815		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
816	}
817
818	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
819}
820
821static void iommu_poll_ppr_log(struct amd_iommu *iommu)
822{
823	u32 head, tail;
824
825	if (iommu->ppr_log == NULL)
826		return;
827
828	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
829	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
830
831	while (head != tail) {
832		volatile u64 *raw;
833		u64 entry[2];
834		int i;
835
836		raw = (u64 *)(iommu->ppr_log + head);
837
838		/*
839		 * Hardware bug: Interrupt may arrive before the entry is
840		 * written to memory. If this happens we need to wait for the
841		 * entry to arrive.
842		 */
843		for (i = 0; i < LOOP_TIMEOUT; ++i) {
844			if (PPR_REQ_TYPE(raw[0]) != 0)
845				break;
846			udelay(1);
847		}
848
849		/* Avoid memcpy function-call overhead */
850		entry[0] = raw[0];
851		entry[1] = raw[1];
852
853		/*
854		 * To detect the hardware errata 733 we need to clear the
855		 * entry back to zero. This issue does not exist on SNP
856		 * enabled system. Also this buffer is not writeable on
857		 * SNP enabled system.
858		 */
859		if (!amd_iommu_snp_en)
860			raw[0] = raw[1] = 0UL;
861
862		/* Update head pointer of hardware ring-buffer */
863		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
864		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
865
866		/* TODO: PPR Handler will be added when we add IOPF support */
867
868		/* Refresh ring-buffer information */
869		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
870		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
871	}
872}
873
874#ifdef CONFIG_IRQ_REMAP
875static int (*iommu_ga_log_notifier)(u32);
876
877int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
878{
879	iommu_ga_log_notifier = notifier;
880
881	return 0;
882}
883EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
884
885static void iommu_poll_ga_log(struct amd_iommu *iommu)
886{
887	u32 head, tail;
888
889	if (iommu->ga_log == NULL)
890		return;
891
892	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
893	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
894
895	while (head != tail) {
896		volatile u64 *raw;
897		u64 log_entry;
898
899		raw = (u64 *)(iommu->ga_log + head);
900
901		/* Avoid memcpy function-call overhead */
902		log_entry = *raw;
903
904		/* Update head pointer of hardware ring-buffer */
905		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
906		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
907
908		/* Handle GA entry */
909		switch (GA_REQ_TYPE(log_entry)) {
910		case GA_GUEST_NR:
911			if (!iommu_ga_log_notifier)
912				break;
913
914			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
915				 __func__, GA_DEVID(log_entry),
916				 GA_TAG(log_entry));
917
918			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
919				pr_err("GA log notifier failed.\n");
920			break;
921		default:
922			break;
923		}
924	}
925}
926
927static void
928amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
929{
930	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
931	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
932		return;
933
934	dev_set_msi_domain(dev, iommu->ir_domain);
935}
936
937#else /* CONFIG_IRQ_REMAP */
938static inline void
939amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
940#endif /* !CONFIG_IRQ_REMAP */
941
942static void amd_iommu_handle_irq(void *data, const char *evt_type,
943				 u32 int_mask, u32 overflow_mask,
944				 void (*int_handler)(struct amd_iommu *),
945				 void (*overflow_handler)(struct amd_iommu *))
946{
947	struct amd_iommu *iommu = (struct amd_iommu *) data;
948	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
949	u32 mask = int_mask | overflow_mask;
950
951	while (status & mask) {
952		/* Enable interrupt sources again */
953		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
954
955		if (int_handler) {
956			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
957				 iommu->index, evt_type);
958			int_handler(iommu);
959		}
960
961		if ((status & overflow_mask) && overflow_handler)
962			overflow_handler(iommu);
963
964		/*
965		 * Hardware bug: ERBT1312
966		 * When re-enabling interrupt (by writing 1
967		 * to clear the bit), the hardware might also try to set
968		 * the interrupt bit in the event status register.
969		 * In this scenario, the bit will be set, and disable
970		 * subsequent interrupts.
971		 *
972		 * Workaround: The IOMMU driver should read back the
973		 * status register and check if the interrupt bits are cleared.
974		 * If not, driver will need to go through the interrupt handler
975		 * again and re-clear the bits
976		 */
977		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
978	}
979}
980
981irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
982{
983	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
984			     MMIO_STATUS_EVT_OVERFLOW_MASK,
985			     iommu_poll_events, amd_iommu_restart_event_logging);
986
987	return IRQ_HANDLED;
988}
989
990irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
991{
992	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
993			     MMIO_STATUS_PPR_OVERFLOW_MASK,
994			     iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
995
996	return IRQ_HANDLED;
997}
998
999irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
1000{
1001#ifdef CONFIG_IRQ_REMAP
1002	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
1003			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
1004			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
1005#endif
1006
1007	return IRQ_HANDLED;
1008}
1009
1010irqreturn_t amd_iommu_int_thread(int irq, void *data)
1011{
1012	amd_iommu_int_thread_evtlog(irq, data);
1013	amd_iommu_int_thread_pprlog(irq, data);
1014	amd_iommu_int_thread_galog(irq, data);
1015
1016	return IRQ_HANDLED;
1017}
1018
1019irqreturn_t amd_iommu_int_handler(int irq, void *data)
1020{
1021	return IRQ_WAKE_THREAD;
1022}
1023
1024/****************************************************************************
1025 *
1026 * IOMMU command queuing functions
1027 *
1028 ****************************************************************************/
1029
1030static int wait_on_sem(struct amd_iommu *iommu, u64 data)
1031{
1032	int i = 0;
1033
1034	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
1035		udelay(1);
1036		i += 1;
1037	}
1038
1039	if (i == LOOP_TIMEOUT) {
1040		pr_alert("Completion-Wait loop timed out\n");
1041		return -EIO;
1042	}
1043
1044	return 0;
1045}
1046
1047static void copy_cmd_to_buffer(struct amd_iommu *iommu,
1048			       struct iommu_cmd *cmd)
1049{
1050	u8 *target;
1051	u32 tail;
1052
1053	/* Copy command to buffer */
1054	tail = iommu->cmd_buf_tail;
1055	target = iommu->cmd_buf + tail;
1056	memcpy(target, cmd, sizeof(*cmd));
1057
1058	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1059	iommu->cmd_buf_tail = tail;
1060
1061	/* Tell the IOMMU about it */
1062	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1063}
1064
1065static void build_completion_wait(struct iommu_cmd *cmd,
1066				  struct amd_iommu *iommu,
1067				  u64 data)
1068{
1069	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1070
1071	memset(cmd, 0, sizeof(*cmd));
1072	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1073	cmd->data[1] = upper_32_bits(paddr);
1074	cmd->data[2] = lower_32_bits(data);
1075	cmd->data[3] = upper_32_bits(data);
1076	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1077}
1078
1079static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1080{
1081	memset(cmd, 0, sizeof(*cmd));
1082	cmd->data[0] = devid;
1083	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1084}
1085
1086/*
1087 * Builds an invalidation address which is suitable for one page or multiple
1088 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1089 */
1090static inline u64 build_inv_address(u64 address, size_t size)
1091{
1092	u64 pages, end, msb_diff;
1093
1094	pages = iommu_num_pages(address, size, PAGE_SIZE);
1095
1096	if (pages == 1)
1097		return address & PAGE_MASK;
1098
1099	end = address + size - 1;
1100
1101	/*
1102	 * msb_diff would hold the index of the most significant bit that
1103	 * flipped between the start and end.
1104	 */
1105	msb_diff = fls64(end ^ address) - 1;
1106
1107	/*
1108	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1109	 * between the start and the end, invalidate everything.
1110	 */
1111	if (unlikely(msb_diff > 51)) {
1112		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1113	} else {
1114		/*
1115		 * The msb-bit must be clear on the address. Just set all the
1116		 * lower bits.
1117		 */
1118		address |= (1ull << msb_diff) - 1;
1119	}
1120
1121	/* Clear bits 11:0 */
1122	address &= PAGE_MASK;
1123
1124	/* Set the size bit - we flush more than one 4kb page */
1125	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1126}
1127
1128static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1129				  size_t size, u16 domid,
1130				  ioasid_t pasid, bool gn)
1131{
1132	u64 inv_address = build_inv_address(address, size);
1133
1134	memset(cmd, 0, sizeof(*cmd));
1135
1136	cmd->data[1] |= domid;
1137	cmd->data[2]  = lower_32_bits(inv_address);
1138	cmd->data[3]  = upper_32_bits(inv_address);
1139	/* PDE bit - we want to flush everything, not only the PTEs */
1140	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1141	if (gn) {
1142		cmd->data[0] |= pasid;
1143		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1144	}
1145	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1146}
1147
1148static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1149				  u64 address, size_t size,
1150				  ioasid_t pasid, bool gn)
1151{
1152	u64 inv_address = build_inv_address(address, size);
1153
1154	memset(cmd, 0, sizeof(*cmd));
1155
1156	cmd->data[0]  = devid;
1157	cmd->data[0] |= (qdep & 0xff) << 24;
1158	cmd->data[1]  = devid;
1159	cmd->data[2]  = lower_32_bits(inv_address);
1160	cmd->data[3]  = upper_32_bits(inv_address);
1161	if (gn) {
1162		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1163		cmd->data[1] |= (pasid & 0xff) << 16;
1164		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1165	}
1166
1167	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1168}
1169
1170static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1171			       int status, int tag, u8 gn)
1172{
1173	memset(cmd, 0, sizeof(*cmd));
1174
1175	cmd->data[0]  = devid;
1176	if (gn) {
1177		cmd->data[1]  = pasid;
1178		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1179	}
1180	cmd->data[3]  = tag & 0x1ff;
1181	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1182
1183	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1184}
1185
1186static void build_inv_all(struct iommu_cmd *cmd)
1187{
1188	memset(cmd, 0, sizeof(*cmd));
1189	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1190}
1191
1192static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1193{
1194	memset(cmd, 0, sizeof(*cmd));
1195	cmd->data[0] = devid;
1196	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1197}
1198
1199/*
1200 * Writes the command to the IOMMUs command buffer and informs the
1201 * hardware about the new command.
1202 */
1203static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1204				      struct iommu_cmd *cmd,
1205				      bool sync)
1206{
1207	unsigned int count = 0;
1208	u32 left, next_tail;
1209
1210	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1211again:
1212	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1213
1214	if (left <= 0x20) {
1215		/* Skip udelay() the first time around */
1216		if (count++) {
1217			if (count == LOOP_TIMEOUT) {
1218				pr_err("Command buffer timeout\n");
1219				return -EIO;
1220			}
1221
1222			udelay(1);
1223		}
1224
1225		/* Update head and recheck remaining space */
1226		iommu->cmd_buf_head = readl(iommu->mmio_base +
1227					    MMIO_CMD_HEAD_OFFSET);
1228
1229		goto again;
1230	}
1231
1232	copy_cmd_to_buffer(iommu, cmd);
1233
1234	/* Do we need to make sure all commands are processed? */
1235	iommu->need_sync = sync;
1236
1237	return 0;
1238}
1239
1240static int iommu_queue_command_sync(struct amd_iommu *iommu,
1241				    struct iommu_cmd *cmd,
1242				    bool sync)
1243{
1244	unsigned long flags;
1245	int ret;
1246
1247	raw_spin_lock_irqsave(&iommu->lock, flags);
1248	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1249	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1250
1251	return ret;
1252}
1253
1254static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1255{
1256	return iommu_queue_command_sync(iommu, cmd, true);
1257}
1258
1259/*
1260 * This function queues a completion wait command into the command
1261 * buffer of an IOMMU
1262 */
1263static int iommu_completion_wait(struct amd_iommu *iommu)
1264{
1265	struct iommu_cmd cmd;
1266	unsigned long flags;
1267	int ret;
1268	u64 data;
1269
1270	if (!iommu->need_sync)
1271		return 0;
1272
1273	data = atomic64_add_return(1, &iommu->cmd_sem_val);
1274	build_completion_wait(&cmd, iommu, data);
1275
1276	raw_spin_lock_irqsave(&iommu->lock, flags);
1277
1278	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1279	if (ret)
1280		goto out_unlock;
1281
1282	ret = wait_on_sem(iommu, data);
1283
1284out_unlock:
1285	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1286
1287	return ret;
1288}
1289
1290static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1291{
1292	struct iommu_cmd cmd;
1293
1294	build_inv_dte(&cmd, devid);
1295
1296	return iommu_queue_command(iommu, &cmd);
1297}
1298
1299static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1300{
1301	u32 devid;
1302	u16 last_bdf = iommu->pci_seg->last_bdf;
1303
1304	for (devid = 0; devid <= last_bdf; ++devid)
1305		iommu_flush_dte(iommu, devid);
1306
1307	iommu_completion_wait(iommu);
1308}
1309
1310/*
1311 * This function uses heavy locking and may disable irqs for some time. But
1312 * this is no issue because it is only called during resume.
1313 */
1314static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1315{
1316	u32 dom_id;
1317	u16 last_bdf = iommu->pci_seg->last_bdf;
1318
1319	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1320		struct iommu_cmd cmd;
1321		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1322				      dom_id, IOMMU_NO_PASID, false);
1323		iommu_queue_command(iommu, &cmd);
1324	}
1325
1326	iommu_completion_wait(iommu);
1327}
1328
1329static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1330{
1331	struct iommu_cmd cmd;
1332
1333	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1334			      dom_id, IOMMU_NO_PASID, false);
1335	iommu_queue_command(iommu, &cmd);
1336
1337	iommu_completion_wait(iommu);
1338}
1339
1340static void amd_iommu_flush_all(struct amd_iommu *iommu)
1341{
1342	struct iommu_cmd cmd;
1343
1344	build_inv_all(&cmd);
1345
1346	iommu_queue_command(iommu, &cmd);
1347	iommu_completion_wait(iommu);
1348}
1349
1350static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1351{
1352	struct iommu_cmd cmd;
1353
1354	build_inv_irt(&cmd, devid);
1355
1356	iommu_queue_command(iommu, &cmd);
1357}
1358
1359static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1360{
1361	u32 devid;
1362	u16 last_bdf = iommu->pci_seg->last_bdf;
1363
1364	if (iommu->irtcachedis_enabled)
1365		return;
1366
1367	for (devid = 0; devid <= last_bdf; devid++)
1368		iommu_flush_irt(iommu, devid);
1369
1370	iommu_completion_wait(iommu);
1371}
1372
1373void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1374{
1375	if (check_feature(FEATURE_IA)) {
1376		amd_iommu_flush_all(iommu);
1377	} else {
1378		amd_iommu_flush_dte_all(iommu);
1379		amd_iommu_flush_irt_all(iommu);
1380		amd_iommu_flush_tlb_all(iommu);
1381	}
1382}
1383
1384/*
1385 * Command send function for flushing on-device TLB
1386 */
1387static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1388			      size_t size, ioasid_t pasid, bool gn)
1389{
1390	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1391	struct iommu_cmd cmd;
1392	int qdep = dev_data->ats_qdep;
1393
1394	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1395			      size, pasid, gn);
1396
1397	return iommu_queue_command(iommu, &cmd);
1398}
1399
1400static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1401{
1402	struct amd_iommu *iommu = data;
1403
1404	return iommu_flush_dte(iommu, alias);
1405}
1406
1407/*
1408 * Command send function for invalidating a device table entry
1409 */
1410static int device_flush_dte(struct iommu_dev_data *dev_data)
1411{
1412	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1413	struct pci_dev *pdev = NULL;
1414	struct amd_iommu_pci_seg *pci_seg;
1415	u16 alias;
1416	int ret;
1417
1418	if (dev_is_pci(dev_data->dev))
1419		pdev = to_pci_dev(dev_data->dev);
1420
1421	if (pdev)
1422		ret = pci_for_each_dma_alias(pdev,
1423					     device_flush_dte_alias, iommu);
1424	else
1425		ret = iommu_flush_dte(iommu, dev_data->devid);
1426	if (ret)
1427		return ret;
1428
1429	pci_seg = iommu->pci_seg;
1430	alias = pci_seg->alias_table[dev_data->devid];
1431	if (alias != dev_data->devid) {
1432		ret = iommu_flush_dte(iommu, alias);
1433		if (ret)
1434			return ret;
1435	}
1436
1437	if (dev_data->ats_enabled) {
1438		/* Invalidate the entire contents of an IOTLB */
1439		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1440					 IOMMU_NO_PASID, false);
1441	}
1442
1443	return ret;
1444}
1445
1446static int domain_flush_pages_v2(struct protection_domain *pdom,
1447				 u64 address, size_t size)
1448{
1449	struct iommu_dev_data *dev_data;
1450	struct iommu_cmd cmd;
1451	int ret = 0;
1452
1453	list_for_each_entry(dev_data, &pdom->dev_list, list) {
1454		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1455		u16 domid = dev_data->gcr3_info.domid;
1456
1457		build_inv_iommu_pages(&cmd, address, size,
1458				      domid, IOMMU_NO_PASID, true);
1459
1460		ret |= iommu_queue_command(iommu, &cmd);
1461	}
1462
1463	return ret;
1464}
1465
1466static int domain_flush_pages_v1(struct protection_domain *pdom,
1467				 u64 address, size_t size)
1468{
1469	struct iommu_cmd cmd;
1470	int ret = 0, i;
1471
1472	build_inv_iommu_pages(&cmd, address, size,
1473			      pdom->id, IOMMU_NO_PASID, false);
1474
1475	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1476		if (!pdom->dev_iommu[i])
1477			continue;
1478
1479		/*
1480		 * Devices of this domain are behind this IOMMU
1481		 * We need a TLB flush
1482		 */
1483		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1484	}
1485
1486	return ret;
1487}
1488
1489/*
1490 * TLB invalidation function which is called from the mapping functions.
1491 * It flushes range of PTEs of the domain.
1492 */
1493static void __domain_flush_pages(struct protection_domain *domain,
1494				 u64 address, size_t size)
1495{
1496	struct iommu_dev_data *dev_data;
1497	int ret = 0;
1498	ioasid_t pasid = IOMMU_NO_PASID;
1499	bool gn = false;
1500
1501	if (pdom_is_v2_pgtbl_mode(domain)) {
1502		gn = true;
1503		ret = domain_flush_pages_v2(domain, address, size);
1504	} else {
1505		ret = domain_flush_pages_v1(domain, address, size);
1506	}
1507
1508	list_for_each_entry(dev_data, &domain->dev_list, list) {
1509
1510		if (!dev_data->ats_enabled)
1511			continue;
1512
1513		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1514	}
1515
1516	WARN_ON(ret);
1517}
1518
1519void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1520				  u64 address, size_t size)
1521{
1522	if (likely(!amd_iommu_np_cache)) {
1523		__domain_flush_pages(domain, address, size);
1524
1525		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1526		amd_iommu_domain_flush_complete(domain);
1527
1528		return;
1529	}
1530
1531	/*
1532	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1533	 * In such setups it is best to avoid flushes of ranges which are not
1534	 * naturally aligned, since it would lead to flushes of unmodified
1535	 * PTEs. Such flushes would require the hypervisor to do more work than
1536	 * necessary. Therefore, perform repeated flushes of aligned ranges
1537	 * until you cover the range. Each iteration flushes the smaller
1538	 * between the natural alignment of the address that we flush and the
1539	 * greatest naturally aligned region that fits in the range.
1540	 */
1541	while (size != 0) {
1542		int addr_alignment = __ffs(address);
1543		int size_alignment = __fls(size);
1544		int min_alignment;
1545		size_t flush_size;
1546
1547		/*
1548		 * size is always non-zero, but address might be zero, causing
1549		 * addr_alignment to be negative. As the casting of the
1550		 * argument in __ffs(address) to long might trim the high bits
1551		 * of the address on x86-32, cast to long when doing the check.
1552		 */
1553		if (likely((unsigned long)address != 0))
1554			min_alignment = min(addr_alignment, size_alignment);
1555		else
1556			min_alignment = size_alignment;
1557
1558		flush_size = 1ul << min_alignment;
1559
1560		__domain_flush_pages(domain, address, flush_size);
1561		address += flush_size;
1562		size -= flush_size;
1563	}
1564
1565	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1566	amd_iommu_domain_flush_complete(domain);
1567}
1568
1569/* Flush the whole IO/TLB for a given protection domain - including PDE */
1570static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1571{
1572	amd_iommu_domain_flush_pages(domain, 0,
1573				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1574}
1575
1576void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1577				     ioasid_t pasid, u64 address, size_t size)
1578{
1579	struct iommu_cmd cmd;
1580	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1581
1582	build_inv_iommu_pages(&cmd, address, size,
1583			      dev_data->gcr3_info.domid, pasid, true);
1584	iommu_queue_command(iommu, &cmd);
1585
1586	if (dev_data->ats_enabled)
1587		device_flush_iotlb(dev_data, address, size, pasid, true);
1588
1589	iommu_completion_wait(iommu);
1590}
1591
1592void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1593				   ioasid_t pasid)
1594{
1595	amd_iommu_dev_flush_pasid_pages(dev_data, 0,
1596					CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid);
1597}
1598
1599void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1600{
1601	int i;
1602
1603	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1604		if (domain && !domain->dev_iommu[i])
1605			continue;
1606
1607		/*
1608		 * Devices of this domain are behind this IOMMU
1609		 * We need to wait for completion of all commands.
1610		 */
1611		iommu_completion_wait(amd_iommus[i]);
1612	}
1613}
1614
1615/* Flush the not present cache if it exists */
1616static void domain_flush_np_cache(struct protection_domain *domain,
1617		dma_addr_t iova, size_t size)
1618{
1619	if (unlikely(amd_iommu_np_cache)) {
1620		unsigned long flags;
1621
1622		spin_lock_irqsave(&domain->lock, flags);
1623		amd_iommu_domain_flush_pages(domain, iova, size);
1624		spin_unlock_irqrestore(&domain->lock, flags);
1625	}
1626}
1627
1628
1629/*
1630 * This function flushes the DTEs for all devices in domain
1631 */
1632static void domain_flush_devices(struct protection_domain *domain)
1633{
1634	struct iommu_dev_data *dev_data;
1635
1636	list_for_each_entry(dev_data, &domain->dev_list, list)
1637		device_flush_dte(dev_data);
1638}
1639
1640static void update_device_table(struct protection_domain *domain)
1641{
1642	struct iommu_dev_data *dev_data;
1643
1644	list_for_each_entry(dev_data, &domain->dev_list, list) {
1645		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1646
1647		set_dte_entry(iommu, dev_data);
1648		clone_aliases(iommu, dev_data->dev);
1649	}
1650}
1651
1652void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1653{
1654	update_device_table(domain);
1655	domain_flush_devices(domain);
1656}
1657
1658void amd_iommu_domain_update(struct protection_domain *domain)
1659{
1660	/* Update device table */
1661	amd_iommu_update_and_flush_device_table(domain);
1662
1663	/* Flush domain TLB(s) and wait for completion */
1664	amd_iommu_domain_flush_all(domain);
1665}
1666
1667int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
1668			   int status, int tag)
1669{
1670	struct iommu_dev_data *dev_data;
1671	struct amd_iommu *iommu;
1672	struct iommu_cmd cmd;
1673
1674	dev_data = dev_iommu_priv_get(&pdev->dev);
1675	iommu    = get_amd_iommu_from_dev(&pdev->dev);
1676
1677	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
1678			   tag, dev_data->pri_tlp);
1679
1680	return iommu_queue_command(iommu, &cmd);
1681}
1682
1683/****************************************************************************
1684 *
1685 * The next functions belong to the domain allocation. A domain is
1686 * allocated for every IOMMU as the default domain. If device isolation
1687 * is enabled, every device get its own domain. The most important thing
1688 * about domains is the page table mapping the DMA address space they
1689 * contain.
1690 *
1691 ****************************************************************************/
1692
1693static u16 domain_id_alloc(void)
1694{
1695	unsigned long flags;
1696	int id;
1697
1698	spin_lock_irqsave(&pd_bitmap_lock, flags);
1699	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1700	BUG_ON(id == 0);
1701	if (id > 0 && id < MAX_DOMAIN_ID)
1702		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1703	else
1704		id = 0;
1705	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1706
1707	return id;
1708}
1709
1710static void domain_id_free(int id)
1711{
1712	unsigned long flags;
1713
1714	spin_lock_irqsave(&pd_bitmap_lock, flags);
1715	if (id > 0 && id < MAX_DOMAIN_ID)
1716		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1717	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1718}
1719
1720static void free_gcr3_tbl_level1(u64 *tbl)
1721{
1722	u64 *ptr;
1723	int i;
1724
1725	for (i = 0; i < 512; ++i) {
1726		if (!(tbl[i] & GCR3_VALID))
1727			continue;
1728
1729		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1730
1731		free_page((unsigned long)ptr);
1732	}
1733}
1734
1735static void free_gcr3_tbl_level2(u64 *tbl)
1736{
1737	u64 *ptr;
1738	int i;
1739
1740	for (i = 0; i < 512; ++i) {
1741		if (!(tbl[i] & GCR3_VALID))
1742			continue;
1743
1744		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1745
1746		free_gcr3_tbl_level1(ptr);
1747	}
1748}
1749
1750static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1751{
1752	if (gcr3_info->glx == 2)
1753		free_gcr3_tbl_level2(gcr3_info->gcr3_tbl);
1754	else if (gcr3_info->glx == 1)
1755		free_gcr3_tbl_level1(gcr3_info->gcr3_tbl);
1756	else
1757		WARN_ON_ONCE(gcr3_info->glx != 0);
1758
1759	gcr3_info->glx = 0;
1760
1761	/* Free per device domain ID */
1762	domain_id_free(gcr3_info->domid);
1763
1764	free_page((unsigned long)gcr3_info->gcr3_tbl);
1765	gcr3_info->gcr3_tbl = NULL;
1766}
1767
1768/*
1769 * Number of GCR3 table levels required. Level must be 4-Kbyte
1770 * page and can contain up to 512 entries.
1771 */
1772static int get_gcr3_levels(int pasids)
1773{
1774	int levels;
1775
1776	if (pasids == -1)
1777		return amd_iommu_max_glx_val;
1778
1779	levels = get_count_order(pasids);
1780
1781	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1782}
1783
1784static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1785			    struct amd_iommu *iommu, int pasids)
1786{
1787	int levels = get_gcr3_levels(pasids);
1788	int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
1789
1790	if (levels > amd_iommu_max_glx_val)
1791		return -EINVAL;
1792
1793	if (gcr3_info->gcr3_tbl)
1794		return -EBUSY;
1795
1796	/* Allocate per device domain ID */
1797	gcr3_info->domid = domain_id_alloc();
1798
1799	gcr3_info->gcr3_tbl = alloc_pgtable_page(nid, GFP_ATOMIC);
1800	if (gcr3_info->gcr3_tbl == NULL) {
1801		domain_id_free(gcr3_info->domid);
1802		return -ENOMEM;
1803	}
1804
1805	gcr3_info->glx = levels;
1806
1807	return 0;
1808}
1809
1810static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1811			   ioasid_t pasid, bool alloc)
1812{
1813	int index;
1814	u64 *pte;
1815	u64 *root = gcr3_info->gcr3_tbl;
1816	int level = gcr3_info->glx;
1817
1818	while (true) {
1819
1820		index = (pasid >> (9 * level)) & 0x1ff;
1821		pte   = &root[index];
1822
1823		if (level == 0)
1824			break;
1825
1826		if (!(*pte & GCR3_VALID)) {
1827			if (!alloc)
1828				return NULL;
1829
1830			root = (void *)get_zeroed_page(GFP_ATOMIC);
1831			if (root == NULL)
1832				return NULL;
1833
1834			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
1835		}
1836
1837		root = iommu_phys_to_virt(*pte & PAGE_MASK);
1838
1839		level -= 1;
1840	}
1841
1842	return pte;
1843}
1844
1845static int update_gcr3(struct iommu_dev_data *dev_data,
1846		       ioasid_t pasid, unsigned long gcr3, bool set)
1847{
1848	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1849	u64 *pte;
1850
1851	pte = __get_gcr3_pte(gcr3_info, pasid, true);
1852	if (pte == NULL)
1853		return -ENOMEM;
1854
1855	if (set)
1856		*pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1857	else
1858		*pte = 0;
1859
1860	amd_iommu_dev_flush_pasid_all(dev_data, pasid);
1861	return 0;
1862}
1863
1864int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1865		       unsigned long gcr3)
1866{
1867	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1868	int ret;
1869
1870	iommu_group_mutex_assert(dev_data->dev);
1871
1872	ret = update_gcr3(dev_data, pasid, gcr3, true);
1873	if (ret)
1874		return ret;
1875
1876	gcr3_info->pasid_cnt++;
1877	return ret;
1878}
1879
1880int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1881{
1882	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1883	int ret;
1884
1885	iommu_group_mutex_assert(dev_data->dev);
1886
1887	ret = update_gcr3(dev_data, pasid, 0, false);
1888	if (ret)
1889		return ret;
1890
1891	gcr3_info->pasid_cnt--;
1892	return ret;
1893}
1894
1895static void set_dte_entry(struct amd_iommu *iommu,
1896			  struct iommu_dev_data *dev_data)
1897{
1898	u64 pte_root = 0;
1899	u64 flags = 0;
1900	u32 old_domid;
1901	u16 devid = dev_data->devid;
1902	u16 domid;
1903	struct protection_domain *domain = dev_data->domain;
1904	struct dev_table_entry *dev_table = get_dev_table(iommu);
1905	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1906
1907	if (gcr3_info && gcr3_info->gcr3_tbl)
1908		domid = dev_data->gcr3_info.domid;
1909	else
1910		domid = domain->id;
1911
1912	if (domain->iop.mode != PAGE_MODE_NONE)
1913		pte_root = iommu_virt_to_phys(domain->iop.root);
1914
1915	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1916		    << DEV_ENTRY_MODE_SHIFT;
1917
1918	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1919
1920	/*
1921	 * When SNP is enabled, Only set TV bit when IOMMU
1922	 * page translation is in use.
1923	 */
1924	if (!amd_iommu_snp_en || (domid != 0))
1925		pte_root |= DTE_FLAG_TV;
1926
1927	flags = dev_table[devid].data[1];
1928
1929	if (dev_data->ats_enabled)
1930		flags |= DTE_FLAG_IOTLB;
1931
1932	if (dev_data->ppr)
1933		pte_root |= 1ULL << DEV_ENTRY_PPR;
1934
1935	if (domain->dirty_tracking)
1936		pte_root |= DTE_FLAG_HAD;
1937
1938	if (gcr3_info && gcr3_info->gcr3_tbl) {
1939		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
1940		u64 glx  = gcr3_info->glx;
1941		u64 tmp;
1942
1943		pte_root |= DTE_FLAG_GV;
1944		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1945
1946		/* First mask out possible old values for GCR3 table */
1947		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1948		flags    &= ~tmp;
1949
1950		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1951		flags    &= ~tmp;
1952
1953		/* Encode GCR3 table into DTE */
1954		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1955		pte_root |= tmp;
1956
1957		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1958		flags    |= tmp;
1959
1960		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1961		flags    |= tmp;
1962
1963		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1964			dev_table[devid].data[2] |=
1965				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1966		}
1967
1968		/* GIOV is supported with V2 page table mode only */
1969		if (pdom_is_v2_pgtbl_mode(domain))
1970			pte_root |= DTE_FLAG_GIOV;
1971	}
1972
1973	flags &= ~DEV_DOMID_MASK;
1974	flags |= domid;
1975
1976	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1977	dev_table[devid].data[1]  = flags;
1978	dev_table[devid].data[0]  = pte_root;
1979
1980	/*
1981	 * A kdump kernel might be replacing a domain ID that was copied from
1982	 * the previous kernel--if so, it needs to flush the translation cache
1983	 * entries for the old domain ID that is being overwritten
1984	 */
1985	if (old_domid) {
1986		amd_iommu_flush_tlb_domid(iommu, old_domid);
1987	}
1988}
1989
1990static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1991{
1992	struct dev_table_entry *dev_table = get_dev_table(iommu);
1993
1994	/* remove entry from the device table seen by the hardware */
1995	dev_table[devid].data[0]  = DTE_FLAG_V;
1996
1997	if (!amd_iommu_snp_en)
1998		dev_table[devid].data[0] |= DTE_FLAG_TV;
1999
2000	dev_table[devid].data[1] &= DTE_FLAG_MASK;
2001
2002	amd_iommu_apply_erratum_63(iommu, devid);
2003}
2004
2005static int do_attach(struct iommu_dev_data *dev_data,
2006		     struct protection_domain *domain)
2007{
2008	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2009	int ret = 0;
2010
2011	/* Update data structures */
2012	dev_data->domain = domain;
2013	list_add(&dev_data->list, &domain->dev_list);
2014
2015	/* Update NUMA Node ID */
2016	if (domain->nid == NUMA_NO_NODE)
2017		domain->nid = dev_to_node(dev_data->dev);
2018
2019	/* Do reference counting */
2020	domain->dev_iommu[iommu->index] += 1;
2021	domain->dev_cnt                 += 1;
2022
2023	/* Init GCR3 table and update device table */
2024	if (domain->pd_mode == PD_MODE_V2) {
2025		/* By default, setup GCR3 table to support single PASID */
2026		ret = setup_gcr3_table(&dev_data->gcr3_info, iommu, 1);
2027		if (ret)
2028			return ret;
2029
2030		ret = update_gcr3(dev_data, 0,
2031				  iommu_virt_to_phys(domain->iop.pgd), true);
2032		if (ret) {
2033			free_gcr3_table(&dev_data->gcr3_info);
2034			return ret;
2035		}
2036	}
2037
2038	/* Update device table */
2039	set_dte_entry(iommu, dev_data);
2040	clone_aliases(iommu, dev_data->dev);
2041
2042	device_flush_dte(dev_data);
2043
2044	return ret;
2045}
2046
2047static void do_detach(struct iommu_dev_data *dev_data)
2048{
2049	struct protection_domain *domain = dev_data->domain;
2050	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2051
2052	/* Clear GCR3 table */
2053	if (domain->pd_mode == PD_MODE_V2) {
2054		update_gcr3(dev_data, 0, 0, false);
2055		free_gcr3_table(&dev_data->gcr3_info);
2056	}
2057
2058	/* Update data structures */
2059	dev_data->domain = NULL;
2060	list_del(&dev_data->list);
2061	clear_dte_entry(iommu, dev_data->devid);
2062	clone_aliases(iommu, dev_data->dev);
2063
2064	/* Flush the DTE entry */
2065	device_flush_dte(dev_data);
2066
2067	/* Flush IOTLB and wait for the flushes to finish */
2068	amd_iommu_domain_flush_all(domain);
2069
2070	/* decrease reference counters - needs to happen after the flushes */
2071	domain->dev_iommu[iommu->index] -= 1;
2072	domain->dev_cnt                 -= 1;
2073}
2074
2075/*
2076 * If a device is not yet associated with a domain, this function makes the
2077 * device visible in the domain
2078 */
2079static int attach_device(struct device *dev,
2080			 struct protection_domain *domain)
2081{
2082	struct iommu_dev_data *dev_data;
2083	unsigned long flags;
2084	int ret = 0;
2085
2086	spin_lock_irqsave(&domain->lock, flags);
2087
2088	dev_data = dev_iommu_priv_get(dev);
2089
2090	spin_lock(&dev_data->lock);
2091
2092	if (dev_data->domain != NULL) {
2093		ret = -EBUSY;
2094		goto out;
2095	}
2096
2097	if (dev_is_pci(dev))
2098		pdev_enable_caps(to_pci_dev(dev));
2099
2100	ret = do_attach(dev_data, domain);
2101
2102out:
2103	spin_unlock(&dev_data->lock);
2104
2105	spin_unlock_irqrestore(&domain->lock, flags);
2106
2107	return ret;
2108}
2109
2110/*
2111 * Removes a device from a protection domain (with devtable_lock held)
2112 */
2113static void detach_device(struct device *dev)
2114{
2115	struct protection_domain *domain;
2116	struct iommu_dev_data *dev_data;
2117	unsigned long flags;
2118
2119	dev_data = dev_iommu_priv_get(dev);
2120	domain   = dev_data->domain;
2121
2122	spin_lock_irqsave(&domain->lock, flags);
2123
2124	spin_lock(&dev_data->lock);
2125
2126	/*
2127	 * First check if the device is still attached. It might already
2128	 * be detached from its domain because the generic
2129	 * iommu_detach_group code detached it and we try again here in
2130	 * our alias handling.
2131	 */
2132	if (WARN_ON(!dev_data->domain))
2133		goto out;
2134
2135	do_detach(dev_data);
2136
2137	if (dev_is_pci(dev))
2138		pdev_disable_caps(to_pci_dev(dev));
2139
2140out:
2141	spin_unlock(&dev_data->lock);
2142
2143	spin_unlock_irqrestore(&domain->lock, flags);
2144}
2145
2146static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2147{
2148	struct iommu_device *iommu_dev;
2149	struct amd_iommu *iommu;
2150	int ret;
2151
2152	if (!check_device(dev))
2153		return ERR_PTR(-ENODEV);
2154
2155	iommu = rlookup_amd_iommu(dev);
2156	if (!iommu)
2157		return ERR_PTR(-ENODEV);
2158
2159	/* Not registered yet? */
2160	if (!iommu->iommu.ops)
2161		return ERR_PTR(-ENODEV);
2162
2163	if (dev_iommu_priv_get(dev))
2164		return &iommu->iommu;
2165
2166	ret = iommu_init_device(iommu, dev);
2167	if (ret) {
2168		dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
2169		iommu_dev = ERR_PTR(ret);
2170		iommu_ignore_device(iommu, dev);
2171	} else {
2172		amd_iommu_set_pci_msi_domain(dev, iommu);
2173		iommu_dev = &iommu->iommu;
2174	}
2175
2176	iommu_completion_wait(iommu);
2177
2178	return iommu_dev;
2179}
2180
2181static void amd_iommu_probe_finalize(struct device *dev)
2182{
2183	/* Domains are initialized for this device - have a look what we ended up with */
2184	set_dma_ops(dev, NULL);
2185	iommu_setup_dma_ops(dev, 0, U64_MAX);
2186}
2187
2188static void amd_iommu_release_device(struct device *dev)
2189{
2190	struct amd_iommu *iommu;
2191
2192	if (!check_device(dev))
2193		return;
2194
2195	iommu = rlookup_amd_iommu(dev);
2196	if (!iommu)
2197		return;
2198
2199	amd_iommu_uninit_device(dev);
2200	iommu_completion_wait(iommu);
2201}
2202
2203static struct iommu_group *amd_iommu_device_group(struct device *dev)
2204{
2205	if (dev_is_pci(dev))
2206		return pci_device_group(dev);
2207
2208	return acpihid_device_group(dev);
2209}
2210
2211/*****************************************************************************
2212 *
2213 * The following functions belong to the exported interface of AMD IOMMU
2214 *
2215 * This interface allows access to lower level functions of the IOMMU
2216 * like protection domain handling and assignement of devices to domains
2217 * which is not possible with the dma_ops interface.
2218 *
2219 *****************************************************************************/
2220
2221static void cleanup_domain(struct protection_domain *domain)
2222{
2223	struct iommu_dev_data *entry;
2224
2225	lockdep_assert_held(&domain->lock);
2226
2227	if (!domain->dev_cnt)
2228		return;
2229
2230	while (!list_empty(&domain->dev_list)) {
2231		entry = list_first_entry(&domain->dev_list,
2232					 struct iommu_dev_data, list);
2233		BUG_ON(!entry->domain);
2234		do_detach(entry);
2235	}
2236	WARN_ON(domain->dev_cnt != 0);
2237}
2238
2239static void protection_domain_free(struct protection_domain *domain)
2240{
2241	if (!domain)
2242		return;
2243
2244	if (domain->iop.pgtbl_cfg.tlb)
2245		free_io_pgtable_ops(&domain->iop.iop.ops);
2246
2247	if (domain->iop.root)
2248		free_page((unsigned long)domain->iop.root);
2249
2250	if (domain->id)
2251		domain_id_free(domain->id);
2252
2253	kfree(domain);
2254}
2255
2256static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2257{
2258	u64 *pt_root = NULL;
2259
2260	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2261
2262	if (mode != PAGE_MODE_NONE) {
2263		pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2264		if (!pt_root)
2265			return -ENOMEM;
2266	}
2267
2268	domain->pd_mode = PD_MODE_V1;
2269	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2270
2271	return 0;
2272}
2273
2274static int protection_domain_init_v2(struct protection_domain *pdom)
2275{
2276	pdom->pd_mode = PD_MODE_V2;
2277	pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2278
2279	return 0;
2280}
2281
2282static struct protection_domain *protection_domain_alloc(unsigned int type)
2283{
2284	struct io_pgtable_ops *pgtbl_ops;
2285	struct protection_domain *domain;
2286	int pgtable;
2287	int ret;
2288
2289	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2290	if (!domain)
2291		return NULL;
2292
2293	domain->id = domain_id_alloc();
2294	if (!domain->id)
2295		goto out_err;
2296
2297	spin_lock_init(&domain->lock);
2298	INIT_LIST_HEAD(&domain->dev_list);
2299	domain->nid = NUMA_NO_NODE;
2300
2301	switch (type) {
2302	/* No need to allocate io pgtable ops in passthrough mode */
2303	case IOMMU_DOMAIN_IDENTITY:
2304		return domain;
2305	case IOMMU_DOMAIN_DMA:
2306		pgtable = amd_iommu_pgtable;
2307		break;
2308	/*
2309	 * Force IOMMU v1 page table when allocating
2310	 * domain for pass-through devices.
2311	 */
2312	case IOMMU_DOMAIN_UNMANAGED:
2313		pgtable = AMD_IOMMU_V1;
2314		break;
2315	default:
2316		goto out_err;
2317	}
2318
2319	switch (pgtable) {
2320	case AMD_IOMMU_V1:
2321		ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
2322		break;
2323	case AMD_IOMMU_V2:
2324		ret = protection_domain_init_v2(domain);
2325		break;
2326	default:
2327		ret = -EINVAL;
2328		break;
2329	}
2330
2331	if (ret)
2332		goto out_err;
2333
2334	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2335	if (!pgtbl_ops)
2336		goto out_err;
2337
2338	return domain;
2339out_err:
2340	protection_domain_free(domain);
2341	return NULL;
2342}
2343
2344static inline u64 dma_max_address(void)
2345{
2346	if (amd_iommu_pgtable == AMD_IOMMU_V1)
2347		return ~0ULL;
2348
2349	/* V2 with 4/5 level page table */
2350	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2351}
2352
2353static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2354{
2355	return iommu && (iommu->features & FEATURE_HDSUP);
2356}
2357
2358static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2359						  struct device *dev, u32 flags)
2360{
2361	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2362	struct protection_domain *domain;
2363	struct amd_iommu *iommu = NULL;
2364
2365	if (dev)
2366		iommu = get_amd_iommu_from_dev(dev);
2367
2368	/*
2369	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2370	 * default to use IOMMU_DOMAIN_DMA[_FQ].
2371	 */
2372	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2373		return ERR_PTR(-EINVAL);
2374
2375	if (dirty_tracking && !amd_iommu_hd_support(iommu))
2376		return ERR_PTR(-EOPNOTSUPP);
2377
2378	domain = protection_domain_alloc(type);
2379	if (!domain)
2380		return ERR_PTR(-ENOMEM);
2381
2382	domain->domain.geometry.aperture_start = 0;
2383	domain->domain.geometry.aperture_end   = dma_max_address();
2384	domain->domain.geometry.force_aperture = true;
2385
2386	if (iommu) {
2387		domain->domain.type = type;
2388		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2389		domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2390
2391		if (dirty_tracking)
2392			domain->domain.dirty_ops = &amd_dirty_ops;
2393	}
2394
2395	return &domain->domain;
2396}
2397
2398static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
2399{
2400	struct iommu_domain *domain;
2401
2402	domain = do_iommu_domain_alloc(type, NULL, 0);
2403	if (IS_ERR(domain))
2404		return NULL;
2405
2406	return domain;
2407}
2408
2409static struct iommu_domain *
2410amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2411			    struct iommu_domain *parent,
2412			    const struct iommu_user_data *user_data)
2413
2414{
2415	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2416
2417	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2418		return ERR_PTR(-EOPNOTSUPP);
2419
2420	return do_iommu_domain_alloc(type, dev, flags);
2421}
2422
2423static void amd_iommu_domain_free(struct iommu_domain *dom)
2424{
2425	struct protection_domain *domain;
2426	unsigned long flags;
2427
2428	if (!dom)
2429		return;
2430
2431	domain = to_pdomain(dom);
2432
2433	spin_lock_irqsave(&domain->lock, flags);
2434
2435	cleanup_domain(domain);
2436
2437	spin_unlock_irqrestore(&domain->lock, flags);
2438
2439	protection_domain_free(domain);
2440}
2441
2442static int amd_iommu_attach_device(struct iommu_domain *dom,
2443				   struct device *dev)
2444{
2445	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2446	struct protection_domain *domain = to_pdomain(dom);
2447	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2448	int ret;
2449
2450	/*
2451	 * Skip attach device to domain if new domain is same as
2452	 * devices current domain
2453	 */
2454	if (dev_data->domain == domain)
2455		return 0;
2456
2457	dev_data->defer_attach = false;
2458
2459	/*
2460	 * Restrict to devices with compatible IOMMU hardware support
2461	 * when enforcement of dirty tracking is enabled.
2462	 */
2463	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2464		return -EINVAL;
2465
2466	if (dev_data->domain)
2467		detach_device(dev);
2468
2469	ret = attach_device(dev, domain);
2470
2471#ifdef CONFIG_IRQ_REMAP
2472	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2473		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2474			dev_data->use_vapic = 1;
2475		else
2476			dev_data->use_vapic = 0;
2477	}
2478#endif
2479
2480	iommu_completion_wait(iommu);
2481
2482	return ret;
2483}
2484
2485static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2486				    unsigned long iova, size_t size)
2487{
2488	struct protection_domain *domain = to_pdomain(dom);
2489	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2490
2491	if (ops->map_pages)
2492		domain_flush_np_cache(domain, iova, size);
2493	return 0;
2494}
2495
2496static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2497			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2498			       int iommu_prot, gfp_t gfp, size_t *mapped)
2499{
2500	struct protection_domain *domain = to_pdomain(dom);
2501	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2502	int prot = 0;
2503	int ret = -EINVAL;
2504
2505	if ((domain->pd_mode == PD_MODE_V1) &&
2506	    (domain->iop.mode == PAGE_MODE_NONE))
2507		return -EINVAL;
2508
2509	if (iommu_prot & IOMMU_READ)
2510		prot |= IOMMU_PROT_IR;
2511	if (iommu_prot & IOMMU_WRITE)
2512		prot |= IOMMU_PROT_IW;
2513
2514	if (ops->map_pages) {
2515		ret = ops->map_pages(ops, iova, paddr, pgsize,
2516				     pgcount, prot, gfp, mapped);
2517	}
2518
2519	return ret;
2520}
2521
2522static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2523					    struct iommu_iotlb_gather *gather,
2524					    unsigned long iova, size_t size)
2525{
2526	/*
2527	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2528	 * Unless we run in a virtual machine, which can be inferred according
2529	 * to whether "non-present cache" is on, it is probably best to prefer
2530	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2531	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2532	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2533	 * the guest, and the trade-off is different: unnecessary TLB flushes
2534	 * should be avoided.
2535	 */
2536	if (amd_iommu_np_cache &&
2537	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2538		iommu_iotlb_sync(domain, gather);
2539
2540	iommu_iotlb_gather_add_range(gather, iova, size);
2541}
2542
2543static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2544				    size_t pgsize, size_t pgcount,
2545				    struct iommu_iotlb_gather *gather)
2546{
2547	struct protection_domain *domain = to_pdomain(dom);
2548	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2549	size_t r;
2550
2551	if ((domain->pd_mode == PD_MODE_V1) &&
2552	    (domain->iop.mode == PAGE_MODE_NONE))
2553		return 0;
2554
2555	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2556
2557	if (r)
2558		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2559
2560	return r;
2561}
2562
2563static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2564					  dma_addr_t iova)
2565{
2566	struct protection_domain *domain = to_pdomain(dom);
2567	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2568
2569	return ops->iova_to_phys(ops, iova);
2570}
2571
2572static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2573{
2574	switch (cap) {
2575	case IOMMU_CAP_CACHE_COHERENCY:
2576		return true;
2577	case IOMMU_CAP_NOEXEC:
2578		return false;
2579	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2580		return amdr_ivrs_remap_support;
2581	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2582		return true;
2583	case IOMMU_CAP_DEFERRED_FLUSH:
2584		return true;
2585	case IOMMU_CAP_DIRTY_TRACKING: {
2586		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2587
2588		return amd_iommu_hd_support(iommu);
2589	}
2590	default:
2591		break;
2592	}
2593
2594	return false;
2595}
2596
2597static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2598					bool enable)
2599{
2600	struct protection_domain *pdomain = to_pdomain(domain);
2601	struct dev_table_entry *dev_table;
2602	struct iommu_dev_data *dev_data;
2603	bool domain_flush = false;
2604	struct amd_iommu *iommu;
2605	unsigned long flags;
2606	u64 pte_root;
2607
2608	spin_lock_irqsave(&pdomain->lock, flags);
2609	if (!(pdomain->dirty_tracking ^ enable)) {
2610		spin_unlock_irqrestore(&pdomain->lock, flags);
2611		return 0;
2612	}
2613
2614	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2615		iommu = get_amd_iommu_from_dev_data(dev_data);
2616
2617		dev_table = get_dev_table(iommu);
2618		pte_root = dev_table[dev_data->devid].data[0];
2619
2620		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2621				     pte_root & ~DTE_FLAG_HAD);
2622
2623		/* Flush device DTE */
2624		dev_table[dev_data->devid].data[0] = pte_root;
2625		device_flush_dte(dev_data);
2626		domain_flush = true;
2627	}
2628
2629	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2630	if (domain_flush)
2631		amd_iommu_domain_flush_all(pdomain);
2632
2633	pdomain->dirty_tracking = enable;
2634	spin_unlock_irqrestore(&pdomain->lock, flags);
2635
2636	return 0;
2637}
2638
2639static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2640					  unsigned long iova, size_t size,
2641					  unsigned long flags,
2642					  struct iommu_dirty_bitmap *dirty)
2643{
2644	struct protection_domain *pdomain = to_pdomain(domain);
2645	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2646	unsigned long lflags;
2647
2648	if (!ops || !ops->read_and_clear_dirty)
2649		return -EOPNOTSUPP;
2650
2651	spin_lock_irqsave(&pdomain->lock, lflags);
2652	if (!pdomain->dirty_tracking && dirty->bitmap) {
2653		spin_unlock_irqrestore(&pdomain->lock, lflags);
2654		return -EINVAL;
2655	}
2656	spin_unlock_irqrestore(&pdomain->lock, lflags);
2657
2658	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2659}
2660
2661static void amd_iommu_get_resv_regions(struct device *dev,
2662				       struct list_head *head)
2663{
2664	struct iommu_resv_region *region;
2665	struct unity_map_entry *entry;
2666	struct amd_iommu *iommu;
2667	struct amd_iommu_pci_seg *pci_seg;
2668	int devid, sbdf;
2669
2670	sbdf = get_device_sbdf_id(dev);
2671	if (sbdf < 0)
2672		return;
2673
2674	devid = PCI_SBDF_TO_DEVID(sbdf);
2675	iommu = get_amd_iommu_from_dev(dev);
2676	pci_seg = iommu->pci_seg;
2677
2678	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2679		int type, prot = 0;
2680		size_t length;
2681
2682		if (devid < entry->devid_start || devid > entry->devid_end)
2683			continue;
2684
2685		type   = IOMMU_RESV_DIRECT;
2686		length = entry->address_end - entry->address_start;
2687		if (entry->prot & IOMMU_PROT_IR)
2688			prot |= IOMMU_READ;
2689		if (entry->prot & IOMMU_PROT_IW)
2690			prot |= IOMMU_WRITE;
2691		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2692			/* Exclusion range */
2693			type = IOMMU_RESV_RESERVED;
2694
2695		region = iommu_alloc_resv_region(entry->address_start,
2696						 length, prot, type,
2697						 GFP_KERNEL);
2698		if (!region) {
2699			dev_err(dev, "Out of memory allocating dm-regions\n");
2700			return;
2701		}
2702		list_add_tail(&region->list, head);
2703	}
2704
2705	region = iommu_alloc_resv_region(MSI_RANGE_START,
2706					 MSI_RANGE_END - MSI_RANGE_START + 1,
2707					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2708	if (!region)
2709		return;
2710	list_add_tail(&region->list, head);
2711
2712	region = iommu_alloc_resv_region(HT_RANGE_START,
2713					 HT_RANGE_END - HT_RANGE_START + 1,
2714					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2715	if (!region)
2716		return;
2717	list_add_tail(&region->list, head);
2718}
2719
2720bool amd_iommu_is_attach_deferred(struct device *dev)
2721{
2722	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2723
2724	return dev_data->defer_attach;
2725}
2726
2727static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2728{
2729	struct protection_domain *dom = to_pdomain(domain);
2730	unsigned long flags;
2731
2732	spin_lock_irqsave(&dom->lock, flags);
2733	amd_iommu_domain_flush_all(dom);
2734	spin_unlock_irqrestore(&dom->lock, flags);
2735}
2736
2737static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2738				 struct iommu_iotlb_gather *gather)
2739{
2740	struct protection_domain *dom = to_pdomain(domain);
2741	unsigned long flags;
2742
2743	spin_lock_irqsave(&dom->lock, flags);
2744	amd_iommu_domain_flush_pages(dom, gather->start,
2745				     gather->end - gather->start + 1);
2746	spin_unlock_irqrestore(&dom->lock, flags);
2747}
2748
2749static int amd_iommu_def_domain_type(struct device *dev)
2750{
2751	struct iommu_dev_data *dev_data;
2752
2753	dev_data = dev_iommu_priv_get(dev);
2754	if (!dev_data)
2755		return 0;
2756
2757	/*
2758	 * Do not identity map IOMMUv2 capable devices when:
2759	 *  - memory encryption is active, because some of those devices
2760	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2761	 *    and require remapping.
2762	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2763	 */
2764	if (pdev_pasid_supported(dev_data) &&
2765	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2766	    !amd_iommu_snp_en) {
2767		return IOMMU_DOMAIN_IDENTITY;
2768	}
2769
2770	return 0;
2771}
2772
2773static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2774{
2775	/* IOMMU_PTE_FC is always set */
2776	return true;
2777}
2778
2779static const struct iommu_dirty_ops amd_dirty_ops = {
2780	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2781	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2782};
2783
2784const struct iommu_ops amd_iommu_ops = {
2785	.capable = amd_iommu_capable,
2786	.domain_alloc = amd_iommu_domain_alloc,
2787	.domain_alloc_user = amd_iommu_domain_alloc_user,
2788	.probe_device = amd_iommu_probe_device,
2789	.release_device = amd_iommu_release_device,
2790	.probe_finalize = amd_iommu_probe_finalize,
2791	.device_group = amd_iommu_device_group,
2792	.get_resv_regions = amd_iommu_get_resv_regions,
2793	.is_attach_deferred = amd_iommu_is_attach_deferred,
2794	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2795	.def_domain_type = amd_iommu_def_domain_type,
2796	.default_domain_ops = &(const struct iommu_domain_ops) {
2797		.attach_dev	= amd_iommu_attach_device,
2798		.map_pages	= amd_iommu_map_pages,
2799		.unmap_pages	= amd_iommu_unmap_pages,
2800		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2801		.iova_to_phys	= amd_iommu_iova_to_phys,
2802		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2803		.iotlb_sync	= amd_iommu_iotlb_sync,
2804		.free		= amd_iommu_domain_free,
2805		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2806	}
2807};
2808
2809#ifdef CONFIG_IRQ_REMAP
2810
2811/*****************************************************************************
2812 *
2813 * Interrupt Remapping Implementation
2814 *
2815 *****************************************************************************/
2816
2817static struct irq_chip amd_ir_chip;
2818static DEFINE_SPINLOCK(iommu_table_lock);
2819
2820static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2821{
2822	int ret;
2823	u64 data;
2824	unsigned long flags;
2825	struct iommu_cmd cmd, cmd2;
2826
2827	if (iommu->irtcachedis_enabled)
2828		return;
2829
2830	build_inv_irt(&cmd, devid);
2831	data = atomic64_add_return(1, &iommu->cmd_sem_val);
2832	build_completion_wait(&cmd2, iommu, data);
2833
2834	raw_spin_lock_irqsave(&iommu->lock, flags);
2835	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2836	if (ret)
2837		goto out;
2838	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2839	if (ret)
2840		goto out;
2841	wait_on_sem(iommu, data);
2842out:
2843	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2844}
2845
2846static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2847			      struct irq_remap_table *table)
2848{
2849	u64 dte;
2850	struct dev_table_entry *dev_table = get_dev_table(iommu);
2851
2852	dte	= dev_table[devid].data[2];
2853	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2854	dte	|= iommu_virt_to_phys(table->table);
2855	dte	|= DTE_IRQ_REMAP_INTCTL;
2856	dte	|= DTE_INTTABLEN;
2857	dte	|= DTE_IRQ_REMAP_ENABLE;
2858
2859	dev_table[devid].data[2] = dte;
2860}
2861
2862static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2863{
2864	struct irq_remap_table *table;
2865	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2866
2867	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2868		      "%s: no iommu for devid %x:%x\n",
2869		      __func__, pci_seg->id, devid))
2870		return NULL;
2871
2872	table = pci_seg->irq_lookup_table[devid];
2873	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2874		      __func__, pci_seg->id, devid))
2875		return NULL;
2876
2877	return table;
2878}
2879
2880static struct irq_remap_table *__alloc_irq_table(void)
2881{
2882	struct irq_remap_table *table;
2883
2884	table = kzalloc(sizeof(*table), GFP_KERNEL);
2885	if (!table)
2886		return NULL;
2887
2888	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2889	if (!table->table) {
2890		kfree(table);
2891		return NULL;
2892	}
2893	raw_spin_lock_init(&table->lock);
2894
2895	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2896		memset(table->table, 0,
2897		       MAX_IRQS_PER_TABLE * sizeof(u32));
2898	else
2899		memset(table->table, 0,
2900		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2901	return table;
2902}
2903
2904static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2905				  struct irq_remap_table *table)
2906{
2907	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2908
2909	pci_seg->irq_lookup_table[devid] = table;
2910	set_dte_irq_entry(iommu, devid, table);
2911	iommu_flush_dte(iommu, devid);
2912}
2913
2914static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2915				       void *data)
2916{
2917	struct irq_remap_table *table = data;
2918	struct amd_iommu_pci_seg *pci_seg;
2919	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2920
2921	if (!iommu)
2922		return -EINVAL;
2923
2924	pci_seg = iommu->pci_seg;
2925	pci_seg->irq_lookup_table[alias] = table;
2926	set_dte_irq_entry(iommu, alias, table);
2927	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2928
2929	return 0;
2930}
2931
2932static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2933					       u16 devid, struct pci_dev *pdev)
2934{
2935	struct irq_remap_table *table = NULL;
2936	struct irq_remap_table *new_table = NULL;
2937	struct amd_iommu_pci_seg *pci_seg;
2938	unsigned long flags;
2939	u16 alias;
2940
2941	spin_lock_irqsave(&iommu_table_lock, flags);
2942
2943	pci_seg = iommu->pci_seg;
2944	table = pci_seg->irq_lookup_table[devid];
2945	if (table)
2946		goto out_unlock;
2947
2948	alias = pci_seg->alias_table[devid];
2949	table = pci_seg->irq_lookup_table[alias];
2950	if (table) {
2951		set_remap_table_entry(iommu, devid, table);
2952		goto out_wait;
2953	}
2954	spin_unlock_irqrestore(&iommu_table_lock, flags);
2955
2956	/* Nothing there yet, allocate new irq remapping table */
2957	new_table = __alloc_irq_table();
2958	if (!new_table)
2959		return NULL;
2960
2961	spin_lock_irqsave(&iommu_table_lock, flags);
2962
2963	table = pci_seg->irq_lookup_table[devid];
2964	if (table)
2965		goto out_unlock;
2966
2967	table = pci_seg->irq_lookup_table[alias];
2968	if (table) {
2969		set_remap_table_entry(iommu, devid, table);
2970		goto out_wait;
2971	}
2972
2973	table = new_table;
2974	new_table = NULL;
2975
2976	if (pdev)
2977		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
2978				       table);
2979	else
2980		set_remap_table_entry(iommu, devid, table);
2981
2982	if (devid != alias)
2983		set_remap_table_entry(iommu, alias, table);
2984
2985out_wait:
2986	iommu_completion_wait(iommu);
2987
2988out_unlock:
2989	spin_unlock_irqrestore(&iommu_table_lock, flags);
2990
2991	if (new_table) {
2992		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
2993		kfree(new_table);
2994	}
2995	return table;
2996}
2997
2998static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
2999			   bool align, struct pci_dev *pdev)
3000{
3001	struct irq_remap_table *table;
3002	int index, c, alignment = 1;
3003	unsigned long flags;
3004
3005	table = alloc_irq_table(iommu, devid, pdev);
3006	if (!table)
3007		return -ENODEV;
3008
3009	if (align)
3010		alignment = roundup_pow_of_two(count);
3011
3012	raw_spin_lock_irqsave(&table->lock, flags);
3013
3014	/* Scan table for free entries */
3015	for (index = ALIGN(table->min_index, alignment), c = 0;
3016	     index < MAX_IRQS_PER_TABLE;) {
3017		if (!iommu->irte_ops->is_allocated(table, index)) {
3018			c += 1;
3019		} else {
3020			c     = 0;
3021			index = ALIGN(index + 1, alignment);
3022			continue;
3023		}
3024
3025		if (c == count)	{
3026			for (; c != 0; --c)
3027				iommu->irte_ops->set_allocated(table, index - c + 1);
3028
3029			index -= count - 1;
3030			goto out;
3031		}
3032
3033		index++;
3034	}
3035
3036	index = -ENOSPC;
3037
3038out:
3039	raw_spin_unlock_irqrestore(&table->lock, flags);
3040
3041	return index;
3042}
3043
3044static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3045			    struct irte_ga *irte)
3046{
3047	struct irq_remap_table *table;
3048	struct irte_ga *entry;
3049	unsigned long flags;
3050	u128 old;
3051
3052	table = get_irq_table(iommu, devid);
3053	if (!table)
3054		return -ENOMEM;
3055
3056	raw_spin_lock_irqsave(&table->lock, flags);
3057
3058	entry = (struct irte_ga *)table->table;
3059	entry = &entry[index];
3060
3061	/*
3062	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3063	 * and it cannot be updated by the hardware or other processors
3064	 * behind us, so the return value of cmpxchg16 should be the
3065	 * same as the old value.
3066	 */
3067	old = entry->irte;
3068	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3069
3070	raw_spin_unlock_irqrestore(&table->lock, flags);
3071
3072	return 0;
3073}
3074
3075static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3076			  struct irte_ga *irte)
3077{
3078	bool ret;
3079
3080	ret = __modify_irte_ga(iommu, devid, index, irte);
3081	if (ret)
3082		return ret;
3083
3084	iommu_flush_irt_and_complete(iommu, devid);
3085
3086	return 0;
3087}
3088
3089static int modify_irte(struct amd_iommu *iommu,
3090		       u16 devid, int index, union irte *irte)
3091{
3092	struct irq_remap_table *table;
3093	unsigned long flags;
3094
3095	table = get_irq_table(iommu, devid);
3096	if (!table)
3097		return -ENOMEM;
3098
3099	raw_spin_lock_irqsave(&table->lock, flags);
3100	table->table[index] = irte->val;
3101	raw_spin_unlock_irqrestore(&table->lock, flags);
3102
3103	iommu_flush_irt_and_complete(iommu, devid);
3104
3105	return 0;
3106}
3107
3108static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3109{
3110	struct irq_remap_table *table;
3111	unsigned long flags;
3112
3113	table = get_irq_table(iommu, devid);
3114	if (!table)
3115		return;
3116
3117	raw_spin_lock_irqsave(&table->lock, flags);
3118	iommu->irte_ops->clear_allocated(table, index);
3119	raw_spin_unlock_irqrestore(&table->lock, flags);
3120
3121	iommu_flush_irt_and_complete(iommu, devid);
3122}
3123
3124static void irte_prepare(void *entry,
3125			 u32 delivery_mode, bool dest_mode,
3126			 u8 vector, u32 dest_apicid, int devid)
3127{
3128	union irte *irte = (union irte *) entry;
3129
3130	irte->val                = 0;
3131	irte->fields.vector      = vector;
3132	irte->fields.int_type    = delivery_mode;
3133	irte->fields.destination = dest_apicid;
3134	irte->fields.dm          = dest_mode;
3135	irte->fields.valid       = 1;
3136}
3137
3138static void irte_ga_prepare(void *entry,
3139			    u32 delivery_mode, bool dest_mode,
3140			    u8 vector, u32 dest_apicid, int devid)
3141{
3142	struct irte_ga *irte = (struct irte_ga *) entry;
3143
3144	irte->lo.val                      = 0;
3145	irte->hi.val                      = 0;
3146	irte->lo.fields_remap.int_type    = delivery_mode;
3147	irte->lo.fields_remap.dm          = dest_mode;
3148	irte->hi.fields.vector            = vector;
3149	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3150	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3151	irte->lo.fields_remap.valid       = 1;
3152}
3153
3154static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3155{
3156	union irte *irte = (union irte *) entry;
3157
3158	irte->fields.valid = 1;
3159	modify_irte(iommu, devid, index, irte);
3160}
3161
3162static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3163{
3164	struct irte_ga *irte = (struct irte_ga *) entry;
3165
3166	irte->lo.fields_remap.valid = 1;
3167	modify_irte_ga(iommu, devid, index, irte);
3168}
3169
3170static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3171{
3172	union irte *irte = (union irte *) entry;
3173
3174	irte->fields.valid = 0;
3175	modify_irte(iommu, devid, index, irte);
3176}
3177
3178static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3179{
3180	struct irte_ga *irte = (struct irte_ga *) entry;
3181
3182	irte->lo.fields_remap.valid = 0;
3183	modify_irte_ga(iommu, devid, index, irte);
3184}
3185
3186static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3187			      u8 vector, u32 dest_apicid)
3188{
3189	union irte *irte = (union irte *) entry;
3190
3191	irte->fields.vector = vector;
3192	irte->fields.destination = dest_apicid;
3193	modify_irte(iommu, devid, index, irte);
3194}
3195
3196static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3197				 u8 vector, u32 dest_apicid)
3198{
3199	struct irte_ga *irte = (struct irte_ga *) entry;
3200
3201	if (!irte->lo.fields_remap.guest_mode) {
3202		irte->hi.fields.vector = vector;
3203		irte->lo.fields_remap.destination =
3204					APICID_TO_IRTE_DEST_LO(dest_apicid);
3205		irte->hi.fields.destination =
3206					APICID_TO_IRTE_DEST_HI(dest_apicid);
3207		modify_irte_ga(iommu, devid, index, irte);
3208	}
3209}
3210
3211#define IRTE_ALLOCATED (~1U)
3212static void irte_set_allocated(struct irq_remap_table *table, int index)
3213{
3214	table->table[index] = IRTE_ALLOCATED;
3215}
3216
3217static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3218{
3219	struct irte_ga *ptr = (struct irte_ga *)table->table;
3220	struct irte_ga *irte = &ptr[index];
3221
3222	memset(&irte->lo.val, 0, sizeof(u64));
3223	memset(&irte->hi.val, 0, sizeof(u64));
3224	irte->hi.fields.vector = 0xff;
3225}
3226
3227static bool irte_is_allocated(struct irq_remap_table *table, int index)
3228{
3229	union irte *ptr = (union irte *)table->table;
3230	union irte *irte = &ptr[index];
3231
3232	return irte->val != 0;
3233}
3234
3235static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3236{
3237	struct irte_ga *ptr = (struct irte_ga *)table->table;
3238	struct irte_ga *irte = &ptr[index];
3239
3240	return irte->hi.fields.vector != 0;
3241}
3242
3243static void irte_clear_allocated(struct irq_remap_table *table, int index)
3244{
3245	table->table[index] = 0;
3246}
3247
3248static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3249{
3250	struct irte_ga *ptr = (struct irte_ga *)table->table;
3251	struct irte_ga *irte = &ptr[index];
3252
3253	memset(&irte->lo.val, 0, sizeof(u64));
3254	memset(&irte->hi.val, 0, sizeof(u64));
3255}
3256
3257static int get_devid(struct irq_alloc_info *info)
3258{
3259	switch (info->type) {
3260	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3261		return get_ioapic_devid(info->devid);
3262	case X86_IRQ_ALLOC_TYPE_HPET:
3263		return get_hpet_devid(info->devid);
3264	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3265	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3266		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3267	default:
3268		WARN_ON_ONCE(1);
3269		return -1;
3270	}
3271}
3272
3273struct irq_remap_ops amd_iommu_irq_ops = {
3274	.prepare		= amd_iommu_prepare,
3275	.enable			= amd_iommu_enable,
3276	.disable		= amd_iommu_disable,
3277	.reenable		= amd_iommu_reenable,
3278	.enable_faulting	= amd_iommu_enable_faulting,
3279};
3280
3281static void fill_msi_msg(struct msi_msg *msg, u32 index)
3282{
3283	msg->data = index;
3284	msg->address_lo = 0;
3285	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3286	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3287}
3288
3289static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3290				       struct irq_cfg *irq_cfg,
3291				       struct irq_alloc_info *info,
3292				       int devid, int index, int sub_handle)
3293{
3294	struct irq_2_irte *irte_info = &data->irq_2_irte;
3295	struct amd_iommu *iommu = data->iommu;
3296
3297	if (!iommu)
3298		return;
3299
3300	data->irq_2_irte.devid = devid;
3301	data->irq_2_irte.index = index + sub_handle;
3302	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3303				 apic->dest_mode_logical, irq_cfg->vector,
3304				 irq_cfg->dest_apicid, devid);
3305
3306	switch (info->type) {
3307	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3308	case X86_IRQ_ALLOC_TYPE_HPET:
3309	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3310	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3311		fill_msi_msg(&data->msi_entry, irte_info->index);
3312		break;
3313
3314	default:
3315		BUG_ON(1);
3316		break;
3317	}
3318}
3319
3320struct amd_irte_ops irte_32_ops = {
3321	.prepare = irte_prepare,
3322	.activate = irte_activate,
3323	.deactivate = irte_deactivate,
3324	.set_affinity = irte_set_affinity,
3325	.set_allocated = irte_set_allocated,
3326	.is_allocated = irte_is_allocated,
3327	.clear_allocated = irte_clear_allocated,
3328};
3329
3330struct amd_irte_ops irte_128_ops = {
3331	.prepare = irte_ga_prepare,
3332	.activate = irte_ga_activate,
3333	.deactivate = irte_ga_deactivate,
3334	.set_affinity = irte_ga_set_affinity,
3335	.set_allocated = irte_ga_set_allocated,
3336	.is_allocated = irte_ga_is_allocated,
3337	.clear_allocated = irte_ga_clear_allocated,
3338};
3339
3340static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3341			       unsigned int nr_irqs, void *arg)
3342{
3343	struct irq_alloc_info *info = arg;
3344	struct irq_data *irq_data;
3345	struct amd_ir_data *data = NULL;
3346	struct amd_iommu *iommu;
3347	struct irq_cfg *cfg;
3348	int i, ret, devid, seg, sbdf;
3349	int index;
3350
3351	if (!info)
3352		return -EINVAL;
3353	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3354		return -EINVAL;
3355
3356	sbdf = get_devid(info);
3357	if (sbdf < 0)
3358		return -EINVAL;
3359
3360	seg = PCI_SBDF_TO_SEGID(sbdf);
3361	devid = PCI_SBDF_TO_DEVID(sbdf);
3362	iommu = __rlookup_amd_iommu(seg, devid);
3363	if (!iommu)
3364		return -EINVAL;
3365
3366	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3367	if (ret < 0)
3368		return ret;
3369
3370	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3371		struct irq_remap_table *table;
3372
3373		table = alloc_irq_table(iommu, devid, NULL);
3374		if (table) {
3375			if (!table->min_index) {
3376				/*
3377				 * Keep the first 32 indexes free for IOAPIC
3378				 * interrupts.
3379				 */
3380				table->min_index = 32;
3381				for (i = 0; i < 32; ++i)
3382					iommu->irte_ops->set_allocated(table, i);
3383			}
3384			WARN_ON(table->min_index != 32);
3385			index = info->ioapic.pin;
3386		} else {
3387			index = -ENOMEM;
3388		}
3389	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3390		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3391		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3392
3393		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3394					msi_desc_to_pci_dev(info->desc));
3395	} else {
3396		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3397	}
3398
3399	if (index < 0) {
3400		pr_warn("Failed to allocate IRTE\n");
3401		ret = index;
3402		goto out_free_parent;
3403	}
3404
3405	for (i = 0; i < nr_irqs; i++) {
3406		irq_data = irq_domain_get_irq_data(domain, virq + i);
3407		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3408		if (!cfg) {
3409			ret = -EINVAL;
3410			goto out_free_data;
3411		}
3412
3413		ret = -ENOMEM;
3414		data = kzalloc(sizeof(*data), GFP_KERNEL);
3415		if (!data)
3416			goto out_free_data;
3417
3418		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3419			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3420		else
3421			data->entry = kzalloc(sizeof(struct irte_ga),
3422						     GFP_KERNEL);
3423		if (!data->entry) {
3424			kfree(data);
3425			goto out_free_data;
3426		}
3427
3428		data->iommu = iommu;
3429		irq_data->hwirq = (devid << 16) + i;
3430		irq_data->chip_data = data;
3431		irq_data->chip = &amd_ir_chip;
3432		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3433		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3434	}
3435
3436	return 0;
3437
3438out_free_data:
3439	for (i--; i >= 0; i--) {
3440		irq_data = irq_domain_get_irq_data(domain, virq + i);
3441		if (irq_data)
3442			kfree(irq_data->chip_data);
3443	}
3444	for (i = 0; i < nr_irqs; i++)
3445		free_irte(iommu, devid, index + i);
3446out_free_parent:
3447	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3448	return ret;
3449}
3450
3451static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3452			       unsigned int nr_irqs)
3453{
3454	struct irq_2_irte *irte_info;
3455	struct irq_data *irq_data;
3456	struct amd_ir_data *data;
3457	int i;
3458
3459	for (i = 0; i < nr_irqs; i++) {
3460		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3461		if (irq_data && irq_data->chip_data) {
3462			data = irq_data->chip_data;
3463			irte_info = &data->irq_2_irte;
3464			free_irte(data->iommu, irte_info->devid, irte_info->index);
3465			kfree(data->entry);
3466			kfree(data);
3467		}
3468	}
3469	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3470}
3471
3472static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3473			       struct amd_ir_data *ir_data,
3474			       struct irq_2_irte *irte_info,
3475			       struct irq_cfg *cfg);
3476
3477static int irq_remapping_activate(struct irq_domain *domain,
3478				  struct irq_data *irq_data, bool reserve)
3479{
3480	struct amd_ir_data *data = irq_data->chip_data;
3481	struct irq_2_irte *irte_info = &data->irq_2_irte;
3482	struct amd_iommu *iommu = data->iommu;
3483	struct irq_cfg *cfg = irqd_cfg(irq_data);
3484
3485	if (!iommu)
3486		return 0;
3487
3488	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3489				  irte_info->index);
3490	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3491	return 0;
3492}
3493
3494static void irq_remapping_deactivate(struct irq_domain *domain,
3495				     struct irq_data *irq_data)
3496{
3497	struct amd_ir_data *data = irq_data->chip_data;
3498	struct irq_2_irte *irte_info = &data->irq_2_irte;
3499	struct amd_iommu *iommu = data->iommu;
3500
3501	if (iommu)
3502		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3503					    irte_info->index);
3504}
3505
3506static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3507				enum irq_domain_bus_token bus_token)
3508{
3509	struct amd_iommu *iommu;
3510	int devid = -1;
3511
3512	if (!amd_iommu_irq_remap)
3513		return 0;
3514
3515	if (x86_fwspec_is_ioapic(fwspec))
3516		devid = get_ioapic_devid(fwspec->param[0]);
3517	else if (x86_fwspec_is_hpet(fwspec))
3518		devid = get_hpet_devid(fwspec->param[0]);
3519
3520	if (devid < 0)
3521		return 0;
3522	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3523
3524	return iommu && iommu->ir_domain == d;
3525}
3526
3527static const struct irq_domain_ops amd_ir_domain_ops = {
3528	.select = irq_remapping_select,
3529	.alloc = irq_remapping_alloc,
3530	.free = irq_remapping_free,
3531	.activate = irq_remapping_activate,
3532	.deactivate = irq_remapping_deactivate,
3533};
3534
3535int amd_iommu_activate_guest_mode(void *data)
3536{
3537	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3538	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3539	u64 valid;
3540
3541	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3542		return 0;
3543
3544	valid = entry->lo.fields_vapic.valid;
3545
3546	entry->lo.val = 0;
3547	entry->hi.val = 0;
3548
3549	entry->lo.fields_vapic.valid       = valid;
3550	entry->lo.fields_vapic.guest_mode  = 1;
3551	entry->lo.fields_vapic.ga_log_intr = 1;
3552	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3553	entry->hi.fields.vector            = ir_data->ga_vector;
3554	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3555
3556	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3557			      ir_data->irq_2_irte.index, entry);
3558}
3559EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3560
3561int amd_iommu_deactivate_guest_mode(void *data)
3562{
3563	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3564	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3565	struct irq_cfg *cfg = ir_data->cfg;
3566	u64 valid;
3567
3568	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3569	    !entry || !entry->lo.fields_vapic.guest_mode)
3570		return 0;
3571
3572	valid = entry->lo.fields_remap.valid;
3573
3574	entry->lo.val = 0;
3575	entry->hi.val = 0;
3576
3577	entry->lo.fields_remap.valid       = valid;
3578	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3579	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3580	entry->hi.fields.vector            = cfg->vector;
3581	entry->lo.fields_remap.destination =
3582				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3583	entry->hi.fields.destination =
3584				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3585
3586	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3587			      ir_data->irq_2_irte.index, entry);
3588}
3589EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3590
3591static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3592{
3593	int ret;
3594	struct amd_iommu_pi_data *pi_data = vcpu_info;
3595	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3596	struct amd_ir_data *ir_data = data->chip_data;
3597	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3598	struct iommu_dev_data *dev_data;
3599
3600	if (ir_data->iommu == NULL)
3601		return -EINVAL;
3602
3603	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3604
3605	/* Note:
3606	 * This device has never been set up for guest mode.
3607	 * we should not modify the IRTE
3608	 */
3609	if (!dev_data || !dev_data->use_vapic)
3610		return 0;
3611
3612	ir_data->cfg = irqd_cfg(data);
3613	pi_data->ir_data = ir_data;
3614
3615	/* Note:
3616	 * SVM tries to set up for VAPIC mode, but we are in
3617	 * legacy mode. So, we force legacy mode instead.
3618	 */
3619	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3620		pr_debug("%s: Fall back to using intr legacy remap\n",
3621			 __func__);
3622		pi_data->is_guest_mode = false;
3623	}
3624
3625	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3626	if (pi_data->is_guest_mode) {
3627		ir_data->ga_root_ptr = (pi_data->base >> 12);
3628		ir_data->ga_vector = vcpu_pi_info->vector;
3629		ir_data->ga_tag = pi_data->ga_tag;
3630		ret = amd_iommu_activate_guest_mode(ir_data);
3631		if (!ret)
3632			ir_data->cached_ga_tag = pi_data->ga_tag;
3633	} else {
3634		ret = amd_iommu_deactivate_guest_mode(ir_data);
3635
3636		/*
3637		 * This communicates the ga_tag back to the caller
3638		 * so that it can do all the necessary clean up.
3639		 */
3640		if (!ret)
3641			ir_data->cached_ga_tag = 0;
3642	}
3643
3644	return ret;
3645}
3646
3647
3648static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3649			       struct amd_ir_data *ir_data,
3650			       struct irq_2_irte *irte_info,
3651			       struct irq_cfg *cfg)
3652{
3653
3654	/*
3655	 * Atomically updates the IRTE with the new destination, vector
3656	 * and flushes the interrupt entry cache.
3657	 */
3658	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3659				      irte_info->index, cfg->vector,
3660				      cfg->dest_apicid);
3661}
3662
3663static int amd_ir_set_affinity(struct irq_data *data,
3664			       const struct cpumask *mask, bool force)
3665{
3666	struct amd_ir_data *ir_data = data->chip_data;
3667	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3668	struct irq_cfg *cfg = irqd_cfg(data);
3669	struct irq_data *parent = data->parent_data;
3670	struct amd_iommu *iommu = ir_data->iommu;
3671	int ret;
3672
3673	if (!iommu)
3674		return -ENODEV;
3675
3676	ret = parent->chip->irq_set_affinity(parent, mask, force);
3677	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3678		return ret;
3679
3680	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3681	/*
3682	 * After this point, all the interrupts will start arriving
3683	 * at the new destination. So, time to cleanup the previous
3684	 * vector allocation.
3685	 */
3686	vector_schedule_cleanup(cfg);
3687
3688	return IRQ_SET_MASK_OK_DONE;
3689}
3690
3691static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3692{
3693	struct amd_ir_data *ir_data = irq_data->chip_data;
3694
3695	*msg = ir_data->msi_entry;
3696}
3697
3698static struct irq_chip amd_ir_chip = {
3699	.name			= "AMD-IR",
3700	.irq_ack		= apic_ack_irq,
3701	.irq_set_affinity	= amd_ir_set_affinity,
3702	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3703	.irq_compose_msi_msg	= ir_compose_msi_msg,
3704};
3705
3706static const struct msi_parent_ops amdvi_msi_parent_ops = {
3707	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3708				  MSI_FLAG_MULTI_PCI_MSI |
3709				  MSI_FLAG_PCI_IMS,
3710	.prefix			= "IR-",
3711	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3712};
3713
3714static const struct msi_parent_ops virt_amdvi_msi_parent_ops = {
3715	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3716				  MSI_FLAG_MULTI_PCI_MSI,
3717	.prefix			= "vIR-",
3718	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3719};
3720
3721int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3722{
3723	struct fwnode_handle *fn;
3724
3725	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3726	if (!fn)
3727		return -ENOMEM;
3728	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3729						       fn, &amd_ir_domain_ops, iommu);
3730	if (!iommu->ir_domain) {
3731		irq_domain_free_fwnode(fn);
3732		return -ENOMEM;
3733	}
3734
3735	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3736	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3737				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3738
3739	if (amd_iommu_np_cache)
3740		iommu->ir_domain->msi_parent_ops = &virt_amdvi_msi_parent_ops;
3741	else
3742		iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3743
3744	return 0;
3745}
3746
3747int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3748{
3749	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3750	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3751
3752	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3753	    !entry || !entry->lo.fields_vapic.guest_mode)
3754		return 0;
3755
3756	if (!ir_data->iommu)
3757		return -ENODEV;
3758
3759	if (cpu >= 0) {
3760		entry->lo.fields_vapic.destination =
3761					APICID_TO_IRTE_DEST_LO(cpu);
3762		entry->hi.fields.destination =
3763					APICID_TO_IRTE_DEST_HI(cpu);
3764	}
3765	entry->lo.fields_vapic.is_run = is_run;
3766
3767	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3768				ir_data->irq_2_irte.index, entry);
3769}
3770EXPORT_SYMBOL(amd_iommu_update_ga);
3771#endif
3772