1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright �� 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 *          Ashok Raj <ashok.raj@intel.com>,
7 *          Shaohua Li <shaohua.li@intel.com>,
8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 *          Fenghua Yu <fenghua.yu@intel.com>
10 *          Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt)     "DMAR: " fmt
14#define dev_fmt(fmt)    pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "pasid.h"
31#include "cap_audit.h"
32#include "perfmon.h"
33
34#define ROOT_SIZE		VTD_PAGE_SIZE
35#define CONTEXT_SIZE		VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START	(0xfee00000)
43#define IOAPIC_RANGE_END	(0xfeefffff)
44#define IOVA_START_ADDR		(0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57/* IO virtual address start page frame number */
58#define IOVA_START_PFN		(1)
59
60#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
61
62static void __init check_tylersburg_isoch(void);
63static int rwbf_quirk;
64
65/*
66 * set to 1 to panic kernel if can't successfully enable VT-d
67 * (used when kernel is launched w/ TXT)
68 */
69static int force_on = 0;
70static int intel_iommu_tboot_noforce;
71static int no_platform_optin;
72
73#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74
75/*
76 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
77 * if marked present.
78 */
79static phys_addr_t root_entry_lctp(struct root_entry *re)
80{
81	if (!(re->lo & 1))
82		return 0;
83
84	return re->lo & VTD_PAGE_MASK;
85}
86
87/*
88 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
89 * if marked present.
90 */
91static phys_addr_t root_entry_uctp(struct root_entry *re)
92{
93	if (!(re->hi & 1))
94		return 0;
95
96	return re->hi & VTD_PAGE_MASK;
97}
98
99static int device_rid_cmp_key(const void *key, const struct rb_node *node)
100{
101	struct device_domain_info *info =
102		rb_entry(node, struct device_domain_info, node);
103	const u16 *rid_lhs = key;
104
105	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
106		return -1;
107
108	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
109		return 1;
110
111	return 0;
112}
113
114static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
115{
116	struct device_domain_info *info =
117		rb_entry(lhs, struct device_domain_info, node);
118	u16 key = PCI_DEVID(info->bus, info->devfn);
119
120	return device_rid_cmp_key(&key, rhs);
121}
122
123/*
124 * Looks up an IOMMU-probed device using its source ID.
125 *
126 * Returns the pointer to the device if there is a match. Otherwise,
127 * returns NULL.
128 *
129 * Note that this helper doesn't guarantee that the device won't be
130 * released by the iommu subsystem after being returned. The caller
131 * should use its own synchronization mechanism to avoid the device
132 * being released during its use if its possibly the case.
133 */
134struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
135{
136	struct device_domain_info *info = NULL;
137	struct rb_node *node;
138	unsigned long flags;
139
140	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
141	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
142	if (node)
143		info = rb_entry(node, struct device_domain_info, node);
144	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
145
146	return info ? info->dev : NULL;
147}
148
149static int device_rbtree_insert(struct intel_iommu *iommu,
150				struct device_domain_info *info)
151{
152	struct rb_node *curr;
153	unsigned long flags;
154
155	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
156	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
157	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
158	if (WARN_ON(curr))
159		return -EEXIST;
160
161	return 0;
162}
163
164static void device_rbtree_remove(struct device_domain_info *info)
165{
166	struct intel_iommu *iommu = info->iommu;
167	unsigned long flags;
168
169	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
170	rb_erase(&info->node, &iommu->device_rbtree);
171	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
172}
173
174/*
175 * This domain is a statically identity mapping domain.
176 *	1. This domain creats a static 1:1 mapping to all usable memory.
177 * 	2. It maps to each iommu if successful.
178 *	3. Each iommu mapps to this domain if successful.
179 */
180static struct dmar_domain *si_domain;
181static int hw_pass_through = 1;
182
183struct dmar_rmrr_unit {
184	struct list_head list;		/* list of rmrr units	*/
185	struct acpi_dmar_header *hdr;	/* ACPI header		*/
186	u64	base_address;		/* reserved base address*/
187	u64	end_address;		/* reserved end address */
188	struct dmar_dev_scope *devices;	/* target devices */
189	int	devices_cnt;		/* target device count */
190};
191
192struct dmar_atsr_unit {
193	struct list_head list;		/* list of ATSR units */
194	struct acpi_dmar_header *hdr;	/* ACPI header */
195	struct dmar_dev_scope *devices;	/* target devices */
196	int devices_cnt;		/* target device count */
197	u8 include_all:1;		/* include all ports */
198};
199
200struct dmar_satc_unit {
201	struct list_head list;		/* list of SATC units */
202	struct acpi_dmar_header *hdr;	/* ACPI header */
203	struct dmar_dev_scope *devices;	/* target devices */
204	struct intel_iommu *iommu;	/* the corresponding iommu */
205	int devices_cnt;		/* target device count */
206	u8 atc_required:1;		/* ATS is required */
207};
208
209static LIST_HEAD(dmar_atsr_units);
210static LIST_HEAD(dmar_rmrr_units);
211static LIST_HEAD(dmar_satc_units);
212
213#define for_each_rmrr_units(rmrr) \
214	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
215
216static void intel_iommu_domain_free(struct iommu_domain *domain);
217
218int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
219int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
220
221int intel_iommu_enabled = 0;
222EXPORT_SYMBOL_GPL(intel_iommu_enabled);
223
224static int dmar_map_gfx = 1;
225static int intel_iommu_superpage = 1;
226static int iommu_identity_mapping;
227static int iommu_skip_te_disable;
228
229#define IDENTMAP_GFX		2
230#define IDENTMAP_AZALIA		4
231
232const struct iommu_ops intel_iommu_ops;
233static const struct iommu_dirty_ops intel_dirty_ops;
234
235static bool translation_pre_enabled(struct intel_iommu *iommu)
236{
237	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
238}
239
240static void clear_translation_pre_enabled(struct intel_iommu *iommu)
241{
242	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
243}
244
245static void init_translation_status(struct intel_iommu *iommu)
246{
247	u32 gsts;
248
249	gsts = readl(iommu->reg + DMAR_GSTS_REG);
250	if (gsts & DMA_GSTS_TES)
251		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
252}
253
254static int __init intel_iommu_setup(char *str)
255{
256	if (!str)
257		return -EINVAL;
258
259	while (*str) {
260		if (!strncmp(str, "on", 2)) {
261			dmar_disabled = 0;
262			pr_info("IOMMU enabled\n");
263		} else if (!strncmp(str, "off", 3)) {
264			dmar_disabled = 1;
265			no_platform_optin = 1;
266			pr_info("IOMMU disabled\n");
267		} else if (!strncmp(str, "igfx_off", 8)) {
268			dmar_map_gfx = 0;
269			pr_info("Disable GFX device mapping\n");
270		} else if (!strncmp(str, "forcedac", 8)) {
271			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
272			iommu_dma_forcedac = true;
273		} else if (!strncmp(str, "strict", 6)) {
274			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
275			iommu_set_dma_strict();
276		} else if (!strncmp(str, "sp_off", 6)) {
277			pr_info("Disable supported super page\n");
278			intel_iommu_superpage = 0;
279		} else if (!strncmp(str, "sm_on", 5)) {
280			pr_info("Enable scalable mode if hardware supports\n");
281			intel_iommu_sm = 1;
282		} else if (!strncmp(str, "sm_off", 6)) {
283			pr_info("Scalable mode is disallowed\n");
284			intel_iommu_sm = 0;
285		} else if (!strncmp(str, "tboot_noforce", 13)) {
286			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
287			intel_iommu_tboot_noforce = 1;
288		} else {
289			pr_notice("Unknown option - '%s'\n", str);
290		}
291
292		str += strcspn(str, ",");
293		while (*str == ',')
294			str++;
295	}
296
297	return 1;
298}
299__setup("intel_iommu=", intel_iommu_setup);
300
301void *alloc_pgtable_page(int node, gfp_t gfp)
302{
303	struct page *page;
304	void *vaddr = NULL;
305
306	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
307	if (page)
308		vaddr = page_address(page);
309	return vaddr;
310}
311
312void free_pgtable_page(void *vaddr)
313{
314	free_page((unsigned long)vaddr);
315}
316
317static int domain_type_is_si(struct dmar_domain *domain)
318{
319	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
320}
321
322static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
323{
324	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
325
326	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
327}
328
329/*
330 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
331 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
332 * the returned SAGAW.
333 */
334static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
335{
336	unsigned long fl_sagaw, sl_sagaw;
337
338	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
339	sl_sagaw = cap_sagaw(iommu->cap);
340
341	/* Second level only. */
342	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
343		return sl_sagaw;
344
345	/* First level only. */
346	if (!ecap_slts(iommu->ecap))
347		return fl_sagaw;
348
349	return fl_sagaw & sl_sagaw;
350}
351
352static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
353{
354	unsigned long sagaw;
355	int agaw;
356
357	sagaw = __iommu_calculate_sagaw(iommu);
358	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
359		if (test_bit(agaw, &sagaw))
360			break;
361	}
362
363	return agaw;
364}
365
366/*
367 * Calculate max SAGAW for each iommu.
368 */
369int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
370{
371	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
372}
373
374/*
375 * calculate agaw for each iommu.
376 * "SAGAW" may be different across iommus, use a default agaw, and
377 * get a supported less agaw for iommus that don't support the default agaw.
378 */
379int iommu_calculate_agaw(struct intel_iommu *iommu)
380{
381	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
382}
383
384static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
385{
386	return sm_supported(iommu) ?
387			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
388}
389
390static void domain_update_iommu_coherency(struct dmar_domain *domain)
391{
392	struct iommu_domain_info *info;
393	struct dmar_drhd_unit *drhd;
394	struct intel_iommu *iommu;
395	bool found = false;
396	unsigned long i;
397
398	domain->iommu_coherency = true;
399	xa_for_each(&domain->iommu_array, i, info) {
400		found = true;
401		if (!iommu_paging_structure_coherency(info->iommu)) {
402			domain->iommu_coherency = false;
403			break;
404		}
405	}
406	if (found)
407		return;
408
409	/* No hardware attached; use lowest common denominator */
410	rcu_read_lock();
411	for_each_active_iommu(iommu, drhd) {
412		if (!iommu_paging_structure_coherency(iommu)) {
413			domain->iommu_coherency = false;
414			break;
415		}
416	}
417	rcu_read_unlock();
418}
419
420static int domain_update_iommu_superpage(struct dmar_domain *domain,
421					 struct intel_iommu *skip)
422{
423	struct dmar_drhd_unit *drhd;
424	struct intel_iommu *iommu;
425	int mask = 0x3;
426
427	if (!intel_iommu_superpage)
428		return 0;
429
430	/* set iommu_superpage to the smallest common denominator */
431	rcu_read_lock();
432	for_each_active_iommu(iommu, drhd) {
433		if (iommu != skip) {
434			if (domain && domain->use_first_level) {
435				if (!cap_fl1gp_support(iommu->cap))
436					mask = 0x1;
437			} else {
438				mask &= cap_super_page_val(iommu->cap);
439			}
440
441			if (!mask)
442				break;
443		}
444	}
445	rcu_read_unlock();
446
447	return fls(mask);
448}
449
450static int domain_update_device_node(struct dmar_domain *domain)
451{
452	struct device_domain_info *info;
453	int nid = NUMA_NO_NODE;
454	unsigned long flags;
455
456	spin_lock_irqsave(&domain->lock, flags);
457	list_for_each_entry(info, &domain->devices, link) {
458		/*
459		 * There could possibly be multiple device numa nodes as devices
460		 * within the same domain may sit behind different IOMMUs. There
461		 * isn't perfect answer in such situation, so we select first
462		 * come first served policy.
463		 */
464		nid = dev_to_node(info->dev);
465		if (nid != NUMA_NO_NODE)
466			break;
467	}
468	spin_unlock_irqrestore(&domain->lock, flags);
469
470	return nid;
471}
472
473/* Return the super pagesize bitmap if supported. */
474static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
475{
476	unsigned long bitmap = 0;
477
478	/*
479	 * 1-level super page supports page size of 2MiB, 2-level super page
480	 * supports page size of both 2MiB and 1GiB.
481	 */
482	if (domain->iommu_superpage == 1)
483		bitmap |= SZ_2M;
484	else if (domain->iommu_superpage == 2)
485		bitmap |= SZ_2M | SZ_1G;
486
487	return bitmap;
488}
489
490/* Some capabilities may be different across iommus */
491void domain_update_iommu_cap(struct dmar_domain *domain)
492{
493	domain_update_iommu_coherency(domain);
494	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
495
496	/*
497	 * If RHSA is missing, we should default to the device numa domain
498	 * as fall back.
499	 */
500	if (domain->nid == NUMA_NO_NODE)
501		domain->nid = domain_update_device_node(domain);
502
503	/*
504	 * First-level translation restricts the input-address to a
505	 * canonical address (i.e., address bits 63:N have the same
506	 * value as address bit [N-1], where N is 48-bits with 4-level
507	 * paging and 57-bits with 5-level paging). Hence, skip bit
508	 * [N-1].
509	 */
510	if (domain->use_first_level)
511		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
512	else
513		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
514
515	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
516	domain_update_iotlb(domain);
517}
518
519struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
520					 u8 devfn, int alloc)
521{
522	struct root_entry *root = &iommu->root_entry[bus];
523	struct context_entry *context;
524	u64 *entry;
525
526	/*
527	 * Except that the caller requested to allocate a new entry,
528	 * returning a copied context entry makes no sense.
529	 */
530	if (!alloc && context_copied(iommu, bus, devfn))
531		return NULL;
532
533	entry = &root->lo;
534	if (sm_supported(iommu)) {
535		if (devfn >= 0x80) {
536			devfn -= 0x80;
537			entry = &root->hi;
538		}
539		devfn *= 2;
540	}
541	if (*entry & 1)
542		context = phys_to_virt(*entry & VTD_PAGE_MASK);
543	else {
544		unsigned long phy_addr;
545		if (!alloc)
546			return NULL;
547
548		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
549		if (!context)
550			return NULL;
551
552		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
553		phy_addr = virt_to_phys((void *)context);
554		*entry = phy_addr | 1;
555		__iommu_flush_cache(iommu, entry, sizeof(*entry));
556	}
557	return &context[devfn];
558}
559
560/**
561 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
562 *				 sub-hierarchy of a candidate PCI-PCI bridge
563 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
564 * @bridge: the candidate PCI-PCI bridge
565 *
566 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
567 */
568static bool
569is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
570{
571	struct pci_dev *pdev, *pbridge;
572
573	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
574		return false;
575
576	pdev = to_pci_dev(dev);
577	pbridge = to_pci_dev(bridge);
578
579	if (pbridge->subordinate &&
580	    pbridge->subordinate->number <= pdev->bus->number &&
581	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
582		return true;
583
584	return false;
585}
586
587static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
588{
589	struct dmar_drhd_unit *drhd;
590	u32 vtbar;
591	int rc;
592
593	/* We know that this device on this chipset has its own IOMMU.
594	 * If we find it under a different IOMMU, then the BIOS is lying
595	 * to us. Hope that the IOMMU for this device is actually
596	 * disabled, and it needs no translation...
597	 */
598	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
599	if (rc) {
600		/* "can't" happen */
601		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
602		return false;
603	}
604	vtbar &= 0xffff0000;
605
606	/* we know that the this iommu should be at offset 0xa000 from vtbar */
607	drhd = dmar_find_matched_drhd_unit(pdev);
608	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
609		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
610		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
611		return true;
612	}
613
614	return false;
615}
616
617static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
618{
619	if (!iommu || iommu->drhd->ignored)
620		return true;
621
622	if (dev_is_pci(dev)) {
623		struct pci_dev *pdev = to_pci_dev(dev);
624
625		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
626		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
627		    quirk_ioat_snb_local_iommu(pdev))
628			return true;
629	}
630
631	return false;
632}
633
634static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
635{
636	struct dmar_drhd_unit *drhd = NULL;
637	struct pci_dev *pdev = NULL;
638	struct intel_iommu *iommu;
639	struct device *tmp;
640	u16 segment = 0;
641	int i;
642
643	if (!dev)
644		return NULL;
645
646	if (dev_is_pci(dev)) {
647		struct pci_dev *pf_pdev;
648
649		pdev = pci_real_dma_dev(to_pci_dev(dev));
650
651		/* VFs aren't listed in scope tables; we need to look up
652		 * the PF instead to find the IOMMU. */
653		pf_pdev = pci_physfn(pdev);
654		dev = &pf_pdev->dev;
655		segment = pci_domain_nr(pdev->bus);
656	} else if (has_acpi_companion(dev))
657		dev = &ACPI_COMPANION(dev)->dev;
658
659	rcu_read_lock();
660	for_each_iommu(iommu, drhd) {
661		if (pdev && segment != drhd->segment)
662			continue;
663
664		for_each_active_dev_scope(drhd->devices,
665					  drhd->devices_cnt, i, tmp) {
666			if (tmp == dev) {
667				/* For a VF use its original BDF# not that of the PF
668				 * which we used for the IOMMU lookup. Strictly speaking
669				 * we could do this for all PCI devices; we only need to
670				 * get the BDF# from the scope table for ACPI matches. */
671				if (pdev && pdev->is_virtfn)
672					goto got_pdev;
673
674				if (bus && devfn) {
675					*bus = drhd->devices[i].bus;
676					*devfn = drhd->devices[i].devfn;
677				}
678				goto out;
679			}
680
681			if (is_downstream_to_pci_bridge(dev, tmp))
682				goto got_pdev;
683		}
684
685		if (pdev && drhd->include_all) {
686got_pdev:
687			if (bus && devfn) {
688				*bus = pdev->bus->number;
689				*devfn = pdev->devfn;
690			}
691			goto out;
692		}
693	}
694	iommu = NULL;
695out:
696	if (iommu_is_dummy(iommu, dev))
697		iommu = NULL;
698
699	rcu_read_unlock();
700
701	return iommu;
702}
703
704static void domain_flush_cache(struct dmar_domain *domain,
705			       void *addr, int size)
706{
707	if (!domain->iommu_coherency)
708		clflush_cache_range(addr, size);
709}
710
711static void free_context_table(struct intel_iommu *iommu)
712{
713	struct context_entry *context;
714	int i;
715
716	if (!iommu->root_entry)
717		return;
718
719	for (i = 0; i < ROOT_ENTRY_NR; i++) {
720		context = iommu_context_addr(iommu, i, 0, 0);
721		if (context)
722			free_pgtable_page(context);
723
724		if (!sm_supported(iommu))
725			continue;
726
727		context = iommu_context_addr(iommu, i, 0x80, 0);
728		if (context)
729			free_pgtable_page(context);
730	}
731
732	free_pgtable_page(iommu->root_entry);
733	iommu->root_entry = NULL;
734}
735
736#ifdef CONFIG_DMAR_DEBUG
737static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
738			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
739{
740	struct dma_pte *pte;
741	int offset;
742
743	while (1) {
744		offset = pfn_level_offset(pfn, level);
745		pte = &parent[offset];
746		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
747			pr_info("PTE not present at level %d\n", level);
748			break;
749		}
750
751		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
752
753		if (level == 1)
754			break;
755
756		parent = phys_to_virt(dma_pte_addr(pte));
757		level--;
758	}
759}
760
761void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
762			  unsigned long long addr, u32 pasid)
763{
764	struct pasid_dir_entry *dir, *pde;
765	struct pasid_entry *entries, *pte;
766	struct context_entry *ctx_entry;
767	struct root_entry *rt_entry;
768	int i, dir_index, index, level;
769	u8 devfn = source_id & 0xff;
770	u8 bus = source_id >> 8;
771	struct dma_pte *pgtable;
772
773	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
774
775	/* root entry dump */
776	rt_entry = &iommu->root_entry[bus];
777	if (!rt_entry) {
778		pr_info("root table entry is not present\n");
779		return;
780	}
781
782	if (sm_supported(iommu))
783		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
784			rt_entry->hi, rt_entry->lo);
785	else
786		pr_info("root entry: 0x%016llx", rt_entry->lo);
787
788	/* context entry dump */
789	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
790	if (!ctx_entry) {
791		pr_info("context table entry is not present\n");
792		return;
793	}
794
795	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
796		ctx_entry->hi, ctx_entry->lo);
797
798	/* legacy mode does not require PASID entries */
799	if (!sm_supported(iommu)) {
800		level = agaw_to_level(ctx_entry->hi & 7);
801		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
802		goto pgtable_walk;
803	}
804
805	/* get the pointer to pasid directory entry */
806	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
807	if (!dir) {
808		pr_info("pasid directory entry is not present\n");
809		return;
810	}
811	/* For request-without-pasid, get the pasid from context entry */
812	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
813		pasid = IOMMU_NO_PASID;
814
815	dir_index = pasid >> PASID_PDE_SHIFT;
816	pde = &dir[dir_index];
817	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
818
819	/* get the pointer to the pasid table entry */
820	entries = get_pasid_table_from_pde(pde);
821	if (!entries) {
822		pr_info("pasid table entry is not present\n");
823		return;
824	}
825	index = pasid & PASID_PTE_MASK;
826	pte = &entries[index];
827	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
828		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
829
830	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
831		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
832		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
833	} else {
834		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
835		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
836	}
837
838pgtable_walk:
839	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
840}
841#endif
842
843static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
844				      unsigned long pfn, int *target_level,
845				      gfp_t gfp)
846{
847	struct dma_pte *parent, *pte;
848	int level = agaw_to_level(domain->agaw);
849	int offset;
850
851	if (!domain_pfn_supported(domain, pfn))
852		/* Address beyond IOMMU's addressing capabilities. */
853		return NULL;
854
855	parent = domain->pgd;
856
857	while (1) {
858		void *tmp_page;
859
860		offset = pfn_level_offset(pfn, level);
861		pte = &parent[offset];
862		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
863			break;
864		if (level == *target_level)
865			break;
866
867		if (!dma_pte_present(pte)) {
868			uint64_t pteval;
869
870			tmp_page = alloc_pgtable_page(domain->nid, gfp);
871
872			if (!tmp_page)
873				return NULL;
874
875			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
876			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
877			if (domain->use_first_level)
878				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
879
880			if (cmpxchg64(&pte->val, 0ULL, pteval))
881				/* Someone else set it while we were thinking; use theirs. */
882				free_pgtable_page(tmp_page);
883			else
884				domain_flush_cache(domain, pte, sizeof(*pte));
885		}
886		if (level == 1)
887			break;
888
889		parent = phys_to_virt(dma_pte_addr(pte));
890		level--;
891	}
892
893	if (!*target_level)
894		*target_level = level;
895
896	return pte;
897}
898
899/* return address's pte at specific level */
900static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
901					 unsigned long pfn,
902					 int level, int *large_page)
903{
904	struct dma_pte *parent, *pte;
905	int total = agaw_to_level(domain->agaw);
906	int offset;
907
908	parent = domain->pgd;
909	while (level <= total) {
910		offset = pfn_level_offset(pfn, total);
911		pte = &parent[offset];
912		if (level == total)
913			return pte;
914
915		if (!dma_pte_present(pte)) {
916			*large_page = total;
917			break;
918		}
919
920		if (dma_pte_superpage(pte)) {
921			*large_page = total;
922			return pte;
923		}
924
925		parent = phys_to_virt(dma_pte_addr(pte));
926		total--;
927	}
928	return NULL;
929}
930
931/* clear last level pte, a tlb flush should be followed */
932static void dma_pte_clear_range(struct dmar_domain *domain,
933				unsigned long start_pfn,
934				unsigned long last_pfn)
935{
936	unsigned int large_page;
937	struct dma_pte *first_pte, *pte;
938
939	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
940	    WARN_ON(start_pfn > last_pfn))
941		return;
942
943	/* we don't need lock here; nobody else touches the iova range */
944	do {
945		large_page = 1;
946		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
947		if (!pte) {
948			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
949			continue;
950		}
951		do {
952			dma_clear_pte(pte);
953			start_pfn += lvl_to_nr_pages(large_page);
954			pte++;
955		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
956
957		domain_flush_cache(domain, first_pte,
958				   (void *)pte - (void *)first_pte);
959
960	} while (start_pfn && start_pfn <= last_pfn);
961}
962
963static void dma_pte_free_level(struct dmar_domain *domain, int level,
964			       int retain_level, struct dma_pte *pte,
965			       unsigned long pfn, unsigned long start_pfn,
966			       unsigned long last_pfn)
967{
968	pfn = max(start_pfn, pfn);
969	pte = &pte[pfn_level_offset(pfn, level)];
970
971	do {
972		unsigned long level_pfn;
973		struct dma_pte *level_pte;
974
975		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
976			goto next;
977
978		level_pfn = pfn & level_mask(level);
979		level_pte = phys_to_virt(dma_pte_addr(pte));
980
981		if (level > 2) {
982			dma_pte_free_level(domain, level - 1, retain_level,
983					   level_pte, level_pfn, start_pfn,
984					   last_pfn);
985		}
986
987		/*
988		 * Free the page table if we're below the level we want to
989		 * retain and the range covers the entire table.
990		 */
991		if (level < retain_level && !(start_pfn > level_pfn ||
992		      last_pfn < level_pfn + level_size(level) - 1)) {
993			dma_clear_pte(pte);
994			domain_flush_cache(domain, pte, sizeof(*pte));
995			free_pgtable_page(level_pte);
996		}
997next:
998		pfn += level_size(level);
999	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1000}
1001
1002/*
1003 * clear last level (leaf) ptes and free page table pages below the
1004 * level we wish to keep intact.
1005 */
1006static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007				   unsigned long start_pfn,
1008				   unsigned long last_pfn,
1009				   int retain_level)
1010{
1011	dma_pte_clear_range(domain, start_pfn, last_pfn);
1012
1013	/* We don't need lock here; nobody else touches the iova range */
1014	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1015			   domain->pgd, 0, start_pfn, last_pfn);
1016
1017	/* free pgd */
1018	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019		free_pgtable_page(domain->pgd);
1020		domain->pgd = NULL;
1021	}
1022}
1023
1024/* When a page at a given level is being unlinked from its parent, we don't
1025   need to *modify* it at all. All we need to do is make a list of all the
1026   pages which can be freed just as soon as we've flushed the IOTLB and we
1027   know the hardware page-walk will no longer touch them.
1028   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029   be freed. */
1030static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031				    int level, struct dma_pte *pte,
1032				    struct list_head *freelist)
1033{
1034	struct page *pg;
1035
1036	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037	list_add_tail(&pg->lru, freelist);
1038
1039	if (level == 1)
1040		return;
1041
1042	pte = page_address(pg);
1043	do {
1044		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1046		pte++;
1047	} while (!first_pte_in_page(pte));
1048}
1049
1050static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051				struct dma_pte *pte, unsigned long pfn,
1052				unsigned long start_pfn, unsigned long last_pfn,
1053				struct list_head *freelist)
1054{
1055	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1056
1057	pfn = max(start_pfn, pfn);
1058	pte = &pte[pfn_level_offset(pfn, level)];
1059
1060	do {
1061		unsigned long level_pfn = pfn & level_mask(level);
1062
1063		if (!dma_pte_present(pte))
1064			goto next;
1065
1066		/* If range covers entire pagetable, free it */
1067		if (start_pfn <= level_pfn &&
1068		    last_pfn >= level_pfn + level_size(level) - 1) {
1069			/* These suborbinate page tables are going away entirely. Don't
1070			   bother to clear them; we're just going to *free* them. */
1071			if (level > 1 && !dma_pte_superpage(pte))
1072				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1073
1074			dma_clear_pte(pte);
1075			if (!first_pte)
1076				first_pte = pte;
1077			last_pte = pte;
1078		} else if (level > 1) {
1079			/* Recurse down into a level that isn't *entirely* obsolete */
1080			dma_pte_clear_level(domain, level - 1,
1081					    phys_to_virt(dma_pte_addr(pte)),
1082					    level_pfn, start_pfn, last_pfn,
1083					    freelist);
1084		}
1085next:
1086		pfn = level_pfn + level_size(level);
1087	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1088
1089	if (first_pte)
1090		domain_flush_cache(domain, first_pte,
1091				   (void *)++last_pte - (void *)first_pte);
1092}
1093
1094/* We can't just free the pages because the IOMMU may still be walking
1095   the page tables, and may have cached the intermediate levels. The
1096   pages can only be freed after the IOTLB flush has been done. */
1097static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098			 unsigned long last_pfn, struct list_head *freelist)
1099{
1100	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101	    WARN_ON(start_pfn > last_pfn))
1102		return;
1103
1104	/* we don't need lock here; nobody else touches the iova range */
1105	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1106			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1107
1108	/* free pgd */
1109	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110		struct page *pgd_page = virt_to_page(domain->pgd);
1111		list_add_tail(&pgd_page->lru, freelist);
1112		domain->pgd = NULL;
1113	}
1114}
1115
1116/* iommu handling */
1117static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1118{
1119	struct root_entry *root;
1120
1121	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1122	if (!root) {
1123		pr_err("Allocating root entry for %s failed\n",
1124			iommu->name);
1125		return -ENOMEM;
1126	}
1127
1128	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1129	iommu->root_entry = root;
1130
1131	return 0;
1132}
1133
1134static void iommu_set_root_entry(struct intel_iommu *iommu)
1135{
1136	u64 addr;
1137	u32 sts;
1138	unsigned long flag;
1139
1140	addr = virt_to_phys(iommu->root_entry);
1141	if (sm_supported(iommu))
1142		addr |= DMA_RTADDR_SMT;
1143
1144	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1146
1147	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1148
1149	/* Make sure hardware complete it */
1150	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151		      readl, (sts & DMA_GSTS_RTPS), sts);
1152
1153	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1154
1155	/*
1156	 * Hardware invalidates all DMA remapping hardware translation
1157	 * caches as part of SRTP flow.
1158	 */
1159	if (cap_esrtps(iommu->cap))
1160		return;
1161
1162	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163	if (sm_supported(iommu))
1164		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1165	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1166}
1167
1168void iommu_flush_write_buffer(struct intel_iommu *iommu)
1169{
1170	u32 val;
1171	unsigned long flag;
1172
1173	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174		return;
1175
1176	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1178
1179	/* Make sure hardware complete it */
1180	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181		      readl, (!(val & DMA_GSTS_WBFS)), val);
1182
1183	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184}
1185
1186/* return value determine if we need a write buffer flush */
1187static void __iommu_flush_context(struct intel_iommu *iommu,
1188				  u16 did, u16 source_id, u8 function_mask,
1189				  u64 type)
1190{
1191	u64 val = 0;
1192	unsigned long flag;
1193
1194	switch (type) {
1195	case DMA_CCMD_GLOBAL_INVL:
1196		val = DMA_CCMD_GLOBAL_INVL;
1197		break;
1198	case DMA_CCMD_DOMAIN_INVL:
1199		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200		break;
1201	case DMA_CCMD_DEVICE_INVL:
1202		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1204		break;
1205	default:
1206		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1207			iommu->name, type);
1208		return;
1209	}
1210	val |= DMA_CCMD_ICC;
1211
1212	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1214
1215	/* Make sure hardware complete it */
1216	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1218
1219	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220}
1221
1222/* return value determine if we need a write buffer flush */
1223static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224				u64 addr, unsigned int size_order, u64 type)
1225{
1226	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227	u64 val = 0, val_iva = 0;
1228	unsigned long flag;
1229
1230	switch (type) {
1231	case DMA_TLB_GLOBAL_FLUSH:
1232		/* global flush doesn't need set IVA_REG */
1233		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1234		break;
1235	case DMA_TLB_DSI_FLUSH:
1236		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237		break;
1238	case DMA_TLB_PSI_FLUSH:
1239		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240		/* IH bit is passed in as part of address */
1241		val_iva = size_order | addr;
1242		break;
1243	default:
1244		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1245			iommu->name, type);
1246		return;
1247	}
1248
1249	if (cap_write_drain(iommu->cap))
1250		val |= DMA_TLB_WRITE_DRAIN;
1251
1252	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253	/* Note: Only uses first TLB reg currently */
1254	if (val_iva)
1255		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1257
1258	/* Make sure hardware complete it */
1259	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1261
1262	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263
1264	/* check IOTLB invalidation granularity */
1265	if (DMA_TLB_IAIG(val) == 0)
1266		pr_err("Flush IOTLB failed\n");
1267	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268		pr_debug("TLB flush request %Lx, actual %Lx\n",
1269			(unsigned long long)DMA_TLB_IIRG(type),
1270			(unsigned long long)DMA_TLB_IAIG(val));
1271}
1272
1273static struct device_domain_info *
1274domain_lookup_dev_info(struct dmar_domain *domain,
1275		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1276{
1277	struct device_domain_info *info;
1278	unsigned long flags;
1279
1280	spin_lock_irqsave(&domain->lock, flags);
1281	list_for_each_entry(info, &domain->devices, link) {
1282		if (info->iommu == iommu && info->bus == bus &&
1283		    info->devfn == devfn) {
1284			spin_unlock_irqrestore(&domain->lock, flags);
1285			return info;
1286		}
1287	}
1288	spin_unlock_irqrestore(&domain->lock, flags);
1289
1290	return NULL;
1291}
1292
1293void domain_update_iotlb(struct dmar_domain *domain)
1294{
1295	struct dev_pasid_info *dev_pasid;
1296	struct device_domain_info *info;
1297	bool has_iotlb_device = false;
1298	unsigned long flags;
1299
1300	spin_lock_irqsave(&domain->lock, flags);
1301	list_for_each_entry(info, &domain->devices, link) {
1302		if (info->ats_enabled) {
1303			has_iotlb_device = true;
1304			break;
1305		}
1306	}
1307
1308	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309		info = dev_iommu_priv_get(dev_pasid->dev);
1310		if (info->ats_enabled) {
1311			has_iotlb_device = true;
1312			break;
1313		}
1314	}
1315	domain->has_iotlb_device = has_iotlb_device;
1316	spin_unlock_irqrestore(&domain->lock, flags);
1317}
1318
1319/*
1320 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322 * check because it applies only to the built-in QAT devices and it doesn't
1323 * grant additional privileges.
1324 */
1325#define BUGGY_QAT_DEVID_MASK 0x4940
1326static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1327{
1328	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1329		return false;
1330
1331	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1332		return false;
1333
1334	return true;
1335}
1336
1337static void iommu_enable_pci_caps(struct device_domain_info *info)
1338{
1339	struct pci_dev *pdev;
1340
1341	if (!dev_is_pci(info->dev))
1342		return;
1343
1344	pdev = to_pci_dev(info->dev);
1345
1346	/* The PCIe spec, in its wisdom, declares that the behaviour of
1347	   the device if you enable PASID support after ATS support is
1348	   undefined. So always enable PASID support on devices which
1349	   have it, even if we can't yet know if we're ever going to
1350	   use it. */
1351	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1352		info->pasid_enabled = 1;
1353
1354	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1355	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1356		info->ats_enabled = 1;
1357		domain_update_iotlb(info->domain);
1358	}
1359}
1360
1361static void iommu_disable_pci_caps(struct device_domain_info *info)
1362{
1363	struct pci_dev *pdev;
1364
1365	if (!dev_is_pci(info->dev))
1366		return;
1367
1368	pdev = to_pci_dev(info->dev);
1369
1370	if (info->ats_enabled) {
1371		pci_disable_ats(pdev);
1372		info->ats_enabled = 0;
1373		domain_update_iotlb(info->domain);
1374	}
1375
1376	if (info->pasid_enabled) {
1377		pci_disable_pasid(pdev);
1378		info->pasid_enabled = 0;
1379	}
1380}
1381
1382static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383				    u64 addr, unsigned int mask)
1384{
1385	u16 sid, qdep;
1386
1387	if (!info || !info->ats_enabled)
1388		return;
1389
1390	sid = info->bus << 8 | info->devfn;
1391	qdep = info->ats_qdep;
1392	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1393			   qdep, addr, mask);
1394	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1395}
1396
1397static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398				  u64 addr, unsigned mask)
1399{
1400	struct dev_pasid_info *dev_pasid;
1401	struct device_domain_info *info;
1402	unsigned long flags;
1403
1404	if (!domain->has_iotlb_device)
1405		return;
1406
1407	spin_lock_irqsave(&domain->lock, flags);
1408	list_for_each_entry(info, &domain->devices, link)
1409		__iommu_flush_dev_iotlb(info, addr, mask);
1410
1411	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412		info = dev_iommu_priv_get(dev_pasid->dev);
1413
1414		if (!info->ats_enabled)
1415			continue;
1416
1417		qi_flush_dev_iotlb_pasid(info->iommu,
1418					 PCI_DEVID(info->bus, info->devfn),
1419					 info->pfsid, dev_pasid->pasid,
1420					 info->ats_qdep, addr,
1421					 mask);
1422	}
1423	spin_unlock_irqrestore(&domain->lock, flags);
1424}
1425
1426static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427				     struct dmar_domain *domain, u64 addr,
1428				     unsigned long npages, bool ih)
1429{
1430	u16 did = domain_id_iommu(domain, iommu);
1431	struct dev_pasid_info *dev_pasid;
1432	unsigned long flags;
1433
1434	spin_lock_irqsave(&domain->lock, flags);
1435	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1437
1438	if (!list_empty(&domain->devices))
1439		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440	spin_unlock_irqrestore(&domain->lock, flags);
1441}
1442
1443static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444				    unsigned long pfn, unsigned int pages,
1445				    int ih)
1446{
1447	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1448	unsigned long bitmask = aligned_pages - 1;
1449	unsigned int mask = ilog2(aligned_pages);
1450	u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1451
1452	/*
1453	 * PSI masks the low order bits of the base address. If the
1454	 * address isn't aligned to the mask, then compute a mask value
1455	 * needed to ensure the target range is flushed.
1456	 */
1457	if (unlikely(bitmask & pfn)) {
1458		unsigned long end_pfn = pfn + pages - 1, shared_bits;
1459
1460		/*
1461		 * Since end_pfn <= pfn + bitmask, the only way bits
1462		 * higher than bitmask can differ in pfn and end_pfn is
1463		 * by carrying. This means after masking out bitmask,
1464		 * high bits starting with the first set bit in
1465		 * shared_bits are all equal in both pfn and end_pfn.
1466		 */
1467		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1469	}
1470
1471	/*
1472	 * Fallback to domain selective flush if no PSI support or
1473	 * the size is too big.
1474	 */
1475	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1477					 DMA_TLB_DSI_FLUSH);
1478	else
1479		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480					 DMA_TLB_PSI_FLUSH);
1481}
1482
1483static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484				  struct dmar_domain *domain,
1485				  unsigned long pfn, unsigned int pages,
1486				  int ih, int map)
1487{
1488	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1489	unsigned int mask = ilog2(aligned_pages);
1490	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491	u16 did = domain_id_iommu(domain, iommu);
1492
1493	if (WARN_ON(!pages))
1494		return;
1495
1496	if (ih)
1497		ih = 1 << 6;
1498
1499	if (domain->use_first_level)
1500		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1501	else
1502		__iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1503
1504	/*
1505	 * In caching mode, changes of pages from non-present to present require
1506	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1507	 */
1508	if (!cap_caching_mode(iommu->cap) || !map)
1509		iommu_flush_dev_iotlb(domain, addr, mask);
1510}
1511
1512/* Notification for newly created mappings */
1513static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514				 unsigned long pfn, unsigned int pages)
1515{
1516	/*
1517	 * It's a non-present to present mapping. Only flush if caching mode
1518	 * and second level.
1519	 */
1520	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522	else
1523		iommu_flush_write_buffer(iommu);
1524}
1525
1526/*
1527 * Flush the relevant caches in nested translation if the domain
1528 * also serves as a parent
1529 */
1530static void parent_domain_flush(struct dmar_domain *domain,
1531				unsigned long pfn,
1532				unsigned long pages, int ih)
1533{
1534	struct dmar_domain *s1_domain;
1535
1536	spin_lock(&domain->s1_lock);
1537	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538		struct device_domain_info *device_info;
1539		struct iommu_domain_info *info;
1540		unsigned long flags;
1541		unsigned long i;
1542
1543		xa_for_each(&s1_domain->iommu_array, i, info)
1544			__iommu_flush_iotlb_psi(info->iommu, info->did,
1545						pfn, pages, ih);
1546
1547		if (!s1_domain->has_iotlb_device)
1548			continue;
1549
1550		spin_lock_irqsave(&s1_domain->lock, flags);
1551		list_for_each_entry(device_info, &s1_domain->devices, link)
1552			/*
1553			 * Address translation cache in device side caches the
1554			 * result of nested translation. There is no easy way
1555			 * to identify the exact set of nested translations
1556			 * affected by a change in S2. So just flush the entire
1557			 * device cache.
1558			 */
1559			__iommu_flush_dev_iotlb(device_info, 0,
1560						MAX_AGAW_PFN_WIDTH);
1561		spin_unlock_irqrestore(&s1_domain->lock, flags);
1562	}
1563	spin_unlock(&domain->s1_lock);
1564}
1565
1566static void intel_flush_iotlb_all(struct iommu_domain *domain)
1567{
1568	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1569	struct iommu_domain_info *info;
1570	unsigned long idx;
1571
1572	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573		struct intel_iommu *iommu = info->iommu;
1574		u16 did = domain_id_iommu(dmar_domain, iommu);
1575
1576		if (dmar_domain->use_first_level)
1577			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1578		else
1579			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580						 DMA_TLB_DSI_FLUSH);
1581
1582		if (!cap_caching_mode(iommu->cap))
1583			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1584	}
1585
1586	if (dmar_domain->nested_parent)
1587		parent_domain_flush(dmar_domain, 0, -1, 0);
1588}
1589
1590static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1591{
1592	u32 pmen;
1593	unsigned long flags;
1594
1595	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1596		return;
1597
1598	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1600	pmen &= ~DMA_PMEN_EPM;
1601	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1602
1603	/* wait for the protected region status bit to clear */
1604	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605		readl, !(pmen & DMA_PMEN_PRS), pmen);
1606
1607	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1608}
1609
1610static void iommu_enable_translation(struct intel_iommu *iommu)
1611{
1612	u32 sts;
1613	unsigned long flags;
1614
1615	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616	iommu->gcmd |= DMA_GCMD_TE;
1617	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1618
1619	/* Make sure hardware complete it */
1620	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621		      readl, (sts & DMA_GSTS_TES), sts);
1622
1623	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1624}
1625
1626static void iommu_disable_translation(struct intel_iommu *iommu)
1627{
1628	u32 sts;
1629	unsigned long flag;
1630
1631	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1633		return;
1634
1635	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636	iommu->gcmd &= ~DMA_GCMD_TE;
1637	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638
1639	/* Make sure hardware complete it */
1640	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641		      readl, (!(sts & DMA_GSTS_TES)), sts);
1642
1643	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644}
1645
1646static int iommu_init_domains(struct intel_iommu *iommu)
1647{
1648	u32 ndomains;
1649
1650	ndomains = cap_ndoms(iommu->cap);
1651	pr_debug("%s: Number of Domains supported <%d>\n",
1652		 iommu->name, ndomains);
1653
1654	spin_lock_init(&iommu->lock);
1655
1656	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1657	if (!iommu->domain_ids)
1658		return -ENOMEM;
1659
1660	/*
1661	 * If Caching mode is set, then invalid translations are tagged
1662	 * with domain-id 0, hence we need to pre-allocate it. We also
1663	 * use domain-id 0 as a marker for non-allocated domain-id, so
1664	 * make sure it is not used for a real domain.
1665	 */
1666	set_bit(0, iommu->domain_ids);
1667
1668	/*
1669	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670	 * entry for first-level or pass-through translation modes should
1671	 * be programmed with a domain id different from those used for
1672	 * second-level or nested translation. We reserve a domain id for
1673	 * this purpose.
1674	 */
1675	if (sm_supported(iommu))
1676		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1677
1678	return 0;
1679}
1680
1681static void disable_dmar_iommu(struct intel_iommu *iommu)
1682{
1683	if (!iommu->domain_ids)
1684		return;
1685
1686	/*
1687	 * All iommu domains must have been detached from the devices,
1688	 * hence there should be no domain IDs in use.
1689	 */
1690	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691		    > NUM_RESERVED_DID))
1692		return;
1693
1694	if (iommu->gcmd & DMA_GCMD_TE)
1695		iommu_disable_translation(iommu);
1696}
1697
1698static void free_dmar_iommu(struct intel_iommu *iommu)
1699{
1700	if (iommu->domain_ids) {
1701		bitmap_free(iommu->domain_ids);
1702		iommu->domain_ids = NULL;
1703	}
1704
1705	if (iommu->copied_tables) {
1706		bitmap_free(iommu->copied_tables);
1707		iommu->copied_tables = NULL;
1708	}
1709
1710	/* free context mapping */
1711	free_context_table(iommu);
1712
1713#ifdef CONFIG_INTEL_IOMMU_SVM
1714	if (pasid_supported(iommu)) {
1715		if (ecap_prs(iommu->ecap))
1716			intel_svm_finish_prq(iommu);
1717	}
1718#endif
1719}
1720
1721/*
1722 * Check and return whether first level is used by default for
1723 * DMA translation.
1724 */
1725static bool first_level_by_default(unsigned int type)
1726{
1727	/* Only SL is available in legacy mode */
1728	if (!scalable_mode_support())
1729		return false;
1730
1731	/* Only level (either FL or SL) is available, just use it */
1732	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733		return intel_cap_flts_sanity();
1734
1735	/* Both levels are available, decide it based on domain type */
1736	return type != IOMMU_DOMAIN_UNMANAGED;
1737}
1738
1739static struct dmar_domain *alloc_domain(unsigned int type)
1740{
1741	struct dmar_domain *domain;
1742
1743	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1744	if (!domain)
1745		return NULL;
1746
1747	domain->nid = NUMA_NO_NODE;
1748	if (first_level_by_default(type))
1749		domain->use_first_level = true;
1750	domain->has_iotlb_device = false;
1751	INIT_LIST_HEAD(&domain->devices);
1752	INIT_LIST_HEAD(&domain->dev_pasids);
1753	spin_lock_init(&domain->lock);
1754	xa_init(&domain->iommu_array);
1755
1756	return domain;
1757}
1758
1759int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1760{
1761	struct iommu_domain_info *info, *curr;
1762	unsigned long ndomains;
1763	int num, ret = -ENOSPC;
1764
1765	info = kzalloc(sizeof(*info), GFP_KERNEL);
1766	if (!info)
1767		return -ENOMEM;
1768
1769	spin_lock(&iommu->lock);
1770	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1771	if (curr) {
1772		curr->refcnt++;
1773		spin_unlock(&iommu->lock);
1774		kfree(info);
1775		return 0;
1776	}
1777
1778	ndomains = cap_ndoms(iommu->cap);
1779	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1780	if (num >= ndomains) {
1781		pr_err("%s: No free domain ids\n", iommu->name);
1782		goto err_unlock;
1783	}
1784
1785	set_bit(num, iommu->domain_ids);
1786	info->refcnt	= 1;
1787	info->did	= num;
1788	info->iommu	= iommu;
1789	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1790			  NULL, info, GFP_ATOMIC);
1791	if (curr) {
1792		ret = xa_err(curr) ? : -EBUSY;
1793		goto err_clear;
1794	}
1795	domain_update_iommu_cap(domain);
1796
1797	spin_unlock(&iommu->lock);
1798	return 0;
1799
1800err_clear:
1801	clear_bit(info->did, iommu->domain_ids);
1802err_unlock:
1803	spin_unlock(&iommu->lock);
1804	kfree(info);
1805	return ret;
1806}
1807
1808void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1809{
1810	struct iommu_domain_info *info;
1811
1812	spin_lock(&iommu->lock);
1813	info = xa_load(&domain->iommu_array, iommu->seq_id);
1814	if (--info->refcnt == 0) {
1815		clear_bit(info->did, iommu->domain_ids);
1816		xa_erase(&domain->iommu_array, iommu->seq_id);
1817		domain->nid = NUMA_NO_NODE;
1818		domain_update_iommu_cap(domain);
1819		kfree(info);
1820	}
1821	spin_unlock(&iommu->lock);
1822}
1823
1824static int guestwidth_to_adjustwidth(int gaw)
1825{
1826	int agaw;
1827	int r = (gaw - 12) % 9;
1828
1829	if (r == 0)
1830		agaw = gaw;
1831	else
1832		agaw = gaw + 9 - r;
1833	if (agaw > 64)
1834		agaw = 64;
1835	return agaw;
1836}
1837
1838static void domain_exit(struct dmar_domain *domain)
1839{
1840	if (domain->pgd) {
1841		LIST_HEAD(freelist);
1842
1843		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1844		put_pages_list(&freelist);
1845	}
1846
1847	if (WARN_ON(!list_empty(&domain->devices)))
1848		return;
1849
1850	kfree(domain);
1851}
1852
1853static int domain_context_mapping_one(struct dmar_domain *domain,
1854				      struct intel_iommu *iommu,
1855				      u8 bus, u8 devfn)
1856{
1857	struct device_domain_info *info =
1858			domain_lookup_dev_info(domain, iommu, bus, devfn);
1859	u16 did = domain_id_iommu(domain, iommu);
1860	int translation = CONTEXT_TT_MULTI_LEVEL;
1861	struct dma_pte *pgd = domain->pgd;
1862	struct context_entry *context;
1863	int agaw, ret;
1864
1865	if (hw_pass_through && domain_type_is_si(domain))
1866		translation = CONTEXT_TT_PASS_THROUGH;
1867
1868	pr_debug("Set context mapping for %02x:%02x.%d\n",
1869		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1870
1871	spin_lock(&iommu->lock);
1872	ret = -ENOMEM;
1873	context = iommu_context_addr(iommu, bus, devfn, 1);
1874	if (!context)
1875		goto out_unlock;
1876
1877	ret = 0;
1878	if (context_present(context) && !context_copied(iommu, bus, devfn))
1879		goto out_unlock;
1880
1881	/*
1882	 * For kdump cases, old valid entries may be cached due to the
1883	 * in-flight DMA and copied pgtable, but there is no unmapping
1884	 * behaviour for them, thus we need an explicit cache flush for
1885	 * the newly-mapped device. For kdump, at this point, the device
1886	 * is supposed to finish reset at its driver probe stage, so no
1887	 * in-flight DMA will exist, and we don't need to worry anymore
1888	 * hereafter.
1889	 */
1890	if (context_copied(iommu, bus, devfn)) {
1891		u16 did_old = context_domain_id(context);
1892
1893		if (did_old < cap_ndoms(iommu->cap)) {
1894			iommu->flush.flush_context(iommu, did_old,
1895						   (((u16)bus) << 8) | devfn,
1896						   DMA_CCMD_MASK_NOBIT,
1897						   DMA_CCMD_DEVICE_INVL);
1898			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1899						 DMA_TLB_DSI_FLUSH);
1900		}
1901
1902		clear_context_copied(iommu, bus, devfn);
1903	}
1904
1905	context_clear_entry(context);
1906	context_set_domain_id(context, did);
1907
1908	if (translation != CONTEXT_TT_PASS_THROUGH) {
1909		/*
1910		 * Skip top levels of page tables for iommu which has
1911		 * less agaw than default. Unnecessary for PT mode.
1912		 */
1913		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1914			ret = -ENOMEM;
1915			pgd = phys_to_virt(dma_pte_addr(pgd));
1916			if (!dma_pte_present(pgd))
1917				goto out_unlock;
1918		}
1919
1920		if (info && info->ats_supported)
1921			translation = CONTEXT_TT_DEV_IOTLB;
1922		else
1923			translation = CONTEXT_TT_MULTI_LEVEL;
1924
1925		context_set_address_root(context, virt_to_phys(pgd));
1926		context_set_address_width(context, agaw);
1927	} else {
1928		/*
1929		 * In pass through mode, AW must be programmed to
1930		 * indicate the largest AGAW value supported by
1931		 * hardware. And ASR is ignored by hardware.
1932		 */
1933		context_set_address_width(context, iommu->msagaw);
1934	}
1935
1936	context_set_translation_type(context, translation);
1937	context_set_fault_enable(context);
1938	context_set_present(context);
1939	if (!ecap_coherent(iommu->ecap))
1940		clflush_cache_range(context, sizeof(*context));
1941
1942	/*
1943	 * It's a non-present to present mapping. If hardware doesn't cache
1944	 * non-present entry we only need to flush the write-buffer. If the
1945	 * _does_ cache non-present entries, then it does so in the special
1946	 * domain #0, which we have to flush:
1947	 */
1948	if (cap_caching_mode(iommu->cap)) {
1949		iommu->flush.flush_context(iommu, 0,
1950					   (((u16)bus) << 8) | devfn,
1951					   DMA_CCMD_MASK_NOBIT,
1952					   DMA_CCMD_DEVICE_INVL);
1953		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1954	} else {
1955		iommu_flush_write_buffer(iommu);
1956	}
1957
1958	ret = 0;
1959
1960out_unlock:
1961	spin_unlock(&iommu->lock);
1962
1963	return ret;
1964}
1965
1966static int domain_context_mapping_cb(struct pci_dev *pdev,
1967				     u16 alias, void *opaque)
1968{
1969	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1970	struct intel_iommu *iommu = info->iommu;
1971	struct dmar_domain *domain = opaque;
1972
1973	return domain_context_mapping_one(domain, iommu,
1974					  PCI_BUS_NUM(alias), alias & 0xff);
1975}
1976
1977static int
1978domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1979{
1980	struct device_domain_info *info = dev_iommu_priv_get(dev);
1981	struct intel_iommu *iommu = info->iommu;
1982	u8 bus = info->bus, devfn = info->devfn;
1983
1984	if (!dev_is_pci(dev))
1985		return domain_context_mapping_one(domain, iommu, bus, devfn);
1986
1987	return pci_for_each_dma_alias(to_pci_dev(dev),
1988				      domain_context_mapping_cb, domain);
1989}
1990
1991/* Returns a number of VTD pages, but aligned to MM page size */
1992static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1993{
1994	host_addr &= ~PAGE_MASK;
1995	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1996}
1997
1998/* Return largest possible superpage level for a given mapping */
1999static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000				   unsigned long phy_pfn, unsigned long pages)
2001{
2002	int support, level = 1;
2003	unsigned long pfnmerge;
2004
2005	support = domain->iommu_superpage;
2006
2007	/* To use a large page, the virtual *and* physical addresses
2008	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009	   of them will mean we have to use smaller pages. So just
2010	   merge them and check both at once. */
2011	pfnmerge = iov_pfn | phy_pfn;
2012
2013	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014		pages >>= VTD_STRIDE_SHIFT;
2015		if (!pages)
2016			break;
2017		pfnmerge >>= VTD_STRIDE_SHIFT;
2018		level++;
2019		support--;
2020	}
2021	return level;
2022}
2023
2024/*
2025 * Ensure that old small page tables are removed to make room for superpage(s).
2026 * We're going to add new large pages, so make sure we don't remove their parent
2027 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2028 */
2029static void switch_to_super_page(struct dmar_domain *domain,
2030				 unsigned long start_pfn,
2031				 unsigned long end_pfn, int level)
2032{
2033	unsigned long lvl_pages = lvl_to_nr_pages(level);
2034	struct iommu_domain_info *info;
2035	struct dma_pte *pte = NULL;
2036	unsigned long i;
2037
2038	while (start_pfn <= end_pfn) {
2039		if (!pte)
2040			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2041					     GFP_ATOMIC);
2042
2043		if (dma_pte_present(pte)) {
2044			dma_pte_free_pagetable(domain, start_pfn,
2045					       start_pfn + lvl_pages - 1,
2046					       level + 1);
2047
2048			xa_for_each(&domain->iommu_array, i, info)
2049				iommu_flush_iotlb_psi(info->iommu, domain,
2050						      start_pfn, lvl_pages,
2051						      0, 0);
2052			if (domain->nested_parent)
2053				parent_domain_flush(domain, start_pfn,
2054						    lvl_pages, 0);
2055		}
2056
2057		pte++;
2058		start_pfn += lvl_pages;
2059		if (first_pte_in_page(pte))
2060			pte = NULL;
2061	}
2062}
2063
2064static int
2065__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2067		 gfp_t gfp)
2068{
2069	struct dma_pte *first_pte = NULL, *pte = NULL;
2070	unsigned int largepage_lvl = 0;
2071	unsigned long lvl_pages = 0;
2072	phys_addr_t pteval;
2073	u64 attr;
2074
2075	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2076		return -EINVAL;
2077
2078	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2079		return -EINVAL;
2080
2081	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2083		return -EINVAL;
2084	}
2085
2086	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087	attr |= DMA_FL_PTE_PRESENT;
2088	if (domain->use_first_level) {
2089		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090		if (prot & DMA_PTE_WRITE)
2091			attr |= DMA_FL_PTE_DIRTY;
2092	}
2093
2094	domain->has_mappings = true;
2095
2096	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2097
2098	while (nr_pages > 0) {
2099		uint64_t tmp;
2100
2101		if (!pte) {
2102			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103					phys_pfn, nr_pages);
2104
2105			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2106					     gfp);
2107			if (!pte)
2108				return -ENOMEM;
2109			first_pte = pte;
2110
2111			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2112
2113			/* It is large page*/
2114			if (largepage_lvl > 1) {
2115				unsigned long end_pfn;
2116				unsigned long pages_to_remove;
2117
2118				pteval |= DMA_PTE_LARGE_PAGE;
2119				pages_to_remove = min_t(unsigned long, nr_pages,
2120							nr_pte_to_next_page(pte) * lvl_pages);
2121				end_pfn = iov_pfn + pages_to_remove - 1;
2122				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2123			} else {
2124				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2125			}
2126
2127		}
2128		/* We don't need lock here, nobody else
2129		 * touches the iova range
2130		 */
2131		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132		if (tmp) {
2133			static int dumps = 5;
2134			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135				iov_pfn, tmp, (unsigned long long)pteval);
2136			if (dumps) {
2137				dumps--;
2138				debug_dma_dump_mappings(NULL);
2139			}
2140			WARN_ON(1);
2141		}
2142
2143		nr_pages -= lvl_pages;
2144		iov_pfn += lvl_pages;
2145		phys_pfn += lvl_pages;
2146		pteval += lvl_pages * VTD_PAGE_SIZE;
2147
2148		/* If the next PTE would be the first in a new page, then we
2149		 * need to flush the cache on the entries we've just written.
2150		 * And then we'll need to recalculate 'pte', so clear it and
2151		 * let it get set again in the if (!pte) block above.
2152		 *
2153		 * If we're done (!nr_pages) we need to flush the cache too.
2154		 *
2155		 * Also if we've been setting superpages, we may need to
2156		 * recalculate 'pte' and switch back to smaller pages for the
2157		 * end of the mapping, if the trailing size is not enough to
2158		 * use another superpage (i.e. nr_pages < lvl_pages).
2159		 */
2160		pte++;
2161		if (!nr_pages || first_pte_in_page(pte) ||
2162		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163			domain_flush_cache(domain, first_pte,
2164					   (void *)pte - (void *)first_pte);
2165			pte = NULL;
2166		}
2167	}
2168
2169	return 0;
2170}
2171
2172static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2173{
2174	struct intel_iommu *iommu = info->iommu;
2175	struct context_entry *context;
2176	u16 did_old;
2177
2178	spin_lock(&iommu->lock);
2179	context = iommu_context_addr(iommu, bus, devfn, 0);
2180	if (!context) {
2181		spin_unlock(&iommu->lock);
2182		return;
2183	}
2184
2185	did_old = context_domain_id(context);
2186
2187	context_clear_entry(context);
2188	__iommu_flush_cache(iommu, context, sizeof(*context));
2189	spin_unlock(&iommu->lock);
2190	iommu->flush.flush_context(iommu,
2191				   did_old,
2192				   (((u16)bus) << 8) | devfn,
2193				   DMA_CCMD_MASK_NOBIT,
2194				   DMA_CCMD_DEVICE_INVL);
2195
2196	iommu->flush.flush_iotlb(iommu,
2197				 did_old,
2198				 0,
2199				 0,
2200				 DMA_TLB_DSI_FLUSH);
2201
2202	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2203}
2204
2205static int domain_setup_first_level(struct intel_iommu *iommu,
2206				    struct dmar_domain *domain,
2207				    struct device *dev,
2208				    u32 pasid)
2209{
2210	struct dma_pte *pgd = domain->pgd;
2211	int agaw, level;
2212	int flags = 0;
2213
2214	/*
2215	 * Skip top levels of page tables for iommu which has
2216	 * less agaw than default. Unnecessary for PT mode.
2217	 */
2218	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219		pgd = phys_to_virt(dma_pte_addr(pgd));
2220		if (!dma_pte_present(pgd))
2221			return -ENOMEM;
2222	}
2223
2224	level = agaw_to_level(agaw);
2225	if (level != 4 && level != 5)
2226		return -EINVAL;
2227
2228	if (level == 5)
2229		flags |= PASID_FLAG_FL5LP;
2230
2231	if (domain->force_snooping)
2232		flags |= PASID_FLAG_PAGE_SNOOP;
2233
2234	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2235					     domain_id_iommu(domain, iommu),
2236					     flags);
2237}
2238
2239static bool dev_is_real_dma_subdevice(struct device *dev)
2240{
2241	return dev && dev_is_pci(dev) &&
2242	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2243}
2244
2245static int iommu_domain_identity_map(struct dmar_domain *domain,
2246				     unsigned long first_vpfn,
2247				     unsigned long last_vpfn)
2248{
2249	/*
2250	 * RMRR range might have overlap with physical memory range,
2251	 * clear it first
2252	 */
2253	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2254
2255	return __domain_mapping(domain, first_vpfn,
2256				first_vpfn, last_vpfn - first_vpfn + 1,
2257				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2258}
2259
2260static int md_domain_init(struct dmar_domain *domain, int guest_width);
2261
2262static int __init si_domain_init(int hw)
2263{
2264	struct dmar_rmrr_unit *rmrr;
2265	struct device *dev;
2266	int i, nid, ret;
2267
2268	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2269	if (!si_domain)
2270		return -EFAULT;
2271
2272	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273		domain_exit(si_domain);
2274		si_domain = NULL;
2275		return -EFAULT;
2276	}
2277
2278	if (hw)
2279		return 0;
2280
2281	for_each_online_node(nid) {
2282		unsigned long start_pfn, end_pfn;
2283		int i;
2284
2285		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286			ret = iommu_domain_identity_map(si_domain,
2287					mm_to_dma_pfn_start(start_pfn),
2288					mm_to_dma_pfn_end(end_pfn));
2289			if (ret)
2290				return ret;
2291		}
2292	}
2293
2294	/*
2295	 * Identity map the RMRRs so that devices with RMRRs could also use
2296	 * the si_domain.
2297	 */
2298	for_each_rmrr_units(rmrr) {
2299		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2300					  i, dev) {
2301			unsigned long long start = rmrr->base_address;
2302			unsigned long long end = rmrr->end_address;
2303
2304			if (WARN_ON(end < start ||
2305				    end >> agaw_to_width(si_domain->agaw)))
2306				continue;
2307
2308			ret = iommu_domain_identity_map(si_domain,
2309					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2310					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2311			if (ret)
2312				return ret;
2313		}
2314	}
2315
2316	return 0;
2317}
2318
2319static int dmar_domain_attach_device(struct dmar_domain *domain,
2320				     struct device *dev)
2321{
2322	struct device_domain_info *info = dev_iommu_priv_get(dev);
2323	struct intel_iommu *iommu = info->iommu;
2324	unsigned long flags;
2325	int ret;
2326
2327	ret = domain_attach_iommu(domain, iommu);
2328	if (ret)
2329		return ret;
2330	info->domain = domain;
2331	spin_lock_irqsave(&domain->lock, flags);
2332	list_add(&info->link, &domain->devices);
2333	spin_unlock_irqrestore(&domain->lock, flags);
2334
2335	if (dev_is_real_dma_subdevice(dev))
2336		return 0;
2337
2338	if (!sm_supported(iommu))
2339		ret = domain_context_mapping(domain, dev);
2340	else if (hw_pass_through && domain_type_is_si(domain))
2341		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342	else if (domain->use_first_level)
2343		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2344	else
2345		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2346
2347	if (ret) {
2348		device_block_translation(dev);
2349		return ret;
2350	}
2351
2352	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2353		iommu_enable_pci_caps(info);
2354
2355	return 0;
2356}
2357
2358/**
2359 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360 * is relaxable (ie. is allowed to be not enforced under some conditions)
2361 * @dev: device handle
2362 *
2363 * We assume that PCI USB devices with RMRRs have them largely
2364 * for historical reasons and that the RMRR space is not actively used post
2365 * boot.  This exclusion may change if vendors begin to abuse it.
2366 *
2367 * The same exception is made for graphics devices, with the requirement that
2368 * any use of the RMRR regions will be torn down before assigning the device
2369 * to a guest.
2370 *
2371 * Return: true if the RMRR is relaxable, false otherwise
2372 */
2373static bool device_rmrr_is_relaxable(struct device *dev)
2374{
2375	struct pci_dev *pdev;
2376
2377	if (!dev_is_pci(dev))
2378		return false;
2379
2380	pdev = to_pci_dev(dev);
2381	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2382		return true;
2383	else
2384		return false;
2385}
2386
2387/*
2388 * Return the required default domain type for a specific device.
2389 *
2390 * @dev: the device in query
2391 * @startup: true if this is during early boot
2392 *
2393 * Returns:
2394 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396 *  - 0: both identity and dynamic domains work for this device
2397 */
2398static int device_def_domain_type(struct device *dev)
2399{
2400	if (dev_is_pci(dev)) {
2401		struct pci_dev *pdev = to_pci_dev(dev);
2402
2403		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404			return IOMMU_DOMAIN_IDENTITY;
2405
2406		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407			return IOMMU_DOMAIN_IDENTITY;
2408	}
2409
2410	return 0;
2411}
2412
2413static void intel_iommu_init_qi(struct intel_iommu *iommu)
2414{
2415	/*
2416	 * Start from the sane iommu hardware state.
2417	 * If the queued invalidation is already initialized by us
2418	 * (for example, while enabling interrupt-remapping) then
2419	 * we got the things already rolling from a sane state.
2420	 */
2421	if (!iommu->qi) {
2422		/*
2423		 * Clear any previous faults.
2424		 */
2425		dmar_fault(-1, iommu);
2426		/*
2427		 * Disable queued invalidation if supported and already enabled
2428		 * before OS handover.
2429		 */
2430		dmar_disable_qi(iommu);
2431	}
2432
2433	if (dmar_enable_qi(iommu)) {
2434		/*
2435		 * Queued Invalidate not enabled, use Register Based Invalidate
2436		 */
2437		iommu->flush.flush_context = __iommu_flush_context;
2438		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439		pr_info("%s: Using Register based invalidation\n",
2440			iommu->name);
2441	} else {
2442		iommu->flush.flush_context = qi_flush_context;
2443		iommu->flush.flush_iotlb = qi_flush_iotlb;
2444		pr_info("%s: Using Queued invalidation\n", iommu->name);
2445	}
2446}
2447
2448static int copy_context_table(struct intel_iommu *iommu,
2449			      struct root_entry *old_re,
2450			      struct context_entry **tbl,
2451			      int bus, bool ext)
2452{
2453	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454	struct context_entry *new_ce = NULL, ce;
2455	struct context_entry *old_ce = NULL;
2456	struct root_entry re;
2457	phys_addr_t old_ce_phys;
2458
2459	tbl_idx = ext ? bus * 2 : bus;
2460	memcpy(&re, old_re, sizeof(re));
2461
2462	for (devfn = 0; devfn < 256; devfn++) {
2463		/* First calculate the correct index */
2464		idx = (ext ? devfn * 2 : devfn) % 256;
2465
2466		if (idx == 0) {
2467			/* First save what we may have and clean up */
2468			if (new_ce) {
2469				tbl[tbl_idx] = new_ce;
2470				__iommu_flush_cache(iommu, new_ce,
2471						    VTD_PAGE_SIZE);
2472				pos = 1;
2473			}
2474
2475			if (old_ce)
2476				memunmap(old_ce);
2477
2478			ret = 0;
2479			if (devfn < 0x80)
2480				old_ce_phys = root_entry_lctp(&re);
2481			else
2482				old_ce_phys = root_entry_uctp(&re);
2483
2484			if (!old_ce_phys) {
2485				if (ext && devfn == 0) {
2486					/* No LCTP, try UCTP */
2487					devfn = 0x7f;
2488					continue;
2489				} else {
2490					goto out;
2491				}
2492			}
2493
2494			ret = -ENOMEM;
2495			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2496					MEMREMAP_WB);
2497			if (!old_ce)
2498				goto out;
2499
2500			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2501			if (!new_ce)
2502				goto out_unmap;
2503
2504			ret = 0;
2505		}
2506
2507		/* Now copy the context entry */
2508		memcpy(&ce, old_ce + idx, sizeof(ce));
2509
2510		if (!context_present(&ce))
2511			continue;
2512
2513		did = context_domain_id(&ce);
2514		if (did >= 0 && did < cap_ndoms(iommu->cap))
2515			set_bit(did, iommu->domain_ids);
2516
2517		set_context_copied(iommu, bus, devfn);
2518		new_ce[idx] = ce;
2519	}
2520
2521	tbl[tbl_idx + pos] = new_ce;
2522
2523	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2524
2525out_unmap:
2526	memunmap(old_ce);
2527
2528out:
2529	return ret;
2530}
2531
2532static int copy_translation_tables(struct intel_iommu *iommu)
2533{
2534	struct context_entry **ctxt_tbls;
2535	struct root_entry *old_rt;
2536	phys_addr_t old_rt_phys;
2537	int ctxt_table_entries;
2538	u64 rtaddr_reg;
2539	int bus, ret;
2540	bool new_ext, ext;
2541
2542	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544	new_ext    = !!sm_supported(iommu);
2545
2546	/*
2547	 * The RTT bit can only be changed when translation is disabled,
2548	 * but disabling translation means to open a window for data
2549	 * corruption. So bail out and don't copy anything if we would
2550	 * have to change the bit.
2551	 */
2552	if (new_ext != ext)
2553		return -EINVAL;
2554
2555	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556	if (!iommu->copied_tables)
2557		return -ENOMEM;
2558
2559	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2560	if (!old_rt_phys)
2561		return -EINVAL;
2562
2563	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2564	if (!old_rt)
2565		return -ENOMEM;
2566
2567	/* This is too big for the stack - allocate it from slab */
2568	ctxt_table_entries = ext ? 512 : 256;
2569	ret = -ENOMEM;
2570	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2571	if (!ctxt_tbls)
2572		goto out_unmap;
2573
2574	for (bus = 0; bus < 256; bus++) {
2575		ret = copy_context_table(iommu, &old_rt[bus],
2576					 ctxt_tbls, bus, ext);
2577		if (ret) {
2578			pr_err("%s: Failed to copy context table for bus %d\n",
2579				iommu->name, bus);
2580			continue;
2581		}
2582	}
2583
2584	spin_lock(&iommu->lock);
2585
2586	/* Context tables are copied, now write them to the root_entry table */
2587	for (bus = 0; bus < 256; bus++) {
2588		int idx = ext ? bus * 2 : bus;
2589		u64 val;
2590
2591		if (ctxt_tbls[idx]) {
2592			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2593			iommu->root_entry[bus].lo = val;
2594		}
2595
2596		if (!ext || !ctxt_tbls[idx + 1])
2597			continue;
2598
2599		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2600		iommu->root_entry[bus].hi = val;
2601	}
2602
2603	spin_unlock(&iommu->lock);
2604
2605	kfree(ctxt_tbls);
2606
2607	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2608
2609	ret = 0;
2610
2611out_unmap:
2612	memunmap(old_rt);
2613
2614	return ret;
2615}
2616
2617static int __init init_dmars(void)
2618{
2619	struct dmar_drhd_unit *drhd;
2620	struct intel_iommu *iommu;
2621	int ret;
2622
2623	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2624	if (ret)
2625		goto free_iommu;
2626
2627	for_each_iommu(iommu, drhd) {
2628		if (drhd->ignored) {
2629			iommu_disable_translation(iommu);
2630			continue;
2631		}
2632
2633		/*
2634		 * Find the max pasid size of all IOMMU's in the system.
2635		 * We need to ensure the system pasid table is no bigger
2636		 * than the smallest supported.
2637		 */
2638		if (pasid_supported(iommu)) {
2639			u32 temp = 2 << ecap_pss(iommu->ecap);
2640
2641			intel_pasid_max_id = min_t(u32, temp,
2642						   intel_pasid_max_id);
2643		}
2644
2645		intel_iommu_init_qi(iommu);
2646
2647		ret = iommu_init_domains(iommu);
2648		if (ret)
2649			goto free_iommu;
2650
2651		init_translation_status(iommu);
2652
2653		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654			iommu_disable_translation(iommu);
2655			clear_translation_pre_enabled(iommu);
2656			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2657				iommu->name);
2658		}
2659
2660		/*
2661		 * TBD:
2662		 * we could share the same root & context tables
2663		 * among all IOMMU's. Need to Split it later.
2664		 */
2665		ret = iommu_alloc_root_entry(iommu);
2666		if (ret)
2667			goto free_iommu;
2668
2669		if (translation_pre_enabled(iommu)) {
2670			pr_info("Translation already enabled - trying to copy translation structures\n");
2671
2672			ret = copy_translation_tables(iommu);
2673			if (ret) {
2674				/*
2675				 * We found the IOMMU with translation
2676				 * enabled - but failed to copy over the
2677				 * old root-entry table. Try to proceed
2678				 * by disabling translation now and
2679				 * allocating a clean root-entry table.
2680				 * This might cause DMAR faults, but
2681				 * probably the dump will still succeed.
2682				 */
2683				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2684				       iommu->name);
2685				iommu_disable_translation(iommu);
2686				clear_translation_pre_enabled(iommu);
2687			} else {
2688				pr_info("Copied translation tables from previous kernel for %s\n",
2689					iommu->name);
2690			}
2691		}
2692
2693		if (!ecap_pass_through(iommu->ecap))
2694			hw_pass_through = 0;
2695		intel_svm_check(iommu);
2696	}
2697
2698	/*
2699	 * Now that qi is enabled on all iommus, set the root entry and flush
2700	 * caches. This is required on some Intel X58 chipsets, otherwise the
2701	 * flush_context function will loop forever and the boot hangs.
2702	 */
2703	for_each_active_iommu(iommu, drhd) {
2704		iommu_flush_write_buffer(iommu);
2705		iommu_set_root_entry(iommu);
2706	}
2707
2708	if (!dmar_map_gfx)
2709		iommu_identity_mapping |= IDENTMAP_GFX;
2710
2711	check_tylersburg_isoch();
2712
2713	ret = si_domain_init(hw_pass_through);
2714	if (ret)
2715		goto free_iommu;
2716
2717	/*
2718	 * for each drhd
2719	 *   enable fault log
2720	 *   global invalidate context cache
2721	 *   global invalidate iotlb
2722	 *   enable translation
2723	 */
2724	for_each_iommu(iommu, drhd) {
2725		if (drhd->ignored) {
2726			/*
2727			 * we always have to disable PMRs or DMA may fail on
2728			 * this device
2729			 */
2730			if (force_on)
2731				iommu_disable_protect_mem_regions(iommu);
2732			continue;
2733		}
2734
2735		iommu_flush_write_buffer(iommu);
2736
2737#ifdef CONFIG_INTEL_IOMMU_SVM
2738		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2739			/*
2740			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741			 * could cause possible lock race condition.
2742			 */
2743			up_write(&dmar_global_lock);
2744			ret = intel_svm_enable_prq(iommu);
2745			down_write(&dmar_global_lock);
2746			if (ret)
2747				goto free_iommu;
2748		}
2749#endif
2750		ret = dmar_set_interrupt(iommu);
2751		if (ret)
2752			goto free_iommu;
2753	}
2754
2755	return 0;
2756
2757free_iommu:
2758	for_each_active_iommu(iommu, drhd) {
2759		disable_dmar_iommu(iommu);
2760		free_dmar_iommu(iommu);
2761	}
2762	if (si_domain) {
2763		domain_exit(si_domain);
2764		si_domain = NULL;
2765	}
2766
2767	return ret;
2768}
2769
2770static void __init init_no_remapping_devices(void)
2771{
2772	struct dmar_drhd_unit *drhd;
2773	struct device *dev;
2774	int i;
2775
2776	for_each_drhd_unit(drhd) {
2777		if (!drhd->include_all) {
2778			for_each_active_dev_scope(drhd->devices,
2779						  drhd->devices_cnt, i, dev)
2780				break;
2781			/* ignore DMAR unit if no devices exist */
2782			if (i == drhd->devices_cnt)
2783				drhd->ignored = 1;
2784		}
2785	}
2786
2787	for_each_active_drhd_unit(drhd) {
2788		if (drhd->include_all)
2789			continue;
2790
2791		for_each_active_dev_scope(drhd->devices,
2792					  drhd->devices_cnt, i, dev)
2793			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2794				break;
2795		if (i < drhd->devices_cnt)
2796			continue;
2797
2798		/* This IOMMU has *only* gfx devices. Either bypass it or
2799		   set the gfx_mapped flag, as appropriate */
2800		drhd->gfx_dedicated = 1;
2801		if (!dmar_map_gfx)
2802			drhd->ignored = 1;
2803	}
2804}
2805
2806#ifdef CONFIG_SUSPEND
2807static int init_iommu_hw(void)
2808{
2809	struct dmar_drhd_unit *drhd;
2810	struct intel_iommu *iommu = NULL;
2811	int ret;
2812
2813	for_each_active_iommu(iommu, drhd) {
2814		if (iommu->qi) {
2815			ret = dmar_reenable_qi(iommu);
2816			if (ret)
2817				return ret;
2818		}
2819	}
2820
2821	for_each_iommu(iommu, drhd) {
2822		if (drhd->ignored) {
2823			/*
2824			 * we always have to disable PMRs or DMA may fail on
2825			 * this device
2826			 */
2827			if (force_on)
2828				iommu_disable_protect_mem_regions(iommu);
2829			continue;
2830		}
2831
2832		iommu_flush_write_buffer(iommu);
2833		iommu_set_root_entry(iommu);
2834		iommu_enable_translation(iommu);
2835		iommu_disable_protect_mem_regions(iommu);
2836	}
2837
2838	return 0;
2839}
2840
2841static void iommu_flush_all(void)
2842{
2843	struct dmar_drhd_unit *drhd;
2844	struct intel_iommu *iommu;
2845
2846	for_each_active_iommu(iommu, drhd) {
2847		iommu->flush.flush_context(iommu, 0, 0, 0,
2848					   DMA_CCMD_GLOBAL_INVL);
2849		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850					 DMA_TLB_GLOBAL_FLUSH);
2851	}
2852}
2853
2854static int iommu_suspend(void)
2855{
2856	struct dmar_drhd_unit *drhd;
2857	struct intel_iommu *iommu = NULL;
2858	unsigned long flag;
2859
2860	iommu_flush_all();
2861
2862	for_each_active_iommu(iommu, drhd) {
2863		iommu_disable_translation(iommu);
2864
2865		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2866
2867		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868			readl(iommu->reg + DMAR_FECTL_REG);
2869		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870			readl(iommu->reg + DMAR_FEDATA_REG);
2871		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872			readl(iommu->reg + DMAR_FEADDR_REG);
2873		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874			readl(iommu->reg + DMAR_FEUADDR_REG);
2875
2876		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2877	}
2878	return 0;
2879}
2880
2881static void iommu_resume(void)
2882{
2883	struct dmar_drhd_unit *drhd;
2884	struct intel_iommu *iommu = NULL;
2885	unsigned long flag;
2886
2887	if (init_iommu_hw()) {
2888		if (force_on)
2889			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2890		else
2891			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2892		return;
2893	}
2894
2895	for_each_active_iommu(iommu, drhd) {
2896
2897		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2898
2899		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2900			iommu->reg + DMAR_FECTL_REG);
2901		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902			iommu->reg + DMAR_FEDATA_REG);
2903		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904			iommu->reg + DMAR_FEADDR_REG);
2905		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906			iommu->reg + DMAR_FEUADDR_REG);
2907
2908		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2909	}
2910}
2911
2912static struct syscore_ops iommu_syscore_ops = {
2913	.resume		= iommu_resume,
2914	.suspend	= iommu_suspend,
2915};
2916
2917static void __init init_iommu_pm_ops(void)
2918{
2919	register_syscore_ops(&iommu_syscore_ops);
2920}
2921
2922#else
2923static inline void init_iommu_pm_ops(void) {}
2924#endif	/* CONFIG_PM */
2925
2926static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2927{
2928	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930	    rmrr->end_address <= rmrr->base_address ||
2931	    arch_rmrr_sanity_check(rmrr))
2932		return -EINVAL;
2933
2934	return 0;
2935}
2936
2937int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2938{
2939	struct acpi_dmar_reserved_memory *rmrr;
2940	struct dmar_rmrr_unit *rmrru;
2941
2942	rmrr = (struct acpi_dmar_reserved_memory *)header;
2943	if (rmrr_sanity_check(rmrr)) {
2944		pr_warn(FW_BUG
2945			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947			   rmrr->base_address, rmrr->end_address,
2948			   dmi_get_system_info(DMI_BIOS_VENDOR),
2949			   dmi_get_system_info(DMI_BIOS_VERSION),
2950			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2951		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2952	}
2953
2954	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2955	if (!rmrru)
2956		goto out;
2957
2958	rmrru->hdr = header;
2959
2960	rmrru->base_address = rmrr->base_address;
2961	rmrru->end_address = rmrr->end_address;
2962
2963	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2964				((void *)rmrr) + rmrr->header.length,
2965				&rmrru->devices_cnt);
2966	if (rmrru->devices_cnt && rmrru->devices == NULL)
2967		goto free_rmrru;
2968
2969	list_add(&rmrru->list, &dmar_rmrr_units);
2970
2971	return 0;
2972free_rmrru:
2973	kfree(rmrru);
2974out:
2975	return -ENOMEM;
2976}
2977
2978static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2979{
2980	struct dmar_atsr_unit *atsru;
2981	struct acpi_dmar_atsr *tmp;
2982
2983	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2984				dmar_rcu_check()) {
2985		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986		if (atsr->segment != tmp->segment)
2987			continue;
2988		if (atsr->header.length != tmp->header.length)
2989			continue;
2990		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2991			return atsru;
2992	}
2993
2994	return NULL;
2995}
2996
2997int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2998{
2999	struct acpi_dmar_atsr *atsr;
3000	struct dmar_atsr_unit *atsru;
3001
3002	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3003		return 0;
3004
3005	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006	atsru = dmar_find_atsr(atsr);
3007	if (atsru)
3008		return 0;
3009
3010	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3011	if (!atsru)
3012		return -ENOMEM;
3013
3014	/*
3015	 * If memory is allocated from slab by ACPI _DSM method, we need to
3016	 * copy the memory content because the memory buffer will be freed
3017	 * on return.
3018	 */
3019	atsru->hdr = (void *)(atsru + 1);
3020	memcpy(atsru->hdr, hdr, hdr->length);
3021	atsru->include_all = atsr->flags & 0x1;
3022	if (!atsru->include_all) {
3023		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3024				(void *)atsr + atsr->header.length,
3025				&atsru->devices_cnt);
3026		if (atsru->devices_cnt && atsru->devices == NULL) {
3027			kfree(atsru);
3028			return -ENOMEM;
3029		}
3030	}
3031
3032	list_add_rcu(&atsru->list, &dmar_atsr_units);
3033
3034	return 0;
3035}
3036
3037static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3038{
3039	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3040	kfree(atsru);
3041}
3042
3043int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3044{
3045	struct acpi_dmar_atsr *atsr;
3046	struct dmar_atsr_unit *atsru;
3047
3048	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049	atsru = dmar_find_atsr(atsr);
3050	if (atsru) {
3051		list_del_rcu(&atsru->list);
3052		synchronize_rcu();
3053		intel_iommu_free_atsr(atsru);
3054	}
3055
3056	return 0;
3057}
3058
3059int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3060{
3061	int i;
3062	struct device *dev;
3063	struct acpi_dmar_atsr *atsr;
3064	struct dmar_atsr_unit *atsru;
3065
3066	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067	atsru = dmar_find_atsr(atsr);
3068	if (!atsru)
3069		return 0;
3070
3071	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3073					  i, dev)
3074			return -EBUSY;
3075	}
3076
3077	return 0;
3078}
3079
3080static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3081{
3082	struct dmar_satc_unit *satcu;
3083	struct acpi_dmar_satc *tmp;
3084
3085	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3086				dmar_rcu_check()) {
3087		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088		if (satc->segment != tmp->segment)
3089			continue;
3090		if (satc->header.length != tmp->header.length)
3091			continue;
3092		if (memcmp(satc, tmp, satc->header.length) == 0)
3093			return satcu;
3094	}
3095
3096	return NULL;
3097}
3098
3099int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3100{
3101	struct acpi_dmar_satc *satc;
3102	struct dmar_satc_unit *satcu;
3103
3104	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3105		return 0;
3106
3107	satc = container_of(hdr, struct acpi_dmar_satc, header);
3108	satcu = dmar_find_satc(satc);
3109	if (satcu)
3110		return 0;
3111
3112	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3113	if (!satcu)
3114		return -ENOMEM;
3115
3116	satcu->hdr = (void *)(satcu + 1);
3117	memcpy(satcu->hdr, hdr, hdr->length);
3118	satcu->atc_required = satc->flags & 0x1;
3119	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3120					      (void *)satc + satc->header.length,
3121					      &satcu->devices_cnt);
3122	if (satcu->devices_cnt && !satcu->devices) {
3123		kfree(satcu);
3124		return -ENOMEM;
3125	}
3126	list_add_rcu(&satcu->list, &dmar_satc_units);
3127
3128	return 0;
3129}
3130
3131static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3132{
3133	int sp, ret;
3134	struct intel_iommu *iommu = dmaru->iommu;
3135
3136	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3137	if (ret)
3138		goto out;
3139
3140	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141		pr_warn("%s: Doesn't support hardware pass through.\n",
3142			iommu->name);
3143		return -ENXIO;
3144	}
3145
3146	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3147	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148		pr_warn("%s: Doesn't support large page.\n",
3149			iommu->name);
3150		return -ENXIO;
3151	}
3152
3153	/*
3154	 * Disable translation if already enabled prior to OS handover.
3155	 */
3156	if (iommu->gcmd & DMA_GCMD_TE)
3157		iommu_disable_translation(iommu);
3158
3159	ret = iommu_init_domains(iommu);
3160	if (ret == 0)
3161		ret = iommu_alloc_root_entry(iommu);
3162	if (ret)
3163		goto out;
3164
3165	intel_svm_check(iommu);
3166
3167	if (dmaru->ignored) {
3168		/*
3169		 * we always have to disable PMRs or DMA may fail on this device
3170		 */
3171		if (force_on)
3172			iommu_disable_protect_mem_regions(iommu);
3173		return 0;
3174	}
3175
3176	intel_iommu_init_qi(iommu);
3177	iommu_flush_write_buffer(iommu);
3178
3179#ifdef CONFIG_INTEL_IOMMU_SVM
3180	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181		ret = intel_svm_enable_prq(iommu);
3182		if (ret)
3183			goto disable_iommu;
3184	}
3185#endif
3186	ret = dmar_set_interrupt(iommu);
3187	if (ret)
3188		goto disable_iommu;
3189
3190	iommu_set_root_entry(iommu);
3191	iommu_enable_translation(iommu);
3192
3193	iommu_disable_protect_mem_regions(iommu);
3194	return 0;
3195
3196disable_iommu:
3197	disable_dmar_iommu(iommu);
3198out:
3199	free_dmar_iommu(iommu);
3200	return ret;
3201}
3202
3203int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3204{
3205	int ret = 0;
3206	struct intel_iommu *iommu = dmaru->iommu;
3207
3208	if (!intel_iommu_enabled)
3209		return 0;
3210	if (iommu == NULL)
3211		return -EINVAL;
3212
3213	if (insert) {
3214		ret = intel_iommu_add(dmaru);
3215	} else {
3216		disable_dmar_iommu(iommu);
3217		free_dmar_iommu(iommu);
3218	}
3219
3220	return ret;
3221}
3222
3223static void intel_iommu_free_dmars(void)
3224{
3225	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226	struct dmar_atsr_unit *atsru, *atsr_n;
3227	struct dmar_satc_unit *satcu, *satc_n;
3228
3229	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230		list_del(&rmrru->list);
3231		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3232		kfree(rmrru);
3233	}
3234
3235	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236		list_del(&atsru->list);
3237		intel_iommu_free_atsr(atsru);
3238	}
3239	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240		list_del(&satcu->list);
3241		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3242		kfree(satcu);
3243	}
3244}
3245
3246static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3247{
3248	struct dmar_satc_unit *satcu;
3249	struct acpi_dmar_satc *satc;
3250	struct device *tmp;
3251	int i;
3252
3253	dev = pci_physfn(dev);
3254	rcu_read_lock();
3255
3256	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258		if (satc->segment != pci_domain_nr(dev->bus))
3259			continue;
3260		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261			if (to_pci_dev(tmp) == dev)
3262				goto out;
3263	}
3264	satcu = NULL;
3265out:
3266	rcu_read_unlock();
3267	return satcu;
3268}
3269
3270static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3271{
3272	int i, ret = 1;
3273	struct pci_bus *bus;
3274	struct pci_dev *bridge = NULL;
3275	struct device *tmp;
3276	struct acpi_dmar_atsr *atsr;
3277	struct dmar_atsr_unit *atsru;
3278	struct dmar_satc_unit *satcu;
3279
3280	dev = pci_physfn(dev);
3281	satcu = dmar_find_matched_satc_unit(dev);
3282	if (satcu)
3283		/*
3284		 * This device supports ATS as it is in SATC table.
3285		 * When IOMMU is in legacy mode, enabling ATS is done
3286		 * automatically by HW for the device that requires
3287		 * ATS, hence OS should not enable this device ATS
3288		 * to avoid duplicated TLB invalidation.
3289		 */
3290		return !(satcu->atc_required && !sm_supported(iommu));
3291
3292	for (bus = dev->bus; bus; bus = bus->parent) {
3293		bridge = bus->self;
3294		/* If it's an integrated device, allow ATS */
3295		if (!bridge)
3296			return 1;
3297		/* Connected via non-PCIe: no ATS */
3298		if (!pci_is_pcie(bridge) ||
3299		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3300			return 0;
3301		/* If we found the root port, look it up in the ATSR */
3302		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3303			break;
3304	}
3305
3306	rcu_read_lock();
3307	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309		if (atsr->segment != pci_domain_nr(dev->bus))
3310			continue;
3311
3312		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313			if (tmp == &bridge->dev)
3314				goto out;
3315
3316		if (atsru->include_all)
3317			goto out;
3318	}
3319	ret = 0;
3320out:
3321	rcu_read_unlock();
3322
3323	return ret;
3324}
3325
3326int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3327{
3328	int ret;
3329	struct dmar_rmrr_unit *rmrru;
3330	struct dmar_atsr_unit *atsru;
3331	struct dmar_satc_unit *satcu;
3332	struct acpi_dmar_atsr *atsr;
3333	struct acpi_dmar_reserved_memory *rmrr;
3334	struct acpi_dmar_satc *satc;
3335
3336	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3337		return 0;
3338
3339	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340		rmrr = container_of(rmrru->hdr,
3341				    struct acpi_dmar_reserved_memory, header);
3342		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3344				((void *)rmrr) + rmrr->header.length,
3345				rmrr->segment, rmrru->devices,
3346				rmrru->devices_cnt);
3347			if (ret < 0)
3348				return ret;
3349		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350			dmar_remove_dev_scope(info, rmrr->segment,
3351				rmrru->devices, rmrru->devices_cnt);
3352		}
3353	}
3354
3355	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356		if (atsru->include_all)
3357			continue;
3358
3359		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3362					(void *)atsr + atsr->header.length,
3363					atsr->segment, atsru->devices,
3364					atsru->devices_cnt);
3365			if (ret > 0)
3366				break;
3367			else if (ret < 0)
3368				return ret;
3369		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370			if (dmar_remove_dev_scope(info, atsr->segment,
3371					atsru->devices, atsru->devices_cnt))
3372				break;
3373		}
3374	}
3375	list_for_each_entry(satcu, &dmar_satc_units, list) {
3376		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3379					(void *)satc + satc->header.length,
3380					satc->segment, satcu->devices,
3381					satcu->devices_cnt);
3382			if (ret > 0)
3383				break;
3384			else if (ret < 0)
3385				return ret;
3386		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387			if (dmar_remove_dev_scope(info, satc->segment,
3388					satcu->devices, satcu->devices_cnt))
3389				break;
3390		}
3391	}
3392
3393	return 0;
3394}
3395
3396static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397				       unsigned long val, void *v)
3398{
3399	struct memory_notify *mhp = v;
3400	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3401	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3402			mhp->nr_pages - 1);
3403
3404	switch (val) {
3405	case MEM_GOING_ONLINE:
3406		if (iommu_domain_identity_map(si_domain,
3407					      start_vpfn, last_vpfn)) {
3408			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409				start_vpfn, last_vpfn);
3410			return NOTIFY_BAD;
3411		}
3412		break;
3413
3414	case MEM_OFFLINE:
3415	case MEM_CANCEL_ONLINE:
3416		{
3417			struct dmar_drhd_unit *drhd;
3418			struct intel_iommu *iommu;
3419			LIST_HEAD(freelist);
3420
3421			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3422
3423			rcu_read_lock();
3424			for_each_active_iommu(iommu, drhd)
3425				iommu_flush_iotlb_psi(iommu, si_domain,
3426					start_vpfn, mhp->nr_pages,
3427					list_empty(&freelist), 0);
3428			rcu_read_unlock();
3429			put_pages_list(&freelist);
3430		}
3431		break;
3432	}
3433
3434	return NOTIFY_OK;
3435}
3436
3437static struct notifier_block intel_iommu_memory_nb = {
3438	.notifier_call = intel_iommu_memory_notifier,
3439	.priority = 0
3440};
3441
3442static void intel_disable_iommus(void)
3443{
3444	struct intel_iommu *iommu = NULL;
3445	struct dmar_drhd_unit *drhd;
3446
3447	for_each_iommu(iommu, drhd)
3448		iommu_disable_translation(iommu);
3449}
3450
3451void intel_iommu_shutdown(void)
3452{
3453	struct dmar_drhd_unit *drhd;
3454	struct intel_iommu *iommu = NULL;
3455
3456	if (no_iommu || dmar_disabled)
3457		return;
3458
3459	down_write(&dmar_global_lock);
3460
3461	/* Disable PMRs explicitly here. */
3462	for_each_iommu(iommu, drhd)
3463		iommu_disable_protect_mem_regions(iommu);
3464
3465	/* Make sure the IOMMUs are switched off */
3466	intel_disable_iommus();
3467
3468	up_write(&dmar_global_lock);
3469}
3470
3471static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3472{
3473	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3474
3475	return container_of(iommu_dev, struct intel_iommu, iommu);
3476}
3477
3478static ssize_t version_show(struct device *dev,
3479			    struct device_attribute *attr, char *buf)
3480{
3481	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3483	return sysfs_emit(buf, "%d:%d\n",
3484			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3485}
3486static DEVICE_ATTR_RO(version);
3487
3488static ssize_t address_show(struct device *dev,
3489			    struct device_attribute *attr, char *buf)
3490{
3491	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3493}
3494static DEVICE_ATTR_RO(address);
3495
3496static ssize_t cap_show(struct device *dev,
3497			struct device_attribute *attr, char *buf)
3498{
3499	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500	return sysfs_emit(buf, "%llx\n", iommu->cap);
3501}
3502static DEVICE_ATTR_RO(cap);
3503
3504static ssize_t ecap_show(struct device *dev,
3505			 struct device_attribute *attr, char *buf)
3506{
3507	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3509}
3510static DEVICE_ATTR_RO(ecap);
3511
3512static ssize_t domains_supported_show(struct device *dev,
3513				      struct device_attribute *attr, char *buf)
3514{
3515	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3517}
3518static DEVICE_ATTR_RO(domains_supported);
3519
3520static ssize_t domains_used_show(struct device *dev,
3521				 struct device_attribute *attr, char *buf)
3522{
3523	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524	return sysfs_emit(buf, "%d\n",
3525			  bitmap_weight(iommu->domain_ids,
3526					cap_ndoms(iommu->cap)));
3527}
3528static DEVICE_ATTR_RO(domains_used);
3529
3530static struct attribute *intel_iommu_attrs[] = {
3531	&dev_attr_version.attr,
3532	&dev_attr_address.attr,
3533	&dev_attr_cap.attr,
3534	&dev_attr_ecap.attr,
3535	&dev_attr_domains_supported.attr,
3536	&dev_attr_domains_used.attr,
3537	NULL,
3538};
3539
3540static struct attribute_group intel_iommu_group = {
3541	.name = "intel-iommu",
3542	.attrs = intel_iommu_attrs,
3543};
3544
3545const struct attribute_group *intel_iommu_groups[] = {
3546	&intel_iommu_group,
3547	NULL,
3548};
3549
3550static bool has_external_pci(void)
3551{
3552	struct pci_dev *pdev = NULL;
3553
3554	for_each_pci_dev(pdev)
3555		if (pdev->external_facing) {
3556			pci_dev_put(pdev);
3557			return true;
3558		}
3559
3560	return false;
3561}
3562
3563static int __init platform_optin_force_iommu(void)
3564{
3565	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3566		return 0;
3567
3568	if (no_iommu || dmar_disabled)
3569		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3570
3571	/*
3572	 * If Intel-IOMMU is disabled by default, we will apply identity
3573	 * map for all devices except those marked as being untrusted.
3574	 */
3575	if (dmar_disabled)
3576		iommu_set_default_passthrough(false);
3577
3578	dmar_disabled = 0;
3579	no_iommu = 0;
3580
3581	return 1;
3582}
3583
3584static int __init probe_acpi_namespace_devices(void)
3585{
3586	struct dmar_drhd_unit *drhd;
3587	/* To avoid a -Wunused-but-set-variable warning. */
3588	struct intel_iommu *iommu __maybe_unused;
3589	struct device *dev;
3590	int i, ret = 0;
3591
3592	for_each_active_iommu(iommu, drhd) {
3593		for_each_active_dev_scope(drhd->devices,
3594					  drhd->devices_cnt, i, dev) {
3595			struct acpi_device_physical_node *pn;
3596			struct acpi_device *adev;
3597
3598			if (dev->bus != &acpi_bus_type)
3599				continue;
3600
3601			adev = to_acpi_device(dev);
3602			mutex_lock(&adev->physical_node_lock);
3603			list_for_each_entry(pn,
3604					    &adev->physical_node_list, node) {
3605				ret = iommu_probe_device(pn->dev);
3606				if (ret)
3607					break;
3608			}
3609			mutex_unlock(&adev->physical_node_lock);
3610
3611			if (ret)
3612				return ret;
3613		}
3614	}
3615
3616	return 0;
3617}
3618
3619static __init int tboot_force_iommu(void)
3620{
3621	if (!tboot_enabled())
3622		return 0;
3623
3624	if (no_iommu || dmar_disabled)
3625		pr_warn("Forcing Intel-IOMMU to enabled\n");
3626
3627	dmar_disabled = 0;
3628	no_iommu = 0;
3629
3630	return 1;
3631}
3632
3633int __init intel_iommu_init(void)
3634{
3635	int ret = -ENODEV;
3636	struct dmar_drhd_unit *drhd;
3637	struct intel_iommu *iommu;
3638
3639	/*
3640	 * Intel IOMMU is required for a TXT/tboot launch or platform
3641	 * opt in, so enforce that.
3642	 */
3643	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644		    platform_optin_force_iommu();
3645
3646	down_write(&dmar_global_lock);
3647	if (dmar_table_init()) {
3648		if (force_on)
3649			panic("tboot: Failed to initialize DMAR table\n");
3650		goto out_free_dmar;
3651	}
3652
3653	if (dmar_dev_scope_init() < 0) {
3654		if (force_on)
3655			panic("tboot: Failed to initialize DMAR device scope\n");
3656		goto out_free_dmar;
3657	}
3658
3659	up_write(&dmar_global_lock);
3660
3661	/*
3662	 * The bus notifier takes the dmar_global_lock, so lockdep will
3663	 * complain later when we register it under the lock.
3664	 */
3665	dmar_register_bus_notifier();
3666
3667	down_write(&dmar_global_lock);
3668
3669	if (!no_iommu)
3670		intel_iommu_debugfs_init();
3671
3672	if (no_iommu || dmar_disabled) {
3673		/*
3674		 * We exit the function here to ensure IOMMU's remapping and
3675		 * mempool aren't setup, which means that the IOMMU's PMRs
3676		 * won't be disabled via the call to init_dmars(). So disable
3677		 * it explicitly here. The PMRs were setup by tboot prior to
3678		 * calling SENTER, but the kernel is expected to reset/tear
3679		 * down the PMRs.
3680		 */
3681		if (intel_iommu_tboot_noforce) {
3682			for_each_iommu(iommu, drhd)
3683				iommu_disable_protect_mem_regions(iommu);
3684		}
3685
3686		/*
3687		 * Make sure the IOMMUs are switched off, even when we
3688		 * boot into a kexec kernel and the previous kernel left
3689		 * them enabled
3690		 */
3691		intel_disable_iommus();
3692		goto out_free_dmar;
3693	}
3694
3695	if (list_empty(&dmar_rmrr_units))
3696		pr_info("No RMRR found\n");
3697
3698	if (list_empty(&dmar_atsr_units))
3699		pr_info("No ATSR found\n");
3700
3701	if (list_empty(&dmar_satc_units))
3702		pr_info("No SATC found\n");
3703
3704	init_no_remapping_devices();
3705
3706	ret = init_dmars();
3707	if (ret) {
3708		if (force_on)
3709			panic("tboot: Failed to initialize DMARs\n");
3710		pr_err("Initialization failed\n");
3711		goto out_free_dmar;
3712	}
3713	up_write(&dmar_global_lock);
3714
3715	init_iommu_pm_ops();
3716
3717	down_read(&dmar_global_lock);
3718	for_each_active_iommu(iommu, drhd) {
3719		/*
3720		 * The flush queue implementation does not perform
3721		 * page-selective invalidations that are required for efficient
3722		 * TLB flushes in virtual environments.  The benefit of batching
3723		 * is likely to be much lower than the overhead of synchronizing
3724		 * the virtual and physical IOMMU page-tables.
3725		 */
3726		if (cap_caching_mode(iommu->cap) &&
3727		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729			iommu_set_dma_strict();
3730		}
3731		iommu_device_sysfs_add(&iommu->iommu, NULL,
3732				       intel_iommu_groups,
3733				       "%s", iommu->name);
3734		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3735
3736		iommu_pmu_register(iommu);
3737	}
3738	up_read(&dmar_global_lock);
3739
3740	if (si_domain && !hw_pass_through)
3741		register_memory_notifier(&intel_iommu_memory_nb);
3742
3743	down_read(&dmar_global_lock);
3744	if (probe_acpi_namespace_devices())
3745		pr_warn("ACPI name space devices didn't probe correctly\n");
3746
3747	/* Finally, we enable the DMA remapping hardware. */
3748	for_each_iommu(iommu, drhd) {
3749		if (!drhd->ignored && !translation_pre_enabled(iommu))
3750			iommu_enable_translation(iommu);
3751
3752		iommu_disable_protect_mem_regions(iommu);
3753	}
3754	up_read(&dmar_global_lock);
3755
3756	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3757
3758	intel_iommu_enabled = 1;
3759
3760	return 0;
3761
3762out_free_dmar:
3763	intel_iommu_free_dmars();
3764	up_write(&dmar_global_lock);
3765	return ret;
3766}
3767
3768static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3769{
3770	struct device_domain_info *info = opaque;
3771
3772	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3773	return 0;
3774}
3775
3776/*
3777 * NB - intel-iommu lacks any sort of reference counting for the users of
3778 * dependent devices.  If multiple endpoints have intersecting dependent
3779 * devices, unbinding the driver from any one of them will possibly leave
3780 * the others unable to operate.
3781 */
3782static void domain_context_clear(struct device_domain_info *info)
3783{
3784	if (!dev_is_pci(info->dev))
3785		domain_context_clear_one(info, info->bus, info->devfn);
3786
3787	pci_for_each_dma_alias(to_pci_dev(info->dev),
3788			       &domain_context_clear_one_cb, info);
3789}
3790
3791/*
3792 * Clear the page table pointer in context or pasid table entries so that
3793 * all DMA requests without PASID from the device are blocked. If the page
3794 * table has been set, clean up the data structures.
3795 */
3796void device_block_translation(struct device *dev)
3797{
3798	struct device_domain_info *info = dev_iommu_priv_get(dev);
3799	struct intel_iommu *iommu = info->iommu;
3800	unsigned long flags;
3801
3802	iommu_disable_pci_caps(info);
3803	if (!dev_is_real_dma_subdevice(dev)) {
3804		if (sm_supported(iommu))
3805			intel_pasid_tear_down_entry(iommu, dev,
3806						    IOMMU_NO_PASID, false);
3807		else
3808			domain_context_clear(info);
3809	}
3810
3811	if (!info->domain)
3812		return;
3813
3814	spin_lock_irqsave(&info->domain->lock, flags);
3815	list_del(&info->link);
3816	spin_unlock_irqrestore(&info->domain->lock, flags);
3817
3818	domain_detach_iommu(info->domain, iommu);
3819	info->domain = NULL;
3820}
3821
3822static int md_domain_init(struct dmar_domain *domain, int guest_width)
3823{
3824	int adjust_width;
3825
3826	/* calculate AGAW */
3827	domain->gaw = guest_width;
3828	adjust_width = guestwidth_to_adjustwidth(guest_width);
3829	domain->agaw = width_to_agaw(adjust_width);
3830
3831	domain->iommu_coherency = false;
3832	domain->iommu_superpage = 0;
3833	domain->max_addr = 0;
3834
3835	/* always allocate the top pgd */
3836	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3837	if (!domain->pgd)
3838		return -ENOMEM;
3839	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3840	return 0;
3841}
3842
3843static int blocking_domain_attach_dev(struct iommu_domain *domain,
3844				      struct device *dev)
3845{
3846	device_block_translation(dev);
3847	return 0;
3848}
3849
3850static struct iommu_domain blocking_domain = {
3851	.type = IOMMU_DOMAIN_BLOCKED,
3852	.ops = &(const struct iommu_domain_ops) {
3853		.attach_dev	= blocking_domain_attach_dev,
3854	}
3855};
3856
3857static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3858{
3859	struct dmar_domain *dmar_domain;
3860	struct iommu_domain *domain;
3861
3862	switch (type) {
3863	case IOMMU_DOMAIN_DMA:
3864	case IOMMU_DOMAIN_UNMANAGED:
3865		dmar_domain = alloc_domain(type);
3866		if (!dmar_domain) {
3867			pr_err("Can't allocate dmar_domain\n");
3868			return NULL;
3869		}
3870		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871			pr_err("Domain initialization failed\n");
3872			domain_exit(dmar_domain);
3873			return NULL;
3874		}
3875
3876		domain = &dmar_domain->domain;
3877		domain->geometry.aperture_start = 0;
3878		domain->geometry.aperture_end   =
3879				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880		domain->geometry.force_aperture = true;
3881
3882		return domain;
3883	case IOMMU_DOMAIN_IDENTITY:
3884		return &si_domain->domain;
3885	case IOMMU_DOMAIN_SVA:
3886		return intel_svm_domain_alloc();
3887	default:
3888		return NULL;
3889	}
3890
3891	return NULL;
3892}
3893
3894static struct iommu_domain *
3895intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896			      struct iommu_domain *parent,
3897			      const struct iommu_user_data *user_data)
3898{
3899	struct device_domain_info *info = dev_iommu_priv_get(dev);
3900	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902	struct intel_iommu *iommu = info->iommu;
3903	struct dmar_domain *dmar_domain;
3904	struct iommu_domain *domain;
3905
3906	/* Must be NESTING domain */
3907	if (parent) {
3908		if (!nested_supported(iommu) || flags)
3909			return ERR_PTR(-EOPNOTSUPP);
3910		return intel_nested_domain_alloc(parent, user_data);
3911	}
3912
3913	if (flags &
3914	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915		return ERR_PTR(-EOPNOTSUPP);
3916	if (nested_parent && !nested_supported(iommu))
3917		return ERR_PTR(-EOPNOTSUPP);
3918	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919		return ERR_PTR(-EOPNOTSUPP);
3920
3921	/*
3922	 * domain_alloc_user op needs to fully initialize a domain before
3923	 * return, so uses iommu_domain_alloc() here for simple.
3924	 */
3925	domain = iommu_domain_alloc(dev->bus);
3926	if (!domain)
3927		return ERR_PTR(-ENOMEM);
3928
3929	dmar_domain = to_dmar_domain(domain);
3930
3931	if (nested_parent) {
3932		dmar_domain->nested_parent = true;
3933		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3934		spin_lock_init(&dmar_domain->s1_lock);
3935	}
3936
3937	if (dirty_tracking) {
3938		if (dmar_domain->use_first_level) {
3939			iommu_domain_free(domain);
3940			return ERR_PTR(-EOPNOTSUPP);
3941		}
3942		domain->dirty_ops = &intel_dirty_ops;
3943	}
3944
3945	return domain;
3946}
3947
3948static void intel_iommu_domain_free(struct iommu_domain *domain)
3949{
3950	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3951
3952	WARN_ON(dmar_domain->nested_parent &&
3953		!list_empty(&dmar_domain->s1_domains));
3954	if (domain != &si_domain->domain)
3955		domain_exit(dmar_domain);
3956}
3957
3958int prepare_domain_attach_device(struct iommu_domain *domain,
3959				 struct device *dev)
3960{
3961	struct device_domain_info *info = dev_iommu_priv_get(dev);
3962	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963	struct intel_iommu *iommu = info->iommu;
3964	int addr_width;
3965
3966	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3967		return -EINVAL;
3968
3969	if (domain->dirty_ops && !ssads_supported(iommu))
3970		return -EINVAL;
3971
3972	/* check if this iommu agaw is sufficient for max mapped address */
3973	addr_width = agaw_to_width(iommu->agaw);
3974	if (addr_width > cap_mgaw(iommu->cap))
3975		addr_width = cap_mgaw(iommu->cap);
3976
3977	if (dmar_domain->max_addr > (1LL << addr_width))
3978		return -EINVAL;
3979	dmar_domain->gaw = addr_width;
3980
3981	/*
3982	 * Knock out extra levels of page tables if necessary
3983	 */
3984	while (iommu->agaw < dmar_domain->agaw) {
3985		struct dma_pte *pte;
3986
3987		pte = dmar_domain->pgd;
3988		if (dma_pte_present(pte)) {
3989			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3990			free_pgtable_page(pte);
3991		}
3992		dmar_domain->agaw--;
3993	}
3994
3995	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996	    context_copied(iommu, info->bus, info->devfn))
3997		return intel_pasid_setup_sm_context(dev);
3998
3999	return 0;
4000}
4001
4002static int intel_iommu_attach_device(struct iommu_domain *domain,
4003				     struct device *dev)
4004{
4005	struct device_domain_info *info = dev_iommu_priv_get(dev);
4006	int ret;
4007
4008	if (info->domain)
4009		device_block_translation(dev);
4010
4011	ret = prepare_domain_attach_device(domain, dev);
4012	if (ret)
4013		return ret;
4014
4015	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4016}
4017
4018static int intel_iommu_map(struct iommu_domain *domain,
4019			   unsigned long iova, phys_addr_t hpa,
4020			   size_t size, int iommu_prot, gfp_t gfp)
4021{
4022	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4023	u64 max_addr;
4024	int prot = 0;
4025
4026	if (iommu_prot & IOMMU_READ)
4027		prot |= DMA_PTE_READ;
4028	if (iommu_prot & IOMMU_WRITE)
4029		prot |= DMA_PTE_WRITE;
4030	if (dmar_domain->set_pte_snp)
4031		prot |= DMA_PTE_SNP;
4032
4033	max_addr = iova + size;
4034	if (dmar_domain->max_addr < max_addr) {
4035		u64 end;
4036
4037		/* check if minimum agaw is sufficient for mapped address */
4038		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039		if (end < max_addr) {
4040			pr_err("%s: iommu width (%d) is not "
4041			       "sufficient for the mapped address (%llx)\n",
4042			       __func__, dmar_domain->gaw, max_addr);
4043			return -EFAULT;
4044		}
4045		dmar_domain->max_addr = max_addr;
4046	}
4047	/* Round up size to next multiple of PAGE_SIZE, if it and
4048	   the low bits of hpa would take us onto the next page */
4049	size = aligned_nrpages(hpa, size);
4050	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4051				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4052}
4053
4054static int intel_iommu_map_pages(struct iommu_domain *domain,
4055				 unsigned long iova, phys_addr_t paddr,
4056				 size_t pgsize, size_t pgcount,
4057				 int prot, gfp_t gfp, size_t *mapped)
4058{
4059	unsigned long pgshift = __ffs(pgsize);
4060	size_t size = pgcount << pgshift;
4061	int ret;
4062
4063	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4064		return -EINVAL;
4065
4066	if (!IS_ALIGNED(iova | paddr, pgsize))
4067		return -EINVAL;
4068
4069	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4070	if (!ret && mapped)
4071		*mapped = size;
4072
4073	return ret;
4074}
4075
4076static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077				unsigned long iova, size_t size,
4078				struct iommu_iotlb_gather *gather)
4079{
4080	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4081	unsigned long start_pfn, last_pfn;
4082	int level = 0;
4083
4084	/* Cope with horrid API which requires us to unmap more than the
4085	   size argument if it happens to be a large-page mapping. */
4086	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087				     &level, GFP_ATOMIC)))
4088		return 0;
4089
4090	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4092
4093	start_pfn = iova >> VTD_PAGE_SHIFT;
4094	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4095
4096	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4097
4098	if (dmar_domain->max_addr == iova + size)
4099		dmar_domain->max_addr = iova;
4100
4101	/*
4102	 * We do not use page-selective IOTLB invalidation in flush queue,
4103	 * so there is no need to track page and sync iotlb.
4104	 */
4105	if (!iommu_iotlb_gather_queued(gather))
4106		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4107
4108	return size;
4109}
4110
4111static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4112				      unsigned long iova,
4113				      size_t pgsize, size_t pgcount,
4114				      struct iommu_iotlb_gather *gather)
4115{
4116	unsigned long pgshift = __ffs(pgsize);
4117	size_t size = pgcount << pgshift;
4118
4119	return intel_iommu_unmap(domain, iova, size, gather);
4120}
4121
4122static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123				 struct iommu_iotlb_gather *gather)
4124{
4125	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126	unsigned long iova_pfn = IOVA_PFN(gather->start);
4127	size_t size = gather->end - gather->start;
4128	struct iommu_domain_info *info;
4129	unsigned long start_pfn;
4130	unsigned long nrpages;
4131	unsigned long i;
4132
4133	nrpages = aligned_nrpages(gather->start, size);
4134	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4135
4136	xa_for_each(&dmar_domain->iommu_array, i, info)
4137		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4138				      start_pfn, nrpages,
4139				      list_empty(&gather->freelist), 0);
4140
4141	if (dmar_domain->nested_parent)
4142		parent_domain_flush(dmar_domain, start_pfn, nrpages,
4143				    list_empty(&gather->freelist));
4144	put_pages_list(&gather->freelist);
4145}
4146
4147static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148					    dma_addr_t iova)
4149{
4150	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4151	struct dma_pte *pte;
4152	int level = 0;
4153	u64 phys = 0;
4154
4155	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4156			     GFP_ATOMIC);
4157	if (pte && dma_pte_present(pte))
4158		phys = dma_pte_addr(pte) +
4159			(iova & (BIT_MASK(level_to_offset_bits(level) +
4160						VTD_PAGE_SHIFT) - 1));
4161
4162	return phys;
4163}
4164
4165static bool domain_support_force_snooping(struct dmar_domain *domain)
4166{
4167	struct device_domain_info *info;
4168	bool support = true;
4169
4170	assert_spin_locked(&domain->lock);
4171	list_for_each_entry(info, &domain->devices, link) {
4172		if (!ecap_sc_support(info->iommu->ecap)) {
4173			support = false;
4174			break;
4175		}
4176	}
4177
4178	return support;
4179}
4180
4181static void domain_set_force_snooping(struct dmar_domain *domain)
4182{
4183	struct device_domain_info *info;
4184
4185	assert_spin_locked(&domain->lock);
4186	/*
4187	 * Second level page table supports per-PTE snoop control. The
4188	 * iommu_map() interface will handle this by setting SNP bit.
4189	 */
4190	if (!domain->use_first_level) {
4191		domain->set_pte_snp = true;
4192		return;
4193	}
4194
4195	list_for_each_entry(info, &domain->devices, link)
4196		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4197						     IOMMU_NO_PASID);
4198}
4199
4200static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4201{
4202	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203	unsigned long flags;
4204
4205	if (dmar_domain->force_snooping)
4206		return true;
4207
4208	spin_lock_irqsave(&dmar_domain->lock, flags);
4209	if (!domain_support_force_snooping(dmar_domain) ||
4210	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4212		return false;
4213	}
4214
4215	domain_set_force_snooping(dmar_domain);
4216	dmar_domain->force_snooping = true;
4217	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4218
4219	return true;
4220}
4221
4222static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4223{
4224	struct device_domain_info *info = dev_iommu_priv_get(dev);
4225
4226	switch (cap) {
4227	case IOMMU_CAP_CACHE_COHERENCY:
4228	case IOMMU_CAP_DEFERRED_FLUSH:
4229		return true;
4230	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231		return dmar_platform_optin();
4232	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233		return ecap_sc_support(info->iommu->ecap);
4234	case IOMMU_CAP_DIRTY_TRACKING:
4235		return ssads_supported(info->iommu);
4236	default:
4237		return false;
4238	}
4239}
4240
4241static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4242{
4243	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244	struct device_domain_info *info;
4245	struct intel_iommu *iommu;
4246	u8 bus, devfn;
4247	int ret;
4248
4249	iommu = device_lookup_iommu(dev, &bus, &devfn);
4250	if (!iommu || !iommu->iommu.ops)
4251		return ERR_PTR(-ENODEV);
4252
4253	info = kzalloc(sizeof(*info), GFP_KERNEL);
4254	if (!info)
4255		return ERR_PTR(-ENOMEM);
4256
4257	if (dev_is_real_dma_subdevice(dev)) {
4258		info->bus = pdev->bus->number;
4259		info->devfn = pdev->devfn;
4260		info->segment = pci_domain_nr(pdev->bus);
4261	} else {
4262		info->bus = bus;
4263		info->devfn = devfn;
4264		info->segment = iommu->segment;
4265	}
4266
4267	info->dev = dev;
4268	info->iommu = iommu;
4269	if (dev_is_pci(dev)) {
4270		if (ecap_dev_iotlb_support(iommu->ecap) &&
4271		    pci_ats_supported(pdev) &&
4272		    dmar_ats_supported(pdev, iommu)) {
4273			info->ats_supported = 1;
4274			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4275
4276			/*
4277			 * For IOMMU that supports device IOTLB throttling
4278			 * (DIT), we assign PFSID to the invalidation desc
4279			 * of a VF such that IOMMU HW can gauge queue depth
4280			 * at PF level. If DIT is not set, PFSID will be
4281			 * treated as reserved, which should be set to 0.
4282			 */
4283			if (ecap_dit(iommu->ecap))
4284				info->pfsid = pci_dev_id(pci_physfn(pdev));
4285			info->ats_qdep = pci_ats_queue_depth(pdev);
4286		}
4287		if (sm_supported(iommu)) {
4288			if (pasid_supported(iommu)) {
4289				int features = pci_pasid_features(pdev);
4290
4291				if (features >= 0)
4292					info->pasid_supported = features | 1;
4293			}
4294
4295			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296			    pci_pri_supported(pdev))
4297				info->pri_supported = 1;
4298		}
4299	}
4300
4301	dev_iommu_priv_set(dev, info);
4302	if (pdev && pci_ats_supported(pdev)) {
4303		ret = device_rbtree_insert(iommu, info);
4304		if (ret)
4305			goto free;
4306	}
4307
4308	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4309		ret = intel_pasid_alloc_table(dev);
4310		if (ret) {
4311			dev_err(dev, "PASID table allocation failed\n");
4312			goto clear_rbtree;
4313		}
4314
4315		if (!context_copied(iommu, info->bus, info->devfn)) {
4316			ret = intel_pasid_setup_sm_context(dev);
4317			if (ret)
4318				goto free_table;
4319		}
4320	}
4321
4322	intel_iommu_debugfs_create_dev(info);
4323
4324	return &iommu->iommu;
4325free_table:
4326	intel_pasid_free_table(dev);
4327clear_rbtree:
4328	device_rbtree_remove(info);
4329free:
4330	kfree(info);
4331
4332	return ERR_PTR(ret);
4333}
4334
4335static void intel_iommu_release_device(struct device *dev)
4336{
4337	struct device_domain_info *info = dev_iommu_priv_get(dev);
4338	struct intel_iommu *iommu = info->iommu;
4339
4340	mutex_lock(&iommu->iopf_lock);
4341	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4342		device_rbtree_remove(info);
4343	mutex_unlock(&iommu->iopf_lock);
4344
4345	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4346	    !context_copied(iommu, info->bus, info->devfn))
4347		intel_pasid_teardown_sm_context(dev);
4348
4349	intel_pasid_free_table(dev);
4350	intel_iommu_debugfs_remove_dev(info);
4351	kfree(info);
4352	set_dma_ops(dev, NULL);
4353}
4354
4355static void intel_iommu_probe_finalize(struct device *dev)
4356{
4357	set_dma_ops(dev, NULL);
4358	iommu_setup_dma_ops(dev, 0, U64_MAX);
4359}
4360
4361static void intel_iommu_get_resv_regions(struct device *device,
4362					 struct list_head *head)
4363{
4364	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4365	struct iommu_resv_region *reg;
4366	struct dmar_rmrr_unit *rmrr;
4367	struct device *i_dev;
4368	int i;
4369
4370	rcu_read_lock();
4371	for_each_rmrr_units(rmrr) {
4372		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4373					  i, i_dev) {
4374			struct iommu_resv_region *resv;
4375			enum iommu_resv_type type;
4376			size_t length;
4377
4378			if (i_dev != device &&
4379			    !is_downstream_to_pci_bridge(device, i_dev))
4380				continue;
4381
4382			length = rmrr->end_address - rmrr->base_address + 1;
4383
4384			type = device_rmrr_is_relaxable(device) ?
4385				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4386
4387			resv = iommu_alloc_resv_region(rmrr->base_address,
4388						       length, prot, type,
4389						       GFP_ATOMIC);
4390			if (!resv)
4391				break;
4392
4393			list_add_tail(&resv->list, head);
4394		}
4395	}
4396	rcu_read_unlock();
4397
4398#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4399	if (dev_is_pci(device)) {
4400		struct pci_dev *pdev = to_pci_dev(device);
4401
4402		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4403			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4404					IOMMU_RESV_DIRECT_RELAXABLE,
4405					GFP_KERNEL);
4406			if (reg)
4407				list_add_tail(&reg->list, head);
4408		}
4409	}
4410#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4411
4412	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4413				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4414				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4415	if (!reg)
4416		return;
4417	list_add_tail(&reg->list, head);
4418}
4419
4420static struct iommu_group *intel_iommu_device_group(struct device *dev)
4421{
4422	if (dev_is_pci(dev))
4423		return pci_device_group(dev);
4424	return generic_device_group(dev);
4425}
4426
4427static int intel_iommu_enable_sva(struct device *dev)
4428{
4429	struct device_domain_info *info = dev_iommu_priv_get(dev);
4430	struct intel_iommu *iommu;
4431
4432	if (!info || dmar_disabled)
4433		return -EINVAL;
4434
4435	iommu = info->iommu;
4436	if (!iommu)
4437		return -EINVAL;
4438
4439	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4440		return -ENODEV;
4441
4442	if (!info->pasid_enabled || !info->ats_enabled)
4443		return -EINVAL;
4444
4445	/*
4446	 * Devices having device-specific I/O fault handling should not
4447	 * support PCI/PRI. The IOMMU side has no means to check the
4448	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4449	 * default that if the device driver enables SVA on a non-PRI
4450	 * device, it will handle IOPF in its own way.
4451	 */
4452	if (!info->pri_supported)
4453		return 0;
4454
4455	/* Devices supporting PRI should have it enabled. */
4456	if (!info->pri_enabled)
4457		return -EINVAL;
4458
4459	return 0;
4460}
4461
4462static int intel_iommu_enable_iopf(struct device *dev)
4463{
4464	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4465	struct device_domain_info *info = dev_iommu_priv_get(dev);
4466	struct intel_iommu *iommu;
4467	int ret;
4468
4469	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4470		return -ENODEV;
4471
4472	if (info->pri_enabled)
4473		return -EBUSY;
4474
4475	iommu = info->iommu;
4476	if (!iommu)
4477		return -EINVAL;
4478
4479	/* PASID is required in PRG Response Message. */
4480	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4481		return -EINVAL;
4482
4483	ret = pci_reset_pri(pdev);
4484	if (ret)
4485		return ret;
4486
4487	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4488	if (ret)
4489		return ret;
4490
4491	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4492	if (ret) {
4493		iopf_queue_remove_device(iommu->iopf_queue, dev);
4494		return ret;
4495	}
4496
4497	info->pri_enabled = 1;
4498
4499	return 0;
4500}
4501
4502static int intel_iommu_disable_iopf(struct device *dev)
4503{
4504	struct device_domain_info *info = dev_iommu_priv_get(dev);
4505	struct intel_iommu *iommu = info->iommu;
4506
4507	if (!info->pri_enabled)
4508		return -EINVAL;
4509
4510	/*
4511	 * PCIe spec states that by clearing PRI enable bit, the Page
4512	 * Request Interface will not issue new page requests, but has
4513	 * outstanding page requests that have been transmitted or are
4514	 * queued for transmission. This is supposed to be called after
4515	 * the device driver has stopped DMA, all PASIDs have been
4516	 * unbound and the outstanding PRQs have been drained.
4517	 */
4518	pci_disable_pri(to_pci_dev(dev));
4519	info->pri_enabled = 0;
4520	iopf_queue_remove_device(iommu->iopf_queue, dev);
4521
4522	return 0;
4523}
4524
4525static int
4526intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4527{
4528	switch (feat) {
4529	case IOMMU_DEV_FEAT_IOPF:
4530		return intel_iommu_enable_iopf(dev);
4531
4532	case IOMMU_DEV_FEAT_SVA:
4533		return intel_iommu_enable_sva(dev);
4534
4535	default:
4536		return -ENODEV;
4537	}
4538}
4539
4540static int
4541intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4542{
4543	switch (feat) {
4544	case IOMMU_DEV_FEAT_IOPF:
4545		return intel_iommu_disable_iopf(dev);
4546
4547	case IOMMU_DEV_FEAT_SVA:
4548		return 0;
4549
4550	default:
4551		return -ENODEV;
4552	}
4553}
4554
4555static bool intel_iommu_is_attach_deferred(struct device *dev)
4556{
4557	struct device_domain_info *info = dev_iommu_priv_get(dev);
4558
4559	return translation_pre_enabled(info->iommu) && !info->domain;
4560}
4561
4562/*
4563 * Check that the device does not live on an external facing PCI port that is
4564 * marked as untrusted. Such devices should not be able to apply quirks and
4565 * thus not be able to bypass the IOMMU restrictions.
4566 */
4567static bool risky_device(struct pci_dev *pdev)
4568{
4569	if (pdev->untrusted) {
4570		pci_info(pdev,
4571			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4572			 pdev->vendor, pdev->device);
4573		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4574		return true;
4575	}
4576	return false;
4577}
4578
4579static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4580				      unsigned long iova, size_t size)
4581{
4582	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4583	unsigned long pages = aligned_nrpages(iova, size);
4584	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4585	struct iommu_domain_info *info;
4586	unsigned long i;
4587
4588	xa_for_each(&dmar_domain->iommu_array, i, info)
4589		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4590	return 0;
4591}
4592
4593static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4594{
4595	struct device_domain_info *info = dev_iommu_priv_get(dev);
4596	struct dev_pasid_info *curr, *dev_pasid = NULL;
4597	struct intel_iommu *iommu = info->iommu;
4598	struct dmar_domain *dmar_domain;
4599	struct iommu_domain *domain;
4600	unsigned long flags;
4601
4602	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4603	if (WARN_ON_ONCE(!domain))
4604		goto out_tear_down;
4605
4606	/*
4607	 * The SVA implementation needs to handle its own stuffs like the mm
4608	 * notification. Before consolidating that code into iommu core, let
4609	 * the intel sva code handle it.
4610	 */
4611	if (domain->type == IOMMU_DOMAIN_SVA) {
4612		intel_svm_remove_dev_pasid(dev, pasid);
4613		goto out_tear_down;
4614	}
4615
4616	dmar_domain = to_dmar_domain(domain);
4617	spin_lock_irqsave(&dmar_domain->lock, flags);
4618	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4619		if (curr->dev == dev && curr->pasid == pasid) {
4620			list_del(&curr->link_domain);
4621			dev_pasid = curr;
4622			break;
4623		}
4624	}
4625	WARN_ON_ONCE(!dev_pasid);
4626	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4627
4628	domain_detach_iommu(dmar_domain, iommu);
4629	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4630	kfree(dev_pasid);
4631out_tear_down:
4632	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4633	intel_drain_pasid_prq(dev, pasid);
4634}
4635
4636static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4637				     struct device *dev, ioasid_t pasid)
4638{
4639	struct device_domain_info *info = dev_iommu_priv_get(dev);
4640	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4641	struct intel_iommu *iommu = info->iommu;
4642	struct dev_pasid_info *dev_pasid;
4643	unsigned long flags;
4644	int ret;
4645
4646	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4647		return -EOPNOTSUPP;
4648
4649	if (domain->dirty_ops)
4650		return -EINVAL;
4651
4652	if (context_copied(iommu, info->bus, info->devfn))
4653		return -EBUSY;
4654
4655	ret = prepare_domain_attach_device(domain, dev);
4656	if (ret)
4657		return ret;
4658
4659	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4660	if (!dev_pasid)
4661		return -ENOMEM;
4662
4663	ret = domain_attach_iommu(dmar_domain, iommu);
4664	if (ret)
4665		goto out_free;
4666
4667	if (domain_type_is_si(dmar_domain))
4668		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4669	else if (dmar_domain->use_first_level)
4670		ret = domain_setup_first_level(iommu, dmar_domain,
4671					       dev, pasid);
4672	else
4673		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4674						     dev, pasid);
4675	if (ret)
4676		goto out_detach_iommu;
4677
4678	dev_pasid->dev = dev;
4679	dev_pasid->pasid = pasid;
4680	spin_lock_irqsave(&dmar_domain->lock, flags);
4681	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4682	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4683
4684	if (domain->type & __IOMMU_DOMAIN_PAGING)
4685		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4686
4687	return 0;
4688out_detach_iommu:
4689	domain_detach_iommu(dmar_domain, iommu);
4690out_free:
4691	kfree(dev_pasid);
4692	return ret;
4693}
4694
4695static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4696{
4697	struct device_domain_info *info = dev_iommu_priv_get(dev);
4698	struct intel_iommu *iommu = info->iommu;
4699	struct iommu_hw_info_vtd *vtd;
4700
4701	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4702	if (!vtd)
4703		return ERR_PTR(-ENOMEM);
4704
4705	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4706	vtd->cap_reg = iommu->cap;
4707	vtd->ecap_reg = iommu->ecap;
4708	*length = sizeof(*vtd);
4709	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4710	return vtd;
4711}
4712
4713/*
4714 * Set dirty tracking for the device list of a domain. The caller must
4715 * hold the domain->lock when calling it.
4716 */
4717static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4718{
4719	struct device_domain_info *info;
4720	int ret = 0;
4721
4722	list_for_each_entry(info, devices, link) {
4723		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4724						       IOMMU_NO_PASID, enable);
4725		if (ret)
4726			break;
4727	}
4728
4729	return ret;
4730}
4731
4732static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4733					    bool enable)
4734{
4735	struct dmar_domain *s1_domain;
4736	unsigned long flags;
4737	int ret;
4738
4739	spin_lock(&domain->s1_lock);
4740	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4741		spin_lock_irqsave(&s1_domain->lock, flags);
4742		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4743		spin_unlock_irqrestore(&s1_domain->lock, flags);
4744		if (ret)
4745			goto err_unwind;
4746	}
4747	spin_unlock(&domain->s1_lock);
4748	return 0;
4749
4750err_unwind:
4751	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4752		spin_lock_irqsave(&s1_domain->lock, flags);
4753		device_set_dirty_tracking(&s1_domain->devices,
4754					  domain->dirty_tracking);
4755		spin_unlock_irqrestore(&s1_domain->lock, flags);
4756	}
4757	spin_unlock(&domain->s1_lock);
4758	return ret;
4759}
4760
4761static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4762					  bool enable)
4763{
4764	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4765	int ret;
4766
4767	spin_lock(&dmar_domain->lock);
4768	if (dmar_domain->dirty_tracking == enable)
4769		goto out_unlock;
4770
4771	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4772	if (ret)
4773		goto err_unwind;
4774
4775	if (dmar_domain->nested_parent) {
4776		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4777		if (ret)
4778			goto err_unwind;
4779	}
4780
4781	dmar_domain->dirty_tracking = enable;
4782out_unlock:
4783	spin_unlock(&dmar_domain->lock);
4784
4785	return 0;
4786
4787err_unwind:
4788	device_set_dirty_tracking(&dmar_domain->devices,
4789				  dmar_domain->dirty_tracking);
4790	spin_unlock(&dmar_domain->lock);
4791	return ret;
4792}
4793
4794static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4795					    unsigned long iova, size_t size,
4796					    unsigned long flags,
4797					    struct iommu_dirty_bitmap *dirty)
4798{
4799	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4800	unsigned long end = iova + size - 1;
4801	unsigned long pgsize;
4802
4803	/*
4804	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4805	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4806	 * have occurred when we stopped dirty tracking. This ensures that we
4807	 * never inherit dirtied bits from a previous cycle.
4808	 */
4809	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4810		return -EINVAL;
4811
4812	do {
4813		struct dma_pte *pte;
4814		int lvl = 0;
4815
4816		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4817				     GFP_ATOMIC);
4818		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4819		if (!pte || !dma_pte_present(pte)) {
4820			iova += pgsize;
4821			continue;
4822		}
4823
4824		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4825			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4826		iova += pgsize;
4827	} while (iova < end);
4828
4829	return 0;
4830}
4831
4832static const struct iommu_dirty_ops intel_dirty_ops = {
4833	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4834	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4835};
4836
4837const struct iommu_ops intel_iommu_ops = {
4838	.blocked_domain		= &blocking_domain,
4839	.release_domain		= &blocking_domain,
4840	.capable		= intel_iommu_capable,
4841	.hw_info		= intel_iommu_hw_info,
4842	.domain_alloc		= intel_iommu_domain_alloc,
4843	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4844	.probe_device		= intel_iommu_probe_device,
4845	.probe_finalize		= intel_iommu_probe_finalize,
4846	.release_device		= intel_iommu_release_device,
4847	.get_resv_regions	= intel_iommu_get_resv_regions,
4848	.device_group		= intel_iommu_device_group,
4849	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4850	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4851	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4852	.def_domain_type	= device_def_domain_type,
4853	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4854	.pgsize_bitmap		= SZ_4K,
4855#ifdef CONFIG_INTEL_IOMMU_SVM
4856	.page_response		= intel_svm_page_response,
4857#endif
4858	.default_domain_ops = &(const struct iommu_domain_ops) {
4859		.attach_dev		= intel_iommu_attach_device,
4860		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4861		.map_pages		= intel_iommu_map_pages,
4862		.unmap_pages		= intel_iommu_unmap_pages,
4863		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4864		.flush_iotlb_all        = intel_flush_iotlb_all,
4865		.iotlb_sync		= intel_iommu_tlb_sync,
4866		.iova_to_phys		= intel_iommu_iova_to_phys,
4867		.free			= intel_iommu_domain_free,
4868		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4869	}
4870};
4871
4872static void quirk_iommu_igfx(struct pci_dev *dev)
4873{
4874	if (risky_device(dev))
4875		return;
4876
4877	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4878	dmar_map_gfx = 0;
4879}
4880
4881/* G4x/GM45 integrated gfx dmar support is totally busted. */
4882DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4883DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4884DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4885DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4886DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4889
4890/* Broadwell igfx malfunctions with dmar */
4891DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4892DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4893DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4894DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4895DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4896DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4897DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4898DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4899DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4900DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4901DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4902DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4903DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4904DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4905DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4906DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4907DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4908DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4909DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4910DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4912DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4913DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4914DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4915
4916static void quirk_iommu_rwbf(struct pci_dev *dev)
4917{
4918	if (risky_device(dev))
4919		return;
4920
4921	/*
4922	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4923	 * but needs it. Same seems to hold for the desktop versions.
4924	 */
4925	pci_info(dev, "Forcing write-buffer flush capability\n");
4926	rwbf_quirk = 1;
4927}
4928
4929DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4930DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4932DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4933DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4934DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4935DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4936
4937#define GGC 0x52
4938#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4939#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4940#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4941#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4942#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4943#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4944#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4945#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4946
4947static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4948{
4949	unsigned short ggc;
4950
4951	if (risky_device(dev))
4952		return;
4953
4954	if (pci_read_config_word(dev, GGC, &ggc))
4955		return;
4956
4957	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4958		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4959		dmar_map_gfx = 0;
4960	} else if (dmar_map_gfx) {
4961		/* we have to ensure the gfx device is idle before we flush */
4962		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4963		iommu_set_dma_strict();
4964	}
4965}
4966DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4967DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4968DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4969DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4970
4971static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4972{
4973	unsigned short ver;
4974
4975	if (!IS_GFX_DEVICE(dev))
4976		return;
4977
4978	ver = (dev->device >> 8) & 0xff;
4979	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4980	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4981	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4982		return;
4983
4984	if (risky_device(dev))
4985		return;
4986
4987	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4988	iommu_skip_te_disable = 1;
4989}
4990DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4991
4992/* On Tylersburg chipsets, some BIOSes have been known to enable the
4993   ISOCH DMAR unit for the Azalia sound device, but not give it any
4994   TLB entries, which causes it to deadlock. Check for that.  We do
4995   this in a function called from init_dmars(), instead of in a PCI
4996   quirk, because we don't want to print the obnoxious "BIOS broken"
4997   message if VT-d is actually disabled.
4998*/
4999static void __init check_tylersburg_isoch(void)
5000{
5001	struct pci_dev *pdev;
5002	uint32_t vtisochctrl;
5003
5004	/* If there's no Azalia in the system anyway, forget it. */
5005	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5006	if (!pdev)
5007		return;
5008
5009	if (risky_device(pdev)) {
5010		pci_dev_put(pdev);
5011		return;
5012	}
5013
5014	pci_dev_put(pdev);
5015
5016	/* System Management Registers. Might be hidden, in which case
5017	   we can't do the sanity check. But that's OK, because the
5018	   known-broken BIOSes _don't_ actually hide it, so far. */
5019	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5020	if (!pdev)
5021		return;
5022
5023	if (risky_device(pdev)) {
5024		pci_dev_put(pdev);
5025		return;
5026	}
5027
5028	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5029		pci_dev_put(pdev);
5030		return;
5031	}
5032
5033	pci_dev_put(pdev);
5034
5035	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5036	if (vtisochctrl & 1)
5037		return;
5038
5039	/* Drop all bits other than the number of TLB entries */
5040	vtisochctrl &= 0x1c;
5041
5042	/* If we have the recommended number of TLB entries (16), fine. */
5043	if (vtisochctrl == 0x10)
5044		return;
5045
5046	/* Zero TLB entries? You get to ride the short bus to school. */
5047	if (!vtisochctrl) {
5048		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5049		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5050		     dmi_get_system_info(DMI_BIOS_VENDOR),
5051		     dmi_get_system_info(DMI_BIOS_VERSION),
5052		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5053		iommu_identity_mapping |= IDENTMAP_AZALIA;
5054		return;
5055	}
5056
5057	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5058	       vtisochctrl);
5059}
5060
5061/*
5062 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5063 * invalidation completion before posted writes initiated with translated address
5064 * that utilized translations matching the invalidation address range, violating
5065 * the invalidation completion ordering.
5066 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5067 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5068 * under the control of the trusted/privileged host device driver must use this
5069 * quirk.
5070 * Device TLBs are invalidated under the following six conditions:
5071 * 1. Device driver does DMA API unmap IOVA
5072 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5073 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5074 *    exit_mmap() due to crash
5075 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5076 *    VM has to free pages that were unmapped
5077 * 5. Userspace driver unmaps a DMA buffer
5078 * 6. Cache invalidation in vSVA usage (upcoming)
5079 *
5080 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5081 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5082 * invalidate TLB the same way as normal user unmap which will use this quirk.
5083 * The dTLB invalidation after PASID cache flush does not need this quirk.
5084 *
5085 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5086 */
5087void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5088			       unsigned long address, unsigned long mask,
5089			       u32 pasid, u16 qdep)
5090{
5091	u16 sid;
5092
5093	if (likely(!info->dtlb_extra_inval))
5094		return;
5095
5096	sid = PCI_DEVID(info->bus, info->devfn);
5097	if (pasid == IOMMU_NO_PASID) {
5098		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5099				   qdep, address, mask);
5100	} else {
5101		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5102					 pasid, qdep, address, mask);
5103	}
5104}
5105
5106#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5107
5108/*
5109 * Function to submit a command to the enhanced command interface. The
5110 * valid enhanced command descriptions are defined in Table 47 of the
5111 * VT-d spec. The VT-d hardware implementation may support some but not
5112 * all commands, which can be determined by checking the Enhanced
5113 * Command Capability Register.
5114 *
5115 * Return values:
5116 *  - 0: Command successful without any error;
5117 *  - Negative: software error value;
5118 *  - Nonzero positive: failure status code defined in Table 48.
5119 */
5120int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5121{
5122	unsigned long flags;
5123	u64 res;
5124	int ret;
5125
5126	if (!cap_ecmds(iommu->cap))
5127		return -ENODEV;
5128
5129	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5130
5131	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5132	if (res & DMA_ECMD_ECRSP_IP) {
5133		ret = -EBUSY;
5134		goto err;
5135	}
5136
5137	/*
5138	 * Unconditionally write the operand B, because
5139	 * - There is no side effect if an ecmd doesn't require an
5140	 *   operand B, but we set the register to some value.
5141	 * - It's not invoked in any critical path. The extra MMIO
5142	 *   write doesn't bring any performance concerns.
5143	 */
5144	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5145	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5146
5147	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5148		      !(res & DMA_ECMD_ECRSP_IP), res);
5149
5150	if (res & DMA_ECMD_ECRSP_IP) {
5151		ret = -ETIMEDOUT;
5152		goto err;
5153	}
5154
5155	ret = ecmd_get_status_code(res);
5156err:
5157	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5158
5159	return ret;
5160}
5161