1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * A fairly generic DMA-API to IOMMU-API glue layer.
4 *
5 * Copyright (C) 2014-2015 ARM Ltd.
6 *
7 * based in part on arch/arm/mm/dma-mapping.c:
8 * Copyright (C) 2000-2004 Russell King
9 */
10
11#include <linux/acpi_iort.h>
12#include <linux/atomic.h>
13#include <linux/crash_dump.h>
14#include <linux/device.h>
15#include <linux/dma-direct.h>
16#include <linux/dma-map-ops.h>
17#include <linux/gfp.h>
18#include <linux/huge_mm.h>
19#include <linux/iommu.h>
20#include <linux/iova.h>
21#include <linux/irq.h>
22#include <linux/list_sort.h>
23#include <linux/memremap.h>
24#include <linux/mm.h>
25#include <linux/mutex.h>
26#include <linux/of_iommu.h>
27#include <linux/pci.h>
28#include <linux/scatterlist.h>
29#include <linux/spinlock.h>
30#include <linux/swiotlb.h>
31#include <linux/vmalloc.h>
32#include <trace/events/swiotlb.h>
33
34#include "dma-iommu.h"
35
36struct iommu_dma_msi_page {
37	struct list_head	list;
38	dma_addr_t		iova;
39	phys_addr_t		phys;
40};
41
42enum iommu_dma_cookie_type {
43	IOMMU_DMA_IOVA_COOKIE,
44	IOMMU_DMA_MSI_COOKIE,
45};
46
47enum iommu_dma_queue_type {
48	IOMMU_DMA_OPTS_PER_CPU_QUEUE,
49	IOMMU_DMA_OPTS_SINGLE_QUEUE,
50};
51
52struct iommu_dma_options {
53	enum iommu_dma_queue_type qt;
54	size_t		fq_size;
55	unsigned int	fq_timeout;
56};
57
58struct iommu_dma_cookie {
59	enum iommu_dma_cookie_type	type;
60	union {
61		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
62		struct {
63			struct iova_domain	iovad;
64			/* Flush queue */
65			union {
66				struct iova_fq	*single_fq;
67				struct iova_fq	__percpu *percpu_fq;
68			};
69			/* Number of TLB flushes that have been started */
70			atomic64_t		fq_flush_start_cnt;
71			/* Number of TLB flushes that have been finished */
72			atomic64_t		fq_flush_finish_cnt;
73			/* Timer to regularily empty the flush queues */
74			struct timer_list	fq_timer;
75			/* 1 when timer is active, 0 when not */
76			atomic_t		fq_timer_on;
77		};
78		/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
79		dma_addr_t		msi_iova;
80	};
81	struct list_head		msi_page_list;
82
83	/* Domain for flush queue callback; NULL if flush queue not in use */
84	struct iommu_domain		*fq_domain;
85	/* Options for dma-iommu use */
86	struct iommu_dma_options	options;
87	struct mutex			mutex;
88};
89
90static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
91bool iommu_dma_forcedac __read_mostly;
92
93static int __init iommu_dma_forcedac_setup(char *str)
94{
95	int ret = kstrtobool(str, &iommu_dma_forcedac);
96
97	if (!ret && iommu_dma_forcedac)
98		pr_info("Forcing DAC for PCI devices\n");
99	return ret;
100}
101early_param("iommu.forcedac", iommu_dma_forcedac_setup);
102
103/* Number of entries per flush queue */
104#define IOVA_DEFAULT_FQ_SIZE	256
105#define IOVA_SINGLE_FQ_SIZE	32768
106
107/* Timeout (in ms) after which entries are flushed from the queue */
108#define IOVA_DEFAULT_FQ_TIMEOUT	10
109#define IOVA_SINGLE_FQ_TIMEOUT	1000
110
111/* Flush queue entry for deferred flushing */
112struct iova_fq_entry {
113	unsigned long iova_pfn;
114	unsigned long pages;
115	struct list_head freelist;
116	u64 counter; /* Flush counter when this entry was added */
117};
118
119/* Per-CPU flush queue structure */
120struct iova_fq {
121	spinlock_t lock;
122	unsigned int head, tail;
123	unsigned int mod_mask;
124	struct iova_fq_entry entries[];
125};
126
127#define fq_ring_for_each(i, fq) \
128	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) & (fq)->mod_mask)
129
130static inline bool fq_full(struct iova_fq *fq)
131{
132	assert_spin_locked(&fq->lock);
133	return (((fq->tail + 1) & fq->mod_mask) == fq->head);
134}
135
136static inline unsigned int fq_ring_add(struct iova_fq *fq)
137{
138	unsigned int idx = fq->tail;
139
140	assert_spin_locked(&fq->lock);
141
142	fq->tail = (idx + 1) & fq->mod_mask;
143
144	return idx;
145}
146
147static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
148{
149	u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
150	unsigned int idx;
151
152	assert_spin_locked(&fq->lock);
153
154	fq_ring_for_each(idx, fq) {
155
156		if (fq->entries[idx].counter >= counter)
157			break;
158
159		put_pages_list(&fq->entries[idx].freelist);
160		free_iova_fast(&cookie->iovad,
161			       fq->entries[idx].iova_pfn,
162			       fq->entries[idx].pages);
163
164		fq->head = (fq->head + 1) & fq->mod_mask;
165	}
166}
167
168static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
169{
170	unsigned long flags;
171
172	spin_lock_irqsave(&fq->lock, flags);
173	fq_ring_free_locked(cookie, fq);
174	spin_unlock_irqrestore(&fq->lock, flags);
175}
176
177static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
178{
179	atomic64_inc(&cookie->fq_flush_start_cnt);
180	cookie->fq_domain->ops->flush_iotlb_all(cookie->fq_domain);
181	atomic64_inc(&cookie->fq_flush_finish_cnt);
182}
183
184static void fq_flush_timeout(struct timer_list *t)
185{
186	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
187	int cpu;
188
189	atomic_set(&cookie->fq_timer_on, 0);
190	fq_flush_iotlb(cookie);
191
192	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
193		fq_ring_free(cookie, cookie->single_fq);
194	} else {
195		for_each_possible_cpu(cpu)
196			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
197	}
198}
199
200static void queue_iova(struct iommu_dma_cookie *cookie,
201		unsigned long pfn, unsigned long pages,
202		struct list_head *freelist)
203{
204	struct iova_fq *fq;
205	unsigned long flags;
206	unsigned int idx;
207
208	/*
209	 * Order against the IOMMU driver's pagetable update from unmapping
210	 * @pte, to guarantee that fq_flush_iotlb() observes that if called
211	 * from a different CPU before we release the lock below. Full barrier
212	 * so it also pairs with iommu_dma_init_fq() to avoid seeing partially
213	 * written fq state here.
214	 */
215	smp_mb();
216
217	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
218		fq = cookie->single_fq;
219	else
220		fq = raw_cpu_ptr(cookie->percpu_fq);
221
222	spin_lock_irqsave(&fq->lock, flags);
223
224	/*
225	 * First remove all entries from the flush queue that have already been
226	 * flushed out on another CPU. This makes the fq_full() check below less
227	 * likely to be true.
228	 */
229	fq_ring_free_locked(cookie, fq);
230
231	if (fq_full(fq)) {
232		fq_flush_iotlb(cookie);
233		fq_ring_free_locked(cookie, fq);
234	}
235
236	idx = fq_ring_add(fq);
237
238	fq->entries[idx].iova_pfn = pfn;
239	fq->entries[idx].pages    = pages;
240	fq->entries[idx].counter  = atomic64_read(&cookie->fq_flush_start_cnt);
241	list_splice(freelist, &fq->entries[idx].freelist);
242
243	spin_unlock_irqrestore(&fq->lock, flags);
244
245	/* Avoid false sharing as much as possible. */
246	if (!atomic_read(&cookie->fq_timer_on) &&
247	    !atomic_xchg(&cookie->fq_timer_on, 1))
248		mod_timer(&cookie->fq_timer,
249			  jiffies + msecs_to_jiffies(cookie->options.fq_timeout));
250}
251
252static void iommu_dma_free_fq_single(struct iova_fq *fq)
253{
254	int idx;
255
256	fq_ring_for_each(idx, fq)
257		put_pages_list(&fq->entries[idx].freelist);
258	vfree(fq);
259}
260
261static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq)
262{
263	int cpu, idx;
264
265	/* The IOVAs will be torn down separately, so just free our queued pages */
266	for_each_possible_cpu(cpu) {
267		struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu);
268
269		fq_ring_for_each(idx, fq)
270			put_pages_list(&fq->entries[idx].freelist);
271	}
272
273	free_percpu(percpu_fq);
274}
275
276static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
277{
278	if (!cookie->fq_domain)
279		return;
280
281	del_timer_sync(&cookie->fq_timer);
282	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
283		iommu_dma_free_fq_single(cookie->single_fq);
284	else
285		iommu_dma_free_fq_percpu(cookie->percpu_fq);
286}
287
288static void iommu_dma_init_one_fq(struct iova_fq *fq, size_t fq_size)
289{
290	int i;
291
292	fq->head = 0;
293	fq->tail = 0;
294	fq->mod_mask = fq_size - 1;
295
296	spin_lock_init(&fq->lock);
297
298	for (i = 0; i < fq_size; i++)
299		INIT_LIST_HEAD(&fq->entries[i].freelist);
300}
301
302static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie)
303{
304	size_t fq_size = cookie->options.fq_size;
305	struct iova_fq *queue;
306
307	queue = vmalloc(struct_size(queue, entries, fq_size));
308	if (!queue)
309		return -ENOMEM;
310	iommu_dma_init_one_fq(queue, fq_size);
311	cookie->single_fq = queue;
312
313	return 0;
314}
315
316static int iommu_dma_init_fq_percpu(struct iommu_dma_cookie *cookie)
317{
318	size_t fq_size = cookie->options.fq_size;
319	struct iova_fq __percpu *queue;
320	int cpu;
321
322	queue = __alloc_percpu(struct_size(queue, entries, fq_size),
323			       __alignof__(*queue));
324	if (!queue)
325		return -ENOMEM;
326
327	for_each_possible_cpu(cpu)
328		iommu_dma_init_one_fq(per_cpu_ptr(queue, cpu), fq_size);
329	cookie->percpu_fq = queue;
330	return 0;
331}
332
333/* sysfs updates are serialised by the mutex of the group owning @domain */
334int iommu_dma_init_fq(struct iommu_domain *domain)
335{
336	struct iommu_dma_cookie *cookie = domain->iova_cookie;
337	int rc;
338
339	if (cookie->fq_domain)
340		return 0;
341
342	atomic64_set(&cookie->fq_flush_start_cnt,  0);
343	atomic64_set(&cookie->fq_flush_finish_cnt, 0);
344
345	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
346		rc = iommu_dma_init_fq_single(cookie);
347	else
348		rc = iommu_dma_init_fq_percpu(cookie);
349
350	if (rc) {
351		pr_warn("iova flush queue initialization failed\n");
352		return -ENOMEM;
353	}
354
355	timer_setup(&cookie->fq_timer, fq_flush_timeout, 0);
356	atomic_set(&cookie->fq_timer_on, 0);
357	/*
358	 * Prevent incomplete fq state being observable. Pairs with path from
359	 * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova()
360	 */
361	smp_wmb();
362	WRITE_ONCE(cookie->fq_domain, domain);
363	return 0;
364}
365
366static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
367{
368	if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
369		return cookie->iovad.granule;
370	return PAGE_SIZE;
371}
372
373static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
374{
375	struct iommu_dma_cookie *cookie;
376
377	cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
378	if (cookie) {
379		INIT_LIST_HEAD(&cookie->msi_page_list);
380		cookie->type = type;
381	}
382	return cookie;
383}
384
385/**
386 * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
387 * @domain: IOMMU domain to prepare for DMA-API usage
388 */
389int iommu_get_dma_cookie(struct iommu_domain *domain)
390{
391	if (domain->iova_cookie)
392		return -EEXIST;
393
394	domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
395	if (!domain->iova_cookie)
396		return -ENOMEM;
397
398	mutex_init(&domain->iova_cookie->mutex);
399	return 0;
400}
401
402/**
403 * iommu_get_msi_cookie - Acquire just MSI remapping resources
404 * @domain: IOMMU domain to prepare
405 * @base: Start address of IOVA region for MSI mappings
406 *
407 * Users who manage their own IOVA allocation and do not want DMA API support,
408 * but would still like to take advantage of automatic MSI remapping, can use
409 * this to initialise their own domain appropriately. Users should reserve a
410 * contiguous IOVA region, starting at @base, large enough to accommodate the
411 * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
412 * used by the devices attached to @domain.
413 */
414int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
415{
416	struct iommu_dma_cookie *cookie;
417
418	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
419		return -EINVAL;
420
421	if (domain->iova_cookie)
422		return -EEXIST;
423
424	cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
425	if (!cookie)
426		return -ENOMEM;
427
428	cookie->msi_iova = base;
429	domain->iova_cookie = cookie;
430	return 0;
431}
432EXPORT_SYMBOL(iommu_get_msi_cookie);
433
434/**
435 * iommu_put_dma_cookie - Release a domain's DMA mapping resources
436 * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
437 *          iommu_get_msi_cookie()
438 */
439void iommu_put_dma_cookie(struct iommu_domain *domain)
440{
441	struct iommu_dma_cookie *cookie = domain->iova_cookie;
442	struct iommu_dma_msi_page *msi, *tmp;
443
444	if (!cookie)
445		return;
446
447	if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) {
448		iommu_dma_free_fq(cookie);
449		put_iova_domain(&cookie->iovad);
450	}
451
452	list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
453		list_del(&msi->list);
454		kfree(msi);
455	}
456	kfree(cookie);
457	domain->iova_cookie = NULL;
458}
459
460/**
461 * iommu_dma_get_resv_regions - Reserved region driver helper
462 * @dev: Device from iommu_get_resv_regions()
463 * @list: Reserved region list from iommu_get_resv_regions()
464 *
465 * IOMMU drivers can use this to implement their .get_resv_regions callback
466 * for general non-IOMMU-specific reservations. Currently, this covers GICv3
467 * ITS region reservation on ACPI based ARM platforms that may require HW MSI
468 * reservation.
469 */
470void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
471{
472
473	if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))
474		iort_iommu_get_resv_regions(dev, list);
475
476	if (dev->of_node)
477		of_iommu_get_resv_regions(dev, list);
478}
479EXPORT_SYMBOL(iommu_dma_get_resv_regions);
480
481static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
482		phys_addr_t start, phys_addr_t end)
483{
484	struct iova_domain *iovad = &cookie->iovad;
485	struct iommu_dma_msi_page *msi_page;
486	int i, num_pages;
487
488	start -= iova_offset(iovad, start);
489	num_pages = iova_align(iovad, end - start) >> iova_shift(iovad);
490
491	for (i = 0; i < num_pages; i++) {
492		msi_page = kmalloc(sizeof(*msi_page), GFP_KERNEL);
493		if (!msi_page)
494			return -ENOMEM;
495
496		msi_page->phys = start;
497		msi_page->iova = start;
498		INIT_LIST_HEAD(&msi_page->list);
499		list_add(&msi_page->list, &cookie->msi_page_list);
500		start += iovad->granule;
501	}
502
503	return 0;
504}
505
506static int iommu_dma_ranges_sort(void *priv, const struct list_head *a,
507		const struct list_head *b)
508{
509	struct resource_entry *res_a = list_entry(a, typeof(*res_a), node);
510	struct resource_entry *res_b = list_entry(b, typeof(*res_b), node);
511
512	return res_a->res->start > res_b->res->start;
513}
514
515static int iova_reserve_pci_windows(struct pci_dev *dev,
516		struct iova_domain *iovad)
517{
518	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
519	struct resource_entry *window;
520	unsigned long lo, hi;
521	phys_addr_t start = 0, end;
522
523	resource_list_for_each_entry(window, &bridge->windows) {
524		if (resource_type(window->res) != IORESOURCE_MEM)
525			continue;
526
527		lo = iova_pfn(iovad, window->res->start - window->offset);
528		hi = iova_pfn(iovad, window->res->end - window->offset);
529		reserve_iova(iovad, lo, hi);
530	}
531
532	/* Get reserved DMA windows from host bridge */
533	list_sort(NULL, &bridge->dma_ranges, iommu_dma_ranges_sort);
534	resource_list_for_each_entry(window, &bridge->dma_ranges) {
535		end = window->res->start - window->offset;
536resv_iova:
537		if (end > start) {
538			lo = iova_pfn(iovad, start);
539			hi = iova_pfn(iovad, end);
540			reserve_iova(iovad, lo, hi);
541		} else if (end < start) {
542			/* DMA ranges should be non-overlapping */
543			dev_err(&dev->dev,
544				"Failed to reserve IOVA [%pa-%pa]\n",
545				&start, &end);
546			return -EINVAL;
547		}
548
549		start = window->res->end - window->offset + 1;
550		/* If window is last entry */
551		if (window->node.next == &bridge->dma_ranges &&
552		    end != ~(phys_addr_t)0) {
553			end = ~(phys_addr_t)0;
554			goto resv_iova;
555		}
556	}
557
558	return 0;
559}
560
561static int iova_reserve_iommu_regions(struct device *dev,
562		struct iommu_domain *domain)
563{
564	struct iommu_dma_cookie *cookie = domain->iova_cookie;
565	struct iova_domain *iovad = &cookie->iovad;
566	struct iommu_resv_region *region;
567	LIST_HEAD(resv_regions);
568	int ret = 0;
569
570	if (dev_is_pci(dev)) {
571		ret = iova_reserve_pci_windows(to_pci_dev(dev), iovad);
572		if (ret)
573			return ret;
574	}
575
576	iommu_get_resv_regions(dev, &resv_regions);
577	list_for_each_entry(region, &resv_regions, list) {
578		unsigned long lo, hi;
579
580		/* We ARE the software that manages these! */
581		if (region->type == IOMMU_RESV_SW_MSI)
582			continue;
583
584		lo = iova_pfn(iovad, region->start);
585		hi = iova_pfn(iovad, region->start + region->length - 1);
586		reserve_iova(iovad, lo, hi);
587
588		if (region->type == IOMMU_RESV_MSI)
589			ret = cookie_init_hw_msi_region(cookie, region->start,
590					region->start + region->length);
591		if (ret)
592			break;
593	}
594	iommu_put_resv_regions(dev, &resv_regions);
595
596	return ret;
597}
598
599static bool dev_is_untrusted(struct device *dev)
600{
601	return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
602}
603
604static bool dev_use_swiotlb(struct device *dev, size_t size,
605			    enum dma_data_direction dir)
606{
607	return IS_ENABLED(CONFIG_SWIOTLB) &&
608		(dev_is_untrusted(dev) ||
609		 dma_kmalloc_needs_bounce(dev, size, dir));
610}
611
612static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg,
613			       int nents, enum dma_data_direction dir)
614{
615	struct scatterlist *s;
616	int i;
617
618	if (!IS_ENABLED(CONFIG_SWIOTLB))
619		return false;
620
621	if (dev_is_untrusted(dev))
622		return true;
623
624	/*
625	 * If kmalloc() buffers are not DMA-safe for this device and
626	 * direction, check the individual lengths in the sg list. If any
627	 * element is deemed unsafe, use the swiotlb for bouncing.
628	 */
629	if (!dma_kmalloc_safe(dev, dir)) {
630		for_each_sg(sg, s, nents, i)
631			if (!dma_kmalloc_size_aligned(s->length))
632				return true;
633	}
634
635	return false;
636}
637
638/**
639 * iommu_dma_init_options - Initialize dma-iommu options
640 * @options: The options to be initialized
641 * @dev: Device the options are set for
642 *
643 * This allows tuning dma-iommu specific to device properties
644 */
645static void iommu_dma_init_options(struct iommu_dma_options *options,
646				   struct device *dev)
647{
648	/* Shadowing IOTLB flushes do better with a single large queue */
649	if (dev->iommu->shadow_on_flush) {
650		options->qt = IOMMU_DMA_OPTS_SINGLE_QUEUE;
651		options->fq_timeout = IOVA_SINGLE_FQ_TIMEOUT;
652		options->fq_size = IOVA_SINGLE_FQ_SIZE;
653	} else {
654		options->qt = IOMMU_DMA_OPTS_PER_CPU_QUEUE;
655		options->fq_size = IOVA_DEFAULT_FQ_SIZE;
656		options->fq_timeout = IOVA_DEFAULT_FQ_TIMEOUT;
657	}
658}
659
660/**
661 * iommu_dma_init_domain - Initialise a DMA mapping domain
662 * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
663 * @base: IOVA at which the mappable address space starts
664 * @limit: Last address of the IOVA space
665 * @dev: Device the domain is being initialised for
666 *
667 * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
668 * avoid rounding surprises. If necessary, we reserve the page at address 0
669 * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
670 * any change which could make prior IOVAs invalid will fail.
671 */
672static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
673				 dma_addr_t limit, struct device *dev)
674{
675	struct iommu_dma_cookie *cookie = domain->iova_cookie;
676	unsigned long order, base_pfn;
677	struct iova_domain *iovad;
678	int ret;
679
680	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
681		return -EINVAL;
682
683	iovad = &cookie->iovad;
684
685	/* Use the smallest supported page size for IOVA granularity */
686	order = __ffs(domain->pgsize_bitmap);
687	base_pfn = max_t(unsigned long, 1, base >> order);
688
689	/* Check the domain allows at least some access to the device... */
690	if (domain->geometry.force_aperture) {
691		if (base > domain->geometry.aperture_end ||
692		    limit < domain->geometry.aperture_start) {
693			pr_warn("specified DMA range outside IOMMU capability\n");
694			return -EFAULT;
695		}
696		/* ...then finally give it a kicking to make sure it fits */
697		base_pfn = max_t(unsigned long, base_pfn,
698				domain->geometry.aperture_start >> order);
699	}
700
701	/* start_pfn is always nonzero for an already-initialised domain */
702	mutex_lock(&cookie->mutex);
703	if (iovad->start_pfn) {
704		if (1UL << order != iovad->granule ||
705		    base_pfn != iovad->start_pfn) {
706			pr_warn("Incompatible range for DMA domain\n");
707			ret = -EFAULT;
708			goto done_unlock;
709		}
710
711		ret = 0;
712		goto done_unlock;
713	}
714
715	init_iova_domain(iovad, 1UL << order, base_pfn);
716	ret = iova_domain_init_rcaches(iovad);
717	if (ret)
718		goto done_unlock;
719
720	iommu_dma_init_options(&cookie->options, dev);
721
722	/* If the FQ fails we can simply fall back to strict mode */
723	if (domain->type == IOMMU_DOMAIN_DMA_FQ &&
724	    (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain)))
725		domain->type = IOMMU_DOMAIN_DMA;
726
727	ret = iova_reserve_iommu_regions(dev, domain);
728
729done_unlock:
730	mutex_unlock(&cookie->mutex);
731	return ret;
732}
733
734/**
735 * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
736 *                    page flags.
737 * @dir: Direction of DMA transfer
738 * @coherent: Is the DMA master cache-coherent?
739 * @attrs: DMA attributes for the mapping
740 *
741 * Return: corresponding IOMMU API page protection flags
742 */
743static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
744		     unsigned long attrs)
745{
746	int prot = coherent ? IOMMU_CACHE : 0;
747
748	if (attrs & DMA_ATTR_PRIVILEGED)
749		prot |= IOMMU_PRIV;
750
751	switch (dir) {
752	case DMA_BIDIRECTIONAL:
753		return prot | IOMMU_READ | IOMMU_WRITE;
754	case DMA_TO_DEVICE:
755		return prot | IOMMU_READ;
756	case DMA_FROM_DEVICE:
757		return prot | IOMMU_WRITE;
758	default:
759		return 0;
760	}
761}
762
763static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
764		size_t size, u64 dma_limit, struct device *dev)
765{
766	struct iommu_dma_cookie *cookie = domain->iova_cookie;
767	struct iova_domain *iovad = &cookie->iovad;
768	unsigned long shift, iova_len, iova;
769
770	if (cookie->type == IOMMU_DMA_MSI_COOKIE) {
771		cookie->msi_iova += size;
772		return cookie->msi_iova - size;
773	}
774
775	shift = iova_shift(iovad);
776	iova_len = size >> shift;
777
778	dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit);
779
780	if (domain->geometry.force_aperture)
781		dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end);
782
783	/*
784	 * Try to use all the 32-bit PCI addresses first. The original SAC vs.
785	 * DAC reasoning loses relevance with PCIe, but enough hardware and
786	 * firmware bugs are still lurking out there that it's safest not to
787	 * venture into the 64-bit space until necessary.
788	 *
789	 * If your device goes wrong after seeing the notice then likely either
790	 * its driver is not setting DMA masks accurately, the hardware has
791	 * some inherent bug in handling >32-bit addresses, or not all the
792	 * expected address bits are wired up between the device and the IOMMU.
793	 */
794	if (dma_limit > DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
795		iova = alloc_iova_fast(iovad, iova_len,
796				       DMA_BIT_MASK(32) >> shift, false);
797		if (iova)
798			goto done;
799
800		dev->iommu->pci_32bit_workaround = false;
801		dev_notice(dev, "Using %d-bit DMA addresses\n", bits_per(dma_limit));
802	}
803
804	iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true);
805done:
806	return (dma_addr_t)iova << shift;
807}
808
809static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
810		dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
811{
812	struct iova_domain *iovad = &cookie->iovad;
813
814	/* The MSI case is only ever cleaning up its most recent allocation */
815	if (cookie->type == IOMMU_DMA_MSI_COOKIE)
816		cookie->msi_iova -= size;
817	else if (gather && gather->queued)
818		queue_iova(cookie, iova_pfn(iovad, iova),
819				size >> iova_shift(iovad),
820				&gather->freelist);
821	else
822		free_iova_fast(iovad, iova_pfn(iovad, iova),
823				size >> iova_shift(iovad));
824}
825
826static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
827		size_t size)
828{
829	struct iommu_domain *domain = iommu_get_dma_domain(dev);
830	struct iommu_dma_cookie *cookie = domain->iova_cookie;
831	struct iova_domain *iovad = &cookie->iovad;
832	size_t iova_off = iova_offset(iovad, dma_addr);
833	struct iommu_iotlb_gather iotlb_gather;
834	size_t unmapped;
835
836	dma_addr -= iova_off;
837	size = iova_align(iovad, size + iova_off);
838	iommu_iotlb_gather_init(&iotlb_gather);
839	iotlb_gather.queued = READ_ONCE(cookie->fq_domain);
840
841	unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather);
842	WARN_ON(unmapped != size);
843
844	if (!iotlb_gather.queued)
845		iommu_iotlb_sync(domain, &iotlb_gather);
846	iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
847}
848
849static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
850		size_t size, int prot, u64 dma_mask)
851{
852	struct iommu_domain *domain = iommu_get_dma_domain(dev);
853	struct iommu_dma_cookie *cookie = domain->iova_cookie;
854	struct iova_domain *iovad = &cookie->iovad;
855	size_t iova_off = iova_offset(iovad, phys);
856	dma_addr_t iova;
857
858	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
859	    iommu_deferred_attach(dev, domain))
860		return DMA_MAPPING_ERROR;
861
862	/* If anyone ever wants this we'd need support in the IOVA allocator */
863	if (dev_WARN_ONCE(dev, dma_get_min_align_mask(dev) > iova_mask(iovad),
864	    "Unsupported alignment constraint\n"))
865		return DMA_MAPPING_ERROR;
866
867	size = iova_align(iovad, size + iova_off);
868
869	iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
870	if (!iova)
871		return DMA_MAPPING_ERROR;
872
873	if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) {
874		iommu_dma_free_iova(cookie, iova, size, NULL);
875		return DMA_MAPPING_ERROR;
876	}
877	return iova + iova_off;
878}
879
880static void __iommu_dma_free_pages(struct page **pages, int count)
881{
882	while (count--)
883		__free_page(pages[count]);
884	kvfree(pages);
885}
886
887static struct page **__iommu_dma_alloc_pages(struct device *dev,
888		unsigned int count, unsigned long order_mask, gfp_t gfp)
889{
890	struct page **pages;
891	unsigned int i = 0, nid = dev_to_node(dev);
892
893	order_mask &= GENMASK(MAX_PAGE_ORDER, 0);
894	if (!order_mask)
895		return NULL;
896
897	pages = kvcalloc(count, sizeof(*pages), GFP_KERNEL);
898	if (!pages)
899		return NULL;
900
901	/* IOMMU can map any pages, so himem can also be used here */
902	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
903
904	while (count) {
905		struct page *page = NULL;
906		unsigned int order_size;
907
908		/*
909		 * Higher-order allocations are a convenience rather
910		 * than a necessity, hence using __GFP_NORETRY until
911		 * falling back to minimum-order allocations.
912		 */
913		for (order_mask &= GENMASK(__fls(count), 0);
914		     order_mask; order_mask &= ~order_size) {
915			unsigned int order = __fls(order_mask);
916			gfp_t alloc_flags = gfp;
917
918			order_size = 1U << order;
919			if (order_mask > order_size)
920				alloc_flags |= __GFP_NORETRY;
921			page = alloc_pages_node(nid, alloc_flags, order);
922			if (!page)
923				continue;
924			if (order)
925				split_page(page, order);
926			break;
927		}
928		if (!page) {
929			__iommu_dma_free_pages(pages, i);
930			return NULL;
931		}
932		count -= order_size;
933		while (order_size--)
934			pages[i++] = page++;
935	}
936	return pages;
937}
938
939/*
940 * If size is less than PAGE_SIZE, then a full CPU page will be allocated,
941 * but an IOMMU which supports smaller pages might not map the whole thing.
942 */
943static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
944		size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot,
945		unsigned long attrs)
946{
947	struct iommu_domain *domain = iommu_get_dma_domain(dev);
948	struct iommu_dma_cookie *cookie = domain->iova_cookie;
949	struct iova_domain *iovad = &cookie->iovad;
950	bool coherent = dev_is_dma_coherent(dev);
951	int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
952	unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
953	struct page **pages;
954	dma_addr_t iova;
955	ssize_t ret;
956
957	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
958	    iommu_deferred_attach(dev, domain))
959		return NULL;
960
961	min_size = alloc_sizes & -alloc_sizes;
962	if (min_size < PAGE_SIZE) {
963		min_size = PAGE_SIZE;
964		alloc_sizes |= PAGE_SIZE;
965	} else {
966		size = ALIGN(size, min_size);
967	}
968	if (attrs & DMA_ATTR_ALLOC_SINGLE_PAGES)
969		alloc_sizes = min_size;
970
971	count = PAGE_ALIGN(size) >> PAGE_SHIFT;
972	pages = __iommu_dma_alloc_pages(dev, count, alloc_sizes >> PAGE_SHIFT,
973					gfp);
974	if (!pages)
975		return NULL;
976
977	size = iova_align(iovad, size);
978	iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
979	if (!iova)
980		goto out_free_pages;
981
982	/*
983	 * Remove the zone/policy flags from the GFP - these are applied to the
984	 * __iommu_dma_alloc_pages() but are not used for the supporting
985	 * internal allocations that follow.
986	 */
987	gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM | __GFP_COMP);
988
989	if (sg_alloc_table_from_pages(sgt, pages, count, 0, size, gfp))
990		goto out_free_iova;
991
992	if (!(ioprot & IOMMU_CACHE)) {
993		struct scatterlist *sg;
994		int i;
995
996		for_each_sg(sgt->sgl, sg, sgt->orig_nents, i)
997			arch_dma_prep_coherent(sg_page(sg), sg->length);
998	}
999
1000	ret = iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, ioprot,
1001			   gfp);
1002	if (ret < 0 || ret < size)
1003		goto out_free_sg;
1004
1005	sgt->sgl->dma_address = iova;
1006	sgt->sgl->dma_length = size;
1007	return pages;
1008
1009out_free_sg:
1010	sg_free_table(sgt);
1011out_free_iova:
1012	iommu_dma_free_iova(cookie, iova, size, NULL);
1013out_free_pages:
1014	__iommu_dma_free_pages(pages, count);
1015	return NULL;
1016}
1017
1018static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
1019		dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
1020		unsigned long attrs)
1021{
1022	struct page **pages;
1023	struct sg_table sgt;
1024	void *vaddr;
1025
1026	pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot,
1027						attrs);
1028	if (!pages)
1029		return NULL;
1030	*dma_handle = sgt.sgl->dma_address;
1031	sg_free_table(&sgt);
1032	vaddr = dma_common_pages_remap(pages, size, prot,
1033			__builtin_return_address(0));
1034	if (!vaddr)
1035		goto out_unmap;
1036	return vaddr;
1037
1038out_unmap:
1039	__iommu_dma_unmap(dev, *dma_handle, size);
1040	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
1041	return NULL;
1042}
1043
1044static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
1045		size_t size, enum dma_data_direction dir, gfp_t gfp,
1046		unsigned long attrs)
1047{
1048	struct dma_sgt_handle *sh;
1049
1050	sh = kmalloc(sizeof(*sh), gfp);
1051	if (!sh)
1052		return NULL;
1053
1054	sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp,
1055						    PAGE_KERNEL, attrs);
1056	if (!sh->pages) {
1057		kfree(sh);
1058		return NULL;
1059	}
1060	return &sh->sgt;
1061}
1062
1063static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
1064		struct sg_table *sgt, enum dma_data_direction dir)
1065{
1066	struct dma_sgt_handle *sh = sgt_handle(sgt);
1067
1068	__iommu_dma_unmap(dev, sgt->sgl->dma_address, size);
1069	__iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
1070	sg_free_table(&sh->sgt);
1071	kfree(sh);
1072}
1073
1074static void iommu_dma_sync_single_for_cpu(struct device *dev,
1075		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
1076{
1077	phys_addr_t phys;
1078
1079	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir))
1080		return;
1081
1082	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
1083	if (!dev_is_dma_coherent(dev))
1084		arch_sync_dma_for_cpu(phys, size, dir);
1085
1086	if (is_swiotlb_buffer(dev, phys))
1087		swiotlb_sync_single_for_cpu(dev, phys, size, dir);
1088}
1089
1090static void iommu_dma_sync_single_for_device(struct device *dev,
1091		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
1092{
1093	phys_addr_t phys;
1094
1095	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir))
1096		return;
1097
1098	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
1099	if (is_swiotlb_buffer(dev, phys))
1100		swiotlb_sync_single_for_device(dev, phys, size, dir);
1101
1102	if (!dev_is_dma_coherent(dev))
1103		arch_sync_dma_for_device(phys, size, dir);
1104}
1105
1106static void iommu_dma_sync_sg_for_cpu(struct device *dev,
1107		struct scatterlist *sgl, int nelems,
1108		enum dma_data_direction dir)
1109{
1110	struct scatterlist *sg;
1111	int i;
1112
1113	if (sg_dma_is_swiotlb(sgl))
1114		for_each_sg(sgl, sg, nelems, i)
1115			iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
1116						      sg->length, dir);
1117	else if (!dev_is_dma_coherent(dev))
1118		for_each_sg(sgl, sg, nelems, i)
1119			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
1120}
1121
1122static void iommu_dma_sync_sg_for_device(struct device *dev,
1123		struct scatterlist *sgl, int nelems,
1124		enum dma_data_direction dir)
1125{
1126	struct scatterlist *sg;
1127	int i;
1128
1129	if (sg_dma_is_swiotlb(sgl))
1130		for_each_sg(sgl, sg, nelems, i)
1131			iommu_dma_sync_single_for_device(dev,
1132							 sg_dma_address(sg),
1133							 sg->length, dir);
1134	else if (!dev_is_dma_coherent(dev))
1135		for_each_sg(sgl, sg, nelems, i)
1136			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
1137}
1138
1139static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
1140		unsigned long offset, size_t size, enum dma_data_direction dir,
1141		unsigned long attrs)
1142{
1143	phys_addr_t phys = page_to_phys(page) + offset;
1144	bool coherent = dev_is_dma_coherent(dev);
1145	int prot = dma_info_to_prot(dir, coherent, attrs);
1146	struct iommu_domain *domain = iommu_get_dma_domain(dev);
1147	struct iommu_dma_cookie *cookie = domain->iova_cookie;
1148	struct iova_domain *iovad = &cookie->iovad;
1149	dma_addr_t iova, dma_mask = dma_get_mask(dev);
1150
1151	/*
1152	 * If both the physical buffer start address and size are
1153	 * page aligned, we don't need to use a bounce page.
1154	 */
1155	if (dev_use_swiotlb(dev, size, dir) &&
1156	    iova_offset(iovad, phys | size)) {
1157		void *padding_start;
1158		size_t padding_size, aligned_size;
1159
1160		if (!is_swiotlb_active(dev)) {
1161			dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
1162			return DMA_MAPPING_ERROR;
1163		}
1164
1165		trace_swiotlb_bounced(dev, phys, size);
1166
1167		aligned_size = iova_align(iovad, size);
1168		phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size,
1169					      iova_mask(iovad), dir, attrs);
1170
1171		if (phys == DMA_MAPPING_ERROR)
1172			return DMA_MAPPING_ERROR;
1173
1174		/* Cleanup the padding area. */
1175		padding_start = phys_to_virt(phys);
1176		padding_size = aligned_size;
1177
1178		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
1179		    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) {
1180			padding_start += size;
1181			padding_size -= size;
1182		}
1183
1184		memset(padding_start, 0, padding_size);
1185	}
1186
1187	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
1188		arch_sync_dma_for_device(phys, size, dir);
1189
1190	iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
1191	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
1192		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
1193	return iova;
1194}
1195
1196static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
1197		size_t size, enum dma_data_direction dir, unsigned long attrs)
1198{
1199	struct iommu_domain *domain = iommu_get_dma_domain(dev);
1200	phys_addr_t phys;
1201
1202	phys = iommu_iova_to_phys(domain, dma_handle);
1203	if (WARN_ON(!phys))
1204		return;
1205
1206	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
1207		arch_sync_dma_for_cpu(phys, size, dir);
1208
1209	__iommu_dma_unmap(dev, dma_handle, size);
1210
1211	if (unlikely(is_swiotlb_buffer(dev, phys)))
1212		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
1213}
1214
1215/*
1216 * Prepare a successfully-mapped scatterlist to give back to the caller.
1217 *
1218 * At this point the segments are already laid out by iommu_dma_map_sg() to
1219 * avoid individually crossing any boundaries, so we merely need to check a
1220 * segment's start address to avoid concatenating across one.
1221 */
1222static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
1223		dma_addr_t dma_addr)
1224{
1225	struct scatterlist *s, *cur = sg;
1226	unsigned long seg_mask = dma_get_seg_boundary(dev);
1227	unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev);
1228	int i, count = 0;
1229
1230	for_each_sg(sg, s, nents, i) {
1231		/* Restore this segment's original unaligned fields first */
1232		dma_addr_t s_dma_addr = sg_dma_address(s);
1233		unsigned int s_iova_off = sg_dma_address(s);
1234		unsigned int s_length = sg_dma_len(s);
1235		unsigned int s_iova_len = s->length;
1236
1237		sg_dma_address(s) = DMA_MAPPING_ERROR;
1238		sg_dma_len(s) = 0;
1239
1240		if (sg_dma_is_bus_address(s)) {
1241			if (i > 0)
1242				cur = sg_next(cur);
1243
1244			sg_dma_unmark_bus_address(s);
1245			sg_dma_address(cur) = s_dma_addr;
1246			sg_dma_len(cur) = s_length;
1247			sg_dma_mark_bus_address(cur);
1248			count++;
1249			cur_len = 0;
1250			continue;
1251		}
1252
1253		s->offset += s_iova_off;
1254		s->length = s_length;
1255
1256		/*
1257		 * Now fill in the real DMA data. If...
1258		 * - there is a valid output segment to append to
1259		 * - and this segment starts on an IOVA page boundary
1260		 * - but doesn't fall at a segment boundary
1261		 * - and wouldn't make the resulting output segment too long
1262		 */
1263		if (cur_len && !s_iova_off && (dma_addr & seg_mask) &&
1264		    (max_len - cur_len >= s_length)) {
1265			/* ...then concatenate it with the previous one */
1266			cur_len += s_length;
1267		} else {
1268			/* Otherwise start the next output segment */
1269			if (i > 0)
1270				cur = sg_next(cur);
1271			cur_len = s_length;
1272			count++;
1273
1274			sg_dma_address(cur) = dma_addr + s_iova_off;
1275		}
1276
1277		sg_dma_len(cur) = cur_len;
1278		dma_addr += s_iova_len;
1279
1280		if (s_length + s_iova_off < s_iova_len)
1281			cur_len = 0;
1282	}
1283	return count;
1284}
1285
1286/*
1287 * If mapping failed, then just restore the original list,
1288 * but making sure the DMA fields are invalidated.
1289 */
1290static void __invalidate_sg(struct scatterlist *sg, int nents)
1291{
1292	struct scatterlist *s;
1293	int i;
1294
1295	for_each_sg(sg, s, nents, i) {
1296		if (sg_dma_is_bus_address(s)) {
1297			sg_dma_unmark_bus_address(s);
1298		} else {
1299			if (sg_dma_address(s) != DMA_MAPPING_ERROR)
1300				s->offset += sg_dma_address(s);
1301			if (sg_dma_len(s))
1302				s->length = sg_dma_len(s);
1303		}
1304		sg_dma_address(s) = DMA_MAPPING_ERROR;
1305		sg_dma_len(s) = 0;
1306	}
1307}
1308
1309static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *sg,
1310		int nents, enum dma_data_direction dir, unsigned long attrs)
1311{
1312	struct scatterlist *s;
1313	int i;
1314
1315	for_each_sg(sg, s, nents, i)
1316		iommu_dma_unmap_page(dev, sg_dma_address(s),
1317				sg_dma_len(s), dir, attrs);
1318}
1319
1320static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
1321		int nents, enum dma_data_direction dir, unsigned long attrs)
1322{
1323	struct scatterlist *s;
1324	int i;
1325
1326	sg_dma_mark_swiotlb(sg);
1327
1328	for_each_sg(sg, s, nents, i) {
1329		sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s),
1330				s->offset, s->length, dir, attrs);
1331		if (sg_dma_address(s) == DMA_MAPPING_ERROR)
1332			goto out_unmap;
1333		sg_dma_len(s) = s->length;
1334	}
1335
1336	return nents;
1337
1338out_unmap:
1339	iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
1340	return -EIO;
1341}
1342
1343/*
1344 * The DMA API client is passing in a scatterlist which could describe
1345 * any old buffer layout, but the IOMMU API requires everything to be
1346 * aligned to IOMMU pages. Hence the need for this complicated bit of
1347 * impedance-matching, to be able to hand off a suitably-aligned list,
1348 * but still preserve the original offsets and sizes for the caller.
1349 */
1350static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
1351		int nents, enum dma_data_direction dir, unsigned long attrs)
1352{
1353	struct iommu_domain *domain = iommu_get_dma_domain(dev);
1354	struct iommu_dma_cookie *cookie = domain->iova_cookie;
1355	struct iova_domain *iovad = &cookie->iovad;
1356	struct scatterlist *s, *prev = NULL;
1357	int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
1358	struct pci_p2pdma_map_state p2pdma_state = {};
1359	enum pci_p2pdma_map_type map;
1360	dma_addr_t iova;
1361	size_t iova_len = 0;
1362	unsigned long mask = dma_get_seg_boundary(dev);
1363	ssize_t ret;
1364	int i;
1365
1366	if (static_branch_unlikely(&iommu_deferred_attach_enabled)) {
1367		ret = iommu_deferred_attach(dev, domain);
1368		if (ret)
1369			goto out;
1370	}
1371
1372	if (dev_use_sg_swiotlb(dev, sg, nents, dir))
1373		return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs);
1374
1375	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
1376		iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
1377
1378	/*
1379	 * Work out how much IOVA space we need, and align the segments to
1380	 * IOVA granules for the IOMMU driver to handle. With some clever
1381	 * trickery we can modify the list in-place, but reversibly, by
1382	 * stashing the unaligned parts in the as-yet-unused DMA fields.
1383	 */
1384	for_each_sg(sg, s, nents, i) {
1385		size_t s_iova_off = iova_offset(iovad, s->offset);
1386		size_t s_length = s->length;
1387		size_t pad_len = (mask - iova_len + 1) & mask;
1388
1389		if (is_pci_p2pdma_page(sg_page(s))) {
1390			map = pci_p2pdma_map_segment(&p2pdma_state, dev, s);
1391			switch (map) {
1392			case PCI_P2PDMA_MAP_BUS_ADDR:
1393				/*
1394				 * iommu_map_sg() will skip this segment as
1395				 * it is marked as a bus address,
1396				 * __finalise_sg() will copy the dma address
1397				 * into the output segment.
1398				 */
1399				continue;
1400			case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
1401				/*
1402				 * Mapping through host bridge should be
1403				 * mapped with regular IOVAs, thus we
1404				 * do nothing here and continue below.
1405				 */
1406				break;
1407			default:
1408				ret = -EREMOTEIO;
1409				goto out_restore_sg;
1410			}
1411		}
1412
1413		sg_dma_address(s) = s_iova_off;
1414		sg_dma_len(s) = s_length;
1415		s->offset -= s_iova_off;
1416		s_length = iova_align(iovad, s_length + s_iova_off);
1417		s->length = s_length;
1418
1419		/*
1420		 * Due to the alignment of our single IOVA allocation, we can
1421		 * depend on these assumptions about the segment boundary mask:
1422		 * - If mask size >= IOVA size, then the IOVA range cannot
1423		 *   possibly fall across a boundary, so we don't care.
1424		 * - If mask size < IOVA size, then the IOVA range must start
1425		 *   exactly on a boundary, therefore we can lay things out
1426		 *   based purely on segment lengths without needing to know
1427		 *   the actual addresses beforehand.
1428		 * - The mask must be a power of 2, so pad_len == 0 if
1429		 *   iova_len == 0, thus we cannot dereference prev the first
1430		 *   time through here (i.e. before it has a meaningful value).
1431		 */
1432		if (pad_len && pad_len < s_length - 1) {
1433			prev->length += pad_len;
1434			iova_len += pad_len;
1435		}
1436
1437		iova_len += s_length;
1438		prev = s;
1439	}
1440
1441	if (!iova_len)
1442		return __finalise_sg(dev, sg, nents, 0);
1443
1444	iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
1445	if (!iova) {
1446		ret = -ENOMEM;
1447		goto out_restore_sg;
1448	}
1449
1450	/*
1451	 * We'll leave any physical concatenation to the IOMMU driver's
1452	 * implementation - it knows better than we do.
1453	 */
1454	ret = iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
1455	if (ret < 0 || ret < iova_len)
1456		goto out_free_iova;
1457
1458	return __finalise_sg(dev, sg, nents, iova);
1459
1460out_free_iova:
1461	iommu_dma_free_iova(cookie, iova, iova_len, NULL);
1462out_restore_sg:
1463	__invalidate_sg(sg, nents);
1464out:
1465	if (ret != -ENOMEM && ret != -EREMOTEIO)
1466		return -EINVAL;
1467	return ret;
1468}
1469
1470static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
1471		int nents, enum dma_data_direction dir, unsigned long attrs)
1472{
1473	dma_addr_t end = 0, start;
1474	struct scatterlist *tmp;
1475	int i;
1476
1477	if (sg_dma_is_swiotlb(sg)) {
1478		iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs);
1479		return;
1480	}
1481
1482	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
1483		iommu_dma_sync_sg_for_cpu(dev, sg, nents, dir);
1484
1485	/*
1486	 * The scatterlist segments are mapped into a single
1487	 * contiguous IOVA allocation, the start and end points
1488	 * just have to be determined.
1489	 */
1490	for_each_sg(sg, tmp, nents, i) {
1491		if (sg_dma_is_bus_address(tmp)) {
1492			sg_dma_unmark_bus_address(tmp);
1493			continue;
1494		}
1495
1496		if (sg_dma_len(tmp) == 0)
1497			break;
1498
1499		start = sg_dma_address(tmp);
1500		break;
1501	}
1502
1503	nents -= i;
1504	for_each_sg(tmp, tmp, nents, i) {
1505		if (sg_dma_is_bus_address(tmp)) {
1506			sg_dma_unmark_bus_address(tmp);
1507			continue;
1508		}
1509
1510		if (sg_dma_len(tmp) == 0)
1511			break;
1512
1513		end = sg_dma_address(tmp) + sg_dma_len(tmp);
1514	}
1515
1516	if (end)
1517		__iommu_dma_unmap(dev, start, end - start);
1518}
1519
1520static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
1521		size_t size, enum dma_data_direction dir, unsigned long attrs)
1522{
1523	return __iommu_dma_map(dev, phys, size,
1524			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
1525			dma_get_mask(dev));
1526}
1527
1528static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
1529		size_t size, enum dma_data_direction dir, unsigned long attrs)
1530{
1531	__iommu_dma_unmap(dev, handle, size);
1532}
1533
1534static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
1535{
1536	size_t alloc_size = PAGE_ALIGN(size);
1537	int count = alloc_size >> PAGE_SHIFT;
1538	struct page *page = NULL, **pages = NULL;
1539
1540	/* Non-coherent atomic allocation? Easy */
1541	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
1542	    dma_free_from_pool(dev, cpu_addr, alloc_size))
1543		return;
1544
1545	if (is_vmalloc_addr(cpu_addr)) {
1546		/*
1547		 * If it the address is remapped, then it's either non-coherent
1548		 * or highmem CMA, or an iommu_dma_alloc_remap() construction.
1549		 */
1550		pages = dma_common_find_pages(cpu_addr);
1551		if (!pages)
1552			page = vmalloc_to_page(cpu_addr);
1553		dma_common_free_remap(cpu_addr, alloc_size);
1554	} else {
1555		/* Lowmem means a coherent atomic or CMA allocation */
1556		page = virt_to_page(cpu_addr);
1557	}
1558
1559	if (pages)
1560		__iommu_dma_free_pages(pages, count);
1561	if (page)
1562		dma_free_contiguous(dev, page, alloc_size);
1563}
1564
1565static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
1566		dma_addr_t handle, unsigned long attrs)
1567{
1568	__iommu_dma_unmap(dev, handle, size);
1569	__iommu_dma_free(dev, size, cpu_addr);
1570}
1571
1572static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
1573		struct page **pagep, gfp_t gfp, unsigned long attrs)
1574{
1575	bool coherent = dev_is_dma_coherent(dev);
1576	size_t alloc_size = PAGE_ALIGN(size);
1577	int node = dev_to_node(dev);
1578	struct page *page = NULL;
1579	void *cpu_addr;
1580
1581	page = dma_alloc_contiguous(dev, alloc_size, gfp);
1582	if (!page)
1583		page = alloc_pages_node(node, gfp, get_order(alloc_size));
1584	if (!page)
1585		return NULL;
1586
1587	if (!coherent || PageHighMem(page)) {
1588		pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
1589
1590		cpu_addr = dma_common_contiguous_remap(page, alloc_size,
1591				prot, __builtin_return_address(0));
1592		if (!cpu_addr)
1593			goto out_free_pages;
1594
1595		if (!coherent)
1596			arch_dma_prep_coherent(page, size);
1597	} else {
1598		cpu_addr = page_address(page);
1599	}
1600
1601	*pagep = page;
1602	memset(cpu_addr, 0, alloc_size);
1603	return cpu_addr;
1604out_free_pages:
1605	dma_free_contiguous(dev, page, alloc_size);
1606	return NULL;
1607}
1608
1609static void *iommu_dma_alloc(struct device *dev, size_t size,
1610		dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
1611{
1612	bool coherent = dev_is_dma_coherent(dev);
1613	int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
1614	struct page *page = NULL;
1615	void *cpu_addr;
1616
1617	gfp |= __GFP_ZERO;
1618
1619	if (gfpflags_allow_blocking(gfp) &&
1620	    !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) {
1621		return iommu_dma_alloc_remap(dev, size, handle, gfp,
1622				dma_pgprot(dev, PAGE_KERNEL, attrs), attrs);
1623	}
1624
1625	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
1626	    !gfpflags_allow_blocking(gfp) && !coherent)
1627		page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
1628					       gfp, NULL);
1629	else
1630		cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
1631	if (!cpu_addr)
1632		return NULL;
1633
1634	*handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
1635			dev->coherent_dma_mask);
1636	if (*handle == DMA_MAPPING_ERROR) {
1637		__iommu_dma_free(dev, size, cpu_addr);
1638		return NULL;
1639	}
1640
1641	return cpu_addr;
1642}
1643
1644static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
1645		void *cpu_addr, dma_addr_t dma_addr, size_t size,
1646		unsigned long attrs)
1647{
1648	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
1649	unsigned long pfn, off = vma->vm_pgoff;
1650	int ret;
1651
1652	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
1653
1654	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
1655		return ret;
1656
1657	if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
1658		return -ENXIO;
1659
1660	if (is_vmalloc_addr(cpu_addr)) {
1661		struct page **pages = dma_common_find_pages(cpu_addr);
1662
1663		if (pages)
1664			return vm_map_pages(vma, pages, nr_pages);
1665		pfn = vmalloc_to_pfn(cpu_addr);
1666	} else {
1667		pfn = page_to_pfn(virt_to_page(cpu_addr));
1668	}
1669
1670	return remap_pfn_range(vma, vma->vm_start, pfn + off,
1671			       vma->vm_end - vma->vm_start,
1672			       vma->vm_page_prot);
1673}
1674
1675static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
1676		void *cpu_addr, dma_addr_t dma_addr, size_t size,
1677		unsigned long attrs)
1678{
1679	struct page *page;
1680	int ret;
1681
1682	if (is_vmalloc_addr(cpu_addr)) {
1683		struct page **pages = dma_common_find_pages(cpu_addr);
1684
1685		if (pages) {
1686			return sg_alloc_table_from_pages(sgt, pages,
1687					PAGE_ALIGN(size) >> PAGE_SHIFT,
1688					0, size, GFP_KERNEL);
1689		}
1690
1691		page = vmalloc_to_page(cpu_addr);
1692	} else {
1693		page = virt_to_page(cpu_addr);
1694	}
1695
1696	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
1697	if (!ret)
1698		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
1699	return ret;
1700}
1701
1702static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
1703{
1704	struct iommu_domain *domain = iommu_get_dma_domain(dev);
1705
1706	return (1UL << __ffs(domain->pgsize_bitmap)) - 1;
1707}
1708
1709static size_t iommu_dma_opt_mapping_size(void)
1710{
1711	return iova_rcache_range();
1712}
1713
1714static size_t iommu_dma_max_mapping_size(struct device *dev)
1715{
1716	if (dev_is_untrusted(dev))
1717		return swiotlb_max_mapping_size(dev);
1718
1719	return SIZE_MAX;
1720}
1721
1722static const struct dma_map_ops iommu_dma_ops = {
1723	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
1724	.alloc			= iommu_dma_alloc,
1725	.free			= iommu_dma_free,
1726	.alloc_pages		= dma_common_alloc_pages,
1727	.free_pages		= dma_common_free_pages,
1728	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
1729	.free_noncontiguous	= iommu_dma_free_noncontiguous,
1730	.mmap			= iommu_dma_mmap,
1731	.get_sgtable		= iommu_dma_get_sgtable,
1732	.map_page		= iommu_dma_map_page,
1733	.unmap_page		= iommu_dma_unmap_page,
1734	.map_sg			= iommu_dma_map_sg,
1735	.unmap_sg		= iommu_dma_unmap_sg,
1736	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
1737	.sync_single_for_device	= iommu_dma_sync_single_for_device,
1738	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
1739	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
1740	.map_resource		= iommu_dma_map_resource,
1741	.unmap_resource		= iommu_dma_unmap_resource,
1742	.get_merge_boundary	= iommu_dma_get_merge_boundary,
1743	.opt_mapping_size	= iommu_dma_opt_mapping_size,
1744	.max_mapping_size       = iommu_dma_max_mapping_size,
1745};
1746
1747/*
1748 * The IOMMU core code allocates the default DMA domain, which the underlying
1749 * IOMMU driver needs to support via the dma-iommu layer.
1750 */
1751void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
1752{
1753	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
1754
1755	if (!domain)
1756		goto out_err;
1757
1758	/*
1759	 * The IOMMU core code allocates the default DMA domain, which the
1760	 * underlying IOMMU driver needs to support via the dma-iommu layer.
1761	 */
1762	if (iommu_is_dma_domain(domain)) {
1763		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
1764			goto out_err;
1765		dev->dma_ops = &iommu_dma_ops;
1766	}
1767
1768	return;
1769out_err:
1770	 pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
1771		 dev_name(dev));
1772}
1773EXPORT_SYMBOL_GPL(iommu_setup_dma_ops);
1774
1775static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
1776		phys_addr_t msi_addr, struct iommu_domain *domain)
1777{
1778	struct iommu_dma_cookie *cookie = domain->iova_cookie;
1779	struct iommu_dma_msi_page *msi_page;
1780	dma_addr_t iova;
1781	int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
1782	size_t size = cookie_msi_granule(cookie);
1783
1784	msi_addr &= ~(phys_addr_t)(size - 1);
1785	list_for_each_entry(msi_page, &cookie->msi_page_list, list)
1786		if (msi_page->phys == msi_addr)
1787			return msi_page;
1788
1789	msi_page = kzalloc(sizeof(*msi_page), GFP_KERNEL);
1790	if (!msi_page)
1791		return NULL;
1792
1793	iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
1794	if (!iova)
1795		goto out_free_page;
1796
1797	if (iommu_map(domain, iova, msi_addr, size, prot, GFP_KERNEL))
1798		goto out_free_iova;
1799
1800	INIT_LIST_HEAD(&msi_page->list);
1801	msi_page->phys = msi_addr;
1802	msi_page->iova = iova;
1803	list_add(&msi_page->list, &cookie->msi_page_list);
1804	return msi_page;
1805
1806out_free_iova:
1807	iommu_dma_free_iova(cookie, iova, size, NULL);
1808out_free_page:
1809	kfree(msi_page);
1810	return NULL;
1811}
1812
1813/**
1814 * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
1815 * @desc: MSI descriptor, will store the MSI page
1816 * @msi_addr: MSI target address to be mapped
1817 *
1818 * Return: 0 on success or negative error code if the mapping failed.
1819 */
1820int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
1821{
1822	struct device *dev = msi_desc_to_dev(desc);
1823	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
1824	struct iommu_dma_msi_page *msi_page;
1825	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
1826
1827	if (!domain || !domain->iova_cookie) {
1828		desc->iommu_cookie = NULL;
1829		return 0;
1830	}
1831
1832	/*
1833	 * In fact the whole prepare operation should already be serialised by
1834	 * irq_domain_mutex further up the callchain, but that's pretty subtle
1835	 * on its own, so consider this locking as failsafe documentation...
1836	 */
1837	mutex_lock(&msi_prepare_lock);
1838	msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
1839	mutex_unlock(&msi_prepare_lock);
1840
1841	msi_desc_set_iommu_cookie(desc, msi_page);
1842
1843	if (!msi_page)
1844		return -ENOMEM;
1845	return 0;
1846}
1847
1848/**
1849 * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
1850 * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
1851 * @msg: MSI message containing target physical address
1852 */
1853void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
1854{
1855	struct device *dev = msi_desc_to_dev(desc);
1856	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
1857	const struct iommu_dma_msi_page *msi_page;
1858
1859	msi_page = msi_desc_get_iommu_cookie(desc);
1860
1861	if (!domain || !domain->iova_cookie || WARN_ON(!msi_page))
1862		return;
1863
1864	msg->address_hi = upper_32_bits(msi_page->iova);
1865	msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
1866	msg->address_lo += lower_32_bits(msi_page->iova);
1867}
1868
1869static int iommu_dma_init(void)
1870{
1871	if (is_kdump_kernel())
1872		static_branch_enable(&iommu_deferred_attach_enabled);
1873
1874	return iova_cache_get();
1875}
1876arch_initcall(iommu_dma_init);
1877