1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23 * Copyright (c) 2019 by Delphix. All rights reserved.
24 */
25
26/*
27 * See abd.c for a general overview of the arc buffered data (ABD).
28 *
29 * Linear buffers act exactly like normal buffers and are always mapped into the
30 * kernel's virtual memory space, while scattered ABD data chunks are allocated
31 * as physical pages and then mapped in only while they are actually being
32 * accessed through one of the abd_* library functions. Using scattered ABDs
33 * provides several benefits:
34 *
35 *  (1) They avoid use of kmem_*, preventing performance problems where running
36 *      kmem_reap on very large memory systems never finishes and causes
37 *      constant TLB shootdowns.
38 *
39 *  (2) Fragmentation is less of an issue since when we are at the limit of
40 *      allocatable space, we won't have to search around for a long free
41 *      hole in the VA space for large ARC allocations. Each chunk is mapped in
42 *      individually, so even if we are using HIGHMEM (see next point) we
43 *      wouldn't need to worry about finding a contiguous address range.
44 *
45 *  (3) If we are not using HIGHMEM, then all physical memory is always
46 *      mapped into the kernel's address space, so we also avoid the map /
47 *      unmap costs on each ABD access.
48 *
49 * If we are not using HIGHMEM, scattered buffers which have only one chunk
50 * can be treated as linear buffers, because they are contiguous in the
51 * kernel's virtual address space.  See abd_alloc_chunks() for details.
52 */
53
54#include <sys/abd_impl.h>
55#include <sys/param.h>
56#include <sys/zio.h>
57#include <sys/arc.h>
58#include <sys/zfs_context.h>
59#include <sys/zfs_znode.h>
60#ifdef _KERNEL
61#include <linux/kmap_compat.h>
62#include <linux/scatterlist.h>
63#else
64#define	MAX_ORDER	1
65#endif
66
67typedef struct abd_stats {
68	kstat_named_t abdstat_struct_size;
69	kstat_named_t abdstat_linear_cnt;
70	kstat_named_t abdstat_linear_data_size;
71	kstat_named_t abdstat_scatter_cnt;
72	kstat_named_t abdstat_scatter_data_size;
73	kstat_named_t abdstat_scatter_chunk_waste;
74	kstat_named_t abdstat_scatter_orders[MAX_ORDER];
75	kstat_named_t abdstat_scatter_page_multi_chunk;
76	kstat_named_t abdstat_scatter_page_multi_zone;
77	kstat_named_t abdstat_scatter_page_alloc_retry;
78	kstat_named_t abdstat_scatter_sg_table_retry;
79} abd_stats_t;
80
81static abd_stats_t abd_stats = {
82	/* Amount of memory occupied by all of the abd_t struct allocations */
83	{ "struct_size",			KSTAT_DATA_UINT64 },
84	/*
85	 * The number of linear ABDs which are currently allocated, excluding
86	 * ABDs which don't own their data (for instance the ones which were
87	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
88	 * ABD takes ownership of its buf then it will become tracked.
89	 */
90	{ "linear_cnt",				KSTAT_DATA_UINT64 },
91	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
92	{ "linear_data_size",			KSTAT_DATA_UINT64 },
93	/*
94	 * The number of scatter ABDs which are currently allocated, excluding
95	 * ABDs which don't own their data (for instance the ones which were
96	 * allocated through abd_get_offset()).
97	 */
98	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
99	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
100	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
101	/*
102	 * The amount of space wasted at the end of the last chunk across all
103	 * scatter ABDs tracked by scatter_cnt.
104	 */
105	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
106	/*
107	 * The number of compound allocations of a given order.  These
108	 * allocations are spread over all currently allocated ABDs, and
109	 * act as a measure of memory fragmentation.
110	 */
111	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
112	/*
113	 * The number of scatter ABDs which contain multiple chunks.
114	 * ABDs are preferentially allocated from the minimum number of
115	 * contiguous multi-page chunks, a single chunk is optimal.
116	 */
117	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
118	/*
119	 * The number of scatter ABDs which are split across memory zones.
120	 * ABDs are preferentially allocated using pages from a single zone.
121	 */
122	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
123	/*
124	 *  The total number of retries encountered when attempting to
125	 *  allocate the pages to populate the scatter ABD.
126	 */
127	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
128	/*
129	 *  The total number of retries encountered when attempting to
130	 *  allocate the sg table for an ABD.
131	 */
132	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
133};
134
135#define	abd_for_each_sg(abd, sg, n, i)	\
136	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
137
138unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
139
140/*
141 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
142 * ABD's.  Smaller allocations will use linear ABD's which uses
143 * zio_[data_]buf_alloc().
144 *
145 * Scatter ABD's use at least one page each, so sub-page allocations waste
146 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
147 * half of each page).  Using linear ABD's for small allocations means that
148 * they will be put on slabs which contain many allocations.  This can
149 * improve memory efficiency, but it also makes it much harder for ARC
150 * evictions to actually free pages, because all the buffers on one slab need
151 * to be freed in order for the slab (and underlying pages) to be freed.
152 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
153 * possible for them to actually waste more memory than scatter (one page per
154 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
155 *
156 * Spill blocks are typically 512B and are heavily used on systems running
157 * selinux with the default dnode size and the `xattr=sa` property set.
158 *
159 * By default we use linear allocations for 512B and 1KB, and scatter
160 * allocations for larger (1.5KB and up).
161 */
162int zfs_abd_scatter_min_size = 512 * 3;
163
164/*
165 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
166 * just a single zero'd page. This allows us to conserve memory by
167 * only using a single zero page for the scatterlist.
168 */
169abd_t *abd_zero_scatter = NULL;
170
171struct page;
172/*
173 * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is
174 * assigned to set each of the pages of abd_zero_scatter.
175 */
176static struct page *abd_zero_page = NULL;
177
178static kmem_cache_t *abd_cache = NULL;
179static kstat_t *abd_ksp;
180
181static uint_t
182abd_chunkcnt_for_bytes(size_t size)
183{
184	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
185}
186
187abd_t *
188abd_alloc_struct_impl(size_t size)
189{
190	/*
191	 * In Linux we do not use the size passed in during ABD
192	 * allocation, so we just ignore it.
193	 */
194	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
195	ASSERT3P(abd, !=, NULL);
196	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
197
198	return (abd);
199}
200
201void
202abd_free_struct_impl(abd_t *abd)
203{
204	kmem_cache_free(abd_cache, abd);
205	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
206}
207
208#ifdef _KERNEL
209/*
210 * Mark zfs data pages so they can be excluded from kernel crash dumps
211 */
212#ifdef _LP64
213#define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
214
215static inline void
216abd_mark_zfs_page(struct page *page)
217{
218	get_page(page);
219	SetPagePrivate(page);
220	set_page_private(page, ABD_FILE_CACHE_PAGE);
221}
222
223static inline void
224abd_unmark_zfs_page(struct page *page)
225{
226	set_page_private(page, 0UL);
227	ClearPagePrivate(page);
228	put_page(page);
229}
230#else
231#define	abd_mark_zfs_page(page)
232#define	abd_unmark_zfs_page(page)
233#endif /* _LP64 */
234
235#ifndef CONFIG_HIGHMEM
236
237#ifndef __GFP_RECLAIM
238#define	__GFP_RECLAIM		__GFP_WAIT
239#endif
240
241/*
242 * The goal is to minimize fragmentation by preferentially populating ABDs
243 * with higher order compound pages from a single zone.  Allocation size is
244 * progressively decreased until it can be satisfied without performing
245 * reclaim or compaction.  When necessary this function will degenerate to
246 * allocating individual pages and allowing reclaim to satisfy allocations.
247 */
248void
249abd_alloc_chunks(abd_t *abd, size_t size)
250{
251	struct list_head pages;
252	struct sg_table table;
253	struct scatterlist *sg;
254	struct page *page, *tmp_page = NULL;
255	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
256	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
257	int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
258	int nr_pages = abd_chunkcnt_for_bytes(size);
259	int chunks = 0, zones = 0;
260	size_t remaining_size;
261	int nid = NUMA_NO_NODE;
262	int alloc_pages = 0;
263
264	INIT_LIST_HEAD(&pages);
265
266	while (alloc_pages < nr_pages) {
267		unsigned chunk_pages;
268		int order;
269
270		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
271		chunk_pages = (1U << order);
272
273		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
274		if (page == NULL) {
275			if (order == 0) {
276				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
277				schedule_timeout_interruptible(1);
278			} else {
279				max_order = MAX(0, order - 1);
280			}
281			continue;
282		}
283
284		list_add_tail(&page->lru, &pages);
285
286		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
287			zones++;
288
289		nid = page_to_nid(page);
290		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
291		chunks++;
292		alloc_pages += chunk_pages;
293	}
294
295	ASSERT3S(alloc_pages, ==, nr_pages);
296
297	while (sg_alloc_table(&table, chunks, gfp)) {
298		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
299		schedule_timeout_interruptible(1);
300	}
301
302	sg = table.sgl;
303	remaining_size = size;
304	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
305		size_t sg_size = MIN(PAGESIZE << compound_order(page),
306		    remaining_size);
307		sg_set_page(sg, page, sg_size, 0);
308		abd_mark_zfs_page(page);
309		remaining_size -= sg_size;
310
311		sg = sg_next(sg);
312		list_del(&page->lru);
313	}
314
315	/*
316	 * These conditions ensure that a possible transformation to a linear
317	 * ABD would be valid.
318	 */
319	ASSERT(!PageHighMem(sg_page(table.sgl)));
320	ASSERT0(ABD_SCATTER(abd).abd_offset);
321
322	if (table.nents == 1) {
323		/*
324		 * Since there is only one entry, this ABD can be represented
325		 * as a linear buffer.  All single-page (4K) ABD's can be
326		 * represented this way.  Some multi-page ABD's can also be
327		 * represented this way, if we were able to allocate a single
328		 * "chunk" (higher-order "page" which represents a power-of-2
329		 * series of physically-contiguous pages).  This is often the
330		 * case for 2-page (8K) ABD's.
331		 *
332		 * Representing a single-entry scatter ABD as a linear ABD
333		 * has the performance advantage of avoiding the copy (and
334		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
335		 * A performance increase of around 5% has been observed for
336		 * ARC-cached reads (of small blocks which can take advantage
337		 * of this).
338		 *
339		 * Note that this optimization is only possible because the
340		 * pages are always mapped into the kernel's address space.
341		 * This is not the case for highmem pages, so the
342		 * optimization can not be made there.
343		 */
344		abd->abd_flags |= ABD_FLAG_LINEAR;
345		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
346		abd->abd_u.abd_linear.abd_sgl = table.sgl;
347		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
348	} else if (table.nents > 1) {
349		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
350		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
351
352		if (zones) {
353			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
354			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
355		}
356
357		ABD_SCATTER(abd).abd_sgl = table.sgl;
358		ABD_SCATTER(abd).abd_nents = table.nents;
359	}
360}
361#else
362
363/*
364 * Allocate N individual pages to construct a scatter ABD.  This function
365 * makes no attempt to request contiguous pages and requires the minimal
366 * number of kernel interfaces.  It's designed for maximum compatibility.
367 */
368void
369abd_alloc_chunks(abd_t *abd, size_t size)
370{
371	struct scatterlist *sg = NULL;
372	struct sg_table table;
373	struct page *page;
374	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
375	int nr_pages = abd_chunkcnt_for_bytes(size);
376	int i = 0;
377
378	while (sg_alloc_table(&table, nr_pages, gfp)) {
379		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
380		schedule_timeout_interruptible(1);
381	}
382
383	ASSERT3U(table.nents, ==, nr_pages);
384	ABD_SCATTER(abd).abd_sgl = table.sgl;
385	ABD_SCATTER(abd).abd_nents = nr_pages;
386
387	abd_for_each_sg(abd, sg, nr_pages, i) {
388		while ((page = __page_cache_alloc(gfp)) == NULL) {
389			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
390			schedule_timeout_interruptible(1);
391		}
392
393		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
394		sg_set_page(sg, page, PAGESIZE, 0);
395		abd_mark_zfs_page(page);
396	}
397
398	if (nr_pages > 1) {
399		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
400		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
401	}
402}
403#endif /* !CONFIG_HIGHMEM */
404
405/*
406 * This must be called if any of the sg_table allocation functions
407 * are called.
408 */
409static void
410abd_free_sg_table(abd_t *abd)
411{
412	struct sg_table table;
413
414	table.sgl = ABD_SCATTER(abd).abd_sgl;
415	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
416	sg_free_table(&table);
417}
418
419void
420abd_free_chunks(abd_t *abd)
421{
422	struct scatterlist *sg = NULL;
423	struct page *page;
424	int nr_pages = ABD_SCATTER(abd).abd_nents;
425	int order, i = 0;
426
427	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
428		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
429
430	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
431		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
432
433	abd_for_each_sg(abd, sg, nr_pages, i) {
434		page = sg_page(sg);
435		abd_unmark_zfs_page(page);
436		order = compound_order(page);
437		__free_pages(page, order);
438		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
439		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
440	}
441	abd_free_sg_table(abd);
442}
443
444/*
445 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
446 * the scatterlist will be set to the zero'd out buffer abd_zero_page.
447 */
448static void
449abd_alloc_zero_scatter(void)
450{
451	struct scatterlist *sg = NULL;
452	struct sg_table table;
453	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
454	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
455	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
456	int i = 0;
457
458	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
459		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
460		schedule_timeout_interruptible(1);
461	}
462	abd_mark_zfs_page(abd_zero_page);
463
464	while (sg_alloc_table(&table, nr_pages, gfp)) {
465		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
466		schedule_timeout_interruptible(1);
467	}
468	ASSERT3U(table.nents, ==, nr_pages);
469
470	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
471	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
472	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
473	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
474	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
475	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
476	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
477
478	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
479		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
480	}
481
482	ABDSTAT_BUMP(abdstat_scatter_cnt);
483	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
484	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
485}
486
487#else /* _KERNEL */
488
489#ifndef PAGE_SHIFT
490#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
491#endif
492
493#define	zfs_kmap_atomic(chunk)		((void *)chunk)
494#define	zfs_kunmap_atomic(addr)		do { (void)(addr); } while (0)
495#define	local_irq_save(flags)		do { (void)(flags); } while (0)
496#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
497#define	nth_page(pg, i) \
498	((struct page *)((void *)(pg) + (i) * PAGESIZE))
499
500struct scatterlist {
501	struct page *page;
502	int length;
503	int end;
504};
505
506static void
507sg_init_table(struct scatterlist *sg, int nr)
508{
509	memset(sg, 0, nr * sizeof (struct scatterlist));
510	sg[nr - 1].end = 1;
511}
512
513/*
514 * This must be called if any of the sg_table allocation functions
515 * are called.
516 */
517static void
518abd_free_sg_table(abd_t *abd)
519{
520	int nents = ABD_SCATTER(abd).abd_nents;
521	vmem_free(ABD_SCATTER(abd).abd_sgl,
522	    nents * sizeof (struct scatterlist));
523}
524
525#define	for_each_sg(sgl, sg, nr, i)	\
526	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
527
528static inline void
529sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
530    unsigned int offset)
531{
532	/* currently we don't use offset */
533	ASSERT(offset == 0);
534	sg->page = page;
535	sg->length = len;
536}
537
538static inline struct page *
539sg_page(struct scatterlist *sg)
540{
541	return (sg->page);
542}
543
544static inline struct scatterlist *
545sg_next(struct scatterlist *sg)
546{
547	if (sg->end)
548		return (NULL);
549
550	return (sg + 1);
551}
552
553void
554abd_alloc_chunks(abd_t *abd, size_t size)
555{
556	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
557	struct scatterlist *sg;
558	int i;
559
560	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
561	    sizeof (struct scatterlist), KM_SLEEP);
562	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
563
564	abd_for_each_sg(abd, sg, nr_pages, i) {
565		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
566		sg_set_page(sg, p, PAGESIZE, 0);
567	}
568	ABD_SCATTER(abd).abd_nents = nr_pages;
569}
570
571void
572abd_free_chunks(abd_t *abd)
573{
574	int i, n = ABD_SCATTER(abd).abd_nents;
575	struct scatterlist *sg;
576
577	abd_for_each_sg(abd, sg, n, i) {
578		for (int j = 0; j < sg->length; j += PAGESIZE) {
579			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
580			umem_free(p, PAGESIZE);
581		}
582	}
583	abd_free_sg_table(abd);
584}
585
586static void
587abd_alloc_zero_scatter(void)
588{
589	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
590	struct scatterlist *sg;
591	int i;
592
593	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
594	memset(abd_zero_page, 0, PAGESIZE);
595	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
596	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
597	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
598	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
599	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
600	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
601	zfs_refcount_create(&abd_zero_scatter->abd_children);
602	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
603	    sizeof (struct scatterlist), KM_SLEEP);
604
605	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
606
607	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
608		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
609	}
610
611	ABDSTAT_BUMP(abdstat_scatter_cnt);
612	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
613	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
614}
615
616#endif /* _KERNEL */
617
618boolean_t
619abd_size_alloc_linear(size_t size)
620{
621	return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
622}
623
624void
625abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
626{
627	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
628	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
629	if (op == ABDSTAT_INCR) {
630		ABDSTAT_BUMP(abdstat_scatter_cnt);
631		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
632		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
633		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
634	} else {
635		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
636		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
637		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
638		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
639	}
640}
641
642void
643abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
644{
645	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
646	if (op == ABDSTAT_INCR) {
647		ABDSTAT_BUMP(abdstat_linear_cnt);
648		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
649	} else {
650		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
651		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
652	}
653}
654
655void
656abd_verify_scatter(abd_t *abd)
657{
658	size_t n;
659	int i = 0;
660	struct scatterlist *sg = NULL;
661
662	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
663	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
664	    ABD_SCATTER(abd).abd_sgl->length);
665	n = ABD_SCATTER(abd).abd_nents;
666	abd_for_each_sg(abd, sg, n, i) {
667		ASSERT3P(sg_page(sg), !=, NULL);
668	}
669}
670
671static void
672abd_free_zero_scatter(void)
673{
674	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
675	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
676	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
677
678	abd_free_sg_table(abd_zero_scatter);
679	abd_free_struct(abd_zero_scatter);
680	abd_zero_scatter = NULL;
681	ASSERT3P(abd_zero_page, !=, NULL);
682#if defined(_KERNEL)
683	abd_unmark_zfs_page(abd_zero_page);
684	__free_page(abd_zero_page);
685#else
686	umem_free(abd_zero_page, PAGESIZE);
687#endif /* _KERNEL */
688}
689
690void
691abd_init(void)
692{
693	int i;
694
695	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
696	    0, NULL, NULL, NULL, NULL, NULL, 0);
697
698	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
699	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
700	if (abd_ksp != NULL) {
701		for (i = 0; i < MAX_ORDER; i++) {
702			snprintf(abd_stats.abdstat_scatter_orders[i].name,
703			    KSTAT_STRLEN, "scatter_order_%d", i);
704			abd_stats.abdstat_scatter_orders[i].data_type =
705			    KSTAT_DATA_UINT64;
706		}
707		abd_ksp->ks_data = &abd_stats;
708		kstat_install(abd_ksp);
709	}
710
711	abd_alloc_zero_scatter();
712}
713
714void
715abd_fini(void)
716{
717	abd_free_zero_scatter();
718
719	if (abd_ksp != NULL) {
720		kstat_delete(abd_ksp);
721		abd_ksp = NULL;
722	}
723
724	if (abd_cache) {
725		kmem_cache_destroy(abd_cache);
726		abd_cache = NULL;
727	}
728}
729
730void
731abd_free_linear_page(abd_t *abd)
732{
733	/* Transform it back into a scatter ABD for freeing */
734	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
735	abd->abd_flags &= ~ABD_FLAG_LINEAR;
736	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
737	ABD_SCATTER(abd).abd_nents = 1;
738	ABD_SCATTER(abd).abd_offset = 0;
739	ABD_SCATTER(abd).abd_sgl = sg;
740	abd_free_chunks(abd);
741
742	abd_update_scatter_stats(abd, ABDSTAT_DECR);
743}
744
745/*
746 * If we're going to use this ABD for doing I/O using the block layer, the
747 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
748 * plan to store this ABD in memory for a long period of time, we should
749 * allocate the ABD type that requires the least data copying to do the I/O.
750 *
751 * On Linux the optimal thing to do would be to use abd_get_offset() and
752 * construct a new ABD which shares the original pages thereby eliminating
753 * the copy.  But for the moment a new linear ABD is allocated until this
754 * performance optimization can be implemented.
755 */
756abd_t *
757abd_alloc_for_io(size_t size, boolean_t is_metadata)
758{
759	return (abd_alloc(size, is_metadata));
760}
761
762abd_t *
763abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
764{
765	int i = 0;
766	struct scatterlist *sg = NULL;
767
768	abd_verify(sabd);
769	ASSERT3U(off, <=, sabd->abd_size);
770
771	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
772
773	if (abd == NULL)
774		abd = abd_alloc_struct(0);
775
776	/*
777	 * Even if this buf is filesystem metadata, we only track that
778	 * if we own the underlying data buffer, which is not true in
779	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
780	 */
781
782	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
783		if (new_offset < sg->length)
784			break;
785		new_offset -= sg->length;
786	}
787
788	ABD_SCATTER(abd).abd_sgl = sg;
789	ABD_SCATTER(abd).abd_offset = new_offset;
790	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
791
792	return (abd);
793}
794
795/*
796 * Initialize the abd_iter.
797 */
798void
799abd_iter_init(struct abd_iter *aiter, abd_t *abd)
800{
801	ASSERT(!abd_is_gang(abd));
802	abd_verify(abd);
803	aiter->iter_abd = abd;
804	aiter->iter_mapaddr = NULL;
805	aiter->iter_mapsize = 0;
806	aiter->iter_pos = 0;
807	if (abd_is_linear(abd)) {
808		aiter->iter_offset = 0;
809		aiter->iter_sg = NULL;
810	} else {
811		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
812		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
813	}
814}
815
816/*
817 * This is just a helper function to see if we have exhausted the
818 * abd_iter and reached the end.
819 */
820boolean_t
821abd_iter_at_end(struct abd_iter *aiter)
822{
823	return (aiter->iter_pos == aiter->iter_abd->abd_size);
824}
825
826/*
827 * Advance the iterator by a certain amount. Cannot be called when a chunk is
828 * in use. This can be safely called when the aiter has already exhausted, in
829 * which case this does nothing.
830 */
831void
832abd_iter_advance(struct abd_iter *aiter, size_t amount)
833{
834	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
835	ASSERT0(aiter->iter_mapsize);
836
837	/* There's nothing left to advance to, so do nothing */
838	if (abd_iter_at_end(aiter))
839		return;
840
841	aiter->iter_pos += amount;
842	aiter->iter_offset += amount;
843	if (!abd_is_linear(aiter->iter_abd)) {
844		while (aiter->iter_offset >= aiter->iter_sg->length) {
845			aiter->iter_offset -= aiter->iter_sg->length;
846			aiter->iter_sg = sg_next(aiter->iter_sg);
847			if (aiter->iter_sg == NULL) {
848				ASSERT0(aiter->iter_offset);
849				break;
850			}
851		}
852	}
853}
854
855/*
856 * Map the current chunk into aiter. This can be safely called when the aiter
857 * has already exhausted, in which case this does nothing.
858 */
859void
860abd_iter_map(struct abd_iter *aiter)
861{
862	void *paddr;
863	size_t offset = 0;
864
865	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
866	ASSERT0(aiter->iter_mapsize);
867
868	/* There's nothing left to iterate over, so do nothing */
869	if (abd_iter_at_end(aiter))
870		return;
871
872	if (abd_is_linear(aiter->iter_abd)) {
873		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
874		offset = aiter->iter_offset;
875		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
876		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
877	} else {
878		offset = aiter->iter_offset;
879		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
880		    aiter->iter_abd->abd_size - aiter->iter_pos);
881
882		paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg));
883	}
884
885	aiter->iter_mapaddr = (char *)paddr + offset;
886}
887
888/*
889 * Unmap the current chunk from aiter. This can be safely called when the aiter
890 * has already exhausted, in which case this does nothing.
891 */
892void
893abd_iter_unmap(struct abd_iter *aiter)
894{
895	/* There's nothing left to unmap, so do nothing */
896	if (abd_iter_at_end(aiter))
897		return;
898
899	if (!abd_is_linear(aiter->iter_abd)) {
900		/* LINTED E_FUNC_SET_NOT_USED */
901		zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset);
902	}
903
904	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
905	ASSERT3U(aiter->iter_mapsize, >, 0);
906
907	aiter->iter_mapaddr = NULL;
908	aiter->iter_mapsize = 0;
909}
910
911void
912abd_cache_reap_now(void)
913{
914}
915
916#if defined(_KERNEL)
917/*
918 * bio_nr_pages for ABD.
919 * @off is the offset in @abd
920 */
921unsigned long
922abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
923{
924	unsigned long pos;
925
926	if (abd_is_gang(abd)) {
927		unsigned long count = 0;
928
929		for (abd_t *cabd = abd_gang_get_offset(abd, &off);
930		    cabd != NULL && size != 0;
931		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
932			ASSERT3U(off, <, cabd->abd_size);
933			int mysize = MIN(size, cabd->abd_size - off);
934			count += abd_nr_pages_off(cabd, mysize, off);
935			size -= mysize;
936			off = 0;
937		}
938		return (count);
939	}
940
941	if (abd_is_linear(abd))
942		pos = (unsigned long)abd_to_buf(abd) + off;
943	else
944		pos = ABD_SCATTER(abd).abd_offset + off;
945
946	return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
947	    (pos >> PAGE_SHIFT));
948}
949
950static unsigned int
951bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
952{
953	unsigned int offset, size, i;
954	struct page *page;
955
956	offset = offset_in_page(buf_ptr);
957	for (i = 0; i < bio->bi_max_vecs; i++) {
958		size = PAGE_SIZE - offset;
959
960		if (bio_size <= 0)
961			break;
962
963		if (size > bio_size)
964			size = bio_size;
965
966		if (is_vmalloc_addr(buf_ptr))
967			page = vmalloc_to_page(buf_ptr);
968		else
969			page = virt_to_page(buf_ptr);
970
971		/*
972		 * Some network related block device uses tcp_sendpage, which
973		 * doesn't behave well when using 0-count page, this is a
974		 * safety net to catch them.
975		 */
976		ASSERT3S(page_count(page), >, 0);
977
978		if (bio_add_page(bio, page, size, offset) != size)
979			break;
980
981		buf_ptr += size;
982		bio_size -= size;
983		offset = 0;
984	}
985
986	return (bio_size);
987}
988
989/*
990 * bio_map for gang ABD.
991 */
992static unsigned int
993abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
994    unsigned int io_size, size_t off)
995{
996	ASSERT(abd_is_gang(abd));
997
998	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
999	    cabd != NULL;
1000	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1001		ASSERT3U(off, <, cabd->abd_size);
1002		int size = MIN(io_size, cabd->abd_size - off);
1003		int remainder = abd_bio_map_off(bio, cabd, size, off);
1004		io_size -= (size - remainder);
1005		if (io_size == 0 || remainder > 0)
1006			return (io_size);
1007		off = 0;
1008	}
1009	ASSERT0(io_size);
1010	return (io_size);
1011}
1012
1013/*
1014 * bio_map for ABD.
1015 * @off is the offset in @abd
1016 * Remaining IO size is returned
1017 */
1018unsigned int
1019abd_bio_map_off(struct bio *bio, abd_t *abd,
1020    unsigned int io_size, size_t off)
1021{
1022	struct abd_iter aiter;
1023
1024	ASSERT3U(io_size, <=, abd->abd_size - off);
1025	if (abd_is_linear(abd))
1026		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
1027
1028	ASSERT(!abd_is_linear(abd));
1029	if (abd_is_gang(abd))
1030		return (abd_gang_bio_map_off(bio, abd, io_size, off));
1031
1032	abd_iter_init(&aiter, abd);
1033	abd_iter_advance(&aiter, off);
1034
1035	for (int i = 0; i < bio->bi_max_vecs; i++) {
1036		struct page *pg;
1037		size_t len, sgoff, pgoff;
1038		struct scatterlist *sg;
1039
1040		if (io_size <= 0)
1041			break;
1042
1043		sg = aiter.iter_sg;
1044		sgoff = aiter.iter_offset;
1045		pgoff = sgoff & (PAGESIZE - 1);
1046		len = MIN(io_size, PAGESIZE - pgoff);
1047		ASSERT(len > 0);
1048
1049		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1050		if (bio_add_page(bio, pg, len, pgoff) != len)
1051			break;
1052
1053		io_size -= len;
1054		abd_iter_advance(&aiter, len);
1055	}
1056
1057	return (io_size);
1058}
1059
1060/* Tunable Parameters */
1061module_param(zfs_abd_scatter_enabled, int, 0644);
1062MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1063	"Toggle whether ABD allocations must be linear.");
1064module_param(zfs_abd_scatter_min_size, int, 0644);
1065MODULE_PARM_DESC(zfs_abd_scatter_min_size,
1066	"Minimum size of scatter allocations.");
1067/* CSTYLED */
1068module_param(zfs_abd_scatter_max_order, uint, 0644);
1069MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1070	"Maximum order allocation used for a scatter ABD.");
1071#endif
1072