1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
14 * Copyright (c) 2016 by Delphix. All rights reserved.
15 */
16
17/*
18 * See abd.c for a general overview of the arc buffered data (ABD).
19 *
20 * Using a large proportion of scattered ABDs decreases ARC fragmentation since
21 * when we are at the limit of allocatable space, using equal-size chunks will
22 * allow us to quickly reclaim enough space for a new large allocation (assuming
23 * it is also scattered).
24 *
25 * ABDs are allocated scattered by default unless the caller uses
26 * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
27 */
28
29#include <sys/abd_impl.h>
30#include <sys/param.h>
31#include <sys/types.h>
32#include <sys/zio.h>
33#include <sys/zfs_context.h>
34#include <sys/zfs_znode.h>
35
36typedef struct abd_stats {
37	kstat_named_t abdstat_struct_size;
38	kstat_named_t abdstat_scatter_cnt;
39	kstat_named_t abdstat_scatter_data_size;
40	kstat_named_t abdstat_scatter_chunk_waste;
41	kstat_named_t abdstat_linear_cnt;
42	kstat_named_t abdstat_linear_data_size;
43} abd_stats_t;
44
45static abd_stats_t abd_stats = {
46	/* Amount of memory occupied by all of the abd_t struct allocations */
47	{ "struct_size",			KSTAT_DATA_UINT64 },
48	/*
49	 * The number of scatter ABDs which are currently allocated, excluding
50	 * ABDs which don't own their data (for instance the ones which were
51	 * allocated through abd_get_offset()).
52	 */
53	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
54	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
55	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
56	/*
57	 * The amount of space wasted at the end of the last chunk across all
58	 * scatter ABDs tracked by scatter_cnt.
59	 */
60	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
61	/*
62	 * The number of linear ABDs which are currently allocated, excluding
63	 * ABDs which don't own their data (for instance the ones which were
64	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
65	 * ABD takes ownership of its buf then it will become tracked.
66	 */
67	{ "linear_cnt",				KSTAT_DATA_UINT64 },
68	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
69	{ "linear_data_size",			KSTAT_DATA_UINT64 },
70};
71
72/*
73 * The size of the chunks ABD allocates. Because the sizes allocated from the
74 * kmem_cache can't change, this tunable can only be modified at boot. Changing
75 * it at runtime would cause ABD iteration to work incorrectly for ABDs which
76 * were allocated with the old size, so a safeguard has been put in place which
77 * will cause the machine to panic if you change it and try to access the data
78 * within a scattered ABD.
79 */
80size_t zfs_abd_chunk_size = 4096;
81
82#if defined(_KERNEL)
83SYSCTL_DECL(_vfs_zfs);
84
85SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
86	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
87SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
88	&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
89#endif
90
91kmem_cache_t *abd_chunk_cache;
92static kstat_t *abd_ksp;
93
94/*
95 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
96 * just a single zero'd sized zfs_abd_chunk_size buffer. This
97 * allows us to conserve memory by only using a single zero buffer
98 * for the scatter chunks.
99 */
100abd_t *abd_zero_scatter = NULL;
101static char *abd_zero_buf = NULL;
102
103static void
104abd_free_chunk(void *c)
105{
106	kmem_cache_free(abd_chunk_cache, c);
107}
108
109static uint_t
110abd_chunkcnt_for_bytes(size_t size)
111{
112	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
113}
114
115static inline uint_t
116abd_scatter_chunkcnt(abd_t *abd)
117{
118	ASSERT(!abd_is_linear(abd));
119	return (abd_chunkcnt_for_bytes(
120	    ABD_SCATTER(abd).abd_offset + abd->abd_size));
121}
122
123boolean_t
124abd_size_alloc_linear(size_t size)
125{
126	return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
127}
128
129void
130abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
131{
132	uint_t n = abd_scatter_chunkcnt(abd);
133	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
134	int waste = n * zfs_abd_chunk_size - abd->abd_size;
135	if (op == ABDSTAT_INCR) {
136		ABDSTAT_BUMP(abdstat_scatter_cnt);
137		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
138		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
139		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
140	} else {
141		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
142		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
143		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
144		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
145	}
146}
147
148void
149abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
150{
151	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
152	if (op == ABDSTAT_INCR) {
153		ABDSTAT_BUMP(abdstat_linear_cnt);
154		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
155	} else {
156		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
157		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
158	}
159}
160
161void
162abd_verify_scatter(abd_t *abd)
163{
164	uint_t i, n;
165
166	/*
167	 * There is no scatter linear pages in FreeBSD so there is an
168	 * if an error if the ABD has been marked as a linear page.
169	 */
170	ASSERT(!abd_is_linear_page(abd));
171	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
172	    zfs_abd_chunk_size);
173	n = abd_scatter_chunkcnt(abd);
174	for (i = 0; i < n; i++) {
175		ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
176	}
177}
178
179void
180abd_alloc_chunks(abd_t *abd, size_t size)
181{
182	uint_t i, n;
183
184	n = abd_chunkcnt_for_bytes(size);
185	for (i = 0; i < n; i++) {
186		void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
187		ASSERT3P(c, !=, NULL);
188		ABD_SCATTER(abd).abd_chunks[i] = c;
189	}
190	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
191}
192
193void
194abd_free_chunks(abd_t *abd)
195{
196	uint_t i, n;
197
198	n = abd_scatter_chunkcnt(abd);
199	for (i = 0; i < n; i++) {
200		abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
201	}
202}
203
204abd_t *
205abd_alloc_struct_impl(size_t size)
206{
207	uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
208	/*
209	 * In the event we are allocating a gang ABD, the size passed in
210	 * will be 0. We must make sure to set abd_size to the size of an
211	 * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
212	 * ABD struct allocation accounts for an additional 24 bytes over
213	 * a scatter ABD with 0 chunks.
214	 */
215	size_t abd_size = MAX(sizeof (abd_t),
216	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
217	abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
218	ASSERT3P(abd, !=, NULL);
219	ABDSTAT_INCR(abdstat_struct_size, abd_size);
220
221	return (abd);
222}
223
224void
225abd_free_struct_impl(abd_t *abd)
226{
227	uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
228	    abd_scatter_chunkcnt(abd);
229	ssize_t size = MAX(sizeof (abd_t),
230	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
231	kmem_free(abd, size);
232	ABDSTAT_INCR(abdstat_struct_size, -size);
233}
234
235/*
236 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
237 * each chunk in the scatterlist will be set to abd_zero_buf.
238 */
239static void
240abd_alloc_zero_scatter(void)
241{
242	uint_t i, n;
243
244	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
245	abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
246	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
247
248	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
249	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
250
251	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
252	ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
253	    zfs_abd_chunk_size;
254
255	for (i = 0; i < n; i++) {
256		ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
257		    abd_zero_buf;
258	}
259
260	ABDSTAT_BUMP(abdstat_scatter_cnt);
261	ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
262}
263
264static void
265abd_free_zero_scatter(void)
266{
267	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
268	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
269
270	abd_free_struct(abd_zero_scatter);
271	abd_zero_scatter = NULL;
272	kmem_free(abd_zero_buf, zfs_abd_chunk_size);
273}
274
275void
276abd_init(void)
277{
278	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
279	    NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
280
281	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
282	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
283	if (abd_ksp != NULL) {
284		abd_ksp->ks_data = &abd_stats;
285		kstat_install(abd_ksp);
286	}
287
288	abd_alloc_zero_scatter();
289}
290
291void
292abd_fini(void)
293{
294	abd_free_zero_scatter();
295
296	if (abd_ksp != NULL) {
297		kstat_delete(abd_ksp);
298		abd_ksp = NULL;
299	}
300
301	kmem_cache_destroy(abd_chunk_cache);
302	abd_chunk_cache = NULL;
303}
304
305void
306abd_free_linear_page(abd_t *abd)
307{
308	/*
309	 * FreeBSD does not have scatter linear pages
310	 * so there is an error.
311	 */
312	VERIFY(0);
313}
314
315/*
316 * If we're going to use this ABD for doing I/O using the block layer, the
317 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
318 * plan to store this ABD in memory for a long period of time, we should
319 * allocate the ABD type that requires the least data copying to do the I/O.
320 *
321 * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
322 * using a scatter/gather list we should switch to that and replace this call
323 * with vanilla abd_alloc().
324 */
325abd_t *
326abd_alloc_for_io(size_t size, boolean_t is_metadata)
327{
328	return (abd_alloc_linear(size, is_metadata));
329}
330
331abd_t *
332abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
333{
334	abd_verify(sabd);
335	ASSERT3U(off, <=, sabd->abd_size);
336
337	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
338	uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
339	    (new_offset / zfs_abd_chunk_size);
340
341	/*
342	 * If an abd struct is provided, it is only the minimum size.  If we
343	 * need additional chunks, we need to allocate a new struct.
344	 */
345	if (abd != NULL &&
346	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
347	    sizeof (abd_t)) {
348		abd = NULL;
349	}
350
351	if (abd == NULL)
352		abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
353
354	/*
355	 * Even if this buf is filesystem metadata, we only track that
356	 * if we own the underlying data buffer, which is not true in
357	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
358	 */
359
360	ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
361	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
362
363	/* Copy the scatterlist starting at the correct offset */
364	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
365	    &ABD_SCATTER(sabd).abd_chunks[new_offset /
366	    zfs_abd_chunk_size],
367	    chunkcnt * sizeof (void *));
368
369	return (abd);
370}
371
372static inline size_t
373abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
374{
375	ASSERT(!abd_is_linear(aiter->iter_abd));
376	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
377	    aiter->iter_pos) % zfs_abd_chunk_size);
378}
379
380static inline size_t
381abd_iter_scatter_chunk_index(struct abd_iter *aiter)
382{
383	ASSERT(!abd_is_linear(aiter->iter_abd));
384	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
385	    aiter->iter_pos) / zfs_abd_chunk_size);
386}
387
388/*
389 * Initialize the abd_iter.
390 */
391void
392abd_iter_init(struct abd_iter *aiter, abd_t *abd)
393{
394	ASSERT(!abd_is_gang(abd));
395	abd_verify(abd);
396	aiter->iter_abd = abd;
397	aiter->iter_pos = 0;
398	aiter->iter_mapaddr = NULL;
399	aiter->iter_mapsize = 0;
400}
401
402/*
403 * This is just a helper function to see if we have exhausted the
404 * abd_iter and reached the end.
405 */
406boolean_t
407abd_iter_at_end(struct abd_iter *aiter)
408{
409	return (aiter->iter_pos == aiter->iter_abd->abd_size);
410}
411
412/*
413 * Advance the iterator by a certain amount. Cannot be called when a chunk is
414 * in use. This can be safely called when the aiter has already exhausted, in
415 * which case this does nothing.
416 */
417void
418abd_iter_advance(struct abd_iter *aiter, size_t amount)
419{
420	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
421	ASSERT0(aiter->iter_mapsize);
422
423	/* There's nothing left to advance to, so do nothing */
424	if (abd_iter_at_end(aiter))
425		return;
426
427	aiter->iter_pos += amount;
428}
429
430/*
431 * Map the current chunk into aiter. This can be safely called when the aiter
432 * has already exhausted, in which case this does nothing.
433 */
434void
435abd_iter_map(struct abd_iter *aiter)
436{
437	void *paddr;
438	size_t offset = 0;
439
440	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
441	ASSERT0(aiter->iter_mapsize);
442
443	/* Panic if someone has changed zfs_abd_chunk_size */
444	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
445	    ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
446
447	/* There's nothing left to iterate over, so do nothing */
448	if (abd_iter_at_end(aiter))
449		return;
450
451	if (abd_is_linear(aiter->iter_abd)) {
452		offset = aiter->iter_pos;
453		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
454		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
455	} else {
456		size_t index = abd_iter_scatter_chunk_index(aiter);
457		offset = abd_iter_scatter_chunk_offset(aiter);
458		aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
459		    aiter->iter_abd->abd_size - aiter->iter_pos);
460		paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
461	}
462	aiter->iter_mapaddr = (char *)paddr + offset;
463}
464
465/*
466 * Unmap the current chunk from aiter. This can be safely called when the aiter
467 * has already exhausted, in which case this does nothing.
468 */
469void
470abd_iter_unmap(struct abd_iter *aiter)
471{
472	/* There's nothing left to unmap, so do nothing */
473	if (abd_iter_at_end(aiter))
474		return;
475
476	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
477	ASSERT3U(aiter->iter_mapsize, >, 0);
478
479	aiter->iter_mapaddr = NULL;
480	aiter->iter_mapsize = 0;
481}
482
483void
484abd_cache_reap_now(void)
485{
486	kmem_cache_reap_soon(abd_chunk_cache);
487}
488