1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source.  A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15/*
16 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
17 */
18
19#include <sys/zfs_context.h>
20#include <sys/multilist.h>
21#include <sys/trace_zfs.h>
22
23/* needed for spa_get_random() */
24#include <sys/spa.h>
25
26/*
27 * This overrides the number of sublists in each multilist_t, which defaults
28 * to the number of CPUs in the system (see multilist_create()).
29 */
30int zfs_multilist_num_sublists = 0;
31
32/*
33 * Given the object contained on the list, return a pointer to the
34 * object's multilist_node_t structure it contains.
35 */
36#ifdef ZFS_DEBUG
37static multilist_node_t *
38multilist_d2l(multilist_t *ml, void *obj)
39{
40	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
41}
42#endif
43
44/*
45 * Initialize a new mutlilist using the parameters specified.
46 *
47 *  - 'size' denotes the size of the structure containing the
48 *     multilist_node_t.
49 *  - 'offset' denotes the byte offset of the mutlilist_node_t within
50 *     the structure that contains it.
51 *  - 'num' specifies the number of internal sublists to create.
52 *  - 'index_func' is used to determine which sublist to insert into
53 *     when the multilist_insert() function is called; as well as which
54 *     sublist to remove from when multilist_remove() is called. The
55 *     requirements this function must meet, are the following:
56 *
57 *      - It must always return the same value when called on the same
58 *        object (to ensure the object is removed from the list it was
59 *        inserted into).
60 *
61 *      - It must return a value in the range [0, number of sublists).
62 *        The multilist_get_num_sublists() function may be used to
63 *        determine the number of sublists in the multilist.
64 *
65 *     Also, in order to reduce internal contention between the sublists
66 *     during insertion and removal, this function should choose evenly
67 *     between all available sublists when inserting. This isn't a hard
68 *     requirement, but a general rule of thumb in order to garner the
69 *     best multi-threaded performance out of the data structure.
70 */
71static void
72multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
73    unsigned int num, multilist_sublist_index_func_t *index_func)
74{
75	ASSERT3U(size, >, 0);
76	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
77	ASSERT3U(num, >, 0);
78	ASSERT3P(index_func, !=, NULL);
79
80	ml->ml_offset = offset;
81	ml->ml_num_sublists = num;
82	ml->ml_index_func = index_func;
83
84	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
85	    ml->ml_num_sublists, KM_SLEEP);
86
87	ASSERT3P(ml->ml_sublists, !=, NULL);
88
89	for (int i = 0; i < ml->ml_num_sublists; i++) {
90		multilist_sublist_t *mls = &ml->ml_sublists[i];
91		mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL);
92		list_create(&mls->mls_list, size, offset);
93	}
94}
95
96/*
97 * Allocate a new multilist, using the default number of sublists (the number
98 * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
99 * that the multilists do not expand if more CPUs are hot-added. In that case,
100 * we will have less fanout than boot_ncpus, but we don't want to always
101 * reserve the RAM necessary to create the extra slots for additional CPUs up
102 * front, and dynamically adding them is a complex task.
103 */
104void
105multilist_create(multilist_t *ml, size_t size, size_t offset,
106    multilist_sublist_index_func_t *index_func)
107{
108	int num_sublists;
109
110	if (zfs_multilist_num_sublists > 0) {
111		num_sublists = zfs_multilist_num_sublists;
112	} else {
113		num_sublists = MAX(boot_ncpus, 4);
114	}
115
116	multilist_create_impl(ml, size, offset, num_sublists, index_func);
117}
118
119/*
120 * Destroy the given multilist object, and free up any memory it holds.
121 */
122void
123multilist_destroy(multilist_t *ml)
124{
125	ASSERT(multilist_is_empty(ml));
126
127	for (int i = 0; i < ml->ml_num_sublists; i++) {
128		multilist_sublist_t *mls = &ml->ml_sublists[i];
129
130		ASSERT(list_is_empty(&mls->mls_list));
131
132		list_destroy(&mls->mls_list);
133		mutex_destroy(&mls->mls_lock);
134	}
135
136	ASSERT3P(ml->ml_sublists, !=, NULL);
137	kmem_free(ml->ml_sublists,
138	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
139
140	ml->ml_num_sublists = 0;
141	ml->ml_offset = 0;
142	ml->ml_sublists = NULL;
143}
144
145/*
146 * Insert the given object into the multilist.
147 *
148 * This function will insert the object specified into the sublist
149 * determined using the function given at multilist creation time.
150 *
151 * The sublist locks are automatically acquired if not already held, to
152 * ensure consistency when inserting and removing from multiple threads.
153 */
154void
155multilist_insert(multilist_t *ml, void *obj)
156{
157	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
158	multilist_sublist_t *mls;
159	boolean_t need_lock;
160
161	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
162	    unsigned int, sublist_idx, void *, obj);
163
164	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
165
166	mls = &ml->ml_sublists[sublist_idx];
167
168	/*
169	 * Note: Callers may already hold the sublist lock by calling
170	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
171	 * returning TRUE if and only if the current thread holds the
172	 * lock.  While it's a little ugly to make the lock recursive in
173	 * this way, it works and allows the calling code to be much
174	 * simpler -- otherwise it would have to pass around a flag
175	 * indicating that it already has the lock.
176	 */
177	need_lock = !MUTEX_HELD(&mls->mls_lock);
178
179	if (need_lock)
180		mutex_enter(&mls->mls_lock);
181
182	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
183
184	multilist_sublist_insert_head(mls, obj);
185
186	if (need_lock)
187		mutex_exit(&mls->mls_lock);
188}
189
190/*
191 * Remove the given object from the multilist.
192 *
193 * This function will remove the object specified from the sublist
194 * determined using the function given at multilist creation time.
195 *
196 * The necessary sublist locks are automatically acquired, to ensure
197 * consistency when inserting and removing from multiple threads.
198 */
199void
200multilist_remove(multilist_t *ml, void *obj)
201{
202	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
203	multilist_sublist_t *mls;
204	boolean_t need_lock;
205
206	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
207	    unsigned int, sublist_idx, void *, obj);
208
209	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
210
211	mls = &ml->ml_sublists[sublist_idx];
212	/* See comment in multilist_insert(). */
213	need_lock = !MUTEX_HELD(&mls->mls_lock);
214
215	if (need_lock)
216		mutex_enter(&mls->mls_lock);
217
218	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
219
220	multilist_sublist_remove(mls, obj);
221
222	if (need_lock)
223		mutex_exit(&mls->mls_lock);
224}
225
226/*
227 * Check to see if this multilist object is empty.
228 *
229 * This will return TRUE if it finds all of the sublists of this
230 * multilist to be empty, and FALSE otherwise. Each sublist lock will be
231 * automatically acquired as necessary.
232 *
233 * If concurrent insertions and removals are occurring, the semantics
234 * of this function become a little fuzzy. Instead of locking all
235 * sublists for the entire call time of the function, each sublist is
236 * only locked as it is individually checked for emptiness. Thus, it's
237 * possible for this function to return TRUE with non-empty sublists at
238 * the time the function returns. This would be due to another thread
239 * inserting into a given sublist, after that specific sublist was check
240 * and deemed empty, but before all sublists have been checked.
241 */
242int
243multilist_is_empty(multilist_t *ml)
244{
245	for (int i = 0; i < ml->ml_num_sublists; i++) {
246		multilist_sublist_t *mls = &ml->ml_sublists[i];
247		/* See comment in multilist_insert(). */
248		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
249
250		if (need_lock)
251			mutex_enter(&mls->mls_lock);
252
253		if (!list_is_empty(&mls->mls_list)) {
254			if (need_lock)
255				mutex_exit(&mls->mls_lock);
256
257			return (FALSE);
258		}
259
260		if (need_lock)
261			mutex_exit(&mls->mls_lock);
262	}
263
264	return (TRUE);
265}
266
267/* Return the number of sublists composing this multilist */
268unsigned int
269multilist_get_num_sublists(multilist_t *ml)
270{
271	return (ml->ml_num_sublists);
272}
273
274/* Return a randomly selected, valid sublist index for this multilist */
275unsigned int
276multilist_get_random_index(multilist_t *ml)
277{
278	return (spa_get_random(ml->ml_num_sublists));
279}
280
281/* Lock and return the sublist specified at the given index */
282multilist_sublist_t *
283multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
284{
285	multilist_sublist_t *mls;
286
287	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
288	mls = &ml->ml_sublists[sublist_idx];
289	mutex_enter(&mls->mls_lock);
290
291	return (mls);
292}
293
294/* Lock and return the sublist that would be used to store the specified obj */
295multilist_sublist_t *
296multilist_sublist_lock_obj(multilist_t *ml, void *obj)
297{
298	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
299}
300
301void
302multilist_sublist_unlock(multilist_sublist_t *mls)
303{
304	mutex_exit(&mls->mls_lock);
305}
306
307/*
308 * We're allowing any object to be inserted into this specific sublist,
309 * but this can lead to trouble if multilist_remove() is called to
310 * remove this object. Specifically, if calling ml_index_func on this
311 * object returns an index for sublist different than what is passed as
312 * a parameter here, any call to multilist_remove() with this newly
313 * inserted object is undefined! (the call to multilist_remove() will
314 * remove the object from a list that it isn't contained in)
315 */
316void
317multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
318{
319	ASSERT(MUTEX_HELD(&mls->mls_lock));
320	list_insert_head(&mls->mls_list, obj);
321}
322
323/* please see comment above multilist_sublist_insert_head */
324void
325multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
326{
327	ASSERT(MUTEX_HELD(&mls->mls_lock));
328	list_insert_tail(&mls->mls_list, obj);
329}
330
331/*
332 * Move the object one element forward in the list.
333 *
334 * This function will move the given object forward in the list (towards
335 * the head) by one object. So, in essence, it will swap its position in
336 * the list with its "prev" pointer. If the given object is already at the
337 * head of the list, it cannot be moved forward any more than it already
338 * is, so no action is taken.
339 *
340 * NOTE: This function **must not** remove any object from the list other
341 *       than the object given as the parameter. This is relied upon in
342 *       arc_evict_state_impl().
343 */
344void
345multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
346{
347	void *prev = list_prev(&mls->mls_list, obj);
348
349	ASSERT(MUTEX_HELD(&mls->mls_lock));
350	ASSERT(!list_is_empty(&mls->mls_list));
351
352	/* 'obj' must be at the head of the list, nothing to do */
353	if (prev == NULL)
354		return;
355
356	list_remove(&mls->mls_list, obj);
357	list_insert_before(&mls->mls_list, prev, obj);
358}
359
360void
361multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
362{
363	ASSERT(MUTEX_HELD(&mls->mls_lock));
364	list_remove(&mls->mls_list, obj);
365}
366
367int
368multilist_sublist_is_empty(multilist_sublist_t *mls)
369{
370	ASSERT(MUTEX_HELD(&mls->mls_lock));
371	return (list_is_empty(&mls->mls_list));
372}
373
374int
375multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
376{
377	multilist_sublist_t *mls;
378	int empty;
379
380	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
381	mls = &ml->ml_sublists[sublist_idx];
382	ASSERT(!MUTEX_HELD(&mls->mls_lock));
383	mutex_enter(&mls->mls_lock);
384	empty = list_is_empty(&mls->mls_list);
385	mutex_exit(&mls->mls_lock);
386	return (empty);
387}
388
389void *
390multilist_sublist_head(multilist_sublist_t *mls)
391{
392	ASSERT(MUTEX_HELD(&mls->mls_lock));
393	return (list_head(&mls->mls_list));
394}
395
396void *
397multilist_sublist_tail(multilist_sublist_t *mls)
398{
399	ASSERT(MUTEX_HELD(&mls->mls_lock));
400	return (list_tail(&mls->mls_list));
401}
402
403void *
404multilist_sublist_next(multilist_sublist_t *mls, void *obj)
405{
406	ASSERT(MUTEX_HELD(&mls->mls_lock));
407	return (list_next(&mls->mls_list, obj));
408}
409
410void *
411multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
412{
413	ASSERT(MUTEX_HELD(&mls->mls_lock));
414	return (list_prev(&mls->mls_list, obj));
415}
416
417void
418multilist_link_init(multilist_node_t *link)
419{
420	list_link_init(link);
421}
422
423int
424multilist_link_active(multilist_node_t *link)
425{
426	return (list_link_active(link));
427}
428
429/* BEGIN CSTYLED */
430ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
431	"Number of sublists used in each multilist");
432/* END CSTYLED */
433