1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source.  A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15/*
16 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
17 */
18
19#include <sys/zfs_context.h>
20#include <sys/multilist.h>
21
22/* needed for spa_get_random() */
23#include <sys/spa.h>
24
25/*
26 * This overrides the number of sublists in each multilist_t, which defaults
27 * to the number of CPUs in the system (see multilist_create()).
28 */
29int zfs_multilist_num_sublists = 0;
30
31/*
32 * Given the object contained on the list, return a pointer to the
33 * object's multilist_node_t structure it contains.
34 */
35static multilist_node_t *
36multilist_d2l(multilist_t *ml, void *obj)
37{
38	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
39}
40
41/*
42 * Initialize a new mutlilist using the parameters specified.
43 *
44 *  - 'size' denotes the size of the structure containing the
45 *     multilist_node_t.
46 *  - 'offset' denotes the byte offset of the mutlilist_node_t within
47 *     the structure that contains it.
48 *  - 'num' specifies the number of internal sublists to create.
49 *  - 'index_func' is used to determine which sublist to insert into
50 *     when the multilist_insert() function is called; as well as which
51 *     sublist to remove from when multilist_remove() is called. The
52 *     requirements this function must meet, are the following:
53 *
54 *      - It must always return the same value when called on the same
55 *        object (to ensure the object is removed from the list it was
56 *        inserted into).
57 *
58 *      - It must return a value in the range [0, number of sublists).
59 *        The multilist_get_num_sublists() function may be used to
60 *        determine the number of sublists in the multilist.
61 *
62 *     Also, in order to reduce internal contention between the sublists
63 *     during insertion and removal, this function should choose evenly
64 *     between all available sublists when inserting. This isn't a hard
65 *     requirement, but a general rule of thumb in order to garner the
66 *     best multi-threaded performance out of the data structure.
67 */
68static multilist_t *
69multilist_create_impl(size_t size, size_t offset,
70    unsigned int num, multilist_sublist_index_func_t *index_func)
71{
72	ASSERT3U(size, >, 0);
73	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
74	ASSERT3U(num, >, 0);
75	ASSERT3P(index_func, !=, NULL);
76
77	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
78	ml->ml_offset = offset;
79	ml->ml_num_sublists = num;
80	ml->ml_index_func = index_func;
81
82	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
83	    ml->ml_num_sublists, KM_SLEEP);
84
85	ASSERT3P(ml->ml_sublists, !=, NULL);
86
87	for (int i = 0; i < ml->ml_num_sublists; i++) {
88		multilist_sublist_t *mls = &ml->ml_sublists[i];
89		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
90		list_create(&mls->mls_list, size, offset);
91	}
92	return (ml);
93}
94
95/*
96 * Allocate a new multilist, using the default number of sublists
97 * (the number of CPUs, or at least 4, or the tunable
98 * zfs_multilist_num_sublists).
99 */
100multilist_t *
101multilist_create(size_t size, size_t offset,
102    multilist_sublist_index_func_t *index_func)
103{
104	int num_sublists;
105
106	if (zfs_multilist_num_sublists > 0) {
107		num_sublists = zfs_multilist_num_sublists;
108	} else {
109		num_sublists = MAX(max_ncpus, 4);
110	}
111
112	return (multilist_create_impl(size, offset, num_sublists, index_func));
113}
114
115/*
116 * Destroy the given multilist object, and free up any memory it holds.
117 */
118void
119multilist_destroy(multilist_t *ml)
120{
121	ASSERT(multilist_is_empty(ml));
122
123	for (int i = 0; i < ml->ml_num_sublists; i++) {
124		multilist_sublist_t *mls = &ml->ml_sublists[i];
125
126		ASSERT(list_is_empty(&mls->mls_list));
127
128		list_destroy(&mls->mls_list);
129		mutex_destroy(&mls->mls_lock);
130	}
131
132	ASSERT3P(ml->ml_sublists, !=, NULL);
133	kmem_free(ml->ml_sublists,
134	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
135
136	ml->ml_num_sublists = 0;
137	ml->ml_offset = 0;
138	kmem_free(ml, sizeof (multilist_t));
139}
140
141/*
142 * Insert the given object into the multilist.
143 *
144 * This function will insert the object specified into the sublist
145 * determined using the function given at multilist creation time.
146 *
147 * The sublist locks are automatically acquired if not already held, to
148 * ensure consistency when inserting and removing from multiple threads.
149 */
150void
151multilist_insert(multilist_t *ml, void *obj)
152{
153	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
154	multilist_sublist_t *mls;
155	boolean_t need_lock;
156
157	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
158	    unsigned int, sublist_idx, void *, obj);
159
160	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
161
162	mls = &ml->ml_sublists[sublist_idx];
163
164	/*
165	 * Note: Callers may already hold the sublist lock by calling
166	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
167	 * returning TRUE if and only if the current thread holds the
168	 * lock.  While it's a little ugly to make the lock recursive in
169	 * this way, it works and allows the calling code to be much
170	 * simpler -- otherwise it would have to pass around a flag
171	 * indicating that it already has the lock.
172	 */
173	need_lock = !MUTEX_HELD(&mls->mls_lock);
174
175	if (need_lock)
176		mutex_enter(&mls->mls_lock);
177
178	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
179
180	multilist_sublist_insert_head(mls, obj);
181
182	if (need_lock)
183		mutex_exit(&mls->mls_lock);
184}
185
186/*
187 * Remove the given object from the multilist.
188 *
189 * This function will remove the object specified from the sublist
190 * determined using the function given at multilist creation time.
191 *
192 * The necessary sublist locks are automatically acquired, to ensure
193 * consistency when inserting and removing from multiple threads.
194 */
195void
196multilist_remove(multilist_t *ml, void *obj)
197{
198	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
199	multilist_sublist_t *mls;
200	boolean_t need_lock;
201
202	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
203	    unsigned int, sublist_idx, void *, obj);
204
205	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
206
207	mls = &ml->ml_sublists[sublist_idx];
208	/* See comment in multilist_insert(). */
209	need_lock = !MUTEX_HELD(&mls->mls_lock);
210
211	if (need_lock)
212		mutex_enter(&mls->mls_lock);
213
214	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
215
216	multilist_sublist_remove(mls, obj);
217
218	if (need_lock)
219		mutex_exit(&mls->mls_lock);
220}
221
222/*
223 * Check to see if this multilist object is empty.
224 *
225 * This will return TRUE if it finds all of the sublists of this
226 * multilist to be empty, and FALSE otherwise. Each sublist lock will be
227 * automatically acquired as necessary.
228 *
229 * If concurrent insertions and removals are occurring, the semantics
230 * of this function become a little fuzzy. Instead of locking all
231 * sublists for the entire call time of the function, each sublist is
232 * only locked as it is individually checked for emptiness. Thus, it's
233 * possible for this function to return TRUE with non-empty sublists at
234 * the time the function returns. This would be due to another thread
235 * inserting into a given sublist, after that specific sublist was check
236 * and deemed empty, but before all sublists have been checked.
237 */
238int
239multilist_is_empty(multilist_t *ml)
240{
241	for (int i = 0; i < ml->ml_num_sublists; i++) {
242		multilist_sublist_t *mls = &ml->ml_sublists[i];
243		/* See comment in multilist_insert(). */
244		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
245
246		if (need_lock)
247			mutex_enter(&mls->mls_lock);
248
249		if (!list_is_empty(&mls->mls_list)) {
250			if (need_lock)
251				mutex_exit(&mls->mls_lock);
252
253			return (FALSE);
254		}
255
256		if (need_lock)
257			mutex_exit(&mls->mls_lock);
258	}
259
260	return (TRUE);
261}
262
263/* Return the number of sublists composing this multilist */
264unsigned int
265multilist_get_num_sublists(multilist_t *ml)
266{
267	return (ml->ml_num_sublists);
268}
269
270/* Return a randomly selected, valid sublist index for this multilist */
271unsigned int
272multilist_get_random_index(multilist_t *ml)
273{
274	return (spa_get_random(ml->ml_num_sublists));
275}
276
277/* Lock and return the sublist specified at the given index */
278multilist_sublist_t *
279multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
280{
281	multilist_sublist_t *mls;
282
283	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
284	mls = &ml->ml_sublists[sublist_idx];
285	mutex_enter(&mls->mls_lock);
286
287	return (mls);
288}
289
290/* Lock and return the sublist that would be used to store the specified obj */
291multilist_sublist_t *
292multilist_sublist_lock_obj(multilist_t *ml, void *obj)
293{
294	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
295}
296
297void
298multilist_sublist_unlock(multilist_sublist_t *mls)
299{
300	mutex_exit(&mls->mls_lock);
301}
302
303/*
304 * We're allowing any object to be inserted into this specific sublist,
305 * but this can lead to trouble if multilist_remove() is called to
306 * remove this object. Specifically, if calling ml_index_func on this
307 * object returns an index for sublist different than what is passed as
308 * a parameter here, any call to multilist_remove() with this newly
309 * inserted object is undefined! (the call to multilist_remove() will
310 * remove the object from a list that it isn't contained in)
311 */
312void
313multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
314{
315	ASSERT(MUTEX_HELD(&mls->mls_lock));
316	list_insert_head(&mls->mls_list, obj);
317}
318
319/* please see comment above multilist_sublist_insert_head */
320void
321multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
322{
323	ASSERT(MUTEX_HELD(&mls->mls_lock));
324	list_insert_tail(&mls->mls_list, obj);
325}
326
327/*
328 * Move the object one element forward in the list.
329 *
330 * This function will move the given object forward in the list (towards
331 * the head) by one object. So, in essence, it will swap its position in
332 * the list with its "prev" pointer. If the given object is already at the
333 * head of the list, it cannot be moved forward any more than it already
334 * is, so no action is taken.
335 *
336 * NOTE: This function **must not** remove any object from the list other
337 *       than the object given as the parameter. This is relied upon in
338 *       arc_evict_state_impl().
339 */
340void
341multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
342{
343	void *prev = list_prev(&mls->mls_list, obj);
344
345	ASSERT(MUTEX_HELD(&mls->mls_lock));
346	ASSERT(!list_is_empty(&mls->mls_list));
347
348	/* 'obj' must be at the head of the list, nothing to do */
349	if (prev == NULL)
350		return;
351
352	list_remove(&mls->mls_list, obj);
353	list_insert_before(&mls->mls_list, prev, obj);
354}
355
356void
357multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
358{
359	ASSERT(MUTEX_HELD(&mls->mls_lock));
360	list_remove(&mls->mls_list, obj);
361}
362
363int
364multilist_sublist_is_empty(multilist_sublist_t *mls)
365{
366	ASSERT(MUTEX_HELD(&mls->mls_lock));
367	return (list_is_empty(&mls->mls_list));
368}
369
370int
371multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
372{
373	multilist_sublist_t *mls;
374	int empty;
375
376	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
377	mls = &ml->ml_sublists[sublist_idx];
378	ASSERT(!MUTEX_HELD(&mls->mls_lock));
379	mutex_enter(&mls->mls_lock);
380	empty = list_is_empty(&mls->mls_list);
381	mutex_exit(&mls->mls_lock);
382	return (empty);
383}
384
385void *
386multilist_sublist_head(multilist_sublist_t *mls)
387{
388	ASSERT(MUTEX_HELD(&mls->mls_lock));
389	return (list_head(&mls->mls_list));
390}
391
392void *
393multilist_sublist_tail(multilist_sublist_t *mls)
394{
395	ASSERT(MUTEX_HELD(&mls->mls_lock));
396	return (list_tail(&mls->mls_list));
397}
398
399void *
400multilist_sublist_next(multilist_sublist_t *mls, void *obj)
401{
402	ASSERT(MUTEX_HELD(&mls->mls_lock));
403	return (list_next(&mls->mls_list, obj));
404}
405
406void *
407multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
408{
409	ASSERT(MUTEX_HELD(&mls->mls_lock));
410	return (list_prev(&mls->mls_list, obj));
411}
412
413void
414multilist_link_init(multilist_node_t *link)
415{
416	list_link_init(link);
417}
418
419int
420multilist_link_active(multilist_node_t *link)
421{
422	return (list_link_active(link));
423}
424