1/*	$NetBSD: nfs_fha.c,v 1.2 2016/12/13 22:41:46 pgoyette Exp $	*/
2/*-
3 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28/* __FBSDID("FreeBSD: head/sys/nfs/nfs_fha.c 267479 2014-06-14 12:26:12Z mav "); */
29__RCSID("$NetBSD: nfs_fha.c,v 1.2 2016/12/13 22:41:46 pgoyette Exp $");
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/sysctl.h>
36#include <sys/vnode.h>
37#include <sys/malloc.h>
38#include <sys/mount.h>
39#include <sys/mbuf.h>
40#include <sys/sbuf.h>
41
42#include <rpc/rpc.h>
43#include <fs/nfs/common/nfs_fha.h>
44
45static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
46
47/*
48 * XXX need to commonize definitions between old and new NFS code.  Define
49 * this here so we don't include one nfsproto.h over the other.
50 */
51#define	NFS_PROG		100003
52
53void
54fha_init(struct fha_params *softc)
55{
56	char tmpstr[128];
57	int i;
58
59	for (i = 0; i < FHA_HASH_SIZE; i++)
60		mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
61
62	/*
63	 * Set the default tuning parameters.
64	 */
65	softc->ctls.enable = FHA_DEF_ENABLE;
66	softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
67	softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
68	softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
69
70	/*
71	 * Allow the user to override the defaults at boot time with
72	 * tunables.
73	 */
74	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
75	    softc->server_name);
76	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
77	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
78	    softc->server_name);
79	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
80	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
81	    softc->server_name);
82	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
83	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
84	    softc->server_name);
85	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
86
87	/*
88	 * Add sysctls so the user can change the tuning parameters at
89	 * runtime.
90	 */
91	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
92	    OID_AUTO, "enable", CTLFLAG_RW,
93	    &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
94
95	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
96	    OID_AUTO, "bin_shift", CTLFLAG_RW,
97	    &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
98	    "contend if they're 2^(bin_shift) bytes apart");
99
100	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
101	    OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
102	    &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
103	    "should be working on requests for the same file handle");
104
105	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
106	    OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
107	    &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
108	    "single nfsd thread should be working on at any time");
109
110	SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
111	    OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
112	    softc->callbacks.fhe_stats_sysctl, "A", "");
113
114}
115
116void
117fha_uninit(struct fha_params *softc)
118{
119	int i;
120
121	sysctl_ctx_free(&softc->sysctl_ctx);
122	for (i = 0; i < FHA_HASH_SIZE; i++)
123		mtx_destroy(&softc->fha_hash[i].mtx);
124}
125
126/*
127 * This just specifies that offsets should obey affinity when within
128 * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
129 */
130static void
131fha_extract_info(struct svc_req *req, struct fha_info *i,
132    struct fha_callbacks *cb)
133{
134	struct mbuf *md;
135	caddr_t dpos;
136	static u_int64_t random_fh = 0;
137	int error;
138	int v3 = (req->rq_vers == 3);
139	rpcproc_t procnum;
140
141	/*
142	 * We start off with a random fh.  If we get a reasonable
143	 * procnum, we set the fh.  If there's a concept of offset
144	 * that we're interested in, we set that.
145	 */
146	i->fh = ++random_fh;
147	i->offset = 0;
148	i->locktype = LK_EXCLUSIVE;
149
150	/*
151	 * Extract the procnum and convert to v3 form if necessary,
152	 * taking care to deal with out-of-range procnums.  Caller will
153	 * ensure that rq_vers is either 2 or 3.
154	 */
155	procnum = req->rq_proc;
156	if (!v3) {
157		rpcproc_t tmp_procnum;
158
159		tmp_procnum = cb->get_procnum(procnum);
160		if (tmp_procnum == -1)
161			goto out;
162		procnum = tmp_procnum;
163	}
164
165	/*
166	 * We do affinity for most.  However, we divide a realm of affinity
167	 * by file offset so as to allow for concurrent random access.  We
168	 * only do this for reads today, but this may change when IFS supports
169	 * efficient concurrent writes.
170	 */
171	if (cb->no_offset(procnum))
172		goto out;
173
174	error = cb->realign(&req->rq_args, M_NOWAIT);
175	if (error)
176		goto out;
177	md = req->rq_args;
178	dpos = mtod(md, caddr_t);
179
180	/* Grab the filehandle. */
181	error = cb->get_fh(&i->fh, v3, &md, &dpos);
182	if (error)
183		goto out;
184
185	/* Content ourselves with zero offset for all but reads. */
186	if (cb->is_read(procnum) || cb->is_write(procnum))
187		cb->get_offset(&md, &dpos, v3, i);
188
189out:
190	cb->set_locktype(procnum, i);
191}
192
193static struct fha_hash_entry *
194fha_hash_entry_new(u_int64_t fh)
195{
196	struct fha_hash_entry *e;
197
198	e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
199	e->fh = fh;
200	e->num_rw = 0;
201	e->num_exclusive = 0;
202	e->num_threads = 0;
203	LIST_INIT(&e->threads);
204
205	return (e);
206}
207
208static void
209fha_hash_entry_destroy(struct fha_hash_entry *e)
210{
211
212	mtx_assert(e->mtx, MA_OWNED);
213	KASSERT(e->num_rw == 0,
214	    ("%d reqs on destroyed fhe %p", e->num_rw, e));
215	KASSERT(e->num_exclusive == 0,
216	    ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
217	KASSERT(e->num_threads == 0,
218	    ("%d threads on destroyed fhe %p", e->num_threads, e));
219	free(e, M_NFS_FHA);
220}
221
222static void
223fha_hash_entry_remove(struct fha_hash_entry *e)
224{
225
226	mtx_assert(e->mtx, MA_OWNED);
227	LIST_REMOVE(e, link);
228	fha_hash_entry_destroy(e);
229}
230
231static struct fha_hash_entry *
232fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
233{
234	SVCPOOL *pool;
235	struct fha_hash_slot *fhs;
236	struct fha_hash_entry *fhe, *new_fhe;
237
238	pool = *softc->pool;
239	fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
240	new_fhe = fha_hash_entry_new(fh);
241	new_fhe->mtx = &fhs->mtx;
242	mtx_lock(&fhs->mtx);
243	LIST_FOREACH(fhe, &fhs->list, link)
244		if (fhe->fh == fh)
245			break;
246	if (!fhe) {
247		fhe = new_fhe;
248		LIST_INSERT_HEAD(&fhs->list, fhe, link);
249	} else
250		fha_hash_entry_destroy(new_fhe);
251	return (fhe);
252}
253
254static void
255fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
256{
257
258	mtx_assert(fhe->mtx, MA_OWNED);
259	thread->st_p2 = 0;
260	LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
261	fhe->num_threads++;
262}
263
264static void
265fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
266{
267
268	mtx_assert(fhe->mtx, MA_OWNED);
269	KASSERT(thread->st_p2 == 0,
270	    ("%d reqs on removed thread %p", thread->st_p2, thread));
271	LIST_REMOVE(thread, st_alink);
272	fhe->num_threads--;
273}
274
275/*
276 * Account for an ongoing operation associated with this file.
277 */
278static void
279fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
280{
281
282	mtx_assert(fhe->mtx, MA_OWNED);
283	if (LK_EXCLUSIVE == locktype)
284		fhe->num_exclusive += count;
285	else
286		fhe->num_rw += count;
287}
288
289/*
290 * Get the service thread currently associated with the fhe that is
291 * appropriate to handle this operation.
292 */
293static SVCTHREAD *
294fha_hash_entry_choose_thread(struct fha_params *softc,
295    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
296{
297	SVCTHREAD *thread, *min_thread = NULL;
298	SVCPOOL *pool;
299	int req_count, min_count = 0;
300	off_t offset1, offset2;
301
302	pool = *softc->pool;
303
304	LIST_FOREACH(thread, &fhe->threads, st_alink) {
305		req_count = thread->st_p2;
306
307		/* If there are any writes in progress, use the first thread. */
308		if (fhe->num_exclusive) {
309#if 0
310			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
311			    "fha: %p(%d)w", thread, req_count);
312#endif
313			return (thread);
314		}
315
316		/*
317		 * Check for read locality, making sure that we won't
318		 * exceed our per-thread load limit in the process.
319		 */
320		offset1 = i->offset;
321		offset2 = thread->st_p3;
322
323		if (((offset1 >= offset2)
324		  && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
325		 || ((offset2 > offset1)
326		  && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
327			if ((softc->ctls.max_reqs_per_nfsd == 0) ||
328			    (req_count < softc->ctls.max_reqs_per_nfsd)) {
329#if 0
330				ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
331				    "fha: %p(%d)r", thread, req_count);
332#endif
333				return (thread);
334			}
335		}
336
337		/*
338		 * We don't have a locality match, so skip this thread,
339		 * but keep track of the most attractive thread in case
340		 * we need to come back to it later.
341		 */
342#if 0
343		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
344		    "fha: %p(%d)s off1 %llu off2 %llu", thread,
345		    req_count, offset1, offset2);
346#endif
347		if ((min_thread == NULL) || (req_count < min_count)) {
348			min_count = req_count;
349			min_thread = thread;
350		}
351	}
352
353	/*
354	 * We didn't find a good match yet.  See if we can add
355	 * a new thread to this file handle entry's thread list.
356	 */
357	if ((softc->ctls.max_nfsds_per_fh == 0) ||
358	    (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
359		thread = this_thread;
360#if 0
361		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
362		    "fha: %p(%d)t", thread, thread->st_p2);
363#endif
364		fha_hash_entry_add_thread(fhe, thread);
365	} else {
366		/*
367		 * We don't want to use any more threads for this file, so
368		 * go back to the most attractive nfsd we're already using.
369		 */
370		thread = min_thread;
371	}
372
373	return (thread);
374}
375
376/*
377 * After getting a request, try to assign it to some thread.  Usually we
378 * handle it ourselves.
379 */
380SVCTHREAD *
381fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
382    struct fha_params *softc)
383{
384	SVCTHREAD *thread;
385	struct fha_info i;
386	struct fha_hash_entry *fhe;
387	struct fha_callbacks *cb;
388
389	cb = &softc->callbacks;
390
391	/* Check to see whether we're enabled. */
392	if (softc->ctls.enable == 0)
393		goto thist;
394
395	/*
396	 * Only do placement if this is an NFS request.
397	 */
398	if (req->rq_prog != NFS_PROG)
399		goto thist;
400
401	if (req->rq_vers != 2 && req->rq_vers != 3)
402		goto thist;
403
404	fha_extract_info(req, &i, cb);
405
406	/*
407	 * We save the offset associated with this request for later
408	 * nfsd matching.
409	 */
410	fhe = fha_hash_entry_lookup(softc, i.fh);
411	req->rq_p1 = fhe;
412	req->rq_p2 = i.locktype;
413	req->rq_p3 = i.offset;
414
415	/*
416	 * Choose a thread, taking into consideration locality, thread load,
417	 * and the number of threads already working on this file.
418	 */
419	thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
420	KASSERT(thread, ("fha_assign: NULL thread!"));
421	fha_hash_entry_add_op(fhe, i.locktype, 1);
422	thread->st_p2++;
423	thread->st_p3 = i.offset;
424
425	/*
426	 * Grab the pool lock here to not let chosen thread go away before
427	 * the new request inserted to its queue while we drop fhe lock.
428	 */
429	mtx_lock(&thread->st_lock);
430	mtx_unlock(fhe->mtx);
431
432	return (thread);
433thist:
434	req->rq_p1 = NULL;
435	mtx_lock(&this_thread->st_lock);
436	return (this_thread);
437}
438
439/*
440 * Called when we're done with an operation.  The request has already
441 * been de-queued.
442 */
443void
444fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
445{
446	struct fha_hash_entry *fhe = req->rq_p1;
447	struct mtx *mtx;
448
449	/*
450	 * This may be called for reqs that didn't go through
451	 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
452	 */
453	if (!fhe)
454		return;
455
456	mtx = fhe->mtx;
457	mtx_lock(mtx);
458	fha_hash_entry_add_op(fhe, req->rq_p2, -1);
459	thread->st_p2--;
460	KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
461	    thread->st_p2, thread));
462	if (thread->st_p2 == 0) {
463		fha_hash_entry_remove_thread(fhe, thread);
464		if (0 == fhe->num_rw + fhe->num_exclusive)
465			fha_hash_entry_remove(fhe);
466	}
467	mtx_unlock(mtx);
468}
469
470int
471fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
472{
473	int error, i;
474	struct sbuf sb;
475	struct fha_hash_entry *fhe;
476	bool_t first, hfirst;
477	SVCTHREAD *thread;
478	SVCPOOL *pool;
479
480	sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
481
482	pool = NULL;
483
484	if (!*softc->pool) {
485		sbuf_printf(&sb, "NFSD not running\n");
486		goto out;
487	}
488	pool = *softc->pool;
489
490	for (i = 0; i < FHA_HASH_SIZE; i++)
491		if (!LIST_EMPTY(&softc->fha_hash[i].list))
492			break;
493
494	if (i == FHA_HASH_SIZE) {
495		sbuf_printf(&sb, "No file handle entries.\n");
496		goto out;
497	}
498
499	hfirst = TRUE;
500	for (; i < FHA_HASH_SIZE; i++) {
501		mtx_lock(&softc->fha_hash[i].mtx);
502		if (LIST_EMPTY(&softc->fha_hash[i].list)) {
503			mtx_unlock(&softc->fha_hash[i].mtx);
504			continue;
505		}
506		sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
507		first = TRUE;
508		LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
509			sbuf_printf(&sb, "%sfhe %p: {\n", first ? "  " : ", ", fhe);
510
511			sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
512			sbuf_printf(&sb, "    num_rw/exclusive: %d/%d\n",
513			    fhe->num_rw, fhe->num_exclusive);
514			sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
515
516			LIST_FOREACH(thread, &fhe->threads, st_alink) {
517				sbuf_printf(&sb, "      thread %p offset %ju "
518				    "reqs %d\n", thread,
519				    thread->st_p3, thread->st_p2);
520			}
521
522			sbuf_printf(&sb, "  }");
523			first = FALSE;
524		}
525		sbuf_printf(&sb, "\n}");
526		mtx_unlock(&softc->fha_hash[i].mtx);
527		hfirst = FALSE;
528	}
529
530 out:
531	sbuf_trim(&sb);
532	sbuf_finish(&sb);
533	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
534	sbuf_delete(&sb);
535	return (error);
536}
537