nfs_nfsdcache.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: stable/11/sys/fs/nfsserver/nfs_nfsdcache.c 330897 2018-03-14 03:19:51Z eadler $");
38
39/*
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
44 *   avoided at all cost
45 * - A valid hit will probably happen a long time after the original reply
46 *   and the TCP socket that the original request was received on will no
47 *   longer be active
48 *   (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 *   in them as well as minimizing the risk of redoing retried non-idempotent
51 *   Ops.
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
55 *
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
58 * For TCP
59 * 	- key on <xid, NFS version> (as noted above, there can be several
60 * 				     entries with the same key)
61 * 	When a request arrives:
62 * 		For all that match key
63 * 		- if RPC# != OR request_size !=
64 * 			- not a match with this one
65 * 		- if NFSv4 and received on same TCP socket OR
66 *			received on a TCP connection created before the
67 *			entry was cached
68 * 			- not a match with this one
69 * 			(V2,3 clients might retry on same TCP socket)
70 * 		- calculate checksum on first N bytes of NFS XDR
71 * 		- if checksum !=
72 * 			- not a match for this one
73 * 		If any of the remaining ones that match has a
74 * 			seqid_refcnt > 0
75 * 			- not a match (go do RPC, using new cache entry)
76 * 		If one match left
77 * 			- a hit (reply from cache)
78 * 		else
79 * 			- miss (go do RPC, using new cache entry)
80 *
81 * 	During processing of NFSv4 request:
82 * 		- set a flag when a non-idempotent Op is processed
83 * 		- when an Op that uses a seqid# (Open,...) is processed
84 * 			- if same seqid# as referenced entry in cache
85 * 				- free new cache entry
86 * 				- reply from referenced cache entry
87 * 			  else if next seqid# in order
88 * 				- free referenced cache entry
89 * 				- increment seqid_refcnt on new cache entry
90 * 				- set pointer from Openowner/Lockowner to
91 * 					new cache entry (aka reference it)
92 * 			  else if first seqid# in sequence
93 * 				- increment seqid_refcnt on new cache entry
94 * 				- set pointer from Openowner/Lockowner to
95 * 					new cache entry (aka reference it)
96 *
97 * 	At end of RPC processing:
98 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
99 * 			cache entry
100 * 			- save reply in cache entry
101 * 			- calculate checksum on first N bytes of NFS XDR
102 * 				request
103 * 			- note op and length of XDR request (in bytes)
104 * 			- timestamp it
105 * 		  else
106 * 			- free new cache entry
107 * 		- Send reply (noting info for socket activity check, below)
108 *
109 * 	For cache entries saved above:
110 * 		- if saved since seqid_refcnt was > 0
111 * 			- free when seqid_refcnt decrements to 0
112 * 			  (when next one in sequence is processed above, or
113 * 			   when Openowner/Lockowner is discarded)
114 * 		  else { non-idempotent Op(s) }
115 * 			- free when
116 * 				- some further activity observed on same
117 * 					socket
118 * 				  (I'm not yet sure how I'm going to do
119 * 				   this. Maybe look at the TCP connection
120 * 				   to see if the send_tcp_sequence# is well
121 * 				   past sent reply OR K additional RPCs
122 * 				   replied on same socket OR?)
123 * 			  OR
124 * 				- when very old (hours, days, weeks?)
125 *
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 *   (at most one entry for each key)
129 *
130 * When a Request arrives:
131 * - if a match with entry via key
132 * 	- if RPC marked In_progress
133 * 		- discard request (don't send reply)
134 * 	  else
135 * 		- reply from cache
136 * 		- timestamp cache entry
137 *   else
138 * 	- add entry to cache, marked In_progress
139 * 	- do RPC
140 * 	- when RPC done
141 * 		- if RPC# non-idempotent
142 * 			- mark entry Done (not In_progress)
143 * 			- save reply
144 * 			- timestamp cache entry
145 * 		  else
146 * 			- free cache entry
147 * 		- send reply
148 *
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 *		pages 53-63. San Diego, February 1989.
154 *	 for the UDP case.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 *	for TCP. For V3, a reply won't be saved when the flood level is
157 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 *	that case. This level should be set high enough that this almost
159 *	never happens.
160 */
161#ifndef APPLEKEXT
162#include <fs/nfs/nfsport.h>
163
164extern struct nfsstatsv1 nfsstatsv1;
165extern struct mtx nfsrc_udpmtx;
166extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
168int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
169#endif	/* !APPLEKEXT */
170
171SYSCTL_DECL(_vfs_nfsd);
172
173static u_int	nfsrc_tcphighwater = 0;
174static int
175sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176{
177	int error, newhighwater;
178
179	newhighwater = nfsrc_tcphighwater;
180	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181	if (error != 0 || req->newptr == NULL)
182		return (error);
183	if (newhighwater < 0)
184		return (EINVAL);
185	if (newhighwater >= nfsrc_floodlevel)
186		nfsrc_floodlevel = newhighwater + newhighwater / 5;
187	nfsrc_tcphighwater = newhighwater;
188	return (0);
189}
190SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
191    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
192    "High water mark for TCP cache entries");
193
194static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196    &nfsrc_udphighwater, 0,
197    "High water mark for UDP cache entries");
198static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200    &nfsrc_tcptimeout, 0,
201    "Timeout for TCP entries in the DRC");
202static u_int nfsrc_tcpnonidempotent = 1;
203SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204    &nfsrc_tcpnonidempotent, 0,
205    "Enable the DRC for NFS over TCP");
206
207static int nfsrc_udpcachesize = 0;
208static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
209static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
210
211/*
212 * and the reverse mapping from generic to Version 2 procedure numbers
213 */
214static int newnfsv2_procid[NFS_V3NPROCS] = {
215	NFSV2PROC_NULL,
216	NFSV2PROC_GETATTR,
217	NFSV2PROC_SETATTR,
218	NFSV2PROC_LOOKUP,
219	NFSV2PROC_NOOP,
220	NFSV2PROC_READLINK,
221	NFSV2PROC_READ,
222	NFSV2PROC_WRITE,
223	NFSV2PROC_CREATE,
224	NFSV2PROC_MKDIR,
225	NFSV2PROC_SYMLINK,
226	NFSV2PROC_CREATE,
227	NFSV2PROC_REMOVE,
228	NFSV2PROC_RMDIR,
229	NFSV2PROC_RENAME,
230	NFSV2PROC_LINK,
231	NFSV2PROC_READDIR,
232	NFSV2PROC_NOOP,
233	NFSV2PROC_STATFS,
234	NFSV2PROC_NOOP,
235	NFSV2PROC_NOOP,
236	NFSV2PROC_NOOP,
237};
238
239#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
240#define	NFSRCUDPHASH(xid) \
241	(&nfsrvudphashtbl[nfsrc_hash(xid)])
242#define	NFSRCHASH(xid) \
243	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
244#define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
245#define	TRUE	1
246#define	FALSE	0
247#define	NFSRVCACHE_CHECKLEN	100
248
249/* True iff the rpc reply is an nfs status ONLY! */
250static int nfsv2_repstat[NFS_V3NPROCS] = {
251	FALSE,
252	FALSE,
253	FALSE,
254	FALSE,
255	FALSE,
256	FALSE,
257	FALSE,
258	FALSE,
259	FALSE,
260	FALSE,
261	TRUE,
262	TRUE,
263	TRUE,
264	TRUE,
265	FALSE,
266	TRUE,
267	FALSE,
268	FALSE,
269	FALSE,
270	FALSE,
271	FALSE,
272	FALSE,
273};
274
275/*
276 * Will NFS want to work over IPv6 someday?
277 */
278#define	NETFAMILY(rp) \
279		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
280
281/* local functions */
282static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284static void nfsrc_lock(struct nfsrvcache *rp);
285static void nfsrc_unlock(struct nfsrvcache *rp);
286static void nfsrc_wanted(struct nfsrvcache *rp);
287static void nfsrc_freecache(struct nfsrvcache *rp);
288static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
289static void nfsrc_marksametcpconn(u_int64_t);
290
291/*
292 * Return the correct mutex for this cache entry.
293 */
294static __inline struct mtx *
295nfsrc_cachemutex(struct nfsrvcache *rp)
296{
297
298	if ((rp->rc_flag & RC_UDP) != 0)
299		return (&nfsrc_udpmtx);
300	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
301}
302
303/*
304 * Initialize the server request cache list
305 */
306APPLESTATIC void
307nfsrvd_initcache(void)
308{
309	int i;
310	static int inited = 0;
311
312	if (inited)
313		return;
314	inited = 1;
315	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
316		LIST_INIT(&nfsrvudphashtbl[i]);
317		LIST_INIT(&nfsrchash_table[i].tbl);
318		LIST_INIT(&nfsrcahash_table[i].tbl);
319	}
320	TAILQ_INIT(&nfsrvudplru);
321	nfsrc_tcpsavedreplies = 0;
322	nfsrc_udpcachesize = 0;
323	nfsstatsv1.srvcache_tcppeak = 0;
324	nfsstatsv1.srvcache_size = 0;
325}
326
327/*
328 * Get a cache entry for this request. Basically just malloc a new one
329 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330 */
331APPLESTATIC int
332nfsrvd_getcache(struct nfsrv_descript *nd)
333{
334	struct nfsrvcache *newrp;
335	int ret;
336
337	if (nd->nd_procnum == NFSPROC_NULL)
338		panic("nfsd cache null");
339	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
340	    M_NFSRVCACHE, M_WAITOK);
341	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
342	if (nd->nd_flag & ND_NFSV4)
343		newrp->rc_flag = RC_NFSV4;
344	else if (nd->nd_flag & ND_NFSV3)
345		newrp->rc_flag = RC_NFSV3;
346	else
347		newrp->rc_flag = RC_NFSV2;
348	newrp->rc_xid = nd->nd_retxid;
349	newrp->rc_proc = nd->nd_procnum;
350	newrp->rc_sockref = nd->nd_sockref;
351	newrp->rc_cachetime = nd->nd_tcpconntime;
352	if (nd->nd_flag & ND_SAMETCPCONN)
353		newrp->rc_flag |= RC_SAMETCPCONN;
354	if (nd->nd_nam2 != NULL) {
355		newrp->rc_flag |= RC_UDP;
356		ret = nfsrc_getudp(nd, newrp);
357	} else {
358		ret = nfsrc_gettcp(nd, newrp);
359	}
360	NFSEXITCODE2(0, nd);
361	return (ret);
362}
363
364/*
365 * For UDP (v2, v3):
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 *   (at most one entry for each key)
368 */
369static int
370nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371{
372	struct nfsrvcache *rp;
373	struct sockaddr_in *saddr;
374	struct sockaddr_in6 *saddr6;
375	struct nfsrvhashhead *hp;
376	int ret = 0;
377	struct mtx *mutex;
378
379	mutex = nfsrc_cachemutex(newrp);
380	hp = NFSRCUDPHASH(newrp->rc_xid);
381loop:
382	mtx_lock(mutex);
383	LIST_FOREACH(rp, hp, rc_hash) {
384	    if (newrp->rc_xid == rp->rc_xid &&
385		newrp->rc_proc == rp->rc_proc &&
386		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388			if ((rp->rc_flag & RC_LOCKED) != 0) {
389				rp->rc_flag |= RC_WANTED;
390				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391				    "nfsrc", 10 * hz);
392				goto loop;
393			}
394			if (rp->rc_flag == 0)
395				panic("nfs udp cache0");
396			rp->rc_flag |= RC_LOCKED;
397			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399			if (rp->rc_flag & RC_INPROG) {
400				nfsstatsv1.srvcache_inproghits++;
401				mtx_unlock(mutex);
402				ret = RC_DROPIT;
403			} else if (rp->rc_flag & RC_REPSTATUS) {
404				/*
405				 * V2 only.
406				 */
407				nfsstatsv1.srvcache_nonidemdonehits++;
408				mtx_unlock(mutex);
409				nfsrvd_rephead(nd);
410				*(nd->nd_errp) = rp->rc_status;
411				ret = RC_REPLY;
412				rp->rc_timestamp = NFSD_MONOSEC +
413					NFSRVCACHE_UDPTIMEOUT;
414			} else if (rp->rc_flag & RC_REPMBUF) {
415				nfsstatsv1.srvcache_nonidemdonehits++;
416				mtx_unlock(mutex);
417				nd->nd_mreq = m_copym(rp->rc_reply, 0,
418					M_COPYALL, M_WAITOK);
419				ret = RC_REPLY;
420				rp->rc_timestamp = NFSD_MONOSEC +
421					NFSRVCACHE_UDPTIMEOUT;
422			} else {
423				panic("nfs udp cache1");
424			}
425			nfsrc_unlock(rp);
426			free((caddr_t)newrp, M_NFSRVCACHE);
427			goto out;
428		}
429	}
430	nfsstatsv1.srvcache_misses++;
431	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
432	nfsrc_udpcachesize++;
433
434	newrp->rc_flag |= RC_INPROG;
435	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436	if (saddr->sin_family == AF_INET)
437		newrp->rc_inet = saddr->sin_addr.s_addr;
438	else if (saddr->sin_family == AF_INET6) {
439		saddr6 = (struct sockaddr_in6 *)saddr;
440		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441		    sizeof (struct in6_addr));
442		newrp->rc_flag |= RC_INETIPV6;
443	}
444	LIST_INSERT_HEAD(hp, newrp, rc_hash);
445	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446	mtx_unlock(mutex);
447	nd->nd_rp = newrp;
448	ret = RC_DOIT;
449
450out:
451	NFSEXITCODE2(0, nd);
452	return (ret);
453}
454
455/*
456 * Update a request cache entry after the rpc has been done
457 */
458APPLESTATIC struct nfsrvcache *
459nfsrvd_updatecache(struct nfsrv_descript *nd)
460{
461	struct nfsrvcache *rp;
462	struct nfsrvcache *retrp = NULL;
463	mbuf_t m;
464	struct mtx *mutex;
465
466	rp = nd->nd_rp;
467	if (!rp)
468		panic("nfsrvd_updatecache null rp");
469	nd->nd_rp = NULL;
470	mutex = nfsrc_cachemutex(rp);
471	mtx_lock(mutex);
472	nfsrc_lock(rp);
473	if (!(rp->rc_flag & RC_INPROG))
474		panic("nfsrvd_updatecache not inprog");
475	rp->rc_flag &= ~RC_INPROG;
476	if (rp->rc_flag & RC_UDP) {
477		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479	}
480
481	/*
482	 * Reply from cache is a special case returned by nfsrv_checkseqid().
483	 */
484	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485		nfsstatsv1.srvcache_nonidemdonehits++;
486		mtx_unlock(mutex);
487		nd->nd_repstat = 0;
488		if (nd->nd_mreq)
489			mbuf_freem(nd->nd_mreq);
490		if (!(rp->rc_flag & RC_REPMBUF))
491			panic("reply from cache");
492		nd->nd_mreq = m_copym(rp->rc_reply, 0,
493		    M_COPYALL, M_WAITOK);
494		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495		nfsrc_unlock(rp);
496		goto out;
497	}
498
499	/*
500	 * If rc_refcnt > 0, save it
501	 * For UDP, save it if ND_SAVEREPLY is set
502	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503	 */
504	if (nd->nd_repstat != NFSERR_DONTREPLY &&
505	    (rp->rc_refcnt > 0 ||
506	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509	      nfsrc_tcpnonidempotent))) {
510		if (rp->rc_refcnt > 0) {
511			if (!(rp->rc_flag & RC_NFSV4))
512				panic("update_cache refcnt");
513			rp->rc_flag |= RC_REFCNT;
514		}
515		if ((nd->nd_flag & ND_NFSV2) &&
516		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517			rp->rc_status = nd->nd_repstat;
518			rp->rc_flag |= RC_REPSTATUS;
519			mtx_unlock(mutex);
520		} else {
521			if (!(rp->rc_flag & RC_UDP)) {
522			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523			    if (nfsrc_tcpsavedreplies >
524				nfsstatsv1.srvcache_tcppeak)
525				nfsstatsv1.srvcache_tcppeak =
526				    nfsrc_tcpsavedreplies;
527			}
528			mtx_unlock(mutex);
529			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530			mtx_lock(mutex);
531			rp->rc_reply = m;
532			rp->rc_flag |= RC_REPMBUF;
533			mtx_unlock(mutex);
534		}
535		if (rp->rc_flag & RC_UDP) {
536			rp->rc_timestamp = NFSD_MONOSEC +
537			    NFSRVCACHE_UDPTIMEOUT;
538			nfsrc_unlock(rp);
539		} else {
540			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541			if (rp->rc_refcnt > 0)
542				nfsrc_unlock(rp);
543			else
544				retrp = rp;
545		}
546	} else {
547		nfsrc_freecache(rp);
548		mtx_unlock(mutex);
549	}
550
551out:
552	NFSEXITCODE2(0, nd);
553	return (retrp);
554}
555
556/*
557 * Invalidate and, if possible, free an in prog cache entry.
558 * Must not sleep.
559 */
560APPLESTATIC void
561nfsrvd_delcache(struct nfsrvcache *rp)
562{
563	struct mtx *mutex;
564
565	mutex = nfsrc_cachemutex(rp);
566	if (!(rp->rc_flag & RC_INPROG))
567		panic("nfsrvd_delcache not in prog");
568	mtx_lock(mutex);
569	rp->rc_flag &= ~RC_INPROG;
570	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
571		nfsrc_freecache(rp);
572	mtx_unlock(mutex);
573}
574
575/*
576 * Called after nfsrvd_updatecache() once the reply is sent, to update
577 * the entry's sequence number and unlock it. The argument is
578 * the pointer returned by nfsrvd_updatecache().
579 */
580APPLESTATIC void
581nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
582{
583	struct nfsrchash_bucket *hbp;
584
585	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
586	if (have_seq) {
587		hbp = NFSRCAHASH(rp->rc_sockref);
588		mtx_lock(&hbp->mtx);
589		rp->rc_tcpseq = seq;
590		if (rp->rc_acked != RC_NO_ACK)
591			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
592		rp->rc_acked = RC_NO_ACK;
593		mtx_unlock(&hbp->mtx);
594	}
595	nfsrc_unlock(rp);
596}
597
598/*
599 * Get a cache entry for TCP
600 * - key on <xid, nfs version>
601 *   (allow multiple entries for a given key)
602 */
603static int
604nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
605{
606	struct nfsrvcache *rp, *nextrp;
607	int i;
608	struct nfsrvcache *hitrp;
609	struct nfsrvhashhead *hp, nfsrc_templist;
610	int hit, ret = 0;
611	struct mtx *mutex;
612
613	mutex = nfsrc_cachemutex(newrp);
614	hp = NFSRCHASH(newrp->rc_xid);
615	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
616tryagain:
617	mtx_lock(mutex);
618	hit = 1;
619	LIST_INIT(&nfsrc_templist);
620	/*
621	 * Get all the matches and put them on the temp list.
622	 */
623	rp = LIST_FIRST(hp);
624	while (rp != LIST_END(hp)) {
625		nextrp = LIST_NEXT(rp, rc_hash);
626		if (newrp->rc_xid == rp->rc_xid &&
627		    (!(rp->rc_flag & RC_INPROG) ||
628		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
629		      newrp->rc_sockref == rp->rc_sockref)) &&
630		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
631		    newrp->rc_proc == rp->rc_proc &&
632		    ((newrp->rc_flag & RC_NFSV4) &&
633		     newrp->rc_sockref != rp->rc_sockref &&
634		     newrp->rc_cachetime >= rp->rc_cachetime)
635		    && newrp->rc_reqlen == rp->rc_reqlen &&
636		    newrp->rc_cksum == rp->rc_cksum) {
637			LIST_REMOVE(rp, rc_hash);
638			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
639		}
640		rp = nextrp;
641	}
642
643	/*
644	 * Now, use nfsrc_templist to decide if there is a match.
645	 */
646	i = 0;
647	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
648		i++;
649		if (rp->rc_refcnt > 0) {
650			hit = 0;
651			break;
652		}
653	}
654	/*
655	 * Can be a hit only if one entry left.
656	 * Note possible hit entry and put nfsrc_templist back on hash
657	 * list.
658	 */
659	if (i != 1)
660		hit = 0;
661	hitrp = rp = LIST_FIRST(&nfsrc_templist);
662	while (rp != LIST_END(&nfsrc_templist)) {
663		nextrp = LIST_NEXT(rp, rc_hash);
664		LIST_REMOVE(rp, rc_hash);
665		LIST_INSERT_HEAD(hp, rp, rc_hash);
666		rp = nextrp;
667	}
668	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
669		panic("nfs gettcp cache templist");
670
671	if (hit) {
672		rp = hitrp;
673		if ((rp->rc_flag & RC_LOCKED) != 0) {
674			rp->rc_flag |= RC_WANTED;
675			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
676			    "nfsrc", 10 * hz);
677			goto tryagain;
678		}
679		if (rp->rc_flag == 0)
680			panic("nfs tcp cache0");
681		rp->rc_flag |= RC_LOCKED;
682		if (rp->rc_flag & RC_INPROG) {
683			nfsstatsv1.srvcache_inproghits++;
684			mtx_unlock(mutex);
685			if (newrp->rc_sockref == rp->rc_sockref)
686				nfsrc_marksametcpconn(rp->rc_sockref);
687			ret = RC_DROPIT;
688		} else if (rp->rc_flag & RC_REPSTATUS) {
689			/*
690			 * V2 only.
691			 */
692			nfsstatsv1.srvcache_nonidemdonehits++;
693			mtx_unlock(mutex);
694			if (newrp->rc_sockref == rp->rc_sockref)
695				nfsrc_marksametcpconn(rp->rc_sockref);
696			ret = RC_REPLY;
697			nfsrvd_rephead(nd);
698			*(nd->nd_errp) = rp->rc_status;
699			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
700		} else if (rp->rc_flag & RC_REPMBUF) {
701			nfsstatsv1.srvcache_nonidemdonehits++;
702			mtx_unlock(mutex);
703			if (newrp->rc_sockref == rp->rc_sockref)
704				nfsrc_marksametcpconn(rp->rc_sockref);
705			ret = RC_REPLY;
706			nd->nd_mreq = m_copym(rp->rc_reply, 0,
707				M_COPYALL, M_WAITOK);
708			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
709		} else {
710			panic("nfs tcp cache1");
711		}
712		nfsrc_unlock(rp);
713		free((caddr_t)newrp, M_NFSRVCACHE);
714		goto out;
715	}
716	nfsstatsv1.srvcache_misses++;
717	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
718
719	/*
720	 * For TCP, multiple entries for a key are allowed, so don't
721	 * chain it into the hash table until done.
722	 */
723	newrp->rc_cachetime = NFSD_MONOSEC;
724	newrp->rc_flag |= RC_INPROG;
725	LIST_INSERT_HEAD(hp, newrp, rc_hash);
726	mtx_unlock(mutex);
727	nd->nd_rp = newrp;
728	ret = RC_DOIT;
729
730out:
731	NFSEXITCODE2(0, nd);
732	return (ret);
733}
734
735/*
736 * Lock a cache entry.
737 */
738static void
739nfsrc_lock(struct nfsrvcache *rp)
740{
741	struct mtx *mutex;
742
743	mutex = nfsrc_cachemutex(rp);
744	mtx_assert(mutex, MA_OWNED);
745	while ((rp->rc_flag & RC_LOCKED) != 0) {
746		rp->rc_flag |= RC_WANTED;
747		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
748	}
749	rp->rc_flag |= RC_LOCKED;
750}
751
752/*
753 * Unlock a cache entry.
754 */
755static void
756nfsrc_unlock(struct nfsrvcache *rp)
757{
758	struct mtx *mutex;
759
760	mutex = nfsrc_cachemutex(rp);
761	mtx_lock(mutex);
762	rp->rc_flag &= ~RC_LOCKED;
763	nfsrc_wanted(rp);
764	mtx_unlock(mutex);
765}
766
767/*
768 * Wakeup anyone wanting entry.
769 */
770static void
771nfsrc_wanted(struct nfsrvcache *rp)
772{
773	if (rp->rc_flag & RC_WANTED) {
774		rp->rc_flag &= ~RC_WANTED;
775		wakeup((caddr_t)rp);
776	}
777}
778
779/*
780 * Free up the entry.
781 * Must not sleep.
782 */
783static void
784nfsrc_freecache(struct nfsrvcache *rp)
785{
786	struct nfsrchash_bucket *hbp;
787
788	LIST_REMOVE(rp, rc_hash);
789	if (rp->rc_flag & RC_UDP) {
790		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
791		nfsrc_udpcachesize--;
792	} else if (rp->rc_acked != RC_NO_SEQ) {
793		hbp = NFSRCAHASH(rp->rc_sockref);
794		mtx_lock(&hbp->mtx);
795		if (rp->rc_acked == RC_NO_ACK)
796			LIST_REMOVE(rp, rc_ahash);
797		mtx_unlock(&hbp->mtx);
798	}
799	nfsrc_wanted(rp);
800	if (rp->rc_flag & RC_REPMBUF) {
801		mbuf_freem(rp->rc_reply);
802		if (!(rp->rc_flag & RC_UDP))
803			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804	}
805	FREE((caddr_t)rp, M_NFSRVCACHE);
806	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
807}
808
809/*
810 * Clean out the cache. Called when nfsserver module is unloaded.
811 */
812APPLESTATIC void
813nfsrvd_cleancache(void)
814{
815	struct nfsrvcache *rp, *nextrp;
816	int i;
817
818	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819		mtx_lock(&nfsrchash_table[i].mtx);
820		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821			nfsrc_freecache(rp);
822		mtx_unlock(&nfsrchash_table[i].mtx);
823	}
824	mtx_lock(&nfsrc_udpmtx);
825	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827			nfsrc_freecache(rp);
828		}
829	}
830	nfsstatsv1.srvcache_size = 0;
831	mtx_unlock(&nfsrc_udpmtx);
832	nfsrc_tcpsavedreplies = 0;
833}
834
835#define HISTSIZE	16
836/*
837 * The basic rule is to get rid of entries that are expired.
838 */
839void
840nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
841{
842	struct nfsrchash_bucket *hbp;
843	struct nfsrvcache *rp, *nextrp;
844	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
845	time_t thisstamp;
846	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
847	static int onethread = 0, oneslot = 0;
848
849	if (sockref != 0) {
850		hbp = NFSRCAHASH(sockref);
851		mtx_lock(&hbp->mtx);
852		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
853			if (sockref == rp->rc_sockref) {
854				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
855					rp->rc_acked = RC_ACK;
856					LIST_REMOVE(rp, rc_ahash);
857				} else if (final) {
858					rp->rc_acked = RC_NACK;
859					LIST_REMOVE(rp, rc_ahash);
860				}
861			}
862		}
863		mtx_unlock(&hbp->mtx);
864	}
865
866	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
867		return;
868	if (NFSD_MONOSEC != udp_lasttrim ||
869	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
870	    nfsrc_udphighwater / 2)) {
871		mtx_lock(&nfsrc_udpmtx);
872		udp_lasttrim = NFSD_MONOSEC;
873		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
874			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
875			     && rp->rc_refcnt == 0
876			     && ((rp->rc_flag & RC_REFCNT) ||
877				 udp_lasttrim > rp->rc_timestamp ||
878				 nfsrc_udpcachesize > nfsrc_udphighwater))
879				nfsrc_freecache(rp);
880		}
881		mtx_unlock(&nfsrc_udpmtx);
882	}
883	if (NFSD_MONOSEC != tcp_lasttrim ||
884	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
885		force = nfsrc_tcphighwater / 4;
886		if (force > 0 &&
887		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
888			for (i = 0; i < HISTSIZE; i++)
889				time_histo[i] = 0;
890			i = 0;
891			lastslot = NFSRVCACHE_HASHSIZE - 1;
892		} else {
893			force = 0;
894			if (NFSD_MONOSEC != tcp_lasttrim) {
895				i = 0;
896				lastslot = NFSRVCACHE_HASHSIZE - 1;
897			} else {
898				lastslot = i = oneslot;
899				if (++oneslot >= NFSRVCACHE_HASHSIZE)
900					oneslot = 0;
901			}
902		}
903		tto = nfsrc_tcptimeout;
904		tcp_lasttrim = NFSD_MONOSEC;
905		for (; i <= lastslot; i++) {
906			mtx_lock(&nfsrchash_table[i].mtx);
907			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
908			    nextrp) {
909				if (!(rp->rc_flag &
910				     (RC_INPROG|RC_LOCKED|RC_WANTED))
911				     && rp->rc_refcnt == 0) {
912					if ((rp->rc_flag & RC_REFCNT) ||
913					    tcp_lasttrim > rp->rc_timestamp ||
914					    rp->rc_acked == RC_ACK) {
915						nfsrc_freecache(rp);
916						continue;
917					}
918
919					if (force == 0)
920						continue;
921					/*
922					 * The timestamps range from roughly the
923					 * present (tcp_lasttrim) to the present
924					 * + nfsrc_tcptimeout. Generate a simple
925					 * histogram of where the timeouts fall.
926					 */
927					j = rp->rc_timestamp - tcp_lasttrim;
928					if (j >= tto)
929						j = HISTSIZE - 1;
930					else if (j < 0)
931						j = 0;
932					else
933						j = j * HISTSIZE / tto;
934					time_histo[j]++;
935				}
936			}
937			mtx_unlock(&nfsrchash_table[i].mtx);
938		}
939		if (force) {
940			/*
941			 * Trim some more with a smaller timeout of as little
942			 * as 20% of nfsrc_tcptimeout to try and get below
943			 * 80% of the nfsrc_tcphighwater.
944			 */
945			k = 0;
946			for (i = 0; i < (HISTSIZE - 2); i++) {
947				k += time_histo[i];
948				if (k > force)
949					break;
950			}
951			k = tto * (i + 1) / HISTSIZE;
952			if (k < 1)
953				k = 1;
954			thisstamp = tcp_lasttrim + k;
955			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
956				mtx_lock(&nfsrchash_table[i].mtx);
957				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
958				    rc_hash, nextrp) {
959					if (!(rp->rc_flag &
960					     (RC_INPROG|RC_LOCKED|RC_WANTED))
961					     && rp->rc_refcnt == 0
962					     && ((rp->rc_flag & RC_REFCNT) ||
963						 thisstamp > rp->rc_timestamp ||
964						 rp->rc_acked == RC_ACK))
965						nfsrc_freecache(rp);
966				}
967				mtx_unlock(&nfsrchash_table[i].mtx);
968			}
969		}
970	}
971	atomic_store_rel_int(&onethread, 0);
972}
973
974/*
975 * Add a seqid# reference to the cache entry.
976 */
977APPLESTATIC void
978nfsrvd_refcache(struct nfsrvcache *rp)
979{
980	struct mtx *mutex;
981
982	if (rp == NULL)
983		/* For NFSv4.1, there is no cache entry. */
984		return;
985	mutex = nfsrc_cachemutex(rp);
986	mtx_lock(mutex);
987	if (rp->rc_refcnt < 0)
988		panic("nfs cache refcnt");
989	rp->rc_refcnt++;
990	mtx_unlock(mutex);
991}
992
993/*
994 * Dereference a seqid# cache entry.
995 */
996APPLESTATIC void
997nfsrvd_derefcache(struct nfsrvcache *rp)
998{
999	struct mtx *mutex;
1000
1001	mutex = nfsrc_cachemutex(rp);
1002	mtx_lock(mutex);
1003	if (rp->rc_refcnt <= 0)
1004		panic("nfs cache derefcnt");
1005	rp->rc_refcnt--;
1006	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1007		nfsrc_freecache(rp);
1008	mtx_unlock(mutex);
1009}
1010
1011/*
1012 * Calculate the length of the mbuf list and a checksum on the first up to
1013 * NFSRVCACHE_CHECKLEN bytes.
1014 */
1015static int
1016nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1017{
1018	int len = 0, cklen;
1019	mbuf_t m;
1020
1021	m = m1;
1022	while (m) {
1023		len += mbuf_len(m);
1024		m = mbuf_next(m);
1025	}
1026	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1027	*cksum = in_cksum(m1, cklen);
1028	return (len);
1029}
1030
1031/*
1032 * Mark a TCP connection that is seeing retries. Should never happen for
1033 * NFSv4.
1034 */
1035static void
1036nfsrc_marksametcpconn(u_int64_t sockref)
1037{
1038}
1039
1040