1191783Srmacklem/*-
2191783Srmacklem * Copyright (c) 1989, 1993
3191783Srmacklem *	The Regents of the University of California.  All rights reserved.
4191783Srmacklem *
5191783Srmacklem * This code is derived from software contributed to Berkeley by
6191783Srmacklem * Rick Macklem at The University of Guelph.
7191783Srmacklem *
8191783Srmacklem * Redistribution and use in source and binary forms, with or without
9191783Srmacklem * modification, are permitted provided that the following conditions
10191783Srmacklem * are met:
11191783Srmacklem * 1. Redistributions of source code must retain the above copyright
12191783Srmacklem *    notice, this list of conditions and the following disclaimer.
13191783Srmacklem * 2. Redistributions in binary form must reproduce the above copyright
14191783Srmacklem *    notice, this list of conditions and the following disclaimer in the
15191783Srmacklem *    documentation and/or other materials provided with the distribution.
16191783Srmacklem * 4. Neither the name of the University nor the names of its contributors
17191783Srmacklem *    may be used to endorse or promote products derived from this software
18191783Srmacklem *    without specific prior written permission.
19191783Srmacklem *
20191783Srmacklem * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21191783Srmacklem * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22191783Srmacklem * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23191783Srmacklem * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24191783Srmacklem * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25191783Srmacklem * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26191783Srmacklem * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27191783Srmacklem * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28191783Srmacklem * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29191783Srmacklem * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30191783Srmacklem * SUCH DAMAGE.
31191783Srmacklem *
32191783Srmacklem */
33191783Srmacklem
34191783Srmacklem#include <sys/cdefs.h>
35191783Srmacklem__FBSDID("$FreeBSD$");
36191783Srmacklem
37191783Srmacklem/*
38191783Srmacklem * Here is the basic algorithm:
39191783Srmacklem * First, some design criteria I used:
40191783Srmacklem * - I think a false hit is more serious than a false miss
41191783Srmacklem * - A false hit for an RPC that has Op(s) that order via seqid# must be
42191783Srmacklem *   avoided at all cost
43191783Srmacklem * - A valid hit will probably happen a long time after the original reply
44191783Srmacklem *   and the TCP socket that the original request was received on will no
45191783Srmacklem *   longer be active
46191783Srmacklem *   (The long time delay implies to me that LRU is not appropriate.)
47191783Srmacklem * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48191783Srmacklem *   in them as well as minimizing the risk of redoing retried non-idempotent
49191783Srmacklem *   Ops.
50191783Srmacklem * Because it is biased towards avoiding false hits, multiple entries with
51191783Srmacklem * the same xid are to be expected, especially for the case of the entry
52191783Srmacklem * in the cache being related to a seqid# sequenced Op.
53191783Srmacklem *
54191783Srmacklem * The basic algorithm I'm about to code up:
55191783Srmacklem * - Null RPCs bypass the cache and are just done
56191783Srmacklem * For TCP
57191783Srmacklem * 	- key on <xid, NFS version> (as noted above, there can be several
58191783Srmacklem * 				     entries with the same key)
59191783Srmacklem * 	When a request arrives:
60191783Srmacklem * 		For all that match key
61191783Srmacklem * 		- if RPC# != OR request_size !=
62191783Srmacklem * 			- not a match with this one
63191783Srmacklem * 		- if NFSv4 and received on same TCP socket OR
64191783Srmacklem *			received on a TCP connection created before the
65191783Srmacklem *			entry was cached
66191783Srmacklem * 			- not a match with this one
67191783Srmacklem * 			(V2,3 clients might retry on same TCP socket)
68191783Srmacklem * 		- calculate checksum on first N bytes of NFS XDR
69191783Srmacklem * 		- if checksum !=
70191783Srmacklem * 			- not a match for this one
71191783Srmacklem * 		If any of the remaining ones that match has a
72191783Srmacklem * 			seqid_refcnt > 0
73191783Srmacklem * 			- not a match (go do RPC, using new cache entry)
74191783Srmacklem * 		If one match left
75191783Srmacklem * 			- a hit (reply from cache)
76191783Srmacklem * 		else
77191783Srmacklem * 			- miss (go do RPC, using new cache entry)
78191783Srmacklem *
79191783Srmacklem * 	During processing of NFSv4 request:
80191783Srmacklem * 		- set a flag when a non-idempotent Op is processed
81191783Srmacklem * 		- when an Op that uses a seqid# (Open,...) is processed
82191783Srmacklem * 			- if same seqid# as referenced entry in cache
83191783Srmacklem * 				- free new cache entry
84191783Srmacklem * 				- reply from referenced cache entry
85191783Srmacklem * 			  else if next seqid# in order
86191783Srmacklem * 				- free referenced cache entry
87191783Srmacklem * 				- increment seqid_refcnt on new cache entry
88191783Srmacklem * 				- set pointer from Openowner/Lockowner to
89191783Srmacklem * 					new cache entry (aka reference it)
90191783Srmacklem * 			  else if first seqid# in sequence
91191783Srmacklem * 				- increment seqid_refcnt on new cache entry
92191783Srmacklem * 				- set pointer from Openowner/Lockowner to
93191783Srmacklem * 					new cache entry (aka reference it)
94191783Srmacklem *
95191783Srmacklem * 	At end of RPC processing:
96191783Srmacklem * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97191783Srmacklem * 			cache entry
98191783Srmacklem * 			- save reply in cache entry
99191783Srmacklem * 			- calculate checksum on first N bytes of NFS XDR
100191783Srmacklem * 				request
101191783Srmacklem * 			- note op and length of XDR request (in bytes)
102191783Srmacklem * 			- timestamp it
103191783Srmacklem * 		  else
104191783Srmacklem * 			- free new cache entry
105191783Srmacklem * 		- Send reply (noting info for socket activity check, below)
106191783Srmacklem *
107191783Srmacklem * 	For cache entries saved above:
108191783Srmacklem * 		- if saved since seqid_refcnt was > 0
109191783Srmacklem * 			- free when seqid_refcnt decrements to 0
110191783Srmacklem * 			  (when next one in sequence is processed above, or
111191783Srmacklem * 			   when Openowner/Lockowner is discarded)
112191783Srmacklem * 		  else { non-idempotent Op(s) }
113191783Srmacklem * 			- free when
114191783Srmacklem * 				- some further activity observed on same
115191783Srmacklem * 					socket
116191783Srmacklem * 				  (I'm not yet sure how I'm going to do
117191783Srmacklem * 				   this. Maybe look at the TCP connection
118191783Srmacklem * 				   to see if the send_tcp_sequence# is well
119191783Srmacklem * 				   past sent reply OR K additional RPCs
120191783Srmacklem * 				   replied on same socket OR?)
121191783Srmacklem * 			  OR
122191783Srmacklem * 				- when very old (hours, days, weeks?)
123191783Srmacklem *
124191783Srmacklem * For UDP (v2, 3 only), pretty much the old way:
125191783Srmacklem * - key on <xid, NFS version, RPC#, Client host ip#>
126191783Srmacklem *   (at most one entry for each key)
127191783Srmacklem *
128191783Srmacklem * When a Request arrives:
129191783Srmacklem * - if a match with entry via key
130191783Srmacklem * 	- if RPC marked In_progress
131191783Srmacklem * 		- discard request (don't send reply)
132191783Srmacklem * 	  else
133191783Srmacklem * 		- reply from cache
134191783Srmacklem * 		- timestamp cache entry
135191783Srmacklem *   else
136191783Srmacklem * 	- add entry to cache, marked In_progress
137191783Srmacklem * 	- do RPC
138191783Srmacklem * 	- when RPC done
139191783Srmacklem * 		- if RPC# non-idempotent
140191783Srmacklem * 			- mark entry Done (not In_progress)
141191783Srmacklem * 			- save reply
142191783Srmacklem * 			- timestamp cache entry
143191783Srmacklem * 		  else
144191783Srmacklem * 			- free cache entry
145191783Srmacklem * 		- send reply
146191783Srmacklem *
147191783Srmacklem * Later, entries with saved replies are free'd a short time (few minutes)
148191783Srmacklem * after reply sent (timestamp).
149191783Srmacklem * Reference: Chet Juszczak, "Improving the Performance and Correctness
150191783Srmacklem *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151191783Srmacklem *		pages 53-63. San Diego, February 1989.
152191783Srmacklem *	 for the UDP case.
153191783Srmacklem * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154191783Srmacklem *	for TCP. For V3, a reply won't be saved when the flood level is
155191783Srmacklem *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156191783Srmacklem *	that case. This level should be set high enough that this almost
157191783Srmacklem *	never happens.
158191783Srmacklem */
159191783Srmacklem#ifndef APPLEKEXT
160191783Srmacklem#include <fs/nfs/nfsport.h>
161191783Srmacklem
162191783Srmacklemextern struct nfsstats newnfsstats;
163255532Srmacklemextern struct mtx nfsrc_udpmtx;
164255532Srmacklemextern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165261067Smavextern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166191783Srmacklemint nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167191783Srmacklem#endif	/* !APPLEKEXT */
168191783Srmacklem
169255532SrmacklemSYSCTL_DECL(_vfs_nfsd);
170255532Srmacklem
171255532Srmacklemstatic u_int	nfsrc_tcphighwater = 0;
172255532Srmacklemstatic int
173255532Srmacklemsysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174255532Srmacklem{
175255532Srmacklem	int error, newhighwater;
176255532Srmacklem
177255532Srmacklem	newhighwater = nfsrc_tcphighwater;
178255532Srmacklem	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179255532Srmacklem	if (error != 0 || req->newptr == NULL)
180255532Srmacklem		return (error);
181255532Srmacklem	if (newhighwater < 0)
182255532Srmacklem		return (EINVAL);
183255532Srmacklem	if (newhighwater >= nfsrc_floodlevel)
184255532Srmacklem		nfsrc_floodlevel = newhighwater + newhighwater / 5;
185255532Srmacklem	nfsrc_tcphighwater = newhighwater;
186255532Srmacklem	return (0);
187255532Srmacklem}
188255532SrmacklemSYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189255532Srmacklem    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190255532Srmacklem    "High water mark for TCP cache entries");
191255532Srmacklem
192255532Srmacklemstatic u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193255532SrmacklemSYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194255532Srmacklem    &nfsrc_udphighwater, 0,
195255532Srmacklem    "High water mark for UDP cache entries");
196255532Srmacklemstatic u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197255532SrmacklemSYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198255532Srmacklem    &nfsrc_tcptimeout, 0,
199255532Srmacklem    "Timeout for TCP entries in the DRC");
200255532Srmacklemstatic u_int nfsrc_tcpnonidempotent = 1;
201255532SrmacklemSYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202255532Srmacklem    &nfsrc_tcpnonidempotent, 0,
203255532Srmacklem    "Enable the DRC for NFS over TCP");
204255532Srmacklem
205255532Srmacklemstatic int nfsrc_udpcachesize = 0;
206191783Srmacklemstatic TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207255532Srmacklemstatic struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208255532Srmacklem
209191783Srmacklem/*
210191783Srmacklem * and the reverse mapping from generic to Version 2 procedure numbers
211191783Srmacklem */
212191783Srmacklemstatic int newnfsv2_procid[NFS_V3NPROCS] = {
213191783Srmacklem	NFSV2PROC_NULL,
214191783Srmacklem	NFSV2PROC_GETATTR,
215191783Srmacklem	NFSV2PROC_SETATTR,
216191783Srmacklem	NFSV2PROC_LOOKUP,
217191783Srmacklem	NFSV2PROC_NOOP,
218191783Srmacklem	NFSV2PROC_READLINK,
219191783Srmacklem	NFSV2PROC_READ,
220191783Srmacklem	NFSV2PROC_WRITE,
221191783Srmacklem	NFSV2PROC_CREATE,
222191783Srmacklem	NFSV2PROC_MKDIR,
223191783Srmacklem	NFSV2PROC_SYMLINK,
224191783Srmacklem	NFSV2PROC_CREATE,
225191783Srmacklem	NFSV2PROC_REMOVE,
226191783Srmacklem	NFSV2PROC_RMDIR,
227191783Srmacklem	NFSV2PROC_RENAME,
228191783Srmacklem	NFSV2PROC_LINK,
229191783Srmacklem	NFSV2PROC_READDIR,
230191783Srmacklem	NFSV2PROC_NOOP,
231191783Srmacklem	NFSV2PROC_STATFS,
232191783Srmacklem	NFSV2PROC_NOOP,
233191783Srmacklem	NFSV2PROC_NOOP,
234191783Srmacklem	NFSV2PROC_NOOP,
235191783Srmacklem};
236191783Srmacklem
237255532Srmacklem#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238191783Srmacklem#define	NFSRCUDPHASH(xid) \
239255532Srmacklem	(&nfsrvudphashtbl[nfsrc_hash(xid)])
240191783Srmacklem#define	NFSRCHASH(xid) \
241255532Srmacklem	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
242261067Smav#define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243191783Srmacklem#define	TRUE	1
244191783Srmacklem#define	FALSE	0
245191783Srmacklem#define	NFSRVCACHE_CHECKLEN	100
246191783Srmacklem
247191783Srmacklem/* True iff the rpc reply is an nfs status ONLY! */
248191783Srmacklemstatic int nfsv2_repstat[NFS_V3NPROCS] = {
249191783Srmacklem	FALSE,
250191783Srmacklem	FALSE,
251191783Srmacklem	FALSE,
252191783Srmacklem	FALSE,
253191783Srmacklem	FALSE,
254191783Srmacklem	FALSE,
255191783Srmacklem	FALSE,
256191783Srmacklem	FALSE,
257191783Srmacklem	FALSE,
258191783Srmacklem	FALSE,
259191783Srmacklem	TRUE,
260191783Srmacklem	TRUE,
261191783Srmacklem	TRUE,
262191783Srmacklem	TRUE,
263191783Srmacklem	FALSE,
264191783Srmacklem	TRUE,
265191783Srmacklem	FALSE,
266191783Srmacklem	FALSE,
267191783Srmacklem	FALSE,
268191783Srmacklem	FALSE,
269191783Srmacklem	FALSE,
270191783Srmacklem	FALSE,
271191783Srmacklem};
272191783Srmacklem
273191783Srmacklem/*
274191783Srmacklem * Will NFS want to work over IPv6 someday?
275191783Srmacklem */
276191783Srmacklem#define	NETFAMILY(rp) \
277191783Srmacklem		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278191783Srmacklem
279191783Srmacklem/* local functions */
280191783Srmacklemstatic int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281191783Srmacklemstatic int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282191783Srmacklemstatic void nfsrc_lock(struct nfsrvcache *rp);
283191783Srmacklemstatic void nfsrc_unlock(struct nfsrvcache *rp);
284191783Srmacklemstatic void nfsrc_wanted(struct nfsrvcache *rp);
285191783Srmacklemstatic void nfsrc_freecache(struct nfsrvcache *rp);
286191783Srmacklemstatic int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287191783Srmacklemstatic void nfsrc_marksametcpconn(u_int64_t);
288191783Srmacklem
289191783Srmacklem/*
290255532Srmacklem * Return the correct mutex for this cache entry.
291255532Srmacklem */
292255532Srmacklemstatic __inline struct mtx *
293255532Srmacklemnfsrc_cachemutex(struct nfsrvcache *rp)
294255532Srmacklem{
295255532Srmacklem
296255532Srmacklem	if ((rp->rc_flag & RC_UDP) != 0)
297255532Srmacklem		return (&nfsrc_udpmtx);
298255532Srmacklem	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299255532Srmacklem}
300255532Srmacklem
301255532Srmacklem/*
302191783Srmacklem * Initialize the server request cache list
303191783Srmacklem */
304191783SrmacklemAPPLESTATIC void
305191783Srmacklemnfsrvd_initcache(void)
306191783Srmacklem{
307191783Srmacklem	int i;
308191783Srmacklem	static int inited = 0;
309191783Srmacklem
310191783Srmacklem	if (inited)
311191783Srmacklem		return;
312191783Srmacklem	inited = 1;
313191783Srmacklem	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314191783Srmacklem		LIST_INIT(&nfsrvudphashtbl[i]);
315255532Srmacklem		LIST_INIT(&nfsrchash_table[i].tbl);
316261067Smav		LIST_INIT(&nfsrcahash_table[i].tbl);
317191783Srmacklem	}
318191783Srmacklem	TAILQ_INIT(&nfsrvudplru);
319191783Srmacklem	nfsrc_tcpsavedreplies = 0;
320191783Srmacklem	nfsrc_udpcachesize = 0;
321191783Srmacklem	newnfsstats.srvcache_tcppeak = 0;
322191783Srmacklem	newnfsstats.srvcache_size = 0;
323191783Srmacklem}
324191783Srmacklem
325191783Srmacklem/*
326191783Srmacklem * Get a cache entry for this request. Basically just malloc a new one
327191783Srmacklem * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328191783Srmacklem */
329191783SrmacklemAPPLESTATIC int
330261067Smavnfsrvd_getcache(struct nfsrv_descript *nd)
331191783Srmacklem{
332191783Srmacklem	struct nfsrvcache *newrp;
333191783Srmacklem	int ret;
334191783Srmacklem
335191783Srmacklem	if (nd->nd_procnum == NFSPROC_NULL)
336191783Srmacklem		panic("nfsd cache null");
337191783Srmacklem	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338191783Srmacklem	    M_NFSRVCACHE, M_WAITOK);
339191783Srmacklem	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340191783Srmacklem	if (nd->nd_flag & ND_NFSV4)
341191783Srmacklem		newrp->rc_flag = RC_NFSV4;
342191783Srmacklem	else if (nd->nd_flag & ND_NFSV3)
343191783Srmacklem		newrp->rc_flag = RC_NFSV3;
344191783Srmacklem	else
345191783Srmacklem		newrp->rc_flag = RC_NFSV2;
346191783Srmacklem	newrp->rc_xid = nd->nd_retxid;
347191783Srmacklem	newrp->rc_proc = nd->nd_procnum;
348191783Srmacklem	newrp->rc_sockref = nd->nd_sockref;
349191783Srmacklem	newrp->rc_cachetime = nd->nd_tcpconntime;
350191783Srmacklem	if (nd->nd_flag & ND_SAMETCPCONN)
351191783Srmacklem		newrp->rc_flag |= RC_SAMETCPCONN;
352191783Srmacklem	if (nd->nd_nam2 != NULL) {
353191783Srmacklem		newrp->rc_flag |= RC_UDP;
354191783Srmacklem		ret = nfsrc_getudp(nd, newrp);
355191783Srmacklem	} else {
356191783Srmacklem		ret = nfsrc_gettcp(nd, newrp);
357191783Srmacklem	}
358224086Szack	NFSEXITCODE2(0, nd);
359191783Srmacklem	return (ret);
360191783Srmacklem}
361191783Srmacklem
362191783Srmacklem/*
363191783Srmacklem * For UDP (v2, v3):
364191783Srmacklem * - key on <xid, NFS version, RPC#, Client host ip#>
365191783Srmacklem *   (at most one entry for each key)
366191783Srmacklem */
367191783Srmacklemstatic int
368191783Srmacklemnfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369191783Srmacklem{
370191783Srmacklem	struct nfsrvcache *rp;
371191783Srmacklem	struct sockaddr_in *saddr;
372191783Srmacklem	struct sockaddr_in6 *saddr6;
373191783Srmacklem	struct nfsrvhashhead *hp;
374191783Srmacklem	int ret = 0;
375255532Srmacklem	struct mtx *mutex;
376191783Srmacklem
377255532Srmacklem	mutex = nfsrc_cachemutex(newrp);
378191783Srmacklem	hp = NFSRCUDPHASH(newrp->rc_xid);
379191783Srmacklemloop:
380255532Srmacklem	mtx_lock(mutex);
381191783Srmacklem	LIST_FOREACH(rp, hp, rc_hash) {
382191783Srmacklem	    if (newrp->rc_xid == rp->rc_xid &&
383191783Srmacklem		newrp->rc_proc == rp->rc_proc &&
384191783Srmacklem		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385191783Srmacklem		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386191783Srmacklem			if ((rp->rc_flag & RC_LOCKED) != 0) {
387191783Srmacklem				rp->rc_flag |= RC_WANTED;
388255532Srmacklem				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389255532Srmacklem				    "nfsrc", 10 * hz);
390191783Srmacklem				goto loop;
391191783Srmacklem			}
392191783Srmacklem			if (rp->rc_flag == 0)
393191783Srmacklem				panic("nfs udp cache0");
394191783Srmacklem			rp->rc_flag |= RC_LOCKED;
395191783Srmacklem			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396191783Srmacklem			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397191783Srmacklem			if (rp->rc_flag & RC_INPROG) {
398191783Srmacklem				newnfsstats.srvcache_inproghits++;
399255532Srmacklem				mtx_unlock(mutex);
400191783Srmacklem				ret = RC_DROPIT;
401191783Srmacklem			} else if (rp->rc_flag & RC_REPSTATUS) {
402191783Srmacklem				/*
403191783Srmacklem				 * V2 only.
404191783Srmacklem				 */
405191783Srmacklem				newnfsstats.srvcache_nonidemdonehits++;
406255532Srmacklem				mtx_unlock(mutex);
407191783Srmacklem				nfsrvd_rephead(nd);
408191783Srmacklem				*(nd->nd_errp) = rp->rc_status;
409191783Srmacklem				ret = RC_REPLY;
410191783Srmacklem				rp->rc_timestamp = NFSD_MONOSEC +
411191783Srmacklem					NFSRVCACHE_UDPTIMEOUT;
412191783Srmacklem			} else if (rp->rc_flag & RC_REPMBUF) {
413191783Srmacklem				newnfsstats.srvcache_nonidemdonehits++;
414255532Srmacklem				mtx_unlock(mutex);
415191783Srmacklem				nd->nd_mreq = m_copym(rp->rc_reply, 0,
416191783Srmacklem					M_COPYALL, M_WAIT);
417191783Srmacklem				ret = RC_REPLY;
418191783Srmacklem				rp->rc_timestamp = NFSD_MONOSEC +
419191783Srmacklem					NFSRVCACHE_UDPTIMEOUT;
420191783Srmacklem			} else {
421191783Srmacklem				panic("nfs udp cache1");
422191783Srmacklem			}
423191783Srmacklem			nfsrc_unlock(rp);
424191783Srmacklem			free((caddr_t)newrp, M_NFSRVCACHE);
425224086Szack			goto out;
426191783Srmacklem		}
427191783Srmacklem	}
428191783Srmacklem	newnfsstats.srvcache_misses++;
429255532Srmacklem	atomic_add_int(&newnfsstats.srvcache_size, 1);
430191783Srmacklem	nfsrc_udpcachesize++;
431191783Srmacklem
432191783Srmacklem	newrp->rc_flag |= RC_INPROG;
433191783Srmacklem	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434191783Srmacklem	if (saddr->sin_family == AF_INET)
435191783Srmacklem		newrp->rc_inet = saddr->sin_addr.s_addr;
436191783Srmacklem	else if (saddr->sin_family == AF_INET6) {
437191783Srmacklem		saddr6 = (struct sockaddr_in6 *)saddr;
438203848Srmacklem		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439203848Srmacklem		    sizeof (struct in6_addr));
440203848Srmacklem		newrp->rc_flag |= RC_INETIPV6;
441191783Srmacklem	}
442191783Srmacklem	LIST_INSERT_HEAD(hp, newrp, rc_hash);
443191783Srmacklem	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444255532Srmacklem	mtx_unlock(mutex);
445191783Srmacklem	nd->nd_rp = newrp;
446224086Szack	ret = RC_DOIT;
447224086Szack
448224086Szackout:
449224086Szack	NFSEXITCODE2(0, nd);
450224086Szack	return (ret);
451191783Srmacklem}
452191783Srmacklem
453191783Srmacklem/*
454191783Srmacklem * Update a request cache entry after the rpc has been done
455191783Srmacklem */
456191783SrmacklemAPPLESTATIC struct nfsrvcache *
457261067Smavnfsrvd_updatecache(struct nfsrv_descript *nd)
458191783Srmacklem{
459191783Srmacklem	struct nfsrvcache *rp;
460191783Srmacklem	struct nfsrvcache *retrp = NULL;
461223312Srmacklem	mbuf_t m;
462255532Srmacklem	struct mtx *mutex;
463191783Srmacklem
464191783Srmacklem	rp = nd->nd_rp;
465191783Srmacklem	if (!rp)
466191783Srmacklem		panic("nfsrvd_updatecache null rp");
467191783Srmacklem	nd->nd_rp = NULL;
468255532Srmacklem	mutex = nfsrc_cachemutex(rp);
469255532Srmacklem	mtx_lock(mutex);
470191783Srmacklem	nfsrc_lock(rp);
471191783Srmacklem	if (!(rp->rc_flag & RC_INPROG))
472191783Srmacklem		panic("nfsrvd_updatecache not inprog");
473191783Srmacklem	rp->rc_flag &= ~RC_INPROG;
474191783Srmacklem	if (rp->rc_flag & RC_UDP) {
475191783Srmacklem		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476191783Srmacklem		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477191783Srmacklem	}
478191783Srmacklem
479191783Srmacklem	/*
480191783Srmacklem	 * Reply from cache is a special case returned by nfsrv_checkseqid().
481191783Srmacklem	 */
482191783Srmacklem	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483191783Srmacklem		newnfsstats.srvcache_nonidemdonehits++;
484255532Srmacklem		mtx_unlock(mutex);
485191783Srmacklem		nd->nd_repstat = 0;
486191783Srmacklem		if (nd->nd_mreq)
487191783Srmacklem			mbuf_freem(nd->nd_mreq);
488191783Srmacklem		if (!(rp->rc_flag & RC_REPMBUF))
489191783Srmacklem			panic("reply from cache");
490191783Srmacklem		nd->nd_mreq = m_copym(rp->rc_reply, 0,
491191783Srmacklem		    M_COPYALL, M_WAIT);
492255532Srmacklem		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493191783Srmacklem		nfsrc_unlock(rp);
494224086Szack		goto out;
495191783Srmacklem	}
496191783Srmacklem
497191783Srmacklem	/*
498191783Srmacklem	 * If rc_refcnt > 0, save it
499191783Srmacklem	 * For UDP, save it if ND_SAVEREPLY is set
500191783Srmacklem	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501191783Srmacklem	 */
502191783Srmacklem	if (nd->nd_repstat != NFSERR_DONTREPLY &&
503191783Srmacklem	    (rp->rc_refcnt > 0 ||
504191783Srmacklem	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505191783Srmacklem	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506191783Srmacklem	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507191783Srmacklem	      nfsrc_tcpnonidempotent))) {
508191783Srmacklem		if (rp->rc_refcnt > 0) {
509191783Srmacklem			if (!(rp->rc_flag & RC_NFSV4))
510191783Srmacklem				panic("update_cache refcnt");
511191783Srmacklem			rp->rc_flag |= RC_REFCNT;
512191783Srmacklem		}
513191783Srmacklem		if ((nd->nd_flag & ND_NFSV2) &&
514191783Srmacklem		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515191783Srmacklem			rp->rc_status = nd->nd_repstat;
516191783Srmacklem			rp->rc_flag |= RC_REPSTATUS;
517255532Srmacklem			mtx_unlock(mutex);
518191783Srmacklem		} else {
519191783Srmacklem			if (!(rp->rc_flag & RC_UDP)) {
520255532Srmacklem			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521191783Srmacklem			    if (nfsrc_tcpsavedreplies >
522191783Srmacklem				newnfsstats.srvcache_tcppeak)
523191783Srmacklem				newnfsstats.srvcache_tcppeak =
524191783Srmacklem				    nfsrc_tcpsavedreplies;
525191783Srmacklem			}
526255532Srmacklem			mtx_unlock(mutex);
527223312Srmacklem			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAIT);
528255532Srmacklem			mtx_lock(mutex);
529223312Srmacklem			rp->rc_reply = m;
530191783Srmacklem			rp->rc_flag |= RC_REPMBUF;
531255532Srmacklem			mtx_unlock(mutex);
532191783Srmacklem		}
533191783Srmacklem		if (rp->rc_flag & RC_UDP) {
534191783Srmacklem			rp->rc_timestamp = NFSD_MONOSEC +
535191783Srmacklem			    NFSRVCACHE_UDPTIMEOUT;
536191783Srmacklem			nfsrc_unlock(rp);
537191783Srmacklem		} else {
538255532Srmacklem			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539191783Srmacklem			if (rp->rc_refcnt > 0)
540191783Srmacklem				nfsrc_unlock(rp);
541191783Srmacklem			else
542191783Srmacklem				retrp = rp;
543191783Srmacklem		}
544191783Srmacklem	} else {
545191783Srmacklem		nfsrc_freecache(rp);
546255532Srmacklem		mtx_unlock(mutex);
547191783Srmacklem	}
548224086Szack
549224086Szackout:
550224086Szack	NFSEXITCODE2(0, nd);
551191783Srmacklem	return (retrp);
552191783Srmacklem}
553191783Srmacklem
554191783Srmacklem/*
555191783Srmacklem * Invalidate and, if possible, free an in prog cache entry.
556191783Srmacklem * Must not sleep.
557191783Srmacklem */
558191783SrmacklemAPPLESTATIC void
559191783Srmacklemnfsrvd_delcache(struct nfsrvcache *rp)
560191783Srmacklem{
561255532Srmacklem	struct mtx *mutex;
562191783Srmacklem
563255532Srmacklem	mutex = nfsrc_cachemutex(rp);
564191783Srmacklem	if (!(rp->rc_flag & RC_INPROG))
565191783Srmacklem		panic("nfsrvd_delcache not in prog");
566255532Srmacklem	mtx_lock(mutex);
567191783Srmacklem	rp->rc_flag &= ~RC_INPROG;
568191783Srmacklem	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569191783Srmacklem		nfsrc_freecache(rp);
570255532Srmacklem	mtx_unlock(mutex);
571191783Srmacklem}
572191783Srmacklem
573191783Srmacklem/*
574191783Srmacklem * Called after nfsrvd_updatecache() once the reply is sent, to update
575261067Smav * the entry's sequence number and unlock it. The argument is
576191783Srmacklem * the pointer returned by nfsrvd_updatecache().
577191783Srmacklem */
578191783SrmacklemAPPLESTATIC void
579261067Smavnfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
580191783Srmacklem{
581261067Smav	struct nfsrchash_bucket *hbp;
582191783Srmacklem
583261067Smav	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584261067Smav	if (have_seq) {
585261067Smav		hbp = NFSRCAHASH(rp->rc_sockref);
586261067Smav		mtx_lock(&hbp->mtx);
587261067Smav		rp->rc_tcpseq = seq;
588261067Smav		if (rp->rc_acked != RC_NO_ACK)
589261067Smav			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590261067Smav		rp->rc_acked = RC_NO_ACK;
591261067Smav		mtx_unlock(&hbp->mtx);
592191783Srmacklem	}
593191783Srmacklem	nfsrc_unlock(rp);
594191783Srmacklem}
595191783Srmacklem
596191783Srmacklem/*
597191783Srmacklem * Get a cache entry for TCP
598191783Srmacklem * - key on <xid, nfs version>
599191783Srmacklem *   (allow multiple entries for a given key)
600191783Srmacklem */
601191783Srmacklemstatic int
602191783Srmacklemnfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
603191783Srmacklem{
604191783Srmacklem	struct nfsrvcache *rp, *nextrp;
605191783Srmacklem	int i;
606191783Srmacklem	struct nfsrvcache *hitrp;
607191783Srmacklem	struct nfsrvhashhead *hp, nfsrc_templist;
608191783Srmacklem	int hit, ret = 0;
609255532Srmacklem	struct mtx *mutex;
610191783Srmacklem
611255532Srmacklem	mutex = nfsrc_cachemutex(newrp);
612191783Srmacklem	hp = NFSRCHASH(newrp->rc_xid);
613191783Srmacklem	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
614191783Srmacklemtryagain:
615255532Srmacklem	mtx_lock(mutex);
616191783Srmacklem	hit = 1;
617191783Srmacklem	LIST_INIT(&nfsrc_templist);
618191783Srmacklem	/*
619191783Srmacklem	 * Get all the matches and put them on the temp list.
620191783Srmacklem	 */
621191783Srmacklem	rp = LIST_FIRST(hp);
622191783Srmacklem	while (rp != LIST_END(hp)) {
623191783Srmacklem		nextrp = LIST_NEXT(rp, rc_hash);
624191783Srmacklem		if (newrp->rc_xid == rp->rc_xid &&
625191783Srmacklem		    (!(rp->rc_flag & RC_INPROG) ||
626191783Srmacklem		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
627191783Srmacklem		      newrp->rc_sockref == rp->rc_sockref)) &&
628191783Srmacklem		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629191783Srmacklem		    newrp->rc_proc == rp->rc_proc &&
630191783Srmacklem		    ((newrp->rc_flag & RC_NFSV4) &&
631191783Srmacklem		     newrp->rc_sockref != rp->rc_sockref &&
632191783Srmacklem		     newrp->rc_cachetime >= rp->rc_cachetime)
633191783Srmacklem		    && newrp->rc_reqlen == rp->rc_reqlen &&
634191783Srmacklem		    newrp->rc_cksum == rp->rc_cksum) {
635191783Srmacklem			LIST_REMOVE(rp, rc_hash);
636191783Srmacklem			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
637191783Srmacklem		}
638191783Srmacklem		rp = nextrp;
639191783Srmacklem	}
640191783Srmacklem
641191783Srmacklem	/*
642191783Srmacklem	 * Now, use nfsrc_templist to decide if there is a match.
643191783Srmacklem	 */
644191783Srmacklem	i = 0;
645191783Srmacklem	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
646191783Srmacklem		i++;
647191783Srmacklem		if (rp->rc_refcnt > 0) {
648191783Srmacklem			hit = 0;
649191783Srmacklem			break;
650191783Srmacklem		}
651191783Srmacklem	}
652191783Srmacklem	/*
653191783Srmacklem	 * Can be a hit only if one entry left.
654191783Srmacklem	 * Note possible hit entry and put nfsrc_templist back on hash
655191783Srmacklem	 * list.
656191783Srmacklem	 */
657191783Srmacklem	if (i != 1)
658191783Srmacklem		hit = 0;
659191783Srmacklem	hitrp = rp = LIST_FIRST(&nfsrc_templist);
660191783Srmacklem	while (rp != LIST_END(&nfsrc_templist)) {
661191783Srmacklem		nextrp = LIST_NEXT(rp, rc_hash);
662191783Srmacklem		LIST_REMOVE(rp, rc_hash);
663191783Srmacklem		LIST_INSERT_HEAD(hp, rp, rc_hash);
664191783Srmacklem		rp = nextrp;
665191783Srmacklem	}
666191783Srmacklem	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667191783Srmacklem		panic("nfs gettcp cache templist");
668191783Srmacklem
669191783Srmacklem	if (hit) {
670191783Srmacklem		rp = hitrp;
671191783Srmacklem		if ((rp->rc_flag & RC_LOCKED) != 0) {
672191783Srmacklem			rp->rc_flag |= RC_WANTED;
673255532Srmacklem			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
674255532Srmacklem			    "nfsrc", 10 * hz);
675191783Srmacklem			goto tryagain;
676191783Srmacklem		}
677191783Srmacklem		if (rp->rc_flag == 0)
678191783Srmacklem			panic("nfs tcp cache0");
679191783Srmacklem		rp->rc_flag |= RC_LOCKED;
680191783Srmacklem		if (rp->rc_flag & RC_INPROG) {
681191783Srmacklem			newnfsstats.srvcache_inproghits++;
682255532Srmacklem			mtx_unlock(mutex);
683191783Srmacklem			if (newrp->rc_sockref == rp->rc_sockref)
684191783Srmacklem				nfsrc_marksametcpconn(rp->rc_sockref);
685191783Srmacklem			ret = RC_DROPIT;
686191783Srmacklem		} else if (rp->rc_flag & RC_REPSTATUS) {
687191783Srmacklem			/*
688191783Srmacklem			 * V2 only.
689191783Srmacklem			 */
690191783Srmacklem			newnfsstats.srvcache_nonidemdonehits++;
691255532Srmacklem			mtx_unlock(mutex);
692191783Srmacklem			if (newrp->rc_sockref == rp->rc_sockref)
693191783Srmacklem				nfsrc_marksametcpconn(rp->rc_sockref);
694191783Srmacklem			ret = RC_REPLY;
695191783Srmacklem			nfsrvd_rephead(nd);
696191783Srmacklem			*(nd->nd_errp) = rp->rc_status;
697255532Srmacklem			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698191783Srmacklem		} else if (rp->rc_flag & RC_REPMBUF) {
699191783Srmacklem			newnfsstats.srvcache_nonidemdonehits++;
700255532Srmacklem			mtx_unlock(mutex);
701191783Srmacklem			if (newrp->rc_sockref == rp->rc_sockref)
702191783Srmacklem				nfsrc_marksametcpconn(rp->rc_sockref);
703191783Srmacklem			ret = RC_REPLY;
704191783Srmacklem			nd->nd_mreq = m_copym(rp->rc_reply, 0,
705191783Srmacklem				M_COPYALL, M_WAIT);
706255532Srmacklem			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707191783Srmacklem		} else {
708191783Srmacklem			panic("nfs tcp cache1");
709191783Srmacklem		}
710191783Srmacklem		nfsrc_unlock(rp);
711191783Srmacklem		free((caddr_t)newrp, M_NFSRVCACHE);
712224086Szack		goto out;
713191783Srmacklem	}
714191783Srmacklem	newnfsstats.srvcache_misses++;
715255532Srmacklem	atomic_add_int(&newnfsstats.srvcache_size, 1);
716191783Srmacklem
717191783Srmacklem	/*
718191783Srmacklem	 * For TCP, multiple entries for a key are allowed, so don't
719191783Srmacklem	 * chain it into the hash table until done.
720191783Srmacklem	 */
721191783Srmacklem	newrp->rc_cachetime = NFSD_MONOSEC;
722191783Srmacklem	newrp->rc_flag |= RC_INPROG;
723191783Srmacklem	LIST_INSERT_HEAD(hp, newrp, rc_hash);
724255532Srmacklem	mtx_unlock(mutex);
725191783Srmacklem	nd->nd_rp = newrp;
726224086Szack	ret = RC_DOIT;
727224086Szack
728224086Szackout:
729224086Szack	NFSEXITCODE2(0, nd);
730224086Szack	return (ret);
731191783Srmacklem}
732191783Srmacklem
733191783Srmacklem/*
734191783Srmacklem * Lock a cache entry.
735191783Srmacklem */
736191783Srmacklemstatic void
737191783Srmacklemnfsrc_lock(struct nfsrvcache *rp)
738191783Srmacklem{
739255532Srmacklem	struct mtx *mutex;
740255532Srmacklem
741255532Srmacklem	mutex = nfsrc_cachemutex(rp);
742255532Srmacklem	mtx_assert(mutex, MA_OWNED);
743191783Srmacklem	while ((rp->rc_flag & RC_LOCKED) != 0) {
744191783Srmacklem		rp->rc_flag |= RC_WANTED;
745255532Srmacklem		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
746191783Srmacklem	}
747191783Srmacklem	rp->rc_flag |= RC_LOCKED;
748191783Srmacklem}
749191783Srmacklem
750191783Srmacklem/*
751191783Srmacklem * Unlock a cache entry.
752191783Srmacklem */
753191783Srmacklemstatic void
754191783Srmacklemnfsrc_unlock(struct nfsrvcache *rp)
755191783Srmacklem{
756255532Srmacklem	struct mtx *mutex;
757223312Srmacklem
758255532Srmacklem	mutex = nfsrc_cachemutex(rp);
759255532Srmacklem	mtx_lock(mutex);
760191783Srmacklem	rp->rc_flag &= ~RC_LOCKED;
761191783Srmacklem	nfsrc_wanted(rp);
762255532Srmacklem	mtx_unlock(mutex);
763191783Srmacklem}
764191783Srmacklem
765191783Srmacklem/*
766191783Srmacklem * Wakeup anyone wanting entry.
767191783Srmacklem */
768191783Srmacklemstatic void
769191783Srmacklemnfsrc_wanted(struct nfsrvcache *rp)
770191783Srmacklem{
771191783Srmacklem	if (rp->rc_flag & RC_WANTED) {
772191783Srmacklem		rp->rc_flag &= ~RC_WANTED;
773191783Srmacklem		wakeup((caddr_t)rp);
774191783Srmacklem	}
775191783Srmacklem}
776191783Srmacklem
777191783Srmacklem/*
778191783Srmacklem * Free up the entry.
779191783Srmacklem * Must not sleep.
780191783Srmacklem */
781191783Srmacklemstatic void
782191783Srmacklemnfsrc_freecache(struct nfsrvcache *rp)
783191783Srmacklem{
784261067Smav	struct nfsrchash_bucket *hbp;
785191783Srmacklem
786191783Srmacklem	LIST_REMOVE(rp, rc_hash);
787191783Srmacklem	if (rp->rc_flag & RC_UDP) {
788191783Srmacklem		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789191783Srmacklem		nfsrc_udpcachesize--;
790261067Smav	} else if (rp->rc_acked != RC_NO_SEQ) {
791261067Smav		hbp = NFSRCAHASH(rp->rc_sockref);
792261067Smav		mtx_lock(&hbp->mtx);
793261067Smav		if (rp->rc_acked == RC_NO_ACK)
794261067Smav			LIST_REMOVE(rp, rc_ahash);
795261067Smav		mtx_unlock(&hbp->mtx);
796191783Srmacklem	}
797191783Srmacklem	nfsrc_wanted(rp);
798191783Srmacklem	if (rp->rc_flag & RC_REPMBUF) {
799191783Srmacklem		mbuf_freem(rp->rc_reply);
800191783Srmacklem		if (!(rp->rc_flag & RC_UDP))
801255532Srmacklem			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
802191783Srmacklem	}
803191783Srmacklem	FREE((caddr_t)rp, M_NFSRVCACHE);
804255532Srmacklem	atomic_add_int(&newnfsstats.srvcache_size, -1);
805191783Srmacklem}
806191783Srmacklem
807191783Srmacklem/*
808217335Szack * Clean out the cache. Called when nfsserver module is unloaded.
809191783Srmacklem */
810191783SrmacklemAPPLESTATIC void
811191783Srmacklemnfsrvd_cleancache(void)
812191783Srmacklem{
813191783Srmacklem	struct nfsrvcache *rp, *nextrp;
814191783Srmacklem	int i;
815191783Srmacklem
816191783Srmacklem	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817255532Srmacklem		mtx_lock(&nfsrchash_table[i].mtx);
818255532Srmacklem		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
819191783Srmacklem			nfsrc_freecache(rp);
820255532Srmacklem		mtx_unlock(&nfsrchash_table[i].mtx);
821191783Srmacklem	}
822255532Srmacklem	mtx_lock(&nfsrc_udpmtx);
823191783Srmacklem	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824191783Srmacklem		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
825191783Srmacklem			nfsrc_freecache(rp);
826191783Srmacklem		}
827191783Srmacklem	}
828191783Srmacklem	newnfsstats.srvcache_size = 0;
829255532Srmacklem	mtx_unlock(&nfsrc_udpmtx);
830191783Srmacklem	nfsrc_tcpsavedreplies = 0;
831191783Srmacklem}
832191783Srmacklem
833261063Smav#define HISTSIZE	16
834191783Srmacklem/*
835191783Srmacklem * The basic rule is to get rid of entries that are expired.
836191783Srmacklem */
837261067Smavvoid
838261067Smavnfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
839191783Srmacklem{
840261067Smav	struct nfsrchash_bucket *hbp;
841191783Srmacklem	struct nfsrvcache *rp, *nextrp;
842261067Smav	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
843255532Srmacklem	time_t thisstamp;
844255532Srmacklem	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845261067Smav	static int onethread = 0, oneslot = 0;
846191783Srmacklem
847261067Smav	if (sockref != 0) {
848261067Smav		hbp = NFSRCAHASH(sockref);
849261067Smav		mtx_lock(&hbp->mtx);
850261067Smav		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851261067Smav			if (sockref == rp->rc_sockref) {
852261067Smav				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853261067Smav					rp->rc_acked = RC_ACK;
854261067Smav					LIST_REMOVE(rp, rc_ahash);
855261067Smav				} else if (final) {
856261067Smav					rp->rc_acked = RC_NACK;
857261067Smav					LIST_REMOVE(rp, rc_ahash);
858261067Smav				}
859261067Smav			}
860261067Smav		}
861261067Smav		mtx_unlock(&hbp->mtx);
862261067Smav	}
863261067Smav
864255532Srmacklem	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
865255532Srmacklem		return;
866255532Srmacklem	if (NFSD_MONOSEC != udp_lasttrim ||
867255532Srmacklem	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
868255532Srmacklem	    nfsrc_udphighwater / 2)) {
869255532Srmacklem		mtx_lock(&nfsrc_udpmtx);
870255532Srmacklem		udp_lasttrim = NFSD_MONOSEC;
871255532Srmacklem		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872191783Srmacklem			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873191783Srmacklem			     && rp->rc_refcnt == 0
874191783Srmacklem			     && ((rp->rc_flag & RC_REFCNT) ||
875255532Srmacklem				 udp_lasttrim > rp->rc_timestamp ||
876255532Srmacklem				 nfsrc_udpcachesize > nfsrc_udphighwater))
877191783Srmacklem				nfsrc_freecache(rp);
878191783Srmacklem		}
879255532Srmacklem		mtx_unlock(&nfsrc_udpmtx);
880191783Srmacklem	}
881255532Srmacklem	if (NFSD_MONOSEC != tcp_lasttrim ||
882255532Srmacklem	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883261067Smav		force = nfsrc_tcphighwater / 4;
884261067Smav		if (force > 0 &&
885261067Smav		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886261067Smav			for (i = 0; i < HISTSIZE; i++)
887261067Smav				time_histo[i] = 0;
888261067Smav			i = 0;
889261067Smav			lastslot = NFSRVCACHE_HASHSIZE - 1;
890261067Smav		} else {
891261067Smav			force = 0;
892261067Smav			if (NFSD_MONOSEC != tcp_lasttrim) {
893261067Smav				i = 0;
894261067Smav				lastslot = NFSRVCACHE_HASHSIZE - 1;
895261067Smav			} else {
896261067Smav				lastslot = i = oneslot;
897261067Smav				if (++oneslot >= NFSRVCACHE_HASHSIZE)
898261067Smav					oneslot = 0;
899261067Smav			}
900261067Smav		}
901261063Smav		tto = nfsrc_tcptimeout;
902261067Smav		tcp_lasttrim = NFSD_MONOSEC;
903261067Smav		for (; i <= lastslot; i++) {
904255532Srmacklem			mtx_lock(&nfsrchash_table[i].mtx);
905255532Srmacklem			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
906255532Srmacklem			    nextrp) {
907255532Srmacklem				if (!(rp->rc_flag &
908255532Srmacklem				     (RC_INPROG|RC_LOCKED|RC_WANTED))
909255532Srmacklem				     && rp->rc_refcnt == 0) {
910261063Smav					if ((rp->rc_flag & RC_REFCNT) ||
911261063Smav					    tcp_lasttrim > rp->rc_timestamp ||
912261067Smav					    rp->rc_acked == RC_ACK) {
913261063Smav						nfsrc_freecache(rp);
914261063Smav						continue;
915261063Smav					}
916261063Smav
917261067Smav					if (force == 0)
918261063Smav						continue;
919255532Srmacklem					/*
920255532Srmacklem					 * The timestamps range from roughly the
921255532Srmacklem					 * present (tcp_lasttrim) to the present
922255532Srmacklem					 * + nfsrc_tcptimeout. Generate a simple
923255532Srmacklem					 * histogram of where the timeouts fall.
924255532Srmacklem					 */
925255532Srmacklem					j = rp->rc_timestamp - tcp_lasttrim;
926261063Smav					if (j >= tto)
927261063Smav						j = HISTSIZE - 1;
928261063Smav					else if (j < 0)
929255532Srmacklem						j = 0;
930261063Smav					else
931261063Smav						j = j * HISTSIZE / tto;
932255532Srmacklem					time_histo[j]++;
933255532Srmacklem				}
934255532Srmacklem			}
935255532Srmacklem			mtx_unlock(&nfsrchash_table[i].mtx);
936255532Srmacklem		}
937261067Smav		if (force) {
938255532Srmacklem			/*
939255532Srmacklem			 * Trim some more with a smaller timeout of as little
940255532Srmacklem			 * as 20% of nfsrc_tcptimeout to try and get below
941255532Srmacklem			 * 80% of the nfsrc_tcphighwater.
942255532Srmacklem			 */
943255532Srmacklem			k = 0;
944261063Smav			for (i = 0; i < (HISTSIZE - 2); i++) {
945255532Srmacklem				k += time_histo[i];
946261067Smav				if (k > force)
947255532Srmacklem					break;
948255532Srmacklem			}
949261063Smav			k = tto * (i + 1) / HISTSIZE;
950255532Srmacklem			if (k < 1)
951255532Srmacklem				k = 1;
952255532Srmacklem			thisstamp = tcp_lasttrim + k;
953255532Srmacklem			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954255532Srmacklem				mtx_lock(&nfsrchash_table[i].mtx);
955255532Srmacklem				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
956255532Srmacklem				    rc_hash, nextrp) {
957255532Srmacklem					if (!(rp->rc_flag &
958255532Srmacklem					     (RC_INPROG|RC_LOCKED|RC_WANTED))
959255532Srmacklem					     && rp->rc_refcnt == 0
960255532Srmacklem					     && ((rp->rc_flag & RC_REFCNT) ||
961255532Srmacklem						 thisstamp > rp->rc_timestamp ||
962261067Smav						 rp->rc_acked == RC_ACK))
963255532Srmacklem						nfsrc_freecache(rp);
964255532Srmacklem				}
965255532Srmacklem				mtx_unlock(&nfsrchash_table[i].mtx);
966255532Srmacklem			}
967255532Srmacklem		}
968255532Srmacklem	}
969255532Srmacklem	atomic_store_rel_int(&onethread, 0);
970191783Srmacklem}
971191783Srmacklem
972191783Srmacklem/*
973191783Srmacklem * Add a seqid# reference to the cache entry.
974191783Srmacklem */
975191783SrmacklemAPPLESTATIC void
976191783Srmacklemnfsrvd_refcache(struct nfsrvcache *rp)
977191783Srmacklem{
978255532Srmacklem	struct mtx *mutex;
979191783Srmacklem
980255532Srmacklem	mutex = nfsrc_cachemutex(rp);
981255532Srmacklem	mtx_lock(mutex);
982191783Srmacklem	if (rp->rc_refcnt < 0)
983191783Srmacklem		panic("nfs cache refcnt");
984191783Srmacklem	rp->rc_refcnt++;
985255532Srmacklem	mtx_unlock(mutex);
986191783Srmacklem}
987191783Srmacklem
988191783Srmacklem/*
989191783Srmacklem * Dereference a seqid# cache entry.
990191783Srmacklem */
991191783SrmacklemAPPLESTATIC void
992191783Srmacklemnfsrvd_derefcache(struct nfsrvcache *rp)
993191783Srmacklem{
994255532Srmacklem	struct mtx *mutex;
995191783Srmacklem
996255532Srmacklem	mutex = nfsrc_cachemutex(rp);
997255532Srmacklem	mtx_lock(mutex);
998191783Srmacklem	if (rp->rc_refcnt <= 0)
999191783Srmacklem		panic("nfs cache derefcnt");
1000191783Srmacklem	rp->rc_refcnt--;
1001191783Srmacklem	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1002191783Srmacklem		nfsrc_freecache(rp);
1003255532Srmacklem	mtx_unlock(mutex);
1004191783Srmacklem}
1005191783Srmacklem
1006191783Srmacklem/*
1007191783Srmacklem * Calculate the length of the mbuf list and a checksum on the first up to
1008191783Srmacklem * NFSRVCACHE_CHECKLEN bytes.
1009191783Srmacklem */
1010191783Srmacklemstatic int
1011191783Srmacklemnfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1012191783Srmacklem{
1013191783Srmacklem	int len = 0, cklen;
1014191783Srmacklem	mbuf_t m;
1015191783Srmacklem
1016191783Srmacklem	m = m1;
1017191783Srmacklem	while (m) {
1018191783Srmacklem		len += mbuf_len(m);
1019191783Srmacklem		m = mbuf_next(m);
1020191783Srmacklem	}
1021191783Srmacklem	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1022191783Srmacklem	*cksum = in_cksum(m1, cklen);
1023191783Srmacklem	return (len);
1024191783Srmacklem}
1025191783Srmacklem
1026191783Srmacklem/*
1027191783Srmacklem * Mark a TCP connection that is seeing retries. Should never happen for
1028191783Srmacklem * NFSv4.
1029191783Srmacklem */
1030191783Srmacklemstatic void
1031191783Srmacklemnfsrc_marksametcpconn(u_int64_t sockref)
1032191783Srmacklem{
1033191783Srmacklem}
1034191783Srmacklem
1035