nfs_nfsdcache.c revision 331722
1176730Sjeff/*-
2176730Sjeff * Copyright (c) 1989, 1993
3176730Sjeff *	The Regents of the University of California.  All rights reserved.
4176730Sjeff *
5176730Sjeff * This code is derived from software contributed to Berkeley by
6176730Sjeff * Rick Macklem at The University of Guelph.
7176730Sjeff *
8176730Sjeff * Redistribution and use in source and binary forms, with or without
9176730Sjeff * modification, are permitted provided that the following conditions
10176730Sjeff * are met:
11176730Sjeff * 1. Redistributions of source code must retain the above copyright
12176730Sjeff *    notice, this list of conditions and the following disclaimer.
13176730Sjeff * 2. Redistributions in binary form must reproduce the above copyright
14176730Sjeff *    notice, this list of conditions and the following disclaimer in the
15176730Sjeff *    documentation and/or other materials provided with the distribution.
16176730Sjeff * 4. Neither the name of the University nor the names of its contributors
17176730Sjeff *    may be used to endorse or promote products derived from this software
18176730Sjeff *    without specific prior written permission.
19176730Sjeff *
20176730Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21176730Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22176730Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23176730Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24176730Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25176730Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26176730Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27176730Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28176730Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29176730Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30176730Sjeff * SUCH DAMAGE.
31176730Sjeff *
32176730Sjeff */
33176730Sjeff
34176730Sjeff#include <sys/cdefs.h>
35176730Sjeff__FBSDID("$FreeBSD: stable/11/sys/fs/nfsserver/nfs_nfsdcache.c 331722 2018-03-29 02:50:57Z eadler $");
36176730Sjeff
37176730Sjeff/*
38176730Sjeff * Here is the basic algorithm:
39176730Sjeff * First, some design criteria I used:
40176730Sjeff * - I think a false hit is more serious than a false miss
41176730Sjeff * - A false hit for an RPC that has Op(s) that order via seqid# must be
42176730Sjeff *   avoided at all cost
43176730Sjeff * - A valid hit will probably happen a long time after the original reply
44176730Sjeff *   and the TCP socket that the original request was received on will no
45176730Sjeff *   longer be active
46176730Sjeff *   (The long time delay implies to me that LRU is not appropriate.)
47176730Sjeff * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48176730Sjeff *   in them as well as minimizing the risk of redoing retried non-idempotent
49177738Sjeff *   Ops.
50177738Sjeff * Because it is biased towards avoiding false hits, multiple entries with
51176730Sjeff * the same xid are to be expected, especially for the case of the entry
52176730Sjeff * in the cache being related to a seqid# sequenced Op.
53176730Sjeff *
54176730Sjeff * The basic algorithm I'm about to code up:
55176730Sjeff * - Null RPCs bypass the cache and are just done
56176730Sjeff * For TCP
57176730Sjeff * 	- key on <xid, NFS version> (as noted above, there can be several
58176730Sjeff * 				     entries with the same key)
59176730Sjeff * 	When a request arrives:
60176730Sjeff * 		For all that match key
61176730Sjeff * 		- if RPC# != OR request_size !=
62176730Sjeff * 			- not a match with this one
63176730Sjeff * 		- if NFSv4 and received on same TCP socket OR
64176730Sjeff *			received on a TCP connection created before the
65176730Sjeff *			entry was cached
66176730Sjeff * 			- not a match with this one
67176730Sjeff * 			(V2,3 clients might retry on same TCP socket)
68176730Sjeff * 		- calculate checksum on first N bytes of NFS XDR
69176730Sjeff * 		- if checksum !=
70176730Sjeff * 			- not a match for this one
71176730Sjeff * 		If any of the remaining ones that match has a
72176730Sjeff * 			seqid_refcnt > 0
73176730Sjeff * 			- not a match (go do RPC, using new cache entry)
74176730Sjeff * 		If one match left
75176730Sjeff * 			- a hit (reply from cache)
76176730Sjeff * 		else
77176730Sjeff * 			- miss (go do RPC, using new cache entry)
78176730Sjeff *
79176730Sjeff * 	During processing of NFSv4 request:
80176730Sjeff * 		- set a flag when a non-idempotent Op is processed
81176730Sjeff * 		- when an Op that uses a seqid# (Open,...) is processed
82176730Sjeff * 			- if same seqid# as referenced entry in cache
83176730Sjeff * 				- free new cache entry
84176730Sjeff * 				- reply from referenced cache entry
85176730Sjeff * 			  else if next seqid# in order
86176730Sjeff * 				- free referenced cache entry
87176730Sjeff * 				- increment seqid_refcnt on new cache entry
88176730Sjeff * 				- set pointer from Openowner/Lockowner to
89176730Sjeff * 					new cache entry (aka reference it)
90176730Sjeff * 			  else if first seqid# in sequence
91176730Sjeff * 				- increment seqid_refcnt on new cache entry
92176730Sjeff * 				- set pointer from Openowner/Lockowner to
93176730Sjeff * 					new cache entry (aka reference it)
94176730Sjeff *
95176730Sjeff * 	At end of RPC processing:
96176730Sjeff * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97176730Sjeff * 			cache entry
98176730Sjeff * 			- save reply in cache entry
99177738Sjeff * 			- calculate checksum on first N bytes of NFS XDR
100176730Sjeff * 				request
101177738Sjeff * 			- note op and length of XDR request (in bytes)
102177738Sjeff * 			- timestamp it
103176730Sjeff * 		  else
104176730Sjeff * 			- free new cache entry
105176730Sjeff * 		- Send reply (noting info for socket activity check, below)
106176730Sjeff *
107176730Sjeff * 	For cache entries saved above:
108176730Sjeff * 		- if saved since seqid_refcnt was > 0
109176730Sjeff * 			- free when seqid_refcnt decrements to 0
110176730Sjeff * 			  (when next one in sequence is processed above, or
111176730Sjeff * 			   when Openowner/Lockowner is discarded)
112176730Sjeff * 		  else { non-idempotent Op(s) }
113176730Sjeff * 			- free when
114176730Sjeff * 				- some further activity observed on same
115176730Sjeff * 					socket
116176730Sjeff * 				  (I'm not yet sure how I'm going to do
117176730Sjeff * 				   this. Maybe look at the TCP connection
118176730Sjeff * 				   to see if the send_tcp_sequence# is well
119176730Sjeff * 				   past sent reply OR K additional RPCs
120176730Sjeff * 				   replied on same socket OR?)
121176730Sjeff * 			  OR
122176730Sjeff * 				- when very old (hours, days, weeks?)
123176730Sjeff *
124176730Sjeff * For UDP (v2, 3 only), pretty much the old way:
125176730Sjeff * - key on <xid, NFS version, RPC#, Client host ip#>
126176730Sjeff *   (at most one entry for each key)
127176730Sjeff *
128176730Sjeff * When a Request arrives:
129176730Sjeff * - if a match with entry via key
130176730Sjeff * 	- if RPC marked In_progress
131176730Sjeff * 		- discard request (don't send reply)
132176730Sjeff * 	  else
133176730Sjeff * 		- reply from cache
134176730Sjeff * 		- timestamp cache entry
135176730Sjeff *   else
136176730Sjeff * 	- add entry to cache, marked In_progress
137176730Sjeff * 	- do RPC
138176730Sjeff * 	- when RPC done
139176730Sjeff * 		- if RPC# non-idempotent
140176730Sjeff * 			- mark entry Done (not In_progress)
141176730Sjeff * 			- save reply
142176730Sjeff * 			- timestamp cache entry
143176730Sjeff * 		  else
144176730Sjeff * 			- free cache entry
145176730Sjeff * 		- send reply
146176730Sjeff *
147176730Sjeff * Later, entries with saved replies are free'd a short time (few minutes)
148176730Sjeff * after reply sent (timestamp).
149176730Sjeff * Reference: Chet Juszczak, "Improving the Performance and Correctness
150176730Sjeff *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151176730Sjeff *		pages 53-63. San Diego, February 1989.
152176730Sjeff *	 for the UDP case.
153176730Sjeff * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154176730Sjeff *	for TCP. For V3, a reply won't be saved when the flood level is
155176730Sjeff *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156176730Sjeff *	that case. This level should be set high enough that this almost
157176730Sjeff *	never happens.
158176730Sjeff */
159176730Sjeff#ifndef APPLEKEXT
160176730Sjeff#include <fs/nfs/nfsport.h>
161176730Sjeff
162176730Sjeffextern struct nfsstatsv1 nfsstatsv1;
163176730Sjeffextern struct mtx nfsrc_udpmtx;
164176730Sjeffextern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165176730Sjeffextern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166176730Sjeffint nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167176730Sjeff#endif	/* !APPLEKEXT */
168176730Sjeff
169176730SjeffSYSCTL_DECL(_vfs_nfsd);
170176730Sjeff
171176730Sjeffstatic u_int	nfsrc_tcphighwater = 0;
172176730Sjeffstatic int
173176730Sjeffsysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174176730Sjeff{
175176730Sjeff	int error, newhighwater;
176176730Sjeff
177176730Sjeff	newhighwater = nfsrc_tcphighwater;
178176730Sjeff	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179176730Sjeff	if (error != 0 || req->newptr == NULL)
180176730Sjeff		return (error);
181176730Sjeff	if (newhighwater < 0)
182176730Sjeff		return (EINVAL);
183176730Sjeff	if (newhighwater >= nfsrc_floodlevel)
184176730Sjeff		nfsrc_floodlevel = newhighwater + newhighwater / 5;
185176730Sjeff	nfsrc_tcphighwater = newhighwater;
186176730Sjeff	return (0);
187176730Sjeff}
188176730SjeffSYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189176730Sjeff    sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190176730Sjeff    "High water mark for TCP cache entries");
191176730Sjeff
192176730Sjeffstatic u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193176730SjeffSYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194176730Sjeff    &nfsrc_udphighwater, 0,
195176730Sjeff    "High water mark for UDP cache entries");
196176811Sjeffstatic u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197176811SjeffSYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198176730Sjeff    &nfsrc_tcptimeout, 0,
199176730Sjeff    "Timeout for TCP entries in the DRC");
200176730Sjeffstatic u_int nfsrc_tcpnonidempotent = 1;
201176730SjeffSYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202176730Sjeff    &nfsrc_tcpnonidempotent, 0,
203176730Sjeff    "Enable the DRC for NFS over TCP");
204176811Sjeff
205176811Sjeffstatic int nfsrc_udpcachesize = 0;
206176811Sjeffstatic TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207176811Sjeffstatic struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208176811Sjeff
209176730Sjeff/*
210176730Sjeff * and the reverse mapping from generic to Version 2 procedure numbers
211176811Sjeff */
212176730Sjeffstatic int newnfsv2_procid[NFS_V3NPROCS] = {
213176730Sjeff	NFSV2PROC_NULL,
214176730Sjeff	NFSV2PROC_GETATTR,
215176730Sjeff	NFSV2PROC_SETATTR,
216176730Sjeff	NFSV2PROC_LOOKUP,
217176730Sjeff	NFSV2PROC_NOOP,
218176730Sjeff	NFSV2PROC_READLINK,
219176730Sjeff	NFSV2PROC_READ,
220176730Sjeff	NFSV2PROC_WRITE,
221176730Sjeff	NFSV2PROC_CREATE,
222176730Sjeff	NFSV2PROC_MKDIR,
223176730Sjeff	NFSV2PROC_SYMLINK,
224176730Sjeff	NFSV2PROC_CREATE,
225176730Sjeff	NFSV2PROC_REMOVE,
226176730Sjeff	NFSV2PROC_RMDIR,
227176730Sjeff	NFSV2PROC_RENAME,
228176730Sjeff	NFSV2PROC_LINK,
229176730Sjeff	NFSV2PROC_READDIR,
230176730Sjeff	NFSV2PROC_NOOP,
231176730Sjeff	NFSV2PROC_STATFS,
232176730Sjeff	NFSV2PROC_NOOP,
233176730Sjeff	NFSV2PROC_NOOP,
234176730Sjeff	NFSV2PROC_NOOP,
235176730Sjeff};
236176730Sjeff
237176730Sjeff#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238176730Sjeff#define	NFSRCUDPHASH(xid) \
239176730Sjeff	(&nfsrvudphashtbl[nfsrc_hash(xid)])
240176730Sjeff#define	NFSRCHASH(xid) \
241176730Sjeff	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
242176730Sjeff#define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243176730Sjeff#define	TRUE	1
244176730Sjeff#define	FALSE	0
245176730Sjeff#define	NFSRVCACHE_CHECKLEN	100
246176730Sjeff
247176730Sjeff/* True iff the rpc reply is an nfs status ONLY! */
248176730Sjeffstatic int nfsv2_repstat[NFS_V3NPROCS] = {
249176730Sjeff	FALSE,
250176730Sjeff	FALSE,
251176730Sjeff	FALSE,
252176730Sjeff	FALSE,
253176730Sjeff	FALSE,
254176811Sjeff	FALSE,
255176811Sjeff	FALSE,
256176730Sjeff	FALSE,
257176730Sjeff	FALSE,
258176811Sjeff	FALSE,
259176730Sjeff	TRUE,
260176730Sjeff	TRUE,
261176730Sjeff	TRUE,
262176730Sjeff	TRUE,
263176730Sjeff	FALSE,
264176730Sjeff	TRUE,
265176730Sjeff	FALSE,
266176730Sjeff	FALSE,
267176730Sjeff	FALSE,
268176730Sjeff	FALSE,
269176730Sjeff	FALSE,
270176730Sjeff	FALSE,
271176730Sjeff};
272176730Sjeff
273176730Sjeff/*
274176730Sjeff * Will NFS want to work over IPv6 someday?
275176730Sjeff */
276176730Sjeff#define	NETFAMILY(rp) \
277176730Sjeff		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278176730Sjeff
279176730Sjeff/* local functions */
280176730Sjeffstatic int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281176730Sjeffstatic int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282176730Sjeffstatic void nfsrc_lock(struct nfsrvcache *rp);
283176730Sjeffstatic void nfsrc_unlock(struct nfsrvcache *rp);
284176730Sjeffstatic void nfsrc_wanted(struct nfsrvcache *rp);
285176730Sjeffstatic void nfsrc_freecache(struct nfsrvcache *rp);
286176730Sjeffstatic int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287176730Sjeffstatic void nfsrc_marksametcpconn(u_int64_t);
288176730Sjeff
289176811Sjeff/*
290176730Sjeff * Return the correct mutex for this cache entry.
291176730Sjeff */
292176730Sjeffstatic __inline struct mtx *
293176730Sjeffnfsrc_cachemutex(struct nfsrvcache *rp)
294176730Sjeff{
295176811Sjeff
296176811Sjeff	if ((rp->rc_flag & RC_UDP) != 0)
297176811Sjeff		return (&nfsrc_udpmtx);
298176811Sjeff	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299176811Sjeff}
300176811Sjeff
301176811Sjeff/*
302176730Sjeff * Initialize the server request cache list
303176730Sjeff */
304176730SjeffAPPLESTATIC void
305176730Sjeffnfsrvd_initcache(void)
306176730Sjeff{
307176730Sjeff	int i;
308176730Sjeff	static int inited = 0;
309176730Sjeff
310176730Sjeff	if (inited)
311176730Sjeff		return;
312176730Sjeff	inited = 1;
313176730Sjeff	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314176730Sjeff		LIST_INIT(&nfsrvudphashtbl[i]);
315176730Sjeff		LIST_INIT(&nfsrchash_table[i].tbl);
316176730Sjeff		LIST_INIT(&nfsrcahash_table[i].tbl);
317176730Sjeff	}
318176730Sjeff	TAILQ_INIT(&nfsrvudplru);
319177738Sjeff	nfsrc_tcpsavedreplies = 0;
320176730Sjeff	nfsrc_udpcachesize = 0;
321176730Sjeff	nfsstatsv1.srvcache_tcppeak = 0;
322176730Sjeff	nfsstatsv1.srvcache_size = 0;
323176730Sjeff}
324176730Sjeff
325176730Sjeff/*
326176730Sjeff * Get a cache entry for this request. Basically just malloc a new one
327176730Sjeff * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328176730Sjeff */
329176730SjeffAPPLESTATIC int
330176730Sjeffnfsrvd_getcache(struct nfsrv_descript *nd)
331176730Sjeff{
332176730Sjeff	struct nfsrvcache *newrp;
333176730Sjeff	int ret;
334176730Sjeff
335176730Sjeff	if (nd->nd_procnum == NFSPROC_NULL)
336177738Sjeff		panic("nfsd cache null");
337176730Sjeff	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338176730Sjeff	    M_NFSRVCACHE, M_WAITOK);
339176730Sjeff	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340176730Sjeff	if (nd->nd_flag & ND_NFSV4)
341176730Sjeff		newrp->rc_flag = RC_NFSV4;
342176730Sjeff	else if (nd->nd_flag & ND_NFSV3)
343176730Sjeff		newrp->rc_flag = RC_NFSV3;
344176730Sjeff	else
345176730Sjeff		newrp->rc_flag = RC_NFSV2;
346176730Sjeff	newrp->rc_xid = nd->nd_retxid;
347176730Sjeff	newrp->rc_proc = nd->nd_procnum;
348176730Sjeff	newrp->rc_sockref = nd->nd_sockref;
349176730Sjeff	newrp->rc_cachetime = nd->nd_tcpconntime;
350176730Sjeff	if (nd->nd_flag & ND_SAMETCPCONN)
351176730Sjeff		newrp->rc_flag |= RC_SAMETCPCONN;
352176730Sjeff	if (nd->nd_nam2 != NULL) {
353176730Sjeff		newrp->rc_flag |= RC_UDP;
354176730Sjeff		ret = nfsrc_getudp(nd, newrp);
355176730Sjeff	} else {
356176730Sjeff		ret = nfsrc_gettcp(nd, newrp);
357176730Sjeff	}
358176730Sjeff	NFSEXITCODE2(0, nd);
359176730Sjeff	return (ret);
360176730Sjeff}
361176730Sjeff
362176730Sjeff/*
363176730Sjeff * For UDP (v2, v3):
364176730Sjeff * - key on <xid, NFS version, RPC#, Client host ip#>
365176730Sjeff *   (at most one entry for each key)
366176730Sjeff */
367176730Sjeffstatic int
368176730Sjeffnfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369176730Sjeff{
370176730Sjeff	struct nfsrvcache *rp;
371176730Sjeff	struct sockaddr_in *saddr;
372176730Sjeff	struct sockaddr_in6 *saddr6;
373176730Sjeff	struct nfsrvhashhead *hp;
374176730Sjeff	int ret = 0;
375176730Sjeff	struct mtx *mutex;
376176730Sjeff
377176730Sjeff	mutex = nfsrc_cachemutex(newrp);
378176730Sjeff	hp = NFSRCUDPHASH(newrp->rc_xid);
379176730Sjeffloop:
380176730Sjeff	mtx_lock(mutex);
381176730Sjeff	LIST_FOREACH(rp, hp, rc_hash) {
382176730Sjeff	    if (newrp->rc_xid == rp->rc_xid &&
383176730Sjeff		newrp->rc_proc == rp->rc_proc &&
384176730Sjeff		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385176730Sjeff		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386176730Sjeff			if ((rp->rc_flag & RC_LOCKED) != 0) {
387176730Sjeff				rp->rc_flag |= RC_WANTED;
388176730Sjeff				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389176730Sjeff				    "nfsrc", 10 * hz);
390176730Sjeff				goto loop;
391176730Sjeff			}
392176730Sjeff			if (rp->rc_flag == 0)
393176730Sjeff				panic("nfs udp cache0");
394176730Sjeff			rp->rc_flag |= RC_LOCKED;
395176730Sjeff			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396176730Sjeff			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397176730Sjeff			if (rp->rc_flag & RC_INPROG) {
398176730Sjeff				nfsstatsv1.srvcache_inproghits++;
399176730Sjeff				mtx_unlock(mutex);
400176730Sjeff				ret = RC_DROPIT;
401176730Sjeff			} else if (rp->rc_flag & RC_REPSTATUS) {
402176730Sjeff				/*
403177738Sjeff				 * V2 only.
404176730Sjeff				 */
405176730Sjeff				nfsstatsv1.srvcache_nonidemdonehits++;
406176730Sjeff				mtx_unlock(mutex);
407176730Sjeff				nfsrvd_rephead(nd);
408176730Sjeff				*(nd->nd_errp) = rp->rc_status;
409176730Sjeff				ret = RC_REPLY;
410176730Sjeff				rp->rc_timestamp = NFSD_MONOSEC +
411176730Sjeff					NFSRVCACHE_UDPTIMEOUT;
412176730Sjeff			} else if (rp->rc_flag & RC_REPMBUF) {
413176730Sjeff				nfsstatsv1.srvcache_nonidemdonehits++;
414176730Sjeff				mtx_unlock(mutex);
415176730Sjeff				nd->nd_mreq = m_copym(rp->rc_reply, 0,
416176730Sjeff					M_COPYALL, M_WAITOK);
417176730Sjeff				ret = RC_REPLY;
418176730Sjeff				rp->rc_timestamp = NFSD_MONOSEC +
419176730Sjeff					NFSRVCACHE_UDPTIMEOUT;
420176730Sjeff			} else {
421176730Sjeff				panic("nfs udp cache1");
422176730Sjeff			}
423176730Sjeff			nfsrc_unlock(rp);
424176730Sjeff			free((caddr_t)newrp, M_NFSRVCACHE);
425176730Sjeff			goto out;
426176730Sjeff		}
427176730Sjeff	}
428176730Sjeff	nfsstatsv1.srvcache_misses++;
429176730Sjeff	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
430176730Sjeff	nfsrc_udpcachesize++;
431176730Sjeff
432176730Sjeff	newrp->rc_flag |= RC_INPROG;
433176730Sjeff	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434176730Sjeff	if (saddr->sin_family == AF_INET)
435176730Sjeff		newrp->rc_inet = saddr->sin_addr.s_addr;
436176730Sjeff	else if (saddr->sin_family == AF_INET6) {
437176730Sjeff		saddr6 = (struct sockaddr_in6 *)saddr;
438176730Sjeff		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439176730Sjeff		    sizeof (struct in6_addr));
440176730Sjeff		newrp->rc_flag |= RC_INETIPV6;
441176811Sjeff	}
442177738Sjeff	LIST_INSERT_HEAD(hp, newrp, rc_hash);
443176730Sjeff	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444176730Sjeff	mtx_unlock(mutex);
445176730Sjeff	nd->nd_rp = newrp;
446176730Sjeff	ret = RC_DOIT;
447176730Sjeff
448176730Sjeffout:
449176730Sjeff	NFSEXITCODE2(0, nd);
450176730Sjeff	return (ret);
451176730Sjeff}
452176730Sjeff
453176730Sjeff/*
454176730Sjeff * Update a request cache entry after the rpc has been done
455176730Sjeff */
456176730SjeffAPPLESTATIC struct nfsrvcache *
457176730Sjeffnfsrvd_updatecache(struct nfsrv_descript *nd)
458176730Sjeff{
459176730Sjeff	struct nfsrvcache *rp;
460176730Sjeff	struct nfsrvcache *retrp = NULL;
461176730Sjeff	mbuf_t m;
462176730Sjeff	struct mtx *mutex;
463176730Sjeff
464176811Sjeff	rp = nd->nd_rp;
465176730Sjeff	if (!rp)
466176730Sjeff		panic("nfsrvd_updatecache null rp");
467176730Sjeff	nd->nd_rp = NULL;
468176730Sjeff	mutex = nfsrc_cachemutex(rp);
469176730Sjeff	mtx_lock(mutex);
470176730Sjeff	nfsrc_lock(rp);
471176730Sjeff	if (!(rp->rc_flag & RC_INPROG))
472176730Sjeff		panic("nfsrvd_updatecache not inprog");
473176730Sjeff	rp->rc_flag &= ~RC_INPROG;
474176730Sjeff	if (rp->rc_flag & RC_UDP) {
475176730Sjeff		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476176730Sjeff		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477176730Sjeff	}
478176730Sjeff
479176730Sjeff	/*
480176730Sjeff	 * Reply from cache is a special case returned by nfsrv_checkseqid().
481176730Sjeff	 */
482176730Sjeff	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483176730Sjeff		nfsstatsv1.srvcache_nonidemdonehits++;
484176730Sjeff		mtx_unlock(mutex);
485176730Sjeff		nd->nd_repstat = 0;
486176730Sjeff		if (nd->nd_mreq)
487176730Sjeff			mbuf_freem(nd->nd_mreq);
488176730Sjeff		if (!(rp->rc_flag & RC_REPMBUF))
489176730Sjeff			panic("reply from cache");
490176730Sjeff		nd->nd_mreq = m_copym(rp->rc_reply, 0,
491176730Sjeff		    M_COPYALL, M_WAITOK);
492176730Sjeff		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493176730Sjeff		nfsrc_unlock(rp);
494176730Sjeff		goto out;
495176730Sjeff	}
496176730Sjeff
497176811Sjeff	/*
498176811Sjeff	 * If rc_refcnt > 0, save it
499176811Sjeff	 * For UDP, save it if ND_SAVEREPLY is set
500176811Sjeff	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501176811Sjeff	 */
502176811Sjeff	if (nd->nd_repstat != NFSERR_DONTREPLY &&
503176811Sjeff	    (rp->rc_refcnt > 0 ||
504176811Sjeff	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505176811Sjeff	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506176811Sjeff	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507176811Sjeff	      nfsrc_tcpnonidempotent))) {
508176811Sjeff		if (rp->rc_refcnt > 0) {
509176811Sjeff			if (!(rp->rc_flag & RC_NFSV4))
510176811Sjeff				panic("update_cache refcnt");
511176811Sjeff			rp->rc_flag |= RC_REFCNT;
512177738Sjeff		}
513176811Sjeff		if ((nd->nd_flag & ND_NFSV2) &&
514176811Sjeff		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515176811Sjeff			rp->rc_status = nd->nd_repstat;
516176811Sjeff			rp->rc_flag |= RC_REPSTATUS;
517176811Sjeff			mtx_unlock(mutex);
518176811Sjeff		} else {
519176811Sjeff			if (!(rp->rc_flag & RC_UDP)) {
520177738Sjeff			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521176811Sjeff			    if (nfsrc_tcpsavedreplies >
522176811Sjeff				nfsstatsv1.srvcache_tcppeak)
523176811Sjeff				nfsstatsv1.srvcache_tcppeak =
524176811Sjeff				    nfsrc_tcpsavedreplies;
525176811Sjeff			}
526176811Sjeff			mtx_unlock(mutex);
527176811Sjeff			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
528177368Sjeff			mtx_lock(mutex);
529177368Sjeff			rp->rc_reply = m;
530176730Sjeff			rp->rc_flag |= RC_REPMBUF;
531176730Sjeff			mtx_unlock(mutex);
532176730Sjeff		}
533176730Sjeff		if (rp->rc_flag & RC_UDP) {
534176730Sjeff			rp->rc_timestamp = NFSD_MONOSEC +
535176730Sjeff			    NFSRVCACHE_UDPTIMEOUT;
536176730Sjeff			nfsrc_unlock(rp);
537176730Sjeff		} else {
538176730Sjeff			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539176730Sjeff			if (rp->rc_refcnt > 0)
540176730Sjeff				nfsrc_unlock(rp);
541176730Sjeff			else
542176730Sjeff				retrp = rp;
543176730Sjeff		}
544176730Sjeff	} else {
545176730Sjeff		nfsrc_freecache(rp);
546176730Sjeff		mtx_unlock(mutex);
547176730Sjeff	}
548176730Sjeff
549176730Sjeffout:
550176730Sjeff	NFSEXITCODE2(0, nd);
551176730Sjeff	return (retrp);
552176730Sjeff}
553176730Sjeff
554176730Sjeff/*
555176730Sjeff * Invalidate and, if possible, free an in prog cache entry.
556176730Sjeff * Must not sleep.
557176730Sjeff */
558176730SjeffAPPLESTATIC void
559176730Sjeffnfsrvd_delcache(struct nfsrvcache *rp)
560176730Sjeff{
561176730Sjeff	struct mtx *mutex;
562176811Sjeff
563176730Sjeff	mutex = nfsrc_cachemutex(rp);
564176730Sjeff	if (!(rp->rc_flag & RC_INPROG))
565176730Sjeff		panic("nfsrvd_delcache not in prog");
566176730Sjeff	mtx_lock(mutex);
567176730Sjeff	rp->rc_flag &= ~RC_INPROG;
568176730Sjeff	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569176730Sjeff		nfsrc_freecache(rp);
570176730Sjeff	mtx_unlock(mutex);
571176730Sjeff}
572176730Sjeff
573176730Sjeff/*
574176730Sjeff * Called after nfsrvd_updatecache() once the reply is sent, to update
575176730Sjeff * the entry's sequence number and unlock it. The argument is
576176730Sjeff * the pointer returned by nfsrvd_updatecache().
577177738Sjeff */
578176730SjeffAPPLESTATIC void
579176730Sjeffnfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
580176730Sjeff{
581176730Sjeff	struct nfsrchash_bucket *hbp;
582176730Sjeff
583176730Sjeff	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584176730Sjeff	if (have_seq) {
585176730Sjeff		hbp = NFSRCAHASH(rp->rc_sockref);
586176730Sjeff		mtx_lock(&hbp->mtx);
587176821Sjeff		rp->rc_tcpseq = seq;
588176730Sjeff		if (rp->rc_acked != RC_NO_ACK)
589176730Sjeff			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590177738Sjeff		rp->rc_acked = RC_NO_ACK;
591176730Sjeff		mtx_unlock(&hbp->mtx);
592177738Sjeff	}
593176730Sjeff	nfsrc_unlock(rp);
594177738Sjeff}
595176730Sjeff
596176730Sjeff/*
597176730Sjeff * Get a cache entry for TCP
598176730Sjeff * - key on <xid, nfs version>
599176730Sjeff *   (allow multiple entries for a given key)
600176730Sjeff */
601177738Sjeffstatic int
602177738Sjeffnfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
603176730Sjeff{
604176730Sjeff	struct nfsrvcache *rp, *nextrp;
605176730Sjeff	int i;
606176730Sjeff	struct nfsrvcache *hitrp;
607176730Sjeff	struct nfsrvhashhead *hp, nfsrc_templist;
608176730Sjeff	int hit, ret = 0;
609176730Sjeff	struct mtx *mutex;
610176730Sjeff
611176730Sjeff	mutex = nfsrc_cachemutex(newrp);
612176730Sjeff	hp = NFSRCHASH(newrp->rc_xid);
613176730Sjeff	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
614176730Sjefftryagain:
615176730Sjeff	mtx_lock(mutex);
616176730Sjeff	hit = 1;
617176730Sjeff	LIST_INIT(&nfsrc_templist);
618176730Sjeff	/*
619176730Sjeff	 * Get all the matches and put them on the temp list.
620176730Sjeff	 */
621176730Sjeff	rp = LIST_FIRST(hp);
622176730Sjeff	while (rp != LIST_END(hp)) {
623176730Sjeff		nextrp = LIST_NEXT(rp, rc_hash);
624176730Sjeff		if (newrp->rc_xid == rp->rc_xid &&
625176730Sjeff		    (!(rp->rc_flag & RC_INPROG) ||
626176730Sjeff		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
627176730Sjeff		      newrp->rc_sockref == rp->rc_sockref)) &&
628176730Sjeff		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629176730Sjeff		    newrp->rc_proc == rp->rc_proc &&
630176730Sjeff		    ((newrp->rc_flag & RC_NFSV4) &&
631176730Sjeff		     newrp->rc_sockref != rp->rc_sockref &&
632176730Sjeff		     newrp->rc_cachetime >= rp->rc_cachetime)
633176730Sjeff		    && newrp->rc_reqlen == rp->rc_reqlen &&
634176730Sjeff		    newrp->rc_cksum == rp->rc_cksum) {
635176730Sjeff			LIST_REMOVE(rp, rc_hash);
636176730Sjeff			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
637176730Sjeff		}
638176730Sjeff		rp = nextrp;
639176730Sjeff	}
640177738Sjeff
641176730Sjeff	/*
642176730Sjeff	 * Now, use nfsrc_templist to decide if there is a match.
643176730Sjeff	 */
644176730Sjeff	i = 0;
645176730Sjeff	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
646176730Sjeff		i++;
647176730Sjeff		if (rp->rc_refcnt > 0) {
648176730Sjeff			hit = 0;
649176730Sjeff			break;
650176730Sjeff		}
651176730Sjeff	}
652176730Sjeff	/*
653176730Sjeff	 * Can be a hit only if one entry left.
654176730Sjeff	 * Note possible hit entry and put nfsrc_templist back on hash
655176730Sjeff	 * list.
656176730Sjeff	 */
657176730Sjeff	if (i != 1)
658176730Sjeff		hit = 0;
659176730Sjeff	hitrp = rp = LIST_FIRST(&nfsrc_templist);
660176730Sjeff	while (rp != LIST_END(&nfsrc_templist)) {
661176730Sjeff		nextrp = LIST_NEXT(rp, rc_hash);
662176730Sjeff		LIST_REMOVE(rp, rc_hash);
663176730Sjeff		LIST_INSERT_HEAD(hp, rp, rc_hash);
664176730Sjeff		rp = nextrp;
665176730Sjeff	}
666176730Sjeff	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667176730Sjeff		panic("nfs gettcp cache templist");
668176730Sjeff
669176730Sjeff	if (hit) {
670176730Sjeff		rp = hitrp;
671176730Sjeff		if ((rp->rc_flag & RC_LOCKED) != 0) {
672176730Sjeff			rp->rc_flag |= RC_WANTED;
673176730Sjeff			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
674176730Sjeff			    "nfsrc", 10 * hz);
675176730Sjeff			goto tryagain;
676176730Sjeff		}
677176730Sjeff		if (rp->rc_flag == 0)
678176730Sjeff			panic("nfs tcp cache0");
679176730Sjeff		rp->rc_flag |= RC_LOCKED;
680176730Sjeff		if (rp->rc_flag & RC_INPROG) {
681176730Sjeff			nfsstatsv1.srvcache_inproghits++;
682176730Sjeff			mtx_unlock(mutex);
683176730Sjeff			if (newrp->rc_sockref == rp->rc_sockref)
684176730Sjeff				nfsrc_marksametcpconn(rp->rc_sockref);
685176730Sjeff			ret = RC_DROPIT;
686176730Sjeff		} else if (rp->rc_flag & RC_REPSTATUS) {
687176730Sjeff			/*
688176730Sjeff			 * V2 only.
689177738Sjeff			 */
690176730Sjeff			nfsstatsv1.srvcache_nonidemdonehits++;
691176730Sjeff			mtx_unlock(mutex);
692176730Sjeff			if (newrp->rc_sockref == rp->rc_sockref)
693176730Sjeff				nfsrc_marksametcpconn(rp->rc_sockref);
694176730Sjeff			ret = RC_REPLY;
695177738Sjeff			nfsrvd_rephead(nd);
696176730Sjeff			*(nd->nd_errp) = rp->rc_status;
697177738Sjeff			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698176730Sjeff		} else if (rp->rc_flag & RC_REPMBUF) {
699176730Sjeff			nfsstatsv1.srvcache_nonidemdonehits++;
700176730Sjeff			mtx_unlock(mutex);
701176730Sjeff			if (newrp->rc_sockref == rp->rc_sockref)
702176730Sjeff				nfsrc_marksametcpconn(rp->rc_sockref);
703176730Sjeff			ret = RC_REPLY;
704176730Sjeff			nd->nd_mreq = m_copym(rp->rc_reply, 0,
705176730Sjeff				M_COPYALL, M_WAITOK);
706176730Sjeff			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707176730Sjeff		} else {
708176730Sjeff			panic("nfs tcp cache1");
709176730Sjeff		}
710176730Sjeff		nfsrc_unlock(rp);
711176730Sjeff		free((caddr_t)newrp, M_NFSRVCACHE);
712176730Sjeff		goto out;
713176730Sjeff	}
714176730Sjeff	nfsstatsv1.srvcache_misses++;
715176730Sjeff	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
716176730Sjeff
717176730Sjeff	/*
718176730Sjeff	 * For TCP, multiple entries for a key are allowed, so don't
719176730Sjeff	 * chain it into the hash table until done.
720176730Sjeff	 */
721176730Sjeff	newrp->rc_cachetime = NFSD_MONOSEC;
722176730Sjeff	newrp->rc_flag |= RC_INPROG;
723176730Sjeff	LIST_INSERT_HEAD(hp, newrp, rc_hash);
724176730Sjeff	mtx_unlock(mutex);
725176730Sjeff	nd->nd_rp = newrp;
726176730Sjeff	ret = RC_DOIT;
727176730Sjeff
728176730Sjeffout:
729176730Sjeff	NFSEXITCODE2(0, nd);
730176730Sjeff	return (ret);
731176730Sjeff}
732176730Sjeff
733176730Sjeff/*
734176730Sjeff * Lock a cache entry.
735176730Sjeff */
736176730Sjeffstatic void
737176730Sjeffnfsrc_lock(struct nfsrvcache *rp)
738176730Sjeff{
739176730Sjeff	struct mtx *mutex;
740176730Sjeff
741176730Sjeff	mutex = nfsrc_cachemutex(rp);
742176730Sjeff	mtx_assert(mutex, MA_OWNED);
743176730Sjeff	while ((rp->rc_flag & RC_LOCKED) != 0) {
744176730Sjeff		rp->rc_flag |= RC_WANTED;
745176730Sjeff		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
746176730Sjeff	}
747176730Sjeff	rp->rc_flag |= RC_LOCKED;
748176730Sjeff}
749176730Sjeff
750176730Sjeff/*
751176730Sjeff * Unlock a cache entry.
752176730Sjeff */
753176730Sjeffstatic void
754177738Sjeffnfsrc_unlock(struct nfsrvcache *rp)
755176730Sjeff{
756176730Sjeff	struct mtx *mutex;
757176730Sjeff
758176730Sjeff	mutex = nfsrc_cachemutex(rp);
759176730Sjeff	mtx_lock(mutex);
760176730Sjeff	rp->rc_flag &= ~RC_LOCKED;
761176730Sjeff	nfsrc_wanted(rp);
762176730Sjeff	mtx_unlock(mutex);
763177738Sjeff}
764176730Sjeff
765176730Sjeff/*
766176730Sjeff * Wakeup anyone wanting entry.
767176730Sjeff */
768176730Sjeffstatic void
769176730Sjeffnfsrc_wanted(struct nfsrvcache *rp)
770176730Sjeff{
771176730Sjeff	if (rp->rc_flag & RC_WANTED) {
772176730Sjeff		rp->rc_flag &= ~RC_WANTED;
773176730Sjeff		wakeup((caddr_t)rp);
774176730Sjeff	}
775176730Sjeff}
776176730Sjeff
777176730Sjeff/*
778176730Sjeff * Free up the entry.
779176730Sjeff * Must not sleep.
780176730Sjeff */
781176730Sjeffstatic void
782177597Srunfsrc_freecache(struct nfsrvcache *rp)
783177597Sru{
784177597Sru	struct nfsrchash_bucket *hbp;
785177597Sru
786177597Sru	LIST_REMOVE(rp, rc_hash);
787176730Sjeff	if (rp->rc_flag & RC_UDP) {
788176730Sjeff		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789176730Sjeff		nfsrc_udpcachesize--;
790176730Sjeff	} else if (rp->rc_acked != RC_NO_SEQ) {
791176730Sjeff		hbp = NFSRCAHASH(rp->rc_sockref);
792176730Sjeff		mtx_lock(&hbp->mtx);
793176730Sjeff		if (rp->rc_acked == RC_NO_ACK)
794176730Sjeff			LIST_REMOVE(rp, rc_ahash);
795176730Sjeff		mtx_unlock(&hbp->mtx);
796176730Sjeff	}
797176730Sjeff	nfsrc_wanted(rp);
798177597Sru	if (rp->rc_flag & RC_REPMBUF) {
799176730Sjeff		mbuf_freem(rp->rc_reply);
800176811Sjeff		if (!(rp->rc_flag & RC_UDP))
801176811Sjeff			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
802176730Sjeff	}
803176811Sjeff	FREE((caddr_t)rp, M_NFSRVCACHE);
804176730Sjeff	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
805176730Sjeff}
806176730Sjeff
807176730Sjeff/*
808176730Sjeff * Clean out the cache. Called when nfsserver module is unloaded.
809176730Sjeff */
810176730SjeffAPPLESTATIC void
811176730Sjeffnfsrvd_cleancache(void)
812176730Sjeff{
813176730Sjeff	struct nfsrvcache *rp, *nextrp;
814176730Sjeff	int i;
815176730Sjeff
816176730Sjeff	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817176730Sjeff		mtx_lock(&nfsrchash_table[i].mtx);
818176730Sjeff		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
819176730Sjeff			nfsrc_freecache(rp);
820176730Sjeff		mtx_unlock(&nfsrchash_table[i].mtx);
821176730Sjeff	}
822177738Sjeff	mtx_lock(&nfsrc_udpmtx);
823176730Sjeff	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824177738Sjeff		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
825176730Sjeff			nfsrc_freecache(rp);
826176730Sjeff		}
827176730Sjeff	}
828176730Sjeff	nfsstatsv1.srvcache_size = 0;
829176730Sjeff	mtx_unlock(&nfsrc_udpmtx);
830176730Sjeff	nfsrc_tcpsavedreplies = 0;
831176730Sjeff}
832176730Sjeff
833176730Sjeff#define HISTSIZE	16
834176730Sjeff/*
835176730Sjeff * The basic rule is to get rid of entries that are expired.
836176730Sjeff */
837176730Sjeffvoid
838176730Sjeffnfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
839176730Sjeff{
840176730Sjeff	struct nfsrchash_bucket *hbp;
841176730Sjeff	struct nfsrvcache *rp, *nextrp;
842176730Sjeff	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
843176730Sjeff	time_t thisstamp;
844176730Sjeff	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845176730Sjeff	static int onethread = 0, oneslot = 0;
846176730Sjeff
847176730Sjeff	if (sockref != 0) {
848176730Sjeff		hbp = NFSRCAHASH(sockref);
849176730Sjeff		mtx_lock(&hbp->mtx);
850176730Sjeff		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851176730Sjeff			if (sockref == rp->rc_sockref) {
852176730Sjeff				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853176730Sjeff					rp->rc_acked = RC_ACK;
854176730Sjeff					LIST_REMOVE(rp, rc_ahash);
855176730Sjeff				} else if (final) {
856176730Sjeff					rp->rc_acked = RC_NACK;
857176730Sjeff					LIST_REMOVE(rp, rc_ahash);
858176730Sjeff				}
859176730Sjeff			}
860176730Sjeff		}
861176730Sjeff		mtx_unlock(&hbp->mtx);
862176730Sjeff	}
863176730Sjeff
864176730Sjeff	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
865177597Sru		return;
866177597Sru	if (NFSD_MONOSEC != udp_lasttrim ||
867177597Sru	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
868177597Sru	    nfsrc_udphighwater / 2)) {
869176730Sjeff		mtx_lock(&nfsrc_udpmtx);
870176730Sjeff		udp_lasttrim = NFSD_MONOSEC;
871176730Sjeff		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872176730Sjeff			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873176730Sjeff			     && rp->rc_refcnt == 0
874176730Sjeff			     && ((rp->rc_flag & RC_REFCNT) ||
875176730Sjeff				 udp_lasttrim > rp->rc_timestamp ||
876176730Sjeff				 nfsrc_udpcachesize > nfsrc_udphighwater))
877176730Sjeff				nfsrc_freecache(rp);
878176730Sjeff		}
879176730Sjeff		mtx_unlock(&nfsrc_udpmtx);
880176730Sjeff	}
881176811Sjeff	if (NFSD_MONOSEC != tcp_lasttrim ||
882176811Sjeff	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883176730Sjeff		force = nfsrc_tcphighwater / 4;
884176811Sjeff		if (force > 0 &&
885176811Sjeff		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886176730Sjeff			for (i = 0; i < HISTSIZE; i++)
887176730Sjeff				time_histo[i] = 0;
888176811Sjeff			i = 0;
889176811Sjeff			lastslot = NFSRVCACHE_HASHSIZE - 1;
890176811Sjeff		} else {
891176811Sjeff			force = 0;
892176811Sjeff			if (NFSD_MONOSEC != tcp_lasttrim) {
893176811Sjeff				i = 0;
894176811Sjeff				lastslot = NFSRVCACHE_HASHSIZE - 1;
895176811Sjeff			} else {
896176811Sjeff				lastslot = i = oneslot;
897176811Sjeff				if (++oneslot >= NFSRVCACHE_HASHSIZE)
898176811Sjeff					oneslot = 0;
899176811Sjeff			}
900176811Sjeff		}
901176811Sjeff		tto = nfsrc_tcptimeout;
902176811Sjeff		tcp_lasttrim = NFSD_MONOSEC;
903176811Sjeff		for (; i <= lastslot; i++) {
904176811Sjeff			mtx_lock(&nfsrchash_table[i].mtx);
905176730Sjeff			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
906176730Sjeff			    nextrp) {
907176730Sjeff				if (!(rp->rc_flag &
908176730Sjeff				     (RC_INPROG|RC_LOCKED|RC_WANTED))
909176730Sjeff				     && rp->rc_refcnt == 0) {
910176730Sjeff					if ((rp->rc_flag & RC_REFCNT) ||
911176730Sjeff					    tcp_lasttrim > rp->rc_timestamp ||
912176730Sjeff					    rp->rc_acked == RC_ACK) {
913176730Sjeff						nfsrc_freecache(rp);
914176730Sjeff						continue;
915176730Sjeff					}
916176730Sjeff
917176880Sjeff					if (force == 0)
918176730Sjeff						continue;
919176730Sjeff					/*
920176730Sjeff					 * The timestamps range from roughly the
921176730Sjeff					 * present (tcp_lasttrim) to the present
922176730Sjeff					 * + nfsrc_tcptimeout. Generate a simple
923177738Sjeff					 * histogram of where the timeouts fall.
924176730Sjeff					 */
925177738Sjeff					j = rp->rc_timestamp - tcp_lasttrim;
926176730Sjeff					if (j >= tto)
927176730Sjeff						j = HISTSIZE - 1;
928176730Sjeff					else if (j < 0)
929176730Sjeff						j = 0;
930176730Sjeff					else
931176730Sjeff						j = j * HISTSIZE / tto;
932176730Sjeff					time_histo[j]++;
933176730Sjeff				}
934176730Sjeff			}
935176730Sjeff			mtx_unlock(&nfsrchash_table[i].mtx);
936176730Sjeff		}
937176730Sjeff		if (force) {
938176730Sjeff			/*
939176730Sjeff			 * Trim some more with a smaller timeout of as little
940176730Sjeff			 * as 20% of nfsrc_tcptimeout to try and get below
941176730Sjeff			 * 80% of the nfsrc_tcphighwater.
942176730Sjeff			 */
943176730Sjeff			k = 0;
944176730Sjeff			for (i = 0; i < (HISTSIZE - 2); i++) {
945176730Sjeff				k += time_histo[i];
946176730Sjeff				if (k > force)
947176730Sjeff					break;
948176730Sjeff			}
949176730Sjeff			k = tto * (i + 1) / HISTSIZE;
950176730Sjeff			if (k < 1)
951176730Sjeff				k = 1;
952176730Sjeff			thisstamp = tcp_lasttrim + k;
953176730Sjeff			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954176730Sjeff				mtx_lock(&nfsrchash_table[i].mtx);
955176730Sjeff				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
956176730Sjeff				    rc_hash, nextrp) {
957176730Sjeff					if (!(rp->rc_flag &
958176730Sjeff					     (RC_INPROG|RC_LOCKED|RC_WANTED))
959					     && rp->rc_refcnt == 0
960					     && ((rp->rc_flag & RC_REFCNT) ||
961						 thisstamp > rp->rc_timestamp ||
962						 rp->rc_acked == RC_ACK))
963						nfsrc_freecache(rp);
964				}
965				mtx_unlock(&nfsrchash_table[i].mtx);
966			}
967		}
968	}
969	atomic_store_rel_int(&onethread, 0);
970}
971
972/*
973 * Add a seqid# reference to the cache entry.
974 */
975APPLESTATIC void
976nfsrvd_refcache(struct nfsrvcache *rp)
977{
978	struct mtx *mutex;
979
980	if (rp == NULL)
981		/* For NFSv4.1, there is no cache entry. */
982		return;
983	mutex = nfsrc_cachemutex(rp);
984	mtx_lock(mutex);
985	if (rp->rc_refcnt < 0)
986		panic("nfs cache refcnt");
987	rp->rc_refcnt++;
988	mtx_unlock(mutex);
989}
990
991/*
992 * Dereference a seqid# cache entry.
993 */
994APPLESTATIC void
995nfsrvd_derefcache(struct nfsrvcache *rp)
996{
997	struct mtx *mutex;
998
999	mutex = nfsrc_cachemutex(rp);
1000	mtx_lock(mutex);
1001	if (rp->rc_refcnt <= 0)
1002		panic("nfs cache derefcnt");
1003	rp->rc_refcnt--;
1004	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1005		nfsrc_freecache(rp);
1006	mtx_unlock(mutex);
1007}
1008
1009/*
1010 * Calculate the length of the mbuf list and a checksum on the first up to
1011 * NFSRVCACHE_CHECKLEN bytes.
1012 */
1013static int
1014nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1015{
1016	int len = 0, cklen;
1017	mbuf_t m;
1018
1019	m = m1;
1020	while (m) {
1021		len += mbuf_len(m);
1022		m = mbuf_next(m);
1023	}
1024	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1025	*cksum = in_cksum(m1, cklen);
1026	return (len);
1027}
1028
1029/*
1030 * Mark a TCP connection that is seeing retries. Should never happen for
1031 * NFSv4.
1032 */
1033static void
1034nfsrc_marksametcpconn(u_int64_t sockref)
1035{
1036}
1037
1038