1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36#include <sys/cdefs.h>
37/*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 *   avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 *   and the TCP socket that the original request was received on will no
45 *   longer be active
46 *   (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 *   in them as well as minimizing the risk of redoing retried non-idempotent
49 *   Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * 	- key on <xid, NFS version> (as noted above, there can be several
58 * 				     entries with the same key)
59 * 	When a request arrives:
60 * 		For all that match key
61 * 		- if RPC# != OR request_size !=
62 * 			- not a match with this one
63 * 		- if NFSv4 and received on same TCP socket OR
64 *			received on a TCP connection created before the
65 *			entry was cached
66 * 			- not a match with this one
67 * 			(V2,3 clients might retry on same TCP socket)
68 * 		- calculate checksum on first N bytes of NFS XDR
69 * 		- if checksum !=
70 * 			- not a match for this one
71 * 		If any of the remaining ones that match has a
72 * 			seqid_refcnt > 0
73 * 			- not a match (go do RPC, using new cache entry)
74 * 		If one match left
75 * 			- a hit (reply from cache)
76 * 		else
77 * 			- miss (go do RPC, using new cache entry)
78 *
79 * 	During processing of NFSv4 request:
80 * 		- set a flag when a non-idempotent Op is processed
81 * 		- when an Op that uses a seqid# (Open,...) is processed
82 * 			- if same seqid# as referenced entry in cache
83 * 				- free new cache entry
84 * 				- reply from referenced cache entry
85 * 			  else if next seqid# in order
86 * 				- free referenced cache entry
87 * 				- increment seqid_refcnt on new cache entry
88 * 				- set pointer from Openowner/Lockowner to
89 * 					new cache entry (aka reference it)
90 * 			  else if first seqid# in sequence
91 * 				- increment seqid_refcnt on new cache entry
92 * 				- set pointer from Openowner/Lockowner to
93 * 					new cache entry (aka reference it)
94 *
95 * 	At end of RPC processing:
96 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * 			cache entry
98 * 			- save reply in cache entry
99 * 			- calculate checksum on first N bytes of NFS XDR
100 * 				request
101 * 			- note op and length of XDR request (in bytes)
102 * 			- timestamp it
103 * 		  else
104 * 			- free new cache entry
105 * 		- Send reply (noting info for socket activity check, below)
106 *
107 * 	For cache entries saved above:
108 * 		- if saved since seqid_refcnt was > 0
109 * 			- free when seqid_refcnt decrements to 0
110 * 			  (when next one in sequence is processed above, or
111 * 			   when Openowner/Lockowner is discarded)
112 * 		  else { non-idempotent Op(s) }
113 * 			- free when
114 * 				- some further activity observed on same
115 * 					socket
116 * 				  (I'm not yet sure how I'm going to do
117 * 				   this. Maybe look at the TCP connection
118 * 				   to see if the send_tcp_sequence# is well
119 * 				   past sent reply OR K additional RPCs
120 * 				   replied on same socket OR?)
121 * 			  OR
122 * 				- when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 *   (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * 	- if RPC marked In_progress
131 * 		- discard request (don't send reply)
132 * 	  else
133 * 		- reply from cache
134 * 		- timestamp cache entry
135 *   else
136 * 	- add entry to cache, marked In_progress
137 * 	- do RPC
138 * 	- when RPC done
139 * 		- if RPC# non-idempotent
140 * 			- mark entry Done (not In_progress)
141 * 			- save reply
142 * 			- timestamp cache entry
143 * 		  else
144 * 			- free cache entry
145 * 		- send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 *		pages 53-63. San Diego, February 1989.
152 *	 for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 *	for TCP. For V3, a reply won't be saved when the flood level is
155 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 *	that case. This level should be set high enough that this almost
157 *	never happens.
158 */
159#include <fs/nfs/nfsport.h>
160
161extern struct mtx nfsrc_udpmtx;
162
163NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
167
168NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
170
171SYSCTL_DECL(_vfs_nfsd);
172
173static u_int	nfsrc_tcphighwater = 0;
174static int
175sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176{
177	int error, newhighwater;
178
179	newhighwater = nfsrc_tcphighwater;
180	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181	if (error != 0 || req->newptr == NULL)
182		return (error);
183	if (newhighwater < 0)
184		return (EINVAL);
185	if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186		NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187	nfsrc_tcphighwater = newhighwater;
188	return (0);
189}
190SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192    sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193
194static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196    &nfsrc_udphighwater, 0,
197    "High water mark for UDP cache entries");
198static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200    &nfsrc_tcptimeout, 0,
201    "Timeout for TCP entries in the DRC");
202static u_int nfsrc_tcpnonidempotent = 1;
203SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204    &nfsrc_tcpnonidempotent, 0,
205    "Enable the DRC for NFS over TCP");
206
207NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
209
210/*
211 * and the reverse mapping from generic to Version 2 procedure numbers
212 */
213static int newnfsv2_procid[NFS_V3NPROCS] = {
214	NFSV2PROC_NULL,
215	NFSV2PROC_GETATTR,
216	NFSV2PROC_SETATTR,
217	NFSV2PROC_LOOKUP,
218	NFSV2PROC_NOOP,
219	NFSV2PROC_READLINK,
220	NFSV2PROC_READ,
221	NFSV2PROC_WRITE,
222	NFSV2PROC_CREATE,
223	NFSV2PROC_MKDIR,
224	NFSV2PROC_SYMLINK,
225	NFSV2PROC_CREATE,
226	NFSV2PROC_REMOVE,
227	NFSV2PROC_RMDIR,
228	NFSV2PROC_RENAME,
229	NFSV2PROC_LINK,
230	NFSV2PROC_READDIR,
231	NFSV2PROC_NOOP,
232	NFSV2PROC_STATFS,
233	NFSV2PROC_NOOP,
234	NFSV2PROC_NOOP,
235	NFSV2PROC_NOOP,
236};
237
238#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239#define	NFSRCUDPHASH(xid) \
240	(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241#define	NFSRCHASH(xid) \
242	(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243#define	NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
244#define	TRUE	1
245#define	FALSE	0
246#define	NFSRVCACHE_CHECKLEN	100
247
248/* True iff the rpc reply is an nfs status ONLY! */
249static int nfsv2_repstat[NFS_V3NPROCS] = {
250	FALSE,
251	FALSE,
252	FALSE,
253	FALSE,
254	FALSE,
255	FALSE,
256	FALSE,
257	FALSE,
258	FALSE,
259	FALSE,
260	TRUE,
261	TRUE,
262	TRUE,
263	TRUE,
264	FALSE,
265	TRUE,
266	FALSE,
267	FALSE,
268	FALSE,
269	FALSE,
270	FALSE,
271	FALSE,
272};
273
274/*
275 * Will NFS want to work over IPv6 someday?
276 */
277#define	NETFAMILY(rp) \
278		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279
280/* local functions */
281static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283static void nfsrc_lock(struct nfsrvcache *rp);
284static void nfsrc_unlock(struct nfsrvcache *rp);
285static void nfsrc_wanted(struct nfsrvcache *rp);
286static void nfsrc_freecache(struct nfsrvcache *rp);
287static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288static void nfsrc_marksametcpconn(u_int64_t);
289
290/*
291 * Return the correct mutex for this cache entry.
292 */
293static __inline struct mtx *
294nfsrc_cachemutex(struct nfsrvcache *rp)
295{
296
297	if ((rp->rc_flag & RC_UDP) != 0)
298		return (&nfsrc_udpmtx);
299	return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
300}
301
302/*
303 * Initialize the server request cache list
304 */
305void
306nfsrvd_initcache(void)
307{
308	int i;
309
310	NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312	NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314	NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317		mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
318		    MTX_DEF);
319		mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
320		    MTX_DEF);
321	}
322	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323		LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324		LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325		LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
326	}
327	TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329	NFSD_VNET(nfsrc_udpcachesize) = 0;
330}
331
332/*
333 * Get a cache entry for this request. Basically just malloc a new one
334 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
335 */
336int
337nfsrvd_getcache(struct nfsrv_descript *nd)
338{
339	struct nfsrvcache *newrp;
340	int ret;
341
342	if (nd->nd_procnum == NFSPROC_NULL)
343		panic("nfsd cache null");
344	newrp = malloc(sizeof (struct nfsrvcache),
345	    M_NFSRVCACHE, M_WAITOK);
346	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347	if (nd->nd_flag & ND_NFSV4)
348		newrp->rc_flag = RC_NFSV4;
349	else if (nd->nd_flag & ND_NFSV3)
350		newrp->rc_flag = RC_NFSV3;
351	else
352		newrp->rc_flag = RC_NFSV2;
353	newrp->rc_xid = nd->nd_retxid;
354	newrp->rc_proc = nd->nd_procnum;
355	newrp->rc_sockref = nd->nd_sockref;
356	newrp->rc_cachetime = nd->nd_tcpconntime;
357	if (nd->nd_flag & ND_SAMETCPCONN)
358		newrp->rc_flag |= RC_SAMETCPCONN;
359	if (nd->nd_nam2 != NULL) {
360		newrp->rc_flag |= RC_UDP;
361		ret = nfsrc_getudp(nd, newrp);
362	} else {
363		ret = nfsrc_gettcp(nd, newrp);
364	}
365	NFSEXITCODE2(0, nd);
366	return (ret);
367}
368
369/*
370 * For UDP (v2, v3):
371 * - key on <xid, NFS version, RPC#, Client host ip#>
372 *   (at most one entry for each key)
373 */
374static int
375nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
376{
377	struct nfsrvcache *rp;
378	struct sockaddr_in *saddr;
379	struct sockaddr_in6 *saddr6;
380	struct nfsrvhashhead *hp;
381	int ret = 0;
382	struct mtx *mutex;
383
384	mutex = nfsrc_cachemutex(newrp);
385	hp = NFSRCUDPHASH(newrp->rc_xid);
386loop:
387	mtx_lock(mutex);
388	LIST_FOREACH(rp, hp, rc_hash) {
389	    if (newrp->rc_xid == rp->rc_xid &&
390		newrp->rc_proc == rp->rc_proc &&
391		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393			if ((rp->rc_flag & RC_LOCKED) != 0) {
394				rp->rc_flag |= RC_WANTED;
395				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
396				    "nfsrc", 10 * hz);
397				goto loop;
398			}
399			if (rp->rc_flag == 0)
400				panic("nfs udp cache0");
401			rp->rc_flag |= RC_LOCKED;
402			TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403			TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404			if (rp->rc_flag & RC_INPROG) {
405				NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
406				mtx_unlock(mutex);
407				ret = RC_DROPIT;
408			} else if (rp->rc_flag & RC_REPSTATUS) {
409				/*
410				 * V2 only.
411				 */
412				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
413				mtx_unlock(mutex);
414				nfsrvd_rephead(nd);
415				*(nd->nd_errp) = rp->rc_status;
416				ret = RC_REPLY;
417				rp->rc_timestamp = NFSD_MONOSEC +
418					NFSRVCACHE_UDPTIMEOUT;
419			} else if (rp->rc_flag & RC_REPMBUF) {
420				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
421				mtx_unlock(mutex);
422				nd->nd_mreq = m_copym(rp->rc_reply, 0,
423					M_COPYALL, M_WAITOK);
424				ret = RC_REPLY;
425				rp->rc_timestamp = NFSD_MONOSEC +
426					NFSRVCACHE_UDPTIMEOUT;
427			} else {
428				panic("nfs udp cache1");
429			}
430			nfsrc_unlock(rp);
431			free(newrp, M_NFSRVCACHE);
432			goto out;
433		}
434	}
435	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437	NFSD_VNET(nfsrc_udpcachesize)++;
438
439	newrp->rc_flag |= RC_INPROG;
440	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441	if (saddr->sin_family == AF_INET)
442		newrp->rc_inet = saddr->sin_addr.s_addr;
443	else if (saddr->sin_family == AF_INET6) {
444		saddr6 = (struct sockaddr_in6 *)saddr;
445		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446		    sizeof (struct in6_addr));
447		newrp->rc_flag |= RC_INETIPV6;
448	}
449	LIST_INSERT_HEAD(hp, newrp, rc_hash);
450	TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
451	mtx_unlock(mutex);
452	nd->nd_rp = newrp;
453	ret = RC_DOIT;
454
455out:
456	NFSEXITCODE2(0, nd);
457	return (ret);
458}
459
460/*
461 * Update a request cache entry after the rpc has been done
462 */
463struct nfsrvcache *
464nfsrvd_updatecache(struct nfsrv_descript *nd)
465{
466	struct nfsrvcache *rp;
467	struct nfsrvcache *retrp = NULL;
468	struct mbuf *m;
469	struct mtx *mutex;
470
471	rp = nd->nd_rp;
472	if (!rp)
473		panic("nfsrvd_updatecache null rp");
474	nd->nd_rp = NULL;
475	mutex = nfsrc_cachemutex(rp);
476	mtx_lock(mutex);
477	nfsrc_lock(rp);
478	if (!(rp->rc_flag & RC_INPROG))
479		panic("nfsrvd_updatecache not inprog");
480	rp->rc_flag &= ~RC_INPROG;
481	if (rp->rc_flag & RC_UDP) {
482		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483		TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
484	}
485
486	/*
487	 * Reply from cache is a special case returned by nfsrv_checkseqid().
488	 */
489	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490		NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
491		mtx_unlock(mutex);
492		nd->nd_repstat = 0;
493		if (nd->nd_mreq)
494			m_freem(nd->nd_mreq);
495		if (!(rp->rc_flag & RC_REPMBUF))
496			panic("reply from cache");
497		nd->nd_mreq = m_copym(rp->rc_reply, 0,
498		    M_COPYALL, M_WAITOK);
499		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500		nfsrc_unlock(rp);
501		goto out;
502	}
503
504	/*
505	 * If rc_refcnt > 0, save it
506	 * For UDP, save it if ND_SAVEREPLY is set
507	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
508	 */
509	if (nd->nd_repstat != NFSERR_DONTREPLY &&
510	    (rp->rc_refcnt > 0 ||
511	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513	      NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514	      nfsrc_tcpnonidempotent))) {
515		if (rp->rc_refcnt > 0) {
516			if (!(rp->rc_flag & RC_NFSV4))
517				panic("update_cache refcnt");
518			rp->rc_flag |= RC_REFCNT;
519		}
520		if ((nd->nd_flag & ND_NFSV2) &&
521		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522			rp->rc_status = nd->nd_repstat;
523			rp->rc_flag |= RC_REPSTATUS;
524			mtx_unlock(mutex);
525		} else {
526			if (!(rp->rc_flag & RC_UDP)) {
527			    atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
528				1);
529			    if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532				    NFSD_VNET(nfsrc_tcpsavedreplies);
533			}
534			mtx_unlock(mutex);
535			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
536			mtx_lock(mutex);
537			rp->rc_reply = m;
538			rp->rc_flag |= RC_REPMBUF;
539			mtx_unlock(mutex);
540		}
541		if (rp->rc_flag & RC_UDP) {
542			rp->rc_timestamp = NFSD_MONOSEC +
543			    NFSRVCACHE_UDPTIMEOUT;
544			nfsrc_unlock(rp);
545		} else {
546			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547			if (rp->rc_refcnt > 0)
548				nfsrc_unlock(rp);
549			else
550				retrp = rp;
551		}
552	} else {
553		nfsrc_freecache(rp);
554		mtx_unlock(mutex);
555	}
556
557out:
558	NFSEXITCODE2(0, nd);
559	return (retrp);
560}
561
562/*
563 * Invalidate and, if possible, free an in prog cache entry.
564 * Must not sleep.
565 */
566void
567nfsrvd_delcache(struct nfsrvcache *rp)
568{
569	struct mtx *mutex;
570
571	mutex = nfsrc_cachemutex(rp);
572	if (!(rp->rc_flag & RC_INPROG))
573		panic("nfsrvd_delcache not in prog");
574	mtx_lock(mutex);
575	rp->rc_flag &= ~RC_INPROG;
576	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577		nfsrc_freecache(rp);
578	mtx_unlock(mutex);
579}
580
581/*
582 * Called after nfsrvd_updatecache() once the reply is sent, to update
583 * the entry's sequence number and unlock it. The argument is
584 * the pointer returned by nfsrvd_updatecache().
585 */
586void
587nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
588{
589	struct nfsrchash_bucket *hbp;
590
591	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
592	if (have_seq) {
593		hbp = NFSRCAHASH(rp->rc_sockref);
594		mtx_lock(&hbp->mtx);
595		rp->rc_tcpseq = seq;
596		if (rp->rc_acked != RC_NO_ACK)
597			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598		rp->rc_acked = RC_NO_ACK;
599		mtx_unlock(&hbp->mtx);
600	}
601	nfsrc_unlock(rp);
602}
603
604/*
605 * Get a cache entry for TCP
606 * - key on <xid, nfs version>
607 *   (allow multiple entries for a given key)
608 */
609static int
610nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
611{
612	struct nfsrvcache *rp, *nextrp;
613	int i;
614	struct nfsrvcache *hitrp;
615	struct nfsrvhashhead *hp, nfsrc_templist;
616	int hit, ret = 0;
617	struct mtx *mutex;
618
619	mutex = nfsrc_cachemutex(newrp);
620	hp = NFSRCHASH(newrp->rc_xid);
621	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
622tryagain:
623	mtx_lock(mutex);
624	hit = 1;
625	LIST_INIT(&nfsrc_templist);
626	/*
627	 * Get all the matches and put them on the temp list.
628	 */
629	rp = LIST_FIRST(hp);
630	while (rp != LIST_END(hp)) {
631		nextrp = LIST_NEXT(rp, rc_hash);
632		if (newrp->rc_xid == rp->rc_xid &&
633		    (!(rp->rc_flag & RC_INPROG) ||
634		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
635		      newrp->rc_sockref == rp->rc_sockref)) &&
636		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637		    newrp->rc_proc == rp->rc_proc &&
638		    ((newrp->rc_flag & RC_NFSV4) &&
639		     newrp->rc_sockref != rp->rc_sockref &&
640		     newrp->rc_cachetime >= rp->rc_cachetime)
641		    && newrp->rc_reqlen == rp->rc_reqlen &&
642		    newrp->rc_cksum == rp->rc_cksum) {
643			LIST_REMOVE(rp, rc_hash);
644			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
645		}
646		rp = nextrp;
647	}
648
649	/*
650	 * Now, use nfsrc_templist to decide if there is a match.
651	 */
652	i = 0;
653	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
654		i++;
655		if (rp->rc_refcnt > 0) {
656			hit = 0;
657			break;
658		}
659	}
660	/*
661	 * Can be a hit only if one entry left.
662	 * Note possible hit entry and put nfsrc_templist back on hash
663	 * list.
664	 */
665	if (i != 1)
666		hit = 0;
667	hitrp = rp = LIST_FIRST(&nfsrc_templist);
668	while (rp != LIST_END(&nfsrc_templist)) {
669		nextrp = LIST_NEXT(rp, rc_hash);
670		LIST_REMOVE(rp, rc_hash);
671		LIST_INSERT_HEAD(hp, rp, rc_hash);
672		rp = nextrp;
673	}
674	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675		panic("nfs gettcp cache templist");
676
677	if (hit) {
678		rp = hitrp;
679		if ((rp->rc_flag & RC_LOCKED) != 0) {
680			rp->rc_flag |= RC_WANTED;
681			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
682			    "nfsrc", 10 * hz);
683			goto tryagain;
684		}
685		if (rp->rc_flag == 0)
686			panic("nfs tcp cache0");
687		rp->rc_flag |= RC_LOCKED;
688		if (rp->rc_flag & RC_INPROG) {
689			NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
690			mtx_unlock(mutex);
691			if (newrp->rc_sockref == rp->rc_sockref)
692				nfsrc_marksametcpconn(rp->rc_sockref);
693			ret = RC_DROPIT;
694		} else if (rp->rc_flag & RC_REPSTATUS) {
695			/*
696			 * V2 only.
697			 */
698			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
699			mtx_unlock(mutex);
700			if (newrp->rc_sockref == rp->rc_sockref)
701				nfsrc_marksametcpconn(rp->rc_sockref);
702			ret = RC_REPLY;
703			nfsrvd_rephead(nd);
704			*(nd->nd_errp) = rp->rc_status;
705			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706		} else if (rp->rc_flag & RC_REPMBUF) {
707			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
708			mtx_unlock(mutex);
709			if (newrp->rc_sockref == rp->rc_sockref)
710				nfsrc_marksametcpconn(rp->rc_sockref);
711			ret = RC_REPLY;
712			nd->nd_mreq = m_copym(rp->rc_reply, 0,
713				M_COPYALL, M_WAITOK);
714			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
715		} else {
716			panic("nfs tcp cache1");
717		}
718		nfsrc_unlock(rp);
719		free(newrp, M_NFSRVCACHE);
720		goto out;
721	}
722	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
724
725	/*
726	 * For TCP, multiple entries for a key are allowed, so don't
727	 * chain it into the hash table until done.
728	 */
729	newrp->rc_cachetime = NFSD_MONOSEC;
730	newrp->rc_flag |= RC_INPROG;
731	LIST_INSERT_HEAD(hp, newrp, rc_hash);
732	mtx_unlock(mutex);
733	nd->nd_rp = newrp;
734	ret = RC_DOIT;
735
736out:
737	NFSEXITCODE2(0, nd);
738	return (ret);
739}
740
741/*
742 * Lock a cache entry.
743 */
744static void
745nfsrc_lock(struct nfsrvcache *rp)
746{
747	struct mtx *mutex;
748
749	mutex = nfsrc_cachemutex(rp);
750	mtx_assert(mutex, MA_OWNED);
751	while ((rp->rc_flag & RC_LOCKED) != 0) {
752		rp->rc_flag |= RC_WANTED;
753		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
754	}
755	rp->rc_flag |= RC_LOCKED;
756}
757
758/*
759 * Unlock a cache entry.
760 */
761static void
762nfsrc_unlock(struct nfsrvcache *rp)
763{
764	struct mtx *mutex;
765
766	mutex = nfsrc_cachemutex(rp);
767	mtx_lock(mutex);
768	rp->rc_flag &= ~RC_LOCKED;
769	nfsrc_wanted(rp);
770	mtx_unlock(mutex);
771}
772
773/*
774 * Wakeup anyone wanting entry.
775 */
776static void
777nfsrc_wanted(struct nfsrvcache *rp)
778{
779	if (rp->rc_flag & RC_WANTED) {
780		rp->rc_flag &= ~RC_WANTED;
781		wakeup((caddr_t)rp);
782	}
783}
784
785/*
786 * Free up the entry.
787 * Must not sleep.
788 */
789static void
790nfsrc_freecache(struct nfsrvcache *rp)
791{
792	struct nfsrchash_bucket *hbp;
793
794	LIST_REMOVE(rp, rc_hash);
795	if (rp->rc_flag & RC_UDP) {
796		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797		NFSD_VNET(nfsrc_udpcachesize)--;
798	} else if (rp->rc_acked != RC_NO_SEQ) {
799		hbp = NFSRCAHASH(rp->rc_sockref);
800		mtx_lock(&hbp->mtx);
801		if (rp->rc_acked == RC_NO_ACK)
802			LIST_REMOVE(rp, rc_ahash);
803		mtx_unlock(&hbp->mtx);
804	}
805	nfsrc_wanted(rp);
806	if (rp->rc_flag & RC_REPMBUF) {
807		m_freem(rp->rc_reply);
808		if (!(rp->rc_flag & RC_UDP))
809			atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
810	}
811	free(rp, M_NFSRVCACHE);
812	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
813}
814
815/*
816 * Clean out the cache. Called when nfsserver module is unloaded.
817 */
818void
819nfsrvd_cleancache(void)
820{
821	struct nfsrvcache *rp, *nextrp;
822	int i;
823
824	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
826		    rc_hash, nextrp)
827			nfsrc_freecache(rp);
828	}
829	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
831		    nextrp) {
832			nfsrc_freecache(rp);
833		}
834	}
835	NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
837}
838
839#define HISTSIZE	16
840/*
841 * The basic rule is to get rid of entries that are expired.
842 */
843void
844nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
845{
846	struct nfsrchash_bucket *hbp;
847	struct nfsrvcache *rp, *nextrp;
848	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
849	time_t thisstamp;
850	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851	static int onethread = 0, oneslot = 0;
852
853	if (sockref != 0) {
854		hbp = NFSRCAHASH(sockref);
855		mtx_lock(&hbp->mtx);
856		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857			if (sockref == rp->rc_sockref) {
858				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859					rp->rc_acked = RC_ACK;
860					LIST_REMOVE(rp, rc_ahash);
861				} else if (final) {
862					rp->rc_acked = RC_NACK;
863					LIST_REMOVE(rp, rc_ahash);
864				}
865			}
866		}
867		mtx_unlock(&hbp->mtx);
868	}
869
870	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
871		return;
872	if (NFSD_MONOSEC != udp_lasttrim ||
873	    NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874	    nfsrc_udphighwater / 2)) {
875		mtx_lock(&nfsrc_udpmtx);
876		udp_lasttrim = NFSD_MONOSEC;
877		TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
878		    nextrp) {
879			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880			     && rp->rc_refcnt == 0
881			     && ((rp->rc_flag & RC_REFCNT) ||
882				 udp_lasttrim > rp->rc_timestamp ||
883				 NFSD_VNET(nfsrc_udpcachesize) >
884				 nfsrc_udphighwater))
885				nfsrc_freecache(rp);
886		}
887		mtx_unlock(&nfsrc_udpmtx);
888	}
889	if (NFSD_MONOSEC != tcp_lasttrim ||
890	    NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891		force = nfsrc_tcphighwater / 4;
892		if (force > 0 &&
893		    NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894		    nfsrc_tcphighwater) {
895			for (i = 0; i < HISTSIZE; i++)
896				time_histo[i] = 0;
897			i = 0;
898			lastslot = NFSRVCACHE_HASHSIZE - 1;
899		} else {
900			force = 0;
901			if (NFSD_MONOSEC != tcp_lasttrim) {
902				i = 0;
903				lastslot = NFSRVCACHE_HASHSIZE - 1;
904			} else {
905				lastslot = i = oneslot;
906				if (++oneslot >= NFSRVCACHE_HASHSIZE)
907					oneslot = 0;
908			}
909		}
910		tto = nfsrc_tcptimeout;
911		tcp_lasttrim = NFSD_MONOSEC;
912		for (; i <= lastslot; i++) {
913			mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914			LIST_FOREACH_SAFE(rp,
915			    &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
916			    nextrp) {
917				if (!(rp->rc_flag &
918				     (RC_INPROG|RC_LOCKED|RC_WANTED))
919				     && rp->rc_refcnt == 0) {
920					if ((rp->rc_flag & RC_REFCNT) ||
921					    tcp_lasttrim > rp->rc_timestamp ||
922					    rp->rc_acked == RC_ACK) {
923						nfsrc_freecache(rp);
924						continue;
925					}
926
927					if (force == 0)
928						continue;
929					/*
930					 * The timestamps range from roughly the
931					 * present (tcp_lasttrim) to the present
932					 * + nfsrc_tcptimeout. Generate a simple
933					 * histogram of where the timeouts fall.
934					 */
935					j = rp->rc_timestamp - tcp_lasttrim;
936					if (j >= tto)
937						j = HISTSIZE - 1;
938					else if (j < 0)
939						j = 0;
940					else
941						j = j * HISTSIZE / tto;
942					time_histo[j]++;
943				}
944			}
945			mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
946		}
947		if (force) {
948			/*
949			 * Trim some more with a smaller timeout of as little
950			 * as 20% of nfsrc_tcptimeout to try and get below
951			 * 80% of the nfsrc_tcphighwater.
952			 */
953			k = 0;
954			for (i = 0; i < (HISTSIZE - 2); i++) {
955				k += time_histo[i];
956				if (k > force)
957					break;
958			}
959			k = tto * (i + 1) / HISTSIZE;
960			if (k < 1)
961				k = 1;
962			thisstamp = tcp_lasttrim + k;
963			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964				mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965				LIST_FOREACH_SAFE(rp,
966				    &NFSD_VNET(nfsrchash_table)[i].tbl,
967				    rc_hash, nextrp) {
968					if (!(rp->rc_flag &
969					     (RC_INPROG|RC_LOCKED|RC_WANTED))
970					     && rp->rc_refcnt == 0
971					     && ((rp->rc_flag & RC_REFCNT) ||
972						 thisstamp > rp->rc_timestamp ||
973						 rp->rc_acked == RC_ACK))
974						nfsrc_freecache(rp);
975				}
976				mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
977			}
978		}
979	}
980	atomic_store_rel_int(&onethread, 0);
981}
982
983/*
984 * Add a seqid# reference to the cache entry.
985 */
986void
987nfsrvd_refcache(struct nfsrvcache *rp)
988{
989	struct mtx *mutex;
990
991	if (rp == NULL)
992		/* For NFSv4.1, there is no cache entry. */
993		return;
994	mutex = nfsrc_cachemutex(rp);
995	mtx_lock(mutex);
996	if (rp->rc_refcnt < 0)
997		panic("nfs cache refcnt");
998	rp->rc_refcnt++;
999	mtx_unlock(mutex);
1000}
1001
1002/*
1003 * Dereference a seqid# cache entry.
1004 */
1005void
1006nfsrvd_derefcache(struct nfsrvcache *rp)
1007{
1008	struct mtx *mutex;
1009
1010	mutex = nfsrc_cachemutex(rp);
1011	mtx_lock(mutex);
1012	if (rp->rc_refcnt <= 0)
1013		panic("nfs cache derefcnt");
1014	rp->rc_refcnt--;
1015	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016		nfsrc_freecache(rp);
1017	mtx_unlock(mutex);
1018}
1019
1020/*
1021 * Calculate the length of the mbuf list and a checksum on the first up to
1022 * NFSRVCACHE_CHECKLEN bytes.
1023 */
1024static int
1025nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026{
1027	int len = 0, cklen;
1028	struct mbuf *m;
1029
1030	m = m1;
1031	while (m) {
1032		len += m->m_len;
1033		m = m->m_next;
1034	}
1035	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036	*cksum = in_cksum(m1, cklen);
1037	return (len);
1038}
1039
1040/*
1041 * Mark a TCP connection that is seeing retries. Should never happen for
1042 * NFSv4.
1043 */
1044static void
1045nfsrc_marksametcpconn(u_int64_t sockref)
1046{
1047}
1048