fs/nfsserver/nfs_nfsdcache.c

/*-
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Copyright (c) 1989, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Rick Macklem at The University of Guelph.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
/*
 * Here is the basic algorithm:
 * First, some design criteria I used:
 * - I think a false hit is more serious than a false miss
 * - A false hit for an RPC that has Op(s) that order via seqid# must be
 *   avoided at all cost
 * - A valid hit will probably happen a long time after the original reply
 *   and the TCP socket that the original request was received on will no
 *   longer be active
 *   (The long time delay implies to me that LRU is not appropriate.)
 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
 *   in them as well as minimizing the risk of redoing retried non-idempotent
 *   Ops.
 * Because it is biased towards avoiding false hits, multiple entries with
 * the same xid are to be expected, especially for the case of the entry
 * in the cache being related to a seqid# sequenced Op.
 *
 * The basic algorithm I'm about to code up:
 * - Null RPCs bypass the cache and are just done
 * For TCP
 * 	- key on <xid, NFS version> (as noted above, there can be several
 * 				     entries with the same key)
 * 	When a request arrives:
 * 		For all that match key
 * 		- if RPC# != OR request_size !=
 * 			- not a match with this one
 * 		- if NFSv4 and received on same TCP socket OR
 *			received on a TCP connection created before the
 *			entry was cached
 * 			- not a match with this one
 * 			(V2,3 clients might retry on same TCP socket)
 * 		- calculate checksum on first N bytes of NFS XDR
 * 		- if checksum !=
 * 			- not a match for this one
 * 		If any of the remaining ones that match has a
 * 			seqid_refcnt > 0
 * 			- not a match (go do RPC, using new cache entry)
 * 		If one match left
 * 			- a hit (reply from cache)
 * 		else
 * 			- miss (go do RPC, using new cache entry)
 *
 * 	During processing of NFSv4 request:
 * 		- set a flag when a non-idempotent Op is processed
 * 		- when an Op that uses a seqid# (Open,...) is processed
 * 			- if same seqid# as referenced entry in cache
 * 				- free new cache entry
 * 				- reply from referenced cache entry
 * 			  else if next seqid# in order
 * 				- free referenced cache entry
 * 				- increment seqid_refcnt on new cache entry
 * 				- set pointer from Openowner/Lockowner to
 * 					new cache entry (aka reference it)
 * 			  else if first seqid# in sequence
 * 				- increment seqid_refcnt on new cache entry
 * 				- set pointer from Openowner/Lockowner to
 * 					new cache entry (aka reference it)
 *
 * 	At end of RPC processing:
 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
 * 			cache entry
 * 			- save reply in cache entry
 * 			- calculate checksum on first N bytes of NFS XDR
 * 				request
 * 			- note op and length of XDR request (in bytes)
 * 			- timestamp it
 * 		  else
 * 			- free new cache entry
 * 		- Send reply (noting info for socket activity check, below)
 *
 * 	For cache entries saved above:
 * 		- if saved since seqid_refcnt was > 0
 * 			- free when seqid_refcnt decrements to 0
 * 			  (when next one in sequence is processed above, or
 * 			   when Openowner/Lockowner is discarded)
 * 		  else { non-idempotent Op(s) }
 * 			- free when
 * 				- some further activity observed on same
 * 					socket
 * 				  (I'm not yet sure how I'm going to do
 * 				   this. Maybe look at the TCP connection
 * 				   to see if the send_tcp_sequence# is well
 * 				   past sent reply OR K additional RPCs
 * 				   replied on same socket OR?)
 * 			  OR
 * 				- when very old (hours, days, weeks?)
 *
 * For UDP (v2, 3 only), pretty much the old way:
 * - key on <xid, NFS version, RPC#, Client host ip#>
 *   (at most one entry for each key)
 *
 * When a Request arrives:
 * - if a match with entry via key
 * 	- if RPC marked In_progress
 * 		- discard request (don't send reply)
 * 	  else
 * 		- reply from cache
 * 		- timestamp cache entry
 *   else
 * 	- add entry to cache, marked In_progress
 * 	- do RPC
 * 	- when RPC done
 * 		- if RPC# non-idempotent
 * 			- mark entry Done (not In_progress)
 * 			- save reply
 * 			- timestamp cache entry
 * 		  else
 * 			- free cache entry
 * 		- send reply
 *
 * Later, entries with saved replies are free'd a short time (few minutes)
 * after reply sent (timestamp).
 * Reference: Chet Juszczak, "Improving the Performance and Correctness
 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
 *		pages 53-63. San Diego, February 1989.
 *	 for the UDP case.
 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
 *	for TCP. For V3, a reply won't be saved when the flood level is
 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
 *	that case. This level should be set high enough that this almost
 *	never happens.
 */
#include <fs/nfs/nfsport.h>

extern struct mtx nfsrc_udpmtx;

NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);

NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;

SYSCTL_DECL(_vfs_nfsd);

static u_int	nfsrc_tcphighwater = 0;
static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
{
	int error, newhighwater;

	newhighwater = nfsrc_tcphighwater;
	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
	if (error != 0 || req->newptr == NULL)
		return (error);
	if (newhighwater < 0)
		return (EINVAL);
	if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
		NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
	nfsrc_tcphighwater = newhighwater;
	return (0);
}
SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
    sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");

static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
    &nfsrc_udphighwater, 0,
    "High water mark for UDP cache entries");
static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
    &nfsrc_tcptimeout, 0,
    "Timeout for TCP entries in the DRC");
static u_int nfsrc_tcpnonidempotent = 1;
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
    &nfsrc_tcpnonidempotent, 0,
    "Enable the DRC for NFS over TCP");

NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);

/*
 * and the reverse mapping from generic to Version 2 procedure numbers
 */
static int newnfsv2_procid[NFS_V3NPROCS] = {
	NFSV2PROC_NULL,
	NFSV2PROC_GETATTR,
	NFSV2PROC_SETATTR,
	NFSV2PROC_LOOKUP,
	NFSV2PROC_NOOP,
	NFSV2PROC_READLINK,
	NFSV2PROC_READ,
	NFSV2PROC_WRITE,
	NFSV2PROC_CREATE,
	NFSV2PROC_MKDIR,
	NFSV2PROC_SYMLINK,
	NFSV2PROC_CREATE,
	NFSV2PROC_REMOVE,
	NFSV2PROC_RMDIR,
	NFSV2PROC_RENAME,
	NFSV2PROC_LINK,
	NFSV2PROC_READDIR,
	NFSV2PROC_NOOP,
	NFSV2PROC_STATFS,
	NFSV2PROC_NOOP,
	NFSV2PROC_NOOP,
	NFSV2PROC_NOOP,
};

#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
#define	NFSRCUDPHASH(xid) \
	(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
#define	NFSRCHASH(xid) \
	(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
#define	NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
#define	TRUE	1
#define	FALSE	0
#define	NFSRVCACHE_CHECKLEN	100

/* True iff the rpc reply is an nfs status ONLY! */
static int nfsv2_repstat[NFS_V3NPROCS] = {
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	TRUE,
	TRUE,
	TRUE,
	TRUE,
	FALSE,
	TRUE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
	FALSE,
};

/*
 * Will NFS want to work over IPv6 someday?
 */
#define	NETFAMILY(rp) \
		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)

/* local functions */
static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
static void nfsrc_lock(struct nfsrvcache *rp);
static void nfsrc_unlock(struct nfsrvcache *rp);
static void nfsrc_wanted(struct nfsrvcache *rp);
static void nfsrc_freecache(struct nfsrvcache *rp);
static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
static void nfsrc_marksametcpconn(u_int64_t);

/*
 * Return the correct mutex for this cache entry.
 */
static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache *rp)
{

	if ((rp->rc_flag & RC_UDP) != 0)
		return (&nfsrc_udpmtx);
	return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
}

/*
 * Initialize the server request cache list
 */
void
nfsrvd_initcache(void)
{
	int i;

	NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
	NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
	NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
		mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
		    MTX_DEF);
		mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
		    MTX_DEF);
	}
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
		LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
		LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
		LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
	}
	TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
	NFSD_VNET(nfsrc_udpcachesize) = 0;
}

/*
 * Get a cache entry for this request. Basically just malloc a new one
 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
 */
int
nfsrvd_getcache(struct nfsrv_descript *nd)
{
	struct nfsrvcache *newrp;
	int ret;

	if (nd->nd_procnum == NFSPROC_NULL)
		panic("nfsd cache null");
	newrp = malloc(sizeof (struct nfsrvcache),
	    M_NFSRVCACHE, M_WAITOK);
	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
	if (nd->nd_flag & ND_NFSV4)
		newrp->rc_flag = RC_NFSV4;
	else if (nd->nd_flag & ND_NFSV3)
		newrp->rc_flag = RC_NFSV3;
	else
		newrp->rc_flag = RC_NFSV2;
	newrp->rc_xid = nd->nd_retxid;
	newrp->rc_proc = nd->nd_procnum;
	newrp->rc_sockref = nd->nd_sockref;
	newrp->rc_cachetime = nd->nd_tcpconntime;
	if (nd->nd_flag & ND_SAMETCPCONN)
		newrp->rc_flag |= RC_SAMETCPCONN;
	if (nd->nd_nam2 != NULL) {
		newrp->rc_flag |= RC_UDP;
		ret = nfsrc_getudp(nd, newrp);
	} else {
		ret = nfsrc_gettcp(nd, newrp);
	}
	NFSEXITCODE2(0, nd);
	return (ret);
}

/*
 * For UDP (v2, v3):
 * - key on <xid, NFS version, RPC#, Client host ip#>
 *   (at most one entry for each key)
 */
static int
nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
{
	struct nfsrvcache *rp;
	struct sockaddr_in *saddr;
	struct sockaddr_in6 *saddr6;
	struct nfsrvhashhead *hp;
	int ret = 0;
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(newrp);
	hp = NFSRCUDPHASH(newrp->rc_xid);
loop:
	mtx_lock(mutex);
	LIST_FOREACH(rp, hp, rc_hash) {
	    if (newrp->rc_xid == rp->rc_xid &&
		newrp->rc_proc == rp->rc_proc &&
		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
			if ((rp->rc_flag & RC_LOCKED) != 0) {
				rp->rc_flag |= RC_WANTED;
				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
				    "nfsrc", 10 * hz);
				goto loop;
			}
			if (rp->rc_flag == 0)
				panic("nfs udp cache0");
			rp->rc_flag |= RC_LOCKED;
			TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
			TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
			if (rp->rc_flag & RC_INPROG) {
				NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
				mtx_unlock(mutex);
				ret = RC_DROPIT;
			} else if (rp->rc_flag & RC_REPSTATUS) {
				/*
				 * V2 only.
				 */
				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
				mtx_unlock(mutex);
				nfsrvd_rephead(nd);
				*(nd->nd_errp) = rp->rc_status;
				ret = RC_REPLY;
				rp->rc_timestamp = NFSD_MONOSEC +
					NFSRVCACHE_UDPTIMEOUT;
			} else if (rp->rc_flag & RC_REPMBUF) {
				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
				mtx_unlock(mutex);
				nd->nd_mreq = m_copym(rp->rc_reply, 0,
					M_COPYALL, M_WAITOK);
				ret = RC_REPLY;
				rp->rc_timestamp = NFSD_MONOSEC +
					NFSRVCACHE_UDPTIMEOUT;
			} else {
				panic("nfs udp cache1");
			}
			nfsrc_unlock(rp);
			free(newrp, M_NFSRVCACHE);
			goto out;
		}
	}
	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
	NFSD_VNET(nfsrc_udpcachesize)++;

	newrp->rc_flag |= RC_INPROG;
	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
	if (saddr->sin_family == AF_INET)
		newrp->rc_inet = saddr->sin_addr.s_addr;
	else if (saddr->sin_family == AF_INET6) {
		saddr6 = (struct sockaddr_in6 *)saddr;
		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
		    sizeof (struct in6_addr));
		newrp->rc_flag |= RC_INETIPV6;
	}
	LIST_INSERT_HEAD(hp, newrp, rc_hash);
	TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
	mtx_unlock(mutex);
	nd->nd_rp = newrp;
	ret = RC_DOIT;

out:
	NFSEXITCODE2(0, nd);
	return (ret);
}

/*
 * Update a request cache entry after the rpc has been done
 */
struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript *nd)
{
	struct nfsrvcache *rp;
	struct nfsrvcache *retrp = NULL;
	struct mbuf *m;
	struct mtx *mutex;

	rp = nd->nd_rp;
	if (!rp)
		panic("nfsrvd_updatecache null rp");
	nd->nd_rp = NULL;
	mutex = nfsrc_cachemutex(rp);
	mtx_lock(mutex);
	nfsrc_lock(rp);
	if (!(rp->rc_flag & RC_INPROG))
		panic("nfsrvd_updatecache not inprog");
	rp->rc_flag &= ~RC_INPROG;
	if (rp->rc_flag & RC_UDP) {
		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
		TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
	}

	/*
	 * Reply from cache is a special case returned by nfsrv_checkseqid().
	 */
	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
		NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
		mtx_unlock(mutex);
		nd->nd_repstat = 0;
		if (nd->nd_mreq)
			m_freem(nd->nd_mreq);
		if (!(rp->rc_flag & RC_REPMBUF))
			panic("reply from cache");
		nd->nd_mreq = m_copym(rp->rc_reply, 0,
		    M_COPYALL, M_WAITOK);
		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
		nfsrc_unlock(rp);
		goto out;
	}

	/*
	 * If rc_refcnt > 0, save it
	 * For UDP, save it if ND_SAVEREPLY is set
	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
	 */
	if (nd->nd_repstat != NFSERR_DONTREPLY &&
	    (rp->rc_refcnt > 0 ||
	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
	      NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
	      nfsrc_tcpnonidempotent))) {
		if (rp->rc_refcnt > 0) {
			if (!(rp->rc_flag & RC_NFSV4))
				panic("update_cache refcnt");
			rp->rc_flag |= RC_REFCNT;
		}
		if ((nd->nd_flag & ND_NFSV2) &&
		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
			rp->rc_status = nd->nd_repstat;
			rp->rc_flag |= RC_REPSTATUS;
			mtx_unlock(mutex);
		} else {
			if (!(rp->rc_flag & RC_UDP)) {
			    atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
				1);
			    if (NFSD_VNET(nfsrc_tcpsavedreplies) >
				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
				    NFSD_VNET(nfsrc_tcpsavedreplies);
			}
			mtx_unlock(mutex);
			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
			mtx_lock(mutex);
			rp->rc_reply = m;
			rp->rc_flag |= RC_REPMBUF;
			mtx_unlock(mutex);
		}
		if (rp->rc_flag & RC_UDP) {
			rp->rc_timestamp = NFSD_MONOSEC +
			    NFSRVCACHE_UDPTIMEOUT;
			nfsrc_unlock(rp);
		} else {
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
			if (rp->rc_refcnt > 0)
				nfsrc_unlock(rp);
			else
				retrp = rp;
		}
	} else {
		nfsrc_freecache(rp);
		mtx_unlock(mutex);
	}

out:
	NFSEXITCODE2(0, nd);
	return (retrp);
}

/*
 * Invalidate and, if possible, free an in prog cache entry.
 * Must not sleep.
 */
void
nfsrvd_delcache(struct nfsrvcache *rp)
{
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(rp);
	if (!(rp->rc_flag & RC_INPROG))
		panic("nfsrvd_delcache not in prog");
	mtx_lock(mutex);
	rp->rc_flag &= ~RC_INPROG;
	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
		nfsrc_freecache(rp);
	mtx_unlock(mutex);
}

/*
 * Called after nfsrvd_updatecache() once the reply is sent, to update
 * the entry's sequence number and unlock it. The argument is
 * the pointer returned by nfsrvd_updatecache().
 */
void
nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
{
	struct nfsrchash_bucket *hbp;

	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
	if (have_seq) {
		hbp = NFSRCAHASH(rp->rc_sockref);
		mtx_lock(&hbp->mtx);
		rp->rc_tcpseq = seq;
		if (rp->rc_acked != RC_NO_ACK)
			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
		rp->rc_acked = RC_NO_ACK;
		mtx_unlock(&hbp->mtx);
	}
	nfsrc_unlock(rp);
}

/*
 * Get a cache entry for TCP
 * - key on <xid, nfs version>
 *   (allow multiple entries for a given key)
 */
static int
nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
{
	struct nfsrvcache *rp, *nextrp;
	int i;
	struct nfsrvcache *hitrp;
	struct nfsrvhashhead *hp, nfsrc_templist;
	int hit, ret = 0;
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(newrp);
	hp = NFSRCHASH(newrp->rc_xid);
	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
tryagain:
	mtx_lock(mutex);
	hit = 1;
	LIST_INIT(&nfsrc_templist);
	/*
	 * Get all the matches and put them on the temp list.
	 */
	rp = LIST_FIRST(hp);
	while (rp != LIST_END(hp)) {
		nextrp = LIST_NEXT(rp, rc_hash);
		if (newrp->rc_xid == rp->rc_xid &&
		    (!(rp->rc_flag & RC_INPROG) ||
		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
		      newrp->rc_sockref == rp->rc_sockref)) &&
		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
		    newrp->rc_proc == rp->rc_proc &&
		    ((newrp->rc_flag & RC_NFSV4) &&
		     newrp->rc_sockref != rp->rc_sockref &&
		     newrp->rc_cachetime >= rp->rc_cachetime)
		    && newrp->rc_reqlen == rp->rc_reqlen &&
		    newrp->rc_cksum == rp->rc_cksum) {
			LIST_REMOVE(rp, rc_hash);
			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
		}
		rp = nextrp;
	}

	/*
	 * Now, use nfsrc_templist to decide if there is a match.
	 */
	i = 0;
	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
		i++;
		if (rp->rc_refcnt > 0) {
			hit = 0;
			break;
		}
	}
	/*
	 * Can be a hit only if one entry left.
	 * Note possible hit entry and put nfsrc_templist back on hash
	 * list.
	 */
	if (i != 1)
		hit = 0;
	hitrp = rp = LIST_FIRST(&nfsrc_templist);
	while (rp != LIST_END(&nfsrc_templist)) {
		nextrp = LIST_NEXT(rp, rc_hash);
		LIST_REMOVE(rp, rc_hash);
		LIST_INSERT_HEAD(hp, rp, rc_hash);
		rp = nextrp;
	}
	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
		panic("nfs gettcp cache templist");

	if (hit) {
		rp = hitrp;
		if ((rp->rc_flag & RC_LOCKED) != 0) {
			rp->rc_flag |= RC_WANTED;
			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
			    "nfsrc", 10 * hz);
			goto tryagain;
		}
		if (rp->rc_flag == 0)
			panic("nfs tcp cache0");
		rp->rc_flag |= RC_LOCKED;
		if (rp->rc_flag & RC_INPROG) {
			NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
			mtx_unlock(mutex);
			if (newrp->rc_sockref == rp->rc_sockref)
				nfsrc_marksametcpconn(rp->rc_sockref);
			ret = RC_DROPIT;
		} else if (rp->rc_flag & RC_REPSTATUS) {
			/*
			 * V2 only.
			 */
			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
			mtx_unlock(mutex);
			if (newrp->rc_sockref == rp->rc_sockref)
				nfsrc_marksametcpconn(rp->rc_sockref);
			ret = RC_REPLY;
			nfsrvd_rephead(nd);
			*(nd->nd_errp) = rp->rc_status;
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
		} else if (rp->rc_flag & RC_REPMBUF) {
			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
			mtx_unlock(mutex);
			if (newrp->rc_sockref == rp->rc_sockref)
				nfsrc_marksametcpconn(rp->rc_sockref);
			ret = RC_REPLY;
			nd->nd_mreq = m_copym(rp->rc_reply, 0,
				M_COPYALL, M_WAITOK);
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
		} else {
			panic("nfs tcp cache1");
		}
		nfsrc_unlock(rp);
		free(newrp, M_NFSRVCACHE);
		goto out;
	}
	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);

	/*
	 * For TCP, multiple entries for a key are allowed, so don't
	 * chain it into the hash table until done.
	 */
	newrp->rc_cachetime = NFSD_MONOSEC;
	newrp->rc_flag |= RC_INPROG;
	LIST_INSERT_HEAD(hp, newrp, rc_hash);
	mtx_unlock(mutex);
	nd->nd_rp = newrp;
	ret = RC_DOIT;

out:
	NFSEXITCODE2(0, nd);
	return (ret);
}

/*
 * Lock a cache entry.
 */
static void
nfsrc_lock(struct nfsrvcache *rp)
{
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(rp);
	mtx_assert(mutex, MA_OWNED);
	while ((rp->rc_flag & RC_LOCKED) != 0) {
		rp->rc_flag |= RC_WANTED;
		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
	}
	rp->rc_flag |= RC_LOCKED;
}

/*
 * Unlock a cache entry.
 */
static void
nfsrc_unlock(struct nfsrvcache *rp)
{
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(rp);
	mtx_lock(mutex);
	rp->rc_flag &= ~RC_LOCKED;
	nfsrc_wanted(rp);
	mtx_unlock(mutex);
}

/*
 * Wakeup anyone wanting entry.
 */
static void
nfsrc_wanted(struct nfsrvcache *rp)
{
	if (rp->rc_flag & RC_WANTED) {
		rp->rc_flag &= ~RC_WANTED;
		wakeup((caddr_t)rp);
	}
}

/*
 * Free up the entry.
 * Must not sleep.
 */
static void
nfsrc_freecache(struct nfsrvcache *rp)
{
	struct nfsrchash_bucket *hbp;

	LIST_REMOVE(rp, rc_hash);
	if (rp->rc_flag & RC_UDP) {
		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
		NFSD_VNET(nfsrc_udpcachesize)--;
	} else if (rp->rc_acked != RC_NO_SEQ) {
		hbp = NFSRCAHASH(rp->rc_sockref);
		mtx_lock(&hbp->mtx);
		if (rp->rc_acked == RC_NO_ACK)
			LIST_REMOVE(rp, rc_ahash);
		mtx_unlock(&hbp->mtx);
	}
	nfsrc_wanted(rp);
	if (rp->rc_flag & RC_REPMBUF) {
		m_freem(rp->rc_reply);
		if (!(rp->rc_flag & RC_UDP))
			atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
	}
	free(rp, M_NFSRVCACHE);
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
}

/*
 * Clean out the cache. Called when nfsserver module is unloaded.
 */
void
nfsrvd_cleancache(void)
{
	struct nfsrvcache *rp, *nextrp;
	int i;

	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
		    rc_hash, nextrp)
			nfsrc_freecache(rp);
	}
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
		    nextrp) {
			nfsrc_freecache(rp);
		}
	}
	NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
}

#define HISTSIZE	16
/*
 * The basic rule is to get rid of entries that are expired.
 */
void
nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
{
	struct nfsrchash_bucket *hbp;
	struct nfsrvcache *rp, *nextrp;
	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
	time_t thisstamp;
	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
	static int onethread = 0, oneslot = 0;

	if (sockref != 0) {
		hbp = NFSRCAHASH(sockref);
		mtx_lock(&hbp->mtx);
		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
			if (sockref == rp->rc_sockref) {
				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
					rp->rc_acked = RC_ACK;
					LIST_REMOVE(rp, rc_ahash);
				} else if (final) {
					rp->rc_acked = RC_NACK;
					LIST_REMOVE(rp, rc_ahash);
				}
			}
		}
		mtx_unlock(&hbp->mtx);
	}

	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
		return;
	if (NFSD_MONOSEC != udp_lasttrim ||
	    NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
	    nfsrc_udphighwater / 2)) {
		mtx_lock(&nfsrc_udpmtx);
		udp_lasttrim = NFSD_MONOSEC;
		TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
		    nextrp) {
			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
			     && rp->rc_refcnt == 0
			     && ((rp->rc_flag & RC_REFCNT) ||
				 udp_lasttrim > rp->rc_timestamp ||
				 NFSD_VNET(nfsrc_udpcachesize) >
				 nfsrc_udphighwater))
				nfsrc_freecache(rp);
		}
		mtx_unlock(&nfsrc_udpmtx);
	}
	if (NFSD_MONOSEC != tcp_lasttrim ||
	    NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
		force = nfsrc_tcphighwater / 4;
		if (force > 0 &&
		    NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
		    nfsrc_tcphighwater) {
			for (i = 0; i < HISTSIZE; i++)
				time_histo[i] = 0;
			i = 0;
			lastslot = NFSRVCACHE_HASHSIZE - 1;
		} else {
			force = 0;
			if (NFSD_MONOSEC != tcp_lasttrim) {
				i = 0;
				lastslot = NFSRVCACHE_HASHSIZE - 1;
			} else {
				lastslot = i = oneslot;
				if (++oneslot >= NFSRVCACHE_HASHSIZE)
					oneslot = 0;
			}
		}
		tto = nfsrc_tcptimeout;
		tcp_lasttrim = NFSD_MONOSEC;
		for (; i <= lastslot; i++) {
			mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
			LIST_FOREACH_SAFE(rp,
			    &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
			    nextrp) {
				if (!(rp->rc_flag &
				     (RC_INPROG|RC_LOCKED|RC_WANTED))
				     && rp->rc_refcnt == 0) {
					if ((rp->rc_flag & RC_REFCNT) ||
					    tcp_lasttrim > rp->rc_timestamp ||
					    rp->rc_acked == RC_ACK) {
						nfsrc_freecache(rp);
						continue;
					}

					if (force == 0)
						continue;
					/*
					 * The timestamps range from roughly the
					 * present (tcp_lasttrim) to the present
					 * + nfsrc_tcptimeout. Generate a simple
					 * histogram of where the timeouts fall.
					 */
					j = rp->rc_timestamp - tcp_lasttrim;
					if (j >= tto)
						j = HISTSIZE - 1;
					else if (j < 0)
						j = 0;
					else
						j = j * HISTSIZE / tto;
					time_histo[j]++;
				}
			}
			mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
		}
		if (force) {
			/*
			 * Trim some more with a smaller timeout of as little
			 * as 20% of nfsrc_tcptimeout to try and get below
			 * 80% of the nfsrc_tcphighwater.
			 */
			k = 0;
			for (i = 0; i < (HISTSIZE - 2); i++) {
				k += time_histo[i];
				if (k > force)
					break;
			}
			k = tto * (i + 1) / HISTSIZE;
			if (k < 1)
				k = 1;
			thisstamp = tcp_lasttrim + k;
			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
				mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
				LIST_FOREACH_SAFE(rp,
				    &NFSD_VNET(nfsrchash_table)[i].tbl,
				    rc_hash, nextrp) {
					if (!(rp->rc_flag &
					     (RC_INPROG|RC_LOCKED|RC_WANTED))
					     && rp->rc_refcnt == 0
					     && ((rp->rc_flag & RC_REFCNT) ||
						 thisstamp > rp->rc_timestamp ||
						 rp->rc_acked == RC_ACK))
						nfsrc_freecache(rp);
				}
				mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
			}
		}
	}
	atomic_store_rel_int(&onethread, 0);
}

/*
 * Add a seqid# reference to the cache entry.
 */
void
nfsrvd_refcache(struct nfsrvcache *rp)
{
	struct mtx *mutex;

	if (rp == NULL)
		/* For NFSv4.1, there is no cache entry. */
		return;
	mutex = nfsrc_cachemutex(rp);
	mtx_lock(mutex);
	if (rp->rc_refcnt < 0)
		panic("nfs cache refcnt");
	rp->rc_refcnt++;
	mtx_unlock(mutex);
}

/*
 * Dereference a seqid# cache entry.
 */
void
nfsrvd_derefcache(struct nfsrvcache *rp)
{
	struct mtx *mutex;

	mutex = nfsrc_cachemutex(rp);
	mtx_lock(mutex);
	if (rp->rc_refcnt <= 0)
		panic("nfs cache derefcnt");
	rp->rc_refcnt--;
	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
		nfsrc_freecache(rp);
	mtx_unlock(mutex);
}

/*
 * Calculate the length of the mbuf list and a checksum on the first up to
 * NFSRVCACHE_CHECKLEN bytes.
 */
static int
nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
{
	int len = 0, cklen;
	struct mbuf *m;

	m = m1;
	while (m) {
		len += m->m_len;
		m = m->m_next;
	}
	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
	*cksum = in_cksum(m1, cklen);
	return (len);
}

/*
 * Mark a TCP connection that is seeing retries. Should never happen for
 * NFSv4.
 */
static void
nfsrc_marksametcpconn(u_int64_t sockref)
{
}