1/*
2 * linux/net/sunrpc/svcsock.c
3 *
4 * These are the RPC server socket internals.
5 *
6 * The server scheduling algorithm does not always distribute the load
7 * evenly when servicing a single client. May need to modify the
8 * svc_xprt_enqueue procedure...
9 *
10 * TCP support is largely untested and may be a little slow. The problem
11 * is that we currently do two separate recvfrom's, one for the 4-byte
12 * record length, and the second for the actual record. This could possibly
13 * be improved by always reading a minimum size of around 100 bytes and
14 * tucking any superfluous bytes away in a temporary store. Still, that
15 * leaves write requests out in the rain. An alternative may be to peek at
16 * the first skb in the queue, and if it matches the next TCP sequence
17 * number, to extract the record marker. Yuck.
18 *
19 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
20 */
21
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/errno.h>
25#include <linux/fcntl.h>
26#include <linux/net.h>
27#include <linux/in.h>
28#include <linux/inet.h>
29#include <linux/udp.h>
30#include <linux/tcp.h>
31#include <linux/unistd.h>
32#include <linux/slab.h>
33#include <linux/netdevice.h>
34#include <linux/skbuff.h>
35#include <linux/file.h>
36#include <linux/freezer.h>
37#include <net/sock.h>
38#include <net/checksum.h>
39#include <net/ip.h>
40#include <net/ipv6.h>
41#include <net/tcp.h>
42#include <net/tcp_states.h>
43#include <asm/uaccess.h>
44#include <asm/ioctls.h>
45
46#include <linux/sunrpc/types.h>
47#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/msg_prot.h>
50#include <linux/sunrpc/svcsock.h>
51#include <linux/sunrpc/stats.h>
52#include <linux/sunrpc/xprt.h>
53
54#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
55
56
57static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
58					 int *errp, int flags);
59static void		svc_udp_data_ready(struct sock *, int);
60static int		svc_udp_recvfrom(struct svc_rqst *);
61static int		svc_udp_sendto(struct svc_rqst *);
62static void		svc_sock_detach(struct svc_xprt *);
63static void		svc_tcp_sock_detach(struct svc_xprt *);
64static void		svc_sock_free(struct svc_xprt *);
65
66static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
67					  struct sockaddr *, int, int);
68#ifdef CONFIG_DEBUG_LOCK_ALLOC
69static struct lock_class_key svc_key[2];
70static struct lock_class_key svc_slock_key[2];
71
72static void svc_reclassify_socket(struct socket *sock)
73{
74	struct sock *sk = sock->sk;
75	BUG_ON(sock_owned_by_user(sk));
76	switch (sk->sk_family) {
77	case AF_INET:
78		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
79					      &svc_slock_key[0],
80					      "sk_xprt.xpt_lock-AF_INET-NFSD",
81					      &svc_key[0]);
82		break;
83
84	case AF_INET6:
85		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
86					      &svc_slock_key[1],
87					      "sk_xprt.xpt_lock-AF_INET6-NFSD",
88					      &svc_key[1]);
89		break;
90
91	default:
92		BUG();
93	}
94}
95#else
96static void svc_reclassify_socket(struct socket *sock)
97{
98}
99#endif
100
101/*
102 * Release an skbuff after use
103 */
104static void svc_release_skb(struct svc_rqst *rqstp)
105{
106	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
107
108	if (skb) {
109		struct svc_sock *svsk =
110			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
111		rqstp->rq_xprt_ctxt = NULL;
112
113		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
114		skb_free_datagram_locked(svsk->sk_sk, skb);
115	}
116}
117
118union svc_pktinfo_u {
119	struct in_pktinfo pkti;
120	struct in6_pktinfo pkti6;
121};
122#define SVC_PKTINFO_SPACE \
123	CMSG_SPACE(sizeof(union svc_pktinfo_u))
124
125static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
126{
127	struct svc_sock *svsk =
128		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
129	switch (svsk->sk_sk->sk_family) {
130	case AF_INET: {
131			struct in_pktinfo *pki = CMSG_DATA(cmh);
132
133			cmh->cmsg_level = SOL_IP;
134			cmh->cmsg_type = IP_PKTINFO;
135			pki->ipi_ifindex = 0;
136			pki->ipi_spec_dst.s_addr = rqstp->rq_daddr.addr.s_addr;
137			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
138		}
139		break;
140
141	case AF_INET6: {
142			struct in6_pktinfo *pki = CMSG_DATA(cmh);
143
144			cmh->cmsg_level = SOL_IPV6;
145			cmh->cmsg_type = IPV6_PKTINFO;
146			pki->ipi6_ifindex = 0;
147			ipv6_addr_copy(&pki->ipi6_addr,
148					&rqstp->rq_daddr.addr6);
149			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
150		}
151		break;
152	}
153}
154
155/*
156 * send routine intended to be shared by the fore- and back-channel
157 */
158int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
159		    struct page *headpage, unsigned long headoffset,
160		    struct page *tailpage, unsigned long tailoffset)
161{
162	int		result;
163	int		size;
164	struct page	**ppage = xdr->pages;
165	size_t		base = xdr->page_base;
166	unsigned int	pglen = xdr->page_len;
167	unsigned int	flags = MSG_MORE;
168	int		slen;
169	int		len = 0;
170
171	slen = xdr->len;
172
173	/* send head */
174	if (slen == xdr->head[0].iov_len)
175		flags = 0;
176	len = kernel_sendpage(sock, headpage, headoffset,
177				  xdr->head[0].iov_len, flags);
178	if (len != xdr->head[0].iov_len)
179		goto out;
180	slen -= xdr->head[0].iov_len;
181	if (slen == 0)
182		goto out;
183
184	/* send page data */
185	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
186	while (pglen > 0) {
187		if (slen == size)
188			flags = 0;
189		result = kernel_sendpage(sock, *ppage, base, size, flags);
190		if (result > 0)
191			len += result;
192		if (result != size)
193			goto out;
194		slen -= size;
195		pglen -= size;
196		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
197		base = 0;
198		ppage++;
199	}
200
201	/* send tail */
202	if (xdr->tail[0].iov_len) {
203		result = kernel_sendpage(sock, tailpage, tailoffset,
204				   xdr->tail[0].iov_len, 0);
205		if (result > 0)
206			len += result;
207	}
208
209out:
210	return len;
211}
212
213
214/*
215 * Generic sendto routine
216 */
217static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
218{
219	struct svc_sock	*svsk =
220		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
221	struct socket	*sock = svsk->sk_sock;
222	union {
223		struct cmsghdr	hdr;
224		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
225	} buffer;
226	struct cmsghdr *cmh = &buffer.hdr;
227	int		len = 0;
228	unsigned long tailoff;
229	unsigned long headoff;
230	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
231
232	if (rqstp->rq_prot == IPPROTO_UDP) {
233		struct msghdr msg = {
234			.msg_name	= &rqstp->rq_addr,
235			.msg_namelen	= rqstp->rq_addrlen,
236			.msg_control	= cmh,
237			.msg_controllen	= sizeof(buffer),
238			.msg_flags	= MSG_MORE,
239		};
240
241		svc_set_cmsg_data(rqstp, cmh);
242
243		if (sock_sendmsg(sock, &msg, 0) < 0)
244			goto out;
245	}
246
247	tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
248	headoff = 0;
249	len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
250			       rqstp->rq_respages[0], tailoff);
251
252out:
253	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
254		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
255		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
256
257	return len;
258}
259
260/*
261 * Report socket names for nfsdfs
262 */
263static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
264{
265	const struct sock *sk = svsk->sk_sk;
266	const char *proto_name = sk->sk_protocol == IPPROTO_UDP ?
267							"udp" : "tcp";
268	int len;
269
270	switch (sk->sk_family) {
271	case PF_INET:
272		len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
273				proto_name,
274				&inet_sk(sk)->inet_rcv_saddr,
275				inet_sk(sk)->inet_num);
276		break;
277	case PF_INET6:
278		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
279				proto_name,
280				&inet6_sk(sk)->rcv_saddr,
281				inet_sk(sk)->inet_num);
282		break;
283	default:
284		len = snprintf(buf, remaining, "*unknown-%d*\n",
285				sk->sk_family);
286	}
287
288	if (len >= remaining) {
289		*buf = '\0';
290		return -ENAMETOOLONG;
291	}
292	return len;
293}
294
295/**
296 * svc_sock_names - construct a list of listener names in a string
297 * @serv: pointer to RPC service
298 * @buf: pointer to a buffer to fill in with socket names
299 * @buflen: size of the buffer to be filled
300 * @toclose: pointer to '\0'-terminated C string containing the name
301 *		of a listener to be closed
302 *
303 * Fills in @buf with a '\n'-separated list of names of listener
304 * sockets.  If @toclose is not NULL, the socket named by @toclose
305 * is closed, and is not included in the output list.
306 *
307 * Returns positive length of the socket name string, or a negative
308 * errno value on error.
309 */
310int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen,
311		   const char *toclose)
312{
313	struct svc_sock *svsk, *closesk = NULL;
314	int len = 0;
315
316	if (!serv)
317		return 0;
318
319	spin_lock_bh(&serv->sv_lock);
320	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
321		int onelen = svc_one_sock_name(svsk, buf + len, buflen - len);
322		if (onelen < 0) {
323			len = onelen;
324			break;
325		}
326		if (toclose && strcmp(toclose, buf + len) == 0)
327			closesk = svsk;
328		else
329			len += onelen;
330	}
331	spin_unlock_bh(&serv->sv_lock);
332
333	if (closesk)
334		/* Should unregister with portmap, but you cannot
335		 * unregister just one protocol...
336		 */
337		svc_close_xprt(&closesk->sk_xprt);
338	else if (toclose)
339		return -ENOENT;
340	return len;
341}
342EXPORT_SYMBOL_GPL(svc_sock_names);
343
344/*
345 * Check input queue length
346 */
347static int svc_recv_available(struct svc_sock *svsk)
348{
349	struct socket	*sock = svsk->sk_sock;
350	int		avail, err;
351
352	err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail);
353
354	return (err >= 0)? avail : err;
355}
356
357/*
358 * Generic recvfrom routine.
359 */
360static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
361			int buflen)
362{
363	struct svc_sock *svsk =
364		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
365	struct msghdr msg = {
366		.msg_flags	= MSG_DONTWAIT,
367	};
368	int len;
369
370	rqstp->rq_xprt_hlen = 0;
371
372	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
373				msg.msg_flags);
374
375	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
376		svsk, iov[0].iov_base, iov[0].iov_len, len);
377	return len;
378}
379
380/*
381 * Set socket snd and rcv buffer lengths
382 */
383static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
384				unsigned int rcv)
385{
386	/* sock_setsockopt limits use to sysctl_?mem_max,
387	 * which isn't acceptable.  Until that is made conditional
388	 * on not having CAP_SYS_RESOURCE or similar, we go direct...
389	 * DaveM said I could!
390	 */
391	lock_sock(sock->sk);
392	sock->sk->sk_sndbuf = snd * 2;
393	sock->sk->sk_rcvbuf = rcv * 2;
394	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
395	sock->sk->sk_write_space(sock->sk);
396	release_sock(sock->sk);
397}
398/*
399 * INET callback when data has been received on the socket.
400 */
401static void svc_udp_data_ready(struct sock *sk, int count)
402{
403	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
404
405	if (svsk) {
406		dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
407			svsk, sk, count,
408			test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
409		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
410		svc_xprt_enqueue(&svsk->sk_xprt);
411	}
412	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
413		wake_up_interruptible(sk_sleep(sk));
414}
415
416/*
417 * INET callback when space is newly available on the socket.
418 */
419static void svc_write_space(struct sock *sk)
420{
421	struct svc_sock	*svsk = (struct svc_sock *)(sk->sk_user_data);
422
423	if (svsk) {
424		dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
425			svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
426		svc_xprt_enqueue(&svsk->sk_xprt);
427	}
428
429	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) {
430		dprintk("RPC svc_write_space: someone sleeping on %p\n",
431		       svsk);
432		wake_up_interruptible(sk_sleep(sk));
433	}
434}
435
436static void svc_tcp_write_space(struct sock *sk)
437{
438	struct socket *sock = sk->sk_socket;
439
440	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock)
441		clear_bit(SOCK_NOSPACE, &sock->flags);
442	svc_write_space(sk);
443}
444
445/*
446 * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
447 */
448static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
449				     struct cmsghdr *cmh)
450{
451	struct in_pktinfo *pki = CMSG_DATA(cmh);
452	if (cmh->cmsg_type != IP_PKTINFO)
453		return 0;
454	rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
455	return 1;
456}
457
458/*
459 * See net/ipv6/datagram.c : datagram_recv_ctl
460 */
461static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
462				     struct cmsghdr *cmh)
463{
464	struct in6_pktinfo *pki = CMSG_DATA(cmh);
465	if (cmh->cmsg_type != IPV6_PKTINFO)
466		return 0;
467	ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr);
468	return 1;
469}
470
471/*
472 * Copy the UDP datagram's destination address to the rqstp structure.
473 * The 'destination' address in this case is the address to which the
474 * peer sent the datagram, i.e. our local address. For multihomed
475 * hosts, this can change from msg to msg. Note that only the IP
476 * address changes, the port number should remain the same.
477 */
478static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
479				    struct cmsghdr *cmh)
480{
481	switch (cmh->cmsg_level) {
482	case SOL_IP:
483		return svc_udp_get_dest_address4(rqstp, cmh);
484	case SOL_IPV6:
485		return svc_udp_get_dest_address6(rqstp, cmh);
486	}
487
488	return 0;
489}
490
491/*
492 * Receive a datagram from a UDP socket.
493 */
494static int svc_udp_recvfrom(struct svc_rqst *rqstp)
495{
496	struct svc_sock	*svsk =
497		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
498	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
499	struct sk_buff	*skb;
500	union {
501		struct cmsghdr	hdr;
502		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
503	} buffer;
504	struct cmsghdr *cmh = &buffer.hdr;
505	struct msghdr msg = {
506		.msg_name = svc_addr(rqstp),
507		.msg_control = cmh,
508		.msg_controllen = sizeof(buffer),
509		.msg_flags = MSG_DONTWAIT,
510	};
511	size_t len;
512	int err;
513
514	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
515	    /* udp sockets need large rcvbuf as all pending
516	     * requests are still in that buffer.  sndbuf must
517	     * also be large enough that there is enough space
518	     * for one reply per thread.  We count all threads
519	     * rather than threads in a particular pool, which
520	     * provides an upper bound on the number of threads
521	     * which will access the socket.
522	     */
523	    svc_sock_setbufsize(svsk->sk_sock,
524				(serv->sv_nrthreads+3) * serv->sv_max_mesg,
525				(serv->sv_nrthreads+3) * serv->sv_max_mesg);
526
527	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
528	skb = NULL;
529	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
530			     0, 0, MSG_PEEK | MSG_DONTWAIT);
531	if (err >= 0)
532		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
533
534	if (skb == NULL) {
535		if (err != -EAGAIN) {
536			/* possibly an icmp error */
537			dprintk("svc: recvfrom returned error %d\n", -err);
538			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
539		}
540		return -EAGAIN;
541	}
542	len = svc_addr_len(svc_addr(rqstp));
543	if (len == 0)
544		return -EAFNOSUPPORT;
545	rqstp->rq_addrlen = len;
546	if (skb->tstamp.tv64 == 0) {
547		skb->tstamp = ktime_get_real();
548		/* Don't enable netstamp, sunrpc doesn't
549		   need that much accuracy */
550	}
551	svsk->sk_sk->sk_stamp = skb->tstamp;
552	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
553
554	len  = skb->len - sizeof(struct udphdr);
555	rqstp->rq_arg.len = len;
556
557	rqstp->rq_prot = IPPROTO_UDP;
558
559	if (!svc_udp_get_dest_address(rqstp, cmh)) {
560		if (net_ratelimit())
561			printk(KERN_WARNING
562				"svc: received unknown control message %d/%d; "
563				"dropping RPC reply datagram\n",
564					cmh->cmsg_level, cmh->cmsg_type);
565		skb_free_datagram_locked(svsk->sk_sk, skb);
566		return 0;
567	}
568
569	if (skb_is_nonlinear(skb)) {
570		/* we have to copy */
571		local_bh_disable();
572		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
573			local_bh_enable();
574			/* checksum error */
575			skb_free_datagram_locked(svsk->sk_sk, skb);
576			return 0;
577		}
578		local_bh_enable();
579		skb_free_datagram_locked(svsk->sk_sk, skb);
580	} else {
581		/* we can use it in-place */
582		rqstp->rq_arg.head[0].iov_base = skb->data +
583			sizeof(struct udphdr);
584		rqstp->rq_arg.head[0].iov_len = len;
585		if (skb_checksum_complete(skb)) {
586			skb_free_datagram_locked(svsk->sk_sk, skb);
587			return 0;
588		}
589		rqstp->rq_xprt_ctxt = skb;
590	}
591
592	rqstp->rq_arg.page_base = 0;
593	if (len <= rqstp->rq_arg.head[0].iov_len) {
594		rqstp->rq_arg.head[0].iov_len = len;
595		rqstp->rq_arg.page_len = 0;
596		rqstp->rq_respages = rqstp->rq_pages+1;
597	} else {
598		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
599		rqstp->rq_respages = rqstp->rq_pages + 1 +
600			DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
601	}
602
603	if (serv->sv_stats)
604		serv->sv_stats->netudpcnt++;
605
606	return len;
607}
608
609static int
610svc_udp_sendto(struct svc_rqst *rqstp)
611{
612	int		error;
613
614	error = svc_sendto(rqstp, &rqstp->rq_res);
615	if (error == -ECONNREFUSED)
616		/* ICMP error on earlier request. */
617		error = svc_sendto(rqstp, &rqstp->rq_res);
618
619	return error;
620}
621
622static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
623{
624}
625
626static int svc_udp_has_wspace(struct svc_xprt *xprt)
627{
628	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
629	struct svc_serv	*serv = xprt->xpt_server;
630	unsigned long required;
631
632	/*
633	 * Set the SOCK_NOSPACE flag before checking the available
634	 * sock space.
635	 */
636	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
637	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
638	if (required*2 > sock_wspace(svsk->sk_sk))
639		return 0;
640	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
641	return 1;
642}
643
644static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
645{
646	BUG();
647	return NULL;
648}
649
650static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
651				       struct sockaddr *sa, int salen,
652				       int flags)
653{
654	return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
655}
656
657static struct svc_xprt_ops svc_udp_ops = {
658	.xpo_create = svc_udp_create,
659	.xpo_recvfrom = svc_udp_recvfrom,
660	.xpo_sendto = svc_udp_sendto,
661	.xpo_release_rqst = svc_release_skb,
662	.xpo_detach = svc_sock_detach,
663	.xpo_free = svc_sock_free,
664	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
665	.xpo_has_wspace = svc_udp_has_wspace,
666	.xpo_accept = svc_udp_accept,
667};
668
669static struct svc_xprt_class svc_udp_class = {
670	.xcl_name = "udp",
671	.xcl_owner = THIS_MODULE,
672	.xcl_ops = &svc_udp_ops,
673	.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
674};
675
676static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
677{
678	int err, level, optname, one = 1;
679
680	svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
681	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
682	svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
683	svsk->sk_sk->sk_write_space = svc_write_space;
684
685	/* initialise setting must have enough space to
686	 * receive and respond to one request.
687	 * svc_udp_recvfrom will re-adjust if necessary
688	 */
689	svc_sock_setbufsize(svsk->sk_sock,
690			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
691			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
692
693	/* data might have come in before data_ready set up */
694	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
695	set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
696
697	/* make sure we get destination address info */
698	switch (svsk->sk_sk->sk_family) {
699	case AF_INET:
700		level = SOL_IP;
701		optname = IP_PKTINFO;
702		break;
703	case AF_INET6:
704		level = SOL_IPV6;
705		optname = IPV6_RECVPKTINFO;
706		break;
707	default:
708		BUG();
709	}
710	err = kernel_setsockopt(svsk->sk_sock, level, optname,
711					(char *)&one, sizeof(one));
712	dprintk("svc: kernel_setsockopt returned %d\n", err);
713}
714
715/*
716 * A data_ready event on a listening socket means there's a connection
717 * pending. Do not use state_change as a substitute for it.
718 */
719static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
720{
721	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
722
723	dprintk("svc: socket %p TCP (listen) state change %d\n",
724		sk, sk->sk_state);
725
726	/*
727	 * This callback may called twice when a new connection
728	 * is established as a child socket inherits everything
729	 * from a parent LISTEN socket.
730	 * 1) data_ready method of the parent socket will be called
731	 *    when one of child sockets become ESTABLISHED.
732	 * 2) data_ready method of the child socket may be called
733	 *    when it receives data before the socket is accepted.
734	 * In case of 2, we should ignore it silently.
735	 */
736	if (sk->sk_state == TCP_LISTEN) {
737		if (svsk) {
738			set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
739			svc_xprt_enqueue(&svsk->sk_xprt);
740		} else
741			printk("svc: socket %p: no user data\n", sk);
742	}
743
744	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
745		wake_up_interruptible_all(sk_sleep(sk));
746}
747
748/*
749 * A state change on a connected socket means it's dying or dead.
750 */
751static void svc_tcp_state_change(struct sock *sk)
752{
753	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
754
755	dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
756		sk, sk->sk_state, sk->sk_user_data);
757
758	if (!svsk)
759		printk("svc: socket %p: no user data\n", sk);
760	else {
761		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
762		svc_xprt_enqueue(&svsk->sk_xprt);
763	}
764	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
765		wake_up_interruptible_all(sk_sleep(sk));
766}
767
768static void svc_tcp_data_ready(struct sock *sk, int count)
769{
770	struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
771
772	dprintk("svc: socket %p TCP data ready (svsk %p)\n",
773		sk, sk->sk_user_data);
774	if (svsk) {
775		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
776		svc_xprt_enqueue(&svsk->sk_xprt);
777	}
778	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
779		wake_up_interruptible(sk_sleep(sk));
780}
781
782/*
783 * Accept a TCP connection
784 */
785static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
786{
787	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
788	struct sockaddr_storage addr;
789	struct sockaddr	*sin = (struct sockaddr *) &addr;
790	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
791	struct socket	*sock = svsk->sk_sock;
792	struct socket	*newsock;
793	struct svc_sock	*newsvsk;
794	int		err, slen;
795	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
796
797	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
798	if (!sock)
799		return NULL;
800
801	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
802	err = kernel_accept(sock, &newsock, O_NONBLOCK);
803	if (err < 0) {
804		if (err == -ENOMEM)
805			printk(KERN_WARNING "%s: no more sockets!\n",
806			       serv->sv_name);
807		else if (err != -EAGAIN && net_ratelimit())
808			printk(KERN_WARNING "%s: accept failed (err %d)!\n",
809				   serv->sv_name, -err);
810		return NULL;
811	}
812	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
813
814	err = kernel_getpeername(newsock, sin, &slen);
815	if (err < 0) {
816		if (net_ratelimit())
817			printk(KERN_WARNING "%s: peername failed (err %d)!\n",
818				   serv->sv_name, -err);
819		goto failed;		/* aborted connection or whatever */
820	}
821
822	/* Ideally, we would want to reject connections from unauthorized
823	 * hosts here, but when we get encryption, the IP of the host won't
824	 * tell us anything.  For now just warn about unpriv connections.
825	 */
826	if (!svc_port_is_privileged(sin)) {
827		dprintk(KERN_WARNING
828			"%s: connect from unprivileged port: %s\n",
829			serv->sv_name,
830			__svc_print_addr(sin, buf, sizeof(buf)));
831	}
832	dprintk("%s: connect from %s\n", serv->sv_name,
833		__svc_print_addr(sin, buf, sizeof(buf)));
834
835	/* make sure that a write doesn't block forever when
836	 * low on memory
837	 */
838	newsock->sk->sk_sndtimeo = HZ*30;
839
840	if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
841				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
842		goto failed;
843	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
844	err = kernel_getsockname(newsock, sin, &slen);
845	if (unlikely(err < 0)) {
846		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
847		slen = offsetof(struct sockaddr, sa_data);
848	}
849	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
850
851	if (serv->sv_stats)
852		serv->sv_stats->nettcpconn++;
853
854	return &newsvsk->sk_xprt;
855
856failed:
857	sock_release(newsock);
858	return NULL;
859}
860
861/*
862 * Receive data.
863 * If we haven't gotten the record length yet, get the next four bytes.
864 * Otherwise try to gobble up as much as possible up to the complete
865 * record length.
866 */
867static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
868{
869	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
870	int len;
871
872	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
873		/* sndbuf needs to have room for one request
874		 * per thread, otherwise we can stall even when the
875		 * network isn't a bottleneck.
876		 *
877		 * We count all threads rather than threads in a
878		 * particular pool, which provides an upper bound
879		 * on the number of threads which will access the socket.
880		 *
881		 * rcvbuf just needs to be able to hold a few requests.
882		 * Normally they will be removed from the queue
883		 * as soon a a complete request arrives.
884		 */
885		svc_sock_setbufsize(svsk->sk_sock,
886				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
887				    3 * serv->sv_max_mesg);
888
889	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
890
891	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
892		int		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
893		struct kvec	iov;
894
895		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
896		iov.iov_len  = want;
897		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
898			goto error;
899		svsk->sk_tcplen += len;
900
901		if (len < want) {
902			dprintk("svc: short recvfrom while reading record "
903				"length (%d of %d)\n", len, want);
904			goto err_again; /* record header not complete */
905		}
906
907		svsk->sk_reclen = ntohl(svsk->sk_reclen);
908		if (!(svsk->sk_reclen & RPC_LAST_STREAM_FRAGMENT)) {
909			if (net_ratelimit())
910				printk(KERN_NOTICE "RPC: multiple fragments "
911					"per record not supported\n");
912			goto err_delete;
913		}
914
915		svsk->sk_reclen &= RPC_FRAGMENT_SIZE_MASK;
916		dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
917		if (svsk->sk_reclen > serv->sv_max_mesg) {
918			if (net_ratelimit())
919				printk(KERN_NOTICE "RPC: "
920					"fragment too large: 0x%08lx\n",
921					(unsigned long)svsk->sk_reclen);
922			goto err_delete;
923		}
924	}
925
926	/* Check whether enough data is available */
927	len = svc_recv_available(svsk);
928	if (len < 0)
929		goto error;
930
931	if (len < svsk->sk_reclen) {
932		dprintk("svc: incomplete TCP record (%d of %d)\n",
933			len, svsk->sk_reclen);
934		goto err_again;	/* record not complete */
935	}
936	len = svsk->sk_reclen;
937	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
938
939	return len;
940 error:
941	if (len == -EAGAIN)
942		dprintk("RPC: TCP recv_record got EAGAIN\n");
943	return len;
944 err_delete:
945	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
946 err_again:
947	return -EAGAIN;
948}
949
950static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
951			       struct rpc_rqst **reqpp, struct kvec *vec)
952{
953	struct rpc_rqst *req = NULL;
954	u32 *p;
955	u32 xid;
956	u32 calldir;
957	int len;
958
959	len = svc_recvfrom(rqstp, vec, 1, 8);
960	if (len < 0)
961		goto error;
962
963	p = (u32 *)rqstp->rq_arg.head[0].iov_base;
964	xid = *p++;
965	calldir = *p;
966
967	if (calldir == 0) {
968		/* REQUEST is the most common case */
969		vec[0] = rqstp->rq_arg.head[0];
970	} else {
971		/* REPLY */
972		if (svsk->sk_bc_xprt)
973			req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid);
974
975		if (!req) {
976			printk(KERN_NOTICE
977				"%s: Got unrecognized reply: "
978				"calldir 0x%x sk_bc_xprt %p xid %08x\n",
979				__func__, ntohl(calldir),
980				svsk->sk_bc_xprt, xid);
981			vec[0] = rqstp->rq_arg.head[0];
982			goto out;
983		}
984
985		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
986		       sizeof(struct xdr_buf));
987		/* copy the xid and call direction */
988		memcpy(req->rq_private_buf.head[0].iov_base,
989		       rqstp->rq_arg.head[0].iov_base, 8);
990		vec[0] = req->rq_private_buf.head[0];
991	}
992 out:
993	vec[0].iov_base += 8;
994	vec[0].iov_len -= 8;
995	len = svsk->sk_reclen - 8;
996 error:
997	*reqpp = req;
998	return len;
999}
1000
1001/*
1002 * Receive data from a TCP socket.
1003 */
1004static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
1005{
1006	struct svc_sock	*svsk =
1007		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
1008	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
1009	int		len;
1010	struct kvec *vec;
1011	int pnum, vlen;
1012	struct rpc_rqst *req = NULL;
1013
1014	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
1015		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
1016		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
1017		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
1018
1019	len = svc_tcp_recv_record(svsk, rqstp);
1020	if (len < 0)
1021		goto error;
1022
1023	vec = rqstp->rq_vec;
1024	vec[0] = rqstp->rq_arg.head[0];
1025	vlen = PAGE_SIZE;
1026
1027	/*
1028	 * We have enough data for the whole tcp record. Let's try and read the
1029	 * first 8 bytes to get the xid and the call direction. We can use this
1030	 * to figure out if this is a call or a reply to a callback. If
1031	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
1032	 * In that case, don't bother with the calldir and just read the data.
1033	 * It will be rejected in svc_process.
1034	 */
1035	if (len >= 8) {
1036		len = svc_process_calldir(svsk, rqstp, &req, vec);
1037		if (len < 0)
1038			goto err_again;
1039		vlen -= 8;
1040	}
1041
1042	pnum = 1;
1043	while (vlen < len) {
1044		vec[pnum].iov_base = (req) ?
1045			page_address(req->rq_private_buf.pages[pnum - 1]) :
1046			page_address(rqstp->rq_pages[pnum]);
1047		vec[pnum].iov_len = PAGE_SIZE;
1048		pnum++;
1049		vlen += PAGE_SIZE;
1050	}
1051	rqstp->rq_respages = &rqstp->rq_pages[pnum];
1052
1053	/* Now receive data */
1054	len = svc_recvfrom(rqstp, vec, pnum, len);
1055	if (len < 0)
1056		goto err_again;
1057
1058	/*
1059	 * Account for the 8 bytes we read earlier
1060	 */
1061	len += 8;
1062
1063	if (req) {
1064		xprt_complete_rqst(req->rq_task, len);
1065		len = 0;
1066		goto out;
1067	}
1068	dprintk("svc: TCP complete record (%d bytes)\n", len);
1069	rqstp->rq_arg.len = len;
1070	rqstp->rq_arg.page_base = 0;
1071	if (len <= rqstp->rq_arg.head[0].iov_len) {
1072		rqstp->rq_arg.head[0].iov_len = len;
1073		rqstp->rq_arg.page_len = 0;
1074	} else {
1075		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
1076	}
1077
1078	rqstp->rq_xprt_ctxt   = NULL;
1079	rqstp->rq_prot	      = IPPROTO_TCP;
1080
1081out:
1082	/* Reset TCP read info */
1083	svsk->sk_reclen = 0;
1084	svsk->sk_tcplen = 0;
1085
1086	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
1087	if (serv->sv_stats)
1088		serv->sv_stats->nettcpcnt++;
1089
1090	return len;
1091
1092err_again:
1093	if (len == -EAGAIN) {
1094		dprintk("RPC: TCP recvfrom got EAGAIN\n");
1095		return len;
1096	}
1097error:
1098	if (len != -EAGAIN) {
1099		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1100		       svsk->sk_xprt.xpt_server->sv_name, -len);
1101		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1102	}
1103	return -EAGAIN;
1104}
1105
1106/*
1107 * Send out data on TCP socket.
1108 */
1109static int svc_tcp_sendto(struct svc_rqst *rqstp)
1110{
1111	struct xdr_buf	*xbufp = &rqstp->rq_res;
1112	int sent;
1113	__be32 reclen;
1114
1115	/* Set up the first element of the reply kvec.
1116	 * Any other kvecs that may be in use have been taken
1117	 * care of by the server implementation itself.
1118	 */
1119	reclen = htonl(0x80000000|((xbufp->len ) - 4));
1120	memcpy(xbufp->head[0].iov_base, &reclen, 4);
1121
1122	if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
1123		return -ENOTCONN;
1124
1125	sent = svc_sendto(rqstp, &rqstp->rq_res);
1126	if (sent != xbufp->len) {
1127		printk(KERN_NOTICE
1128		       "rpc-srv/tcp: %s: %s %d when sending %d bytes "
1129		       "- shutting down socket\n",
1130		       rqstp->rq_xprt->xpt_server->sv_name,
1131		       (sent<0)?"got error":"sent only",
1132		       sent, xbufp->len);
1133		set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
1134		svc_xprt_enqueue(rqstp->rq_xprt);
1135		sent = -EAGAIN;
1136	}
1137	return sent;
1138}
1139
1140/*
1141 * Setup response header. TCP has a 4B record length field.
1142 */
1143static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
1144{
1145	struct kvec *resv = &rqstp->rq_res.head[0];
1146
1147	/* tcp needs a space for the record length... */
1148	svc_putnl(resv, 0);
1149}
1150
1151static int svc_tcp_has_wspace(struct svc_xprt *xprt)
1152{
1153	struct svc_sock *svsk =	container_of(xprt, struct svc_sock, sk_xprt);
1154	struct svc_serv *serv = svsk->sk_xprt.xpt_server;
1155	int required;
1156
1157	if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
1158		return 1;
1159	required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
1160	if (sk_stream_wspace(svsk->sk_sk) >= required)
1161		return 1;
1162	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
1163	return 0;
1164}
1165
1166static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
1167				       struct sockaddr *sa, int salen,
1168				       int flags)
1169{
1170	return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
1171}
1172
1173static struct svc_xprt_ops svc_tcp_ops = {
1174	.xpo_create = svc_tcp_create,
1175	.xpo_recvfrom = svc_tcp_recvfrom,
1176	.xpo_sendto = svc_tcp_sendto,
1177	.xpo_release_rqst = svc_release_skb,
1178	.xpo_detach = svc_tcp_sock_detach,
1179	.xpo_free = svc_sock_free,
1180	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
1181	.xpo_has_wspace = svc_tcp_has_wspace,
1182	.xpo_accept = svc_tcp_accept,
1183};
1184
1185static struct svc_xprt_class svc_tcp_class = {
1186	.xcl_name = "tcp",
1187	.xcl_owner = THIS_MODULE,
1188	.xcl_ops = &svc_tcp_ops,
1189	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
1190};
1191
1192void svc_init_xprt_sock(void)
1193{
1194	svc_reg_xprt_class(&svc_tcp_class);
1195	svc_reg_xprt_class(&svc_udp_class);
1196}
1197
1198void svc_cleanup_xprt_sock(void)
1199{
1200	svc_unreg_xprt_class(&svc_tcp_class);
1201	svc_unreg_xprt_class(&svc_udp_class);
1202}
1203
1204static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
1205{
1206	struct sock	*sk = svsk->sk_sk;
1207
1208	svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
1209	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
1210	if (sk->sk_state == TCP_LISTEN) {
1211		dprintk("setting up TCP socket for listening\n");
1212		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
1213		sk->sk_data_ready = svc_tcp_listen_data_ready;
1214		set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
1215	} else {
1216		dprintk("setting up TCP socket for reading\n");
1217		sk->sk_state_change = svc_tcp_state_change;
1218		sk->sk_data_ready = svc_tcp_data_ready;
1219		sk->sk_write_space = svc_tcp_write_space;
1220
1221		svsk->sk_reclen = 0;
1222		svsk->sk_tcplen = 0;
1223
1224		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
1225
1226		/* initialise setting must have enough space to
1227		 * receive and respond to one request.
1228		 * svc_tcp_recvfrom will re-adjust if necessary
1229		 */
1230		svc_sock_setbufsize(svsk->sk_sock,
1231				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
1232				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
1233
1234		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1235		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
1236		if (sk->sk_state != TCP_ESTABLISHED)
1237			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
1238	}
1239}
1240
1241void svc_sock_update_bufs(struct svc_serv *serv)
1242{
1243	/*
1244	 * The number of server threads has changed. Update
1245	 * rcvbuf and sndbuf accordingly on all sockets
1246	 */
1247	struct list_head *le;
1248
1249	spin_lock_bh(&serv->sv_lock);
1250	list_for_each(le, &serv->sv_permsocks) {
1251		struct svc_sock *svsk =
1252			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1253		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1254	}
1255	list_for_each(le, &serv->sv_tempsocks) {
1256		struct svc_sock *svsk =
1257			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
1258		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
1259	}
1260	spin_unlock_bh(&serv->sv_lock);
1261}
1262EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
1263
1264static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1265						struct socket *sock,
1266						int *errp, int flags)
1267{
1268	struct svc_sock	*svsk;
1269	struct sock	*inet;
1270	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1271
1272	dprintk("svc: svc_setup_socket %p\n", sock);
1273	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
1274		*errp = -ENOMEM;
1275		return NULL;
1276	}
1277
1278	inet = sock->sk;
1279
1280	/* Register socket with portmapper */
1281	if (*errp >= 0 && pmap_register)
1282		*errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
1283				     ntohs(inet_sk(inet)->inet_sport));
1284
1285	if (*errp < 0) {
1286		kfree(svsk);
1287		return NULL;
1288	}
1289
1290	inet->sk_user_data = svsk;
1291	svsk->sk_sock = sock;
1292	svsk->sk_sk = inet;
1293	svsk->sk_ostate = inet->sk_state_change;
1294	svsk->sk_odata = inet->sk_data_ready;
1295	svsk->sk_owspace = inet->sk_write_space;
1296
1297	/* Initialize the socket */
1298	if (sock->type == SOCK_DGRAM)
1299		svc_udp_init(svsk, serv);
1300	else
1301		svc_tcp_init(svsk, serv);
1302
1303	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1304				svsk, svsk->sk_sk);
1305
1306	return svsk;
1307}
1308
1309/**
1310 * svc_addsock - add a listener socket to an RPC service
1311 * @serv: pointer to RPC service to which to add a new listener
1312 * @fd: file descriptor of the new listener
1313 * @name_return: pointer to buffer to fill in with name of listener
1314 * @len: size of the buffer
1315 *
1316 * Fills in socket name and returns positive length of name if successful.
1317 * Name is terminated with '\n'.  On error, returns a negative errno
1318 * value.
1319 */
1320int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
1321		const size_t len)
1322{
1323	int err = 0;
1324	struct socket *so = sockfd_lookup(fd, &err);
1325	struct svc_sock *svsk = NULL;
1326
1327	if (!so)
1328		return err;
1329	if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
1330		err =  -EAFNOSUPPORT;
1331	else if (so->sk->sk_protocol != IPPROTO_TCP &&
1332	    so->sk->sk_protocol != IPPROTO_UDP)
1333		err =  -EPROTONOSUPPORT;
1334	else if (so->state > SS_UNCONNECTED)
1335		err = -EISCONN;
1336	else {
1337		if (!try_module_get(THIS_MODULE))
1338			err = -ENOENT;
1339		else
1340			svsk = svc_setup_socket(serv, so, &err,
1341						SVC_SOCK_DEFAULTS);
1342		if (svsk) {
1343			struct sockaddr_storage addr;
1344			struct sockaddr *sin = (struct sockaddr *)&addr;
1345			int salen;
1346			if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
1347				svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
1348			clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
1349			spin_lock_bh(&serv->sv_lock);
1350			list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
1351			spin_unlock_bh(&serv->sv_lock);
1352			svc_xprt_received(&svsk->sk_xprt);
1353			err = 0;
1354		} else
1355			module_put(THIS_MODULE);
1356	}
1357	if (err) {
1358		sockfd_put(so);
1359		return err;
1360	}
1361	return svc_one_sock_name(svsk, name_return, len);
1362}
1363EXPORT_SYMBOL_GPL(svc_addsock);
1364
1365/*
1366 * Create socket for RPC service.
1367 */
1368static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1369					  int protocol,
1370					  struct sockaddr *sin, int len,
1371					  int flags)
1372{
1373	struct svc_sock	*svsk;
1374	struct socket	*sock;
1375	int		error;
1376	int		type;
1377	struct sockaddr_storage addr;
1378	struct sockaddr *newsin = (struct sockaddr *)&addr;
1379	int		newlen;
1380	int		family;
1381	int		val;
1382	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
1383
1384	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
1385			serv->sv_program->pg_name, protocol,
1386			__svc_print_addr(sin, buf, sizeof(buf)));
1387
1388	if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
1389		printk(KERN_WARNING "svc: only UDP and TCP "
1390				"sockets supported\n");
1391		return ERR_PTR(-EINVAL);
1392	}
1393
1394	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1395	switch (sin->sa_family) {
1396	case AF_INET6:
1397		family = PF_INET6;
1398		break;
1399	case AF_INET:
1400		family = PF_INET;
1401		break;
1402	default:
1403		return ERR_PTR(-EINVAL);
1404	}
1405
1406	error = sock_create_kern(family, type, protocol, &sock);
1407	if (error < 0)
1408		return ERR_PTR(error);
1409
1410	svc_reclassify_socket(sock);
1411
1412	/*
1413	 * If this is an PF_INET6 listener, we want to avoid
1414	 * getting requests from IPv4 remotes.  Those should
1415	 * be shunted to a PF_INET listener via rpcbind.
1416	 */
1417	val = 1;
1418	if (family == PF_INET6)
1419		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
1420					(char *)&val, sizeof(val));
1421
1422	if (type == SOCK_STREAM)
1423		sock->sk->sk_reuse = 1;		/* allow address reuse */
1424	error = kernel_bind(sock, sin, len);
1425	if (error < 0)
1426		goto bummer;
1427
1428	newlen = len;
1429	error = kernel_getsockname(sock, newsin, &newlen);
1430	if (error < 0)
1431		goto bummer;
1432
1433	if (protocol == IPPROTO_TCP) {
1434		if ((error = kernel_listen(sock, 64)) < 0)
1435			goto bummer;
1436	}
1437
1438	if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
1439		svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
1440		return (struct svc_xprt *)svsk;
1441	}
1442
1443bummer:
1444	dprintk("svc: svc_create_socket error = %d\n", -error);
1445	sock_release(sock);
1446	return ERR_PTR(error);
1447}
1448
1449/*
1450 * Detach the svc_sock from the socket so that no
1451 * more callbacks occur.
1452 */
1453static void svc_sock_detach(struct svc_xprt *xprt)
1454{
1455	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1456	struct sock *sk = svsk->sk_sk;
1457
1458	dprintk("svc: svc_sock_detach(%p)\n", svsk);
1459
1460	/* put back the old socket callbacks */
1461	sk->sk_state_change = svsk->sk_ostate;
1462	sk->sk_data_ready = svsk->sk_odata;
1463	sk->sk_write_space = svsk->sk_owspace;
1464
1465	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
1466		wake_up_interruptible(sk_sleep(sk));
1467}
1468
1469/*
1470 * Disconnect the socket, and reset the callbacks
1471 */
1472static void svc_tcp_sock_detach(struct svc_xprt *xprt)
1473{
1474	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1475
1476	dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);
1477
1478	svc_sock_detach(xprt);
1479
1480	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags))
1481		kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
1482}
1483
1484/*
1485 * Free the svc_sock's socket resources and the svc_sock itself.
1486 */
1487static void svc_sock_free(struct svc_xprt *xprt)
1488{
1489	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
1490	dprintk("svc: svc_sock_free(%p)\n", svsk);
1491
1492	if (svsk->sk_sock->file)
1493		sockfd_put(svsk->sk_sock);
1494	else
1495		sock_release(svsk->sk_sock);
1496	kfree(svsk);
1497}
1498
1499/*
1500 * Create a svc_xprt.
1501 *
1502 * For internal use only (e.g. nfsv4.1 backchannel).
1503 * Callers should typically use the xpo_create() method.
1504 */
1505struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot)
1506{
1507	struct svc_sock *svsk;
1508	struct svc_xprt *xprt = NULL;
1509
1510	dprintk("svc: %s\n", __func__);
1511	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
1512	if (!svsk)
1513		goto out;
1514
1515	xprt = &svsk->sk_xprt;
1516	if (prot == IPPROTO_TCP)
1517		svc_xprt_init(&svc_tcp_class, xprt, serv);
1518	else if (prot == IPPROTO_UDP)
1519		svc_xprt_init(&svc_udp_class, xprt, serv);
1520	else
1521		BUG();
1522out:
1523	dprintk("svc: %s return %p\n", __func__, xprt);
1524	return xprt;
1525}
1526EXPORT_SYMBOL_GPL(svc_sock_create);
1527
1528/*
1529 * Destroy a svc_sock.
1530 */
1531void svc_sock_destroy(struct svc_xprt *xprt)
1532{
1533	if (xprt)
1534		kfree(container_of(xprt, struct svc_sock, sk_xprt));
1535}
1536EXPORT_SYMBOL_GPL(svc_sock_destroy);
1537