1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.
4 * Copyright (c) 2008 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under
9 * contract to Juniper Networks, Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD$");
40
41#include "opt_ipfw.h"
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#include "opt_ipsec.h"
45
46#include <sys/param.h>
47#include <sys/domain.h>
48#include <sys/eventhandler.h>
49#include <sys/jail.h>
50#include <sys/kernel.h>
51#include <sys/lock.h>
52#include <sys/malloc.h>
53#include <sys/mbuf.h>
54#include <sys/priv.h>
55#include <sys/proc.h>
56#include <sys/protosw.h>
57#include <sys/signalvar.h>
58#include <sys/socket.h>
59#include <sys/socketvar.h>
60#include <sys/sx.h>
61#include <sys/sysctl.h>
62#include <sys/syslog.h>
63#include <sys/systm.h>
64
65#include <vm/uma.h>
66
67#include <net/if.h>
68#include <net/route.h>
69
70#include <netinet/in.h>
71#include <netinet/in_pcb.h>
72#include <netinet/in_systm.h>
73#include <netinet/in_var.h>
74#include <netinet/ip.h>
75#ifdef INET6
76#include <netinet/ip6.h>
77#endif
78#include <netinet/ip_icmp.h>
79#include <netinet/icmp_var.h>
80#include <netinet/ip_var.h>
81#include <netinet/ip_options.h>
82#ifdef INET6
83#include <netinet6/ip6_var.h>
84#endif
85#include <netinet/udp.h>
86#include <netinet/udp_var.h>
87
88#ifdef IPSEC
89#include <netipsec/ipsec.h>
90#include <netipsec/esp.h>
91#endif
92
93#include <machine/in_cksum.h>
94
95#include <security/mac/mac_framework.h>
96
97/*
98 * UDP protocol implementation.
99 * Per RFC 768, August, 1980.
100 */
101
102/*
103 * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
104 * removes the only data integrity mechanism for packets and malformed
105 * packets that would otherwise be discarded due to bad checksums, and may
106 * cause problems (especially for NFS data blocks).
107 */
108VNET_DEFINE(int, udp_cksum) = 1;
109SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
110    &VNET_NAME(udp_cksum), 0, "compute udp checksum");
111
112int	udp_log_in_vain = 0;
113SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
114    &udp_log_in_vain, 0, "Log all incoming UDP packets");
115
116VNET_DEFINE(int, udp_blackhole) = 0;
117SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
118    &VNET_NAME(udp_blackhole), 0,
119    "Do not send port unreachables for refused connects");
120
121u_long	udp_sendspace = 9216;		/* really max datagram size */
122					/* 40 1K datagrams */
123SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
124    &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
125
126u_long	udp_recvspace = 40 * (1024 +
127#ifdef INET6
128				      sizeof(struct sockaddr_in6)
129#else
130				      sizeof(struct sockaddr_in)
131#endif
132				      );
133
134SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
135    &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
136
137VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
138VNET_DEFINE(struct inpcbinfo, udbinfo);
139static VNET_DEFINE(uma_zone_t, udpcb_zone);
140#define	V_udpcb_zone			VNET(udpcb_zone)
141
142#ifndef UDBHASHSIZE
143#define	UDBHASHSIZE	128
144#endif
145
146VNET_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
147SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
148    &VNET_NAME(udpstat), udpstat,
149    "UDP statistics (struct udpstat, netinet/udp_var.h)");
150
151#ifdef INET
152static void	udp_detach(struct socket *so);
153static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
154		    struct mbuf *, struct thread *);
155#endif
156
157#ifdef IPSEC
158#ifdef IPSEC_NAT_T
159#define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
160#ifdef INET
161static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
162#endif
163#endif /* IPSEC_NAT_T */
164#endif /* IPSEC */
165
166static void
167udp_zone_change(void *tag)
168{
169
170	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
171	uma_zone_set_max(V_udpcb_zone, maxsockets);
172}
173
174static int
175udp_inpcb_init(void *mem, int size, int flags)
176{
177	struct inpcb *inp;
178
179	inp = mem;
180	INP_LOCK_INIT(inp, "inp", "udpinp");
181	return (0);
182}
183
184void
185udp_init(void)
186{
187
188	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
189	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
190	    IPI_HASHFIELDS_2TUPLE);
191	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
192	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
193	uma_zone_set_max(V_udpcb_zone, maxsockets);
194	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
195	    EVENTHANDLER_PRI_ANY);
196}
197
198/*
199 * Kernel module interface for updating udpstat.  The argument is an index
200 * into udpstat treated as an array of u_long.  While this encodes the
201 * general layout of udpstat into the caller, it doesn't encode its location,
202 * so that future changes to add, for example, per-CPU stats support won't
203 * cause binary compatibility problems for kernel modules.
204 */
205void
206kmod_udpstat_inc(int statnum)
207{
208
209	(*((u_long *)&V_udpstat + statnum))++;
210}
211
212int
213udp_newudpcb(struct inpcb *inp)
214{
215	struct udpcb *up;
216
217	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
218	if (up == NULL)
219		return (ENOBUFS);
220	inp->inp_ppcb = up;
221	return (0);
222}
223
224void
225udp_discardcb(struct udpcb *up)
226{
227
228	uma_zfree(V_udpcb_zone, up);
229}
230
231#ifdef VIMAGE
232void
233udp_destroy(void)
234{
235
236	in_pcbinfo_destroy(&V_udbinfo);
237	uma_zdestroy(V_udpcb_zone);
238}
239#endif
240
241#ifdef INET
242/*
243 * Subroutine of udp_input(), which appends the provided mbuf chain to the
244 * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
245 * contains the source address.  If the socket ends up being an IPv6 socket,
246 * udp_append() will convert to a sockaddr_in6 before passing the address
247 * into the socket code.
248 */
249static void
250udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
251    struct sockaddr_in *udp_in)
252{
253	struct sockaddr *append_sa;
254	struct socket *so;
255	struct mbuf *opts = 0;
256#ifdef INET6
257	struct sockaddr_in6 udp_in6;
258#endif
259	struct udpcb *up;
260
261	INP_LOCK_ASSERT(inp);
262
263	/*
264	 * Engage the tunneling protocol.
265	 */
266	up = intoudpcb(inp);
267	if (up->u_tun_func != NULL) {
268		(*up->u_tun_func)(n, off, inp);
269		return;
270	}
271
272	if (n == NULL)
273		return;
274
275	off += sizeof(struct udphdr);
276
277#ifdef IPSEC
278	/* Check AH/ESP integrity. */
279	if (ipsec4_in_reject(n, inp)) {
280		m_freem(n);
281		IPSECSTAT_INC(in_polvio);
282		return;
283	}
284#ifdef IPSEC_NAT_T
285	up = intoudpcb(inp);
286	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
287	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
288		n = udp4_espdecap(inp, n, off);
289		if (n == NULL)				/* Consumed. */
290			return;
291	}
292#endif /* IPSEC_NAT_T */
293#endif /* IPSEC */
294#ifdef MAC
295	if (mac_inpcb_check_deliver(inp, n) != 0) {
296		m_freem(n);
297		return;
298	}
299#endif /* MAC */
300	if (inp->inp_flags & INP_CONTROLOPTS ||
301	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
302#ifdef INET6
303		if (inp->inp_vflag & INP_IPV6)
304			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
305		else
306#endif /* INET6 */
307			ip_savecontrol(inp, &opts, ip, n);
308	}
309#ifdef INET6
310	if (inp->inp_vflag & INP_IPV6) {
311		bzero(&udp_in6, sizeof(udp_in6));
312		udp_in6.sin6_len = sizeof(udp_in6);
313		udp_in6.sin6_family = AF_INET6;
314		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
315		append_sa = (struct sockaddr *)&udp_in6;
316	} else
317#endif /* INET6 */
318		append_sa = (struct sockaddr *)udp_in;
319	m_adj(n, off);
320
321	so = inp->inp_socket;
322	SOCKBUF_LOCK(&so->so_rcv);
323	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
324		SOCKBUF_UNLOCK(&so->so_rcv);
325		m_freem(n);
326		if (opts)
327			m_freem(opts);
328		UDPSTAT_INC(udps_fullsock);
329	} else
330		sorwakeup_locked(so);
331}
332
333void
334udp_input(struct mbuf *m, int off)
335{
336	int iphlen = off;
337	struct ip *ip;
338	struct udphdr *uh;
339	struct ifnet *ifp;
340	struct inpcb *inp;
341	int len;
342	struct ip save_ip;
343	struct sockaddr_in udp_in;
344	struct m_tag *fwd_tag;
345
346	ifp = m->m_pkthdr.rcvif;
347	UDPSTAT_INC(udps_ipackets);
348
349	/*
350	 * Strip IP options, if any; should skip this, make available to
351	 * user, and use on returned packets, but we don't yet have a way to
352	 * check the checksum with options still present.
353	 */
354	if (iphlen > sizeof (struct ip)) {
355		ip_stripoptions(m, (struct mbuf *)0);
356		iphlen = sizeof(struct ip);
357	}
358
359	/*
360	 * Get IP and UDP header together in first mbuf.
361	 */
362	ip = mtod(m, struct ip *);
363	if (m->m_len < iphlen + sizeof(struct udphdr)) {
364		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
365			UDPSTAT_INC(udps_hdrops);
366			return;
367		}
368		ip = mtod(m, struct ip *);
369	}
370	uh = (struct udphdr *)((caddr_t)ip + iphlen);
371
372	/*
373	 * Destination port of 0 is illegal, based on RFC768.
374	 */
375	if (uh->uh_dport == 0)
376		goto badunlocked;
377
378	/*
379	 * Construct sockaddr format source address.  Stuff source address
380	 * and datagram in user buffer.
381	 */
382	bzero(&udp_in, sizeof(udp_in));
383	udp_in.sin_len = sizeof(udp_in);
384	udp_in.sin_family = AF_INET;
385	udp_in.sin_port = uh->uh_sport;
386	udp_in.sin_addr = ip->ip_src;
387
388	/*
389	 * Make mbuf data length reflect UDP length.  If not enough data to
390	 * reflect UDP length, drop.
391	 */
392	len = ntohs((u_short)uh->uh_ulen);
393	if (ip->ip_len != len) {
394		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
395			UDPSTAT_INC(udps_badlen);
396			goto badunlocked;
397		}
398		m_adj(m, len - ip->ip_len);
399		/* ip->ip_len = len; */
400	}
401
402	/*
403	 * Save a copy of the IP header in case we want restore it for
404	 * sending an ICMP error message in response.
405	 */
406	if (!V_udp_blackhole)
407		save_ip = *ip;
408	else
409		memset(&save_ip, 0, sizeof(save_ip));
410
411	/*
412	 * Checksum extended UDP header and data.
413	 */
414	if (uh->uh_sum) {
415		u_short uh_sum;
416
417		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
418			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
419				uh_sum = m->m_pkthdr.csum_data;
420			else
421				uh_sum = in_pseudo(ip->ip_src.s_addr,
422				    ip->ip_dst.s_addr, htonl((u_short)len +
423				    m->m_pkthdr.csum_data + IPPROTO_UDP));
424			uh_sum ^= 0xffff;
425		} else {
426			char b[9];
427
428			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
429			bzero(((struct ipovly *)ip)->ih_x1, 9);
430			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
431			uh_sum = in_cksum(m, len + sizeof (struct ip));
432			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
433		}
434		if (uh_sum) {
435			UDPSTAT_INC(udps_badsum);
436			m_freem(m);
437			return;
438		}
439	} else
440		UDPSTAT_INC(udps_nosum);
441
442	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
443	    in_broadcast(ip->ip_dst, ifp)) {
444		struct inpcb *last;
445		struct ip_moptions *imo;
446
447		INP_INFO_RLOCK(&V_udbinfo);
448		last = NULL;
449		LIST_FOREACH(inp, &V_udb, inp_list) {
450			if (inp->inp_lport != uh->uh_dport)
451				continue;
452#ifdef INET6
453			if ((inp->inp_vflag & INP_IPV4) == 0)
454				continue;
455#endif
456			if (inp->inp_laddr.s_addr != INADDR_ANY &&
457			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
458				continue;
459			if (inp->inp_faddr.s_addr != INADDR_ANY &&
460			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
461				continue;
462			if (inp->inp_fport != 0 &&
463			    inp->inp_fport != uh->uh_sport)
464				continue;
465
466			INP_RLOCK(inp);
467
468			/*
469			 * XXXRW: Because we weren't holding either the inpcb
470			 * or the hash lock when we checked for a match
471			 * before, we should probably recheck now that the
472			 * inpcb lock is held.
473			 */
474
475			/*
476			 * Handle socket delivery policy for any-source
477			 * and source-specific multicast. [RFC3678]
478			 */
479			imo = inp->inp_moptions;
480			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
481				struct sockaddr_in	 group;
482				int			 blocked;
483				if (imo == NULL) {
484					INP_RUNLOCK(inp);
485					continue;
486				}
487				bzero(&group, sizeof(struct sockaddr_in));
488				group.sin_len = sizeof(struct sockaddr_in);
489				group.sin_family = AF_INET;
490				group.sin_addr = ip->ip_dst;
491
492				blocked = imo_multi_filter(imo, ifp,
493					(struct sockaddr *)&group,
494					(struct sockaddr *)&udp_in);
495				if (blocked != MCAST_PASS) {
496					if (blocked == MCAST_NOTGMEMBER)
497						IPSTAT_INC(ips_notmember);
498					if (blocked == MCAST_NOTSMEMBER ||
499					    blocked == MCAST_MUTED)
500						UDPSTAT_INC(udps_filtermcast);
501					INP_RUNLOCK(inp);
502					continue;
503				}
504			}
505			if (last != NULL) {
506				struct mbuf *n;
507
508				n = m_copy(m, 0, M_COPYALL);
509				udp_append(last, ip, n, iphlen, &udp_in);
510				INP_RUNLOCK(last);
511			}
512			last = inp;
513			/*
514			 * Don't look for additional matches if this one does
515			 * not have either the SO_REUSEPORT or SO_REUSEADDR
516			 * socket options set.  This heuristic avoids
517			 * searching through all pcbs in the common case of a
518			 * non-shared port.  It assumes that an application
519			 * will never clear these options after setting them.
520			 */
521			if ((last->inp_socket->so_options &
522			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
523				break;
524		}
525
526		if (last == NULL) {
527			/*
528			 * No matching pcb found; discard datagram.  (No need
529			 * to send an ICMP Port Unreachable for a broadcast
530			 * or multicast datgram.)
531			 */
532			UDPSTAT_INC(udps_noportbcast);
533			if (inp)
534				INP_RUNLOCK(inp);
535			INP_INFO_RUNLOCK(&V_udbinfo);
536			goto badunlocked;
537		}
538		udp_append(last, ip, m, iphlen, &udp_in);
539		INP_RUNLOCK(last);
540		INP_INFO_RUNLOCK(&V_udbinfo);
541		return;
542	}
543
544	/*
545	 * Locate pcb for datagram.
546	 */
547
548	/*
549	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
550	 */
551	if ((m->m_flags & M_IP_NEXTHOP) &&
552	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
553		struct sockaddr_in *next_hop;
554
555		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
556
557		/*
558		 * Transparently forwarded. Pretend to be the destination.
559		 * Already got one like this?
560		 */
561		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
562		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
563		if (!inp) {
564			/*
565			 * It's new.  Try to find the ambushing socket.
566			 * Because we've rewritten the destination address,
567			 * any hardware-generated hash is ignored.
568			 */
569			inp = in_pcblookup(&V_udbinfo, ip->ip_src,
570			    uh->uh_sport, next_hop->sin_addr,
571			    next_hop->sin_port ? htons(next_hop->sin_port) :
572			    uh->uh_dport, INPLOOKUP_WILDCARD |
573			    INPLOOKUP_RLOCKPCB, ifp);
574		}
575		/* Remove the tag from the packet. We don't need it anymore. */
576		m_tag_delete(m, fwd_tag);
577		m->m_flags &= ~M_IP_NEXTHOP;
578	} else
579		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
580		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
581		    INPLOOKUP_RLOCKPCB, ifp, m);
582	if (inp == NULL) {
583		if (udp_log_in_vain) {
584			char buf[4*sizeof "123"];
585
586			strcpy(buf, inet_ntoa(ip->ip_dst));
587			log(LOG_INFO,
588			    "Connection attempt to UDP %s:%d from %s:%d\n",
589			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
590			    ntohs(uh->uh_sport));
591		}
592		UDPSTAT_INC(udps_noport);
593		if (m->m_flags & (M_BCAST | M_MCAST)) {
594			UDPSTAT_INC(udps_noportbcast);
595			goto badunlocked;
596		}
597		if (V_udp_blackhole)
598			goto badunlocked;
599		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
600			goto badunlocked;
601		*ip = save_ip;
602		ip->ip_len += iphlen;
603		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
604		return;
605	}
606
607	/*
608	 * Check the minimum TTL for socket.
609	 */
610	INP_RLOCK_ASSERT(inp);
611	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
612		INP_RUNLOCK(inp);
613		m_freem(m);
614		return;
615	}
616	udp_append(inp, ip, m, iphlen, &udp_in);
617	INP_RUNLOCK(inp);
618	return;
619
620badunlocked:
621	m_freem(m);
622}
623#endif /* INET */
624
625/*
626 * Notify a udp user of an asynchronous error; just wake up so that they can
627 * collect error status.
628 */
629struct inpcb *
630udp_notify(struct inpcb *inp, int errno)
631{
632
633	/*
634	 * While udp_ctlinput() always calls udp_notify() with a read lock
635	 * when invoking it directly, in_pcbnotifyall() currently uses write
636	 * locks due to sharing code with TCP.  For now, accept either a read
637	 * or a write lock, but a read lock is sufficient.
638	 */
639	INP_LOCK_ASSERT(inp);
640
641	inp->inp_socket->so_error = errno;
642	sorwakeup(inp->inp_socket);
643	sowwakeup(inp->inp_socket);
644	return (inp);
645}
646
647#ifdef INET
648void
649udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
650{
651	struct ip *ip = vip;
652	struct udphdr *uh;
653	struct in_addr faddr;
654	struct inpcb *inp;
655
656	faddr = ((struct sockaddr_in *)sa)->sin_addr;
657	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
658		return;
659
660	/*
661	 * Redirects don't need to be handled up here.
662	 */
663	if (PRC_IS_REDIRECT(cmd))
664		return;
665
666	/*
667	 * Hostdead is ugly because it goes linearly through all PCBs.
668	 *
669	 * XXX: We never get this from ICMP, otherwise it makes an excellent
670	 * DoS attack on machines with many connections.
671	 */
672	if (cmd == PRC_HOSTDEAD)
673		ip = NULL;
674	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
675		return;
676	if (ip != NULL) {
677		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
678		inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport,
679		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
680		if (inp != NULL) {
681			INP_RLOCK_ASSERT(inp);
682			if (inp->inp_socket != NULL) {
683				udp_notify(inp, inetctlerrmap[cmd]);
684			}
685			INP_RUNLOCK(inp);
686		}
687	} else
688		in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
689		    udp_notify);
690}
691#endif /* INET */
692
693static int
694udp_pcblist(SYSCTL_HANDLER_ARGS)
695{
696	int error, i, n;
697	struct inpcb *inp, **inp_list;
698	inp_gen_t gencnt;
699	struct xinpgen xig;
700
701	/*
702	 * The process of preparing the PCB list is too time-consuming and
703	 * resource-intensive to repeat twice on every request.
704	 */
705	if (req->oldptr == 0) {
706		n = V_udbinfo.ipi_count;
707		n += imax(n / 8, 10);
708		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
709		return (0);
710	}
711
712	if (req->newptr != 0)
713		return (EPERM);
714
715	/*
716	 * OK, now we're committed to doing something.
717	 */
718	INP_INFO_RLOCK(&V_udbinfo);
719	gencnt = V_udbinfo.ipi_gencnt;
720	n = V_udbinfo.ipi_count;
721	INP_INFO_RUNLOCK(&V_udbinfo);
722
723	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
724		+ n * sizeof(struct xinpcb));
725	if (error != 0)
726		return (error);
727
728	xig.xig_len = sizeof xig;
729	xig.xig_count = n;
730	xig.xig_gen = gencnt;
731	xig.xig_sogen = so_gencnt;
732	error = SYSCTL_OUT(req, &xig, sizeof xig);
733	if (error)
734		return (error);
735
736	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
737	if (inp_list == 0)
738		return (ENOMEM);
739
740	INP_INFO_RLOCK(&V_udbinfo);
741	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
742	     inp = LIST_NEXT(inp, inp_list)) {
743		INP_WLOCK(inp);
744		if (inp->inp_gencnt <= gencnt &&
745		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
746			in_pcbref(inp);
747			inp_list[i++] = inp;
748		}
749		INP_WUNLOCK(inp);
750	}
751	INP_INFO_RUNLOCK(&V_udbinfo);
752	n = i;
753
754	error = 0;
755	for (i = 0; i < n; i++) {
756		inp = inp_list[i];
757		INP_RLOCK(inp);
758		if (inp->inp_gencnt <= gencnt) {
759			struct xinpcb xi;
760
761			bzero(&xi, sizeof(xi));
762			xi.xi_len = sizeof xi;
763			/* XXX should avoid extra copy */
764			bcopy(inp, &xi.xi_inp, sizeof *inp);
765			if (inp->inp_socket)
766				sotoxsocket(inp->inp_socket, &xi.xi_socket);
767			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
768			INP_RUNLOCK(inp);
769			error = SYSCTL_OUT(req, &xi, sizeof xi);
770		} else
771			INP_RUNLOCK(inp);
772	}
773	INP_INFO_WLOCK(&V_udbinfo);
774	for (i = 0; i < n; i++) {
775		inp = inp_list[i];
776		INP_RLOCK(inp);
777		if (!in_pcbrele_rlocked(inp))
778			INP_RUNLOCK(inp);
779	}
780	INP_INFO_WUNLOCK(&V_udbinfo);
781
782	if (!error) {
783		/*
784		 * Give the user an updated idea of our state.  If the
785		 * generation differs from what we told her before, she knows
786		 * that something happened while we were processing this
787		 * request, and it might be necessary to retry.
788		 */
789		INP_INFO_RLOCK(&V_udbinfo);
790		xig.xig_gen = V_udbinfo.ipi_gencnt;
791		xig.xig_sogen = so_gencnt;
792		xig.xig_count = V_udbinfo.ipi_count;
793		INP_INFO_RUNLOCK(&V_udbinfo);
794		error = SYSCTL_OUT(req, &xig, sizeof xig);
795	}
796	free(inp_list, M_TEMP);
797	return (error);
798}
799
800SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
801    CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
802    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
803
804#ifdef INET
805static int
806udp_getcred(SYSCTL_HANDLER_ARGS)
807{
808	struct xucred xuc;
809	struct sockaddr_in addrs[2];
810	struct inpcb *inp;
811	int error;
812
813	error = priv_check(req->td, PRIV_NETINET_GETCRED);
814	if (error)
815		return (error);
816	error = SYSCTL_IN(req, addrs, sizeof(addrs));
817	if (error)
818		return (error);
819	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
820	    addrs[0].sin_addr, addrs[0].sin_port,
821	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
822	if (inp != NULL) {
823		INP_RLOCK_ASSERT(inp);
824		if (inp->inp_socket == NULL)
825			error = ENOENT;
826		if (error == 0)
827			error = cr_canseeinpcb(req->td->td_ucred, inp);
828		if (error == 0)
829			cru2x(inp->inp_cred, &xuc);
830		INP_RUNLOCK(inp);
831	} else
832		error = ENOENT;
833	if (error == 0)
834		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
835	return (error);
836}
837
838SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
839    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
840    udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
841#endif /* INET */
842
843int
844udp_ctloutput(struct socket *so, struct sockopt *sopt)
845{
846	int error = 0, optval;
847	struct inpcb *inp;
848#ifdef IPSEC_NAT_T
849	struct udpcb *up;
850#endif
851
852	inp = sotoinpcb(so);
853	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
854	INP_WLOCK(inp);
855	if (sopt->sopt_level != IPPROTO_UDP) {
856#ifdef INET6
857		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
858			INP_WUNLOCK(inp);
859			error = ip6_ctloutput(so, sopt);
860		}
861#endif
862#if defined(INET) && defined(INET6)
863		else
864#endif
865#ifdef INET
866		{
867			INP_WUNLOCK(inp);
868			error = ip_ctloutput(so, sopt);
869		}
870#endif
871		return (error);
872	}
873
874	switch (sopt->sopt_dir) {
875	case SOPT_SET:
876		switch (sopt->sopt_name) {
877		case UDP_ENCAP:
878			INP_WUNLOCK(inp);
879			error = sooptcopyin(sopt, &optval, sizeof optval,
880					    sizeof optval);
881			if (error)
882				break;
883			inp = sotoinpcb(so);
884			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
885			INP_WLOCK(inp);
886#ifdef IPSEC_NAT_T
887			up = intoudpcb(inp);
888			KASSERT(up != NULL, ("%s: up == NULL", __func__));
889#endif
890			switch (optval) {
891			case 0:
892				/* Clear all UDP encap. */
893#ifdef IPSEC_NAT_T
894				up->u_flags &= ~UF_ESPINUDP_ALL;
895#endif
896				break;
897#ifdef IPSEC_NAT_T
898			case UDP_ENCAP_ESPINUDP:
899			case UDP_ENCAP_ESPINUDP_NON_IKE:
900				up->u_flags &= ~UF_ESPINUDP_ALL;
901				if (optval == UDP_ENCAP_ESPINUDP)
902					up->u_flags |= UF_ESPINUDP;
903				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
904					up->u_flags |= UF_ESPINUDP_NON_IKE;
905				break;
906#endif
907			default:
908				error = EINVAL;
909				break;
910			}
911			INP_WUNLOCK(inp);
912			break;
913		default:
914			INP_WUNLOCK(inp);
915			error = ENOPROTOOPT;
916			break;
917		}
918		break;
919	case SOPT_GET:
920		switch (sopt->sopt_name) {
921#ifdef IPSEC_NAT_T
922		case UDP_ENCAP:
923			up = intoudpcb(inp);
924			KASSERT(up != NULL, ("%s: up == NULL", __func__));
925			optval = up->u_flags & UF_ESPINUDP_ALL;
926			INP_WUNLOCK(inp);
927			error = sooptcopyout(sopt, &optval, sizeof optval);
928			break;
929#endif
930		default:
931			INP_WUNLOCK(inp);
932			error = ENOPROTOOPT;
933			break;
934		}
935		break;
936	}
937	return (error);
938}
939
940#ifdef INET
941#define	UH_WLOCKED	2
942#define	UH_RLOCKED	1
943#define	UH_UNLOCKED	0
944static int
945udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
946    struct mbuf *control, struct thread *td)
947{
948	struct udpiphdr *ui;
949	int len = m->m_pkthdr.len;
950	struct in_addr faddr, laddr;
951	struct cmsghdr *cm;
952	struct sockaddr_in *sin, src;
953	int error = 0;
954	int ipflags;
955	u_short fport, lport;
956	int unlock_udbinfo;
957	u_char tos;
958
959	/*
960	 * udp_output() may need to temporarily bind or connect the current
961	 * inpcb.  As such, we don't know up front whether we will need the
962	 * pcbinfo lock or not.  Do any work to decide what is needed up
963	 * front before acquiring any locks.
964	 */
965	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
966		if (control)
967			m_freem(control);
968		m_freem(m);
969		return (EMSGSIZE);
970	}
971
972	src.sin_family = 0;
973	INP_RLOCK(inp);
974	tos = inp->inp_ip_tos;
975	if (control != NULL) {
976		/*
977		 * XXX: Currently, we assume all the optional information is
978		 * stored in a single mbuf.
979		 */
980		if (control->m_next) {
981			INP_RUNLOCK(inp);
982			m_freem(control);
983			m_freem(m);
984			return (EINVAL);
985		}
986		for (; control->m_len > 0;
987		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
988		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
989			cm = mtod(control, struct cmsghdr *);
990			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
991			    || cm->cmsg_len > control->m_len) {
992				error = EINVAL;
993				break;
994			}
995			if (cm->cmsg_level != IPPROTO_IP)
996				continue;
997
998			switch (cm->cmsg_type) {
999			case IP_SENDSRCADDR:
1000				if (cm->cmsg_len !=
1001				    CMSG_LEN(sizeof(struct in_addr))) {
1002					error = EINVAL;
1003					break;
1004				}
1005				bzero(&src, sizeof(src));
1006				src.sin_family = AF_INET;
1007				src.sin_len = sizeof(src);
1008				src.sin_port = inp->inp_lport;
1009				src.sin_addr =
1010				    *(struct in_addr *)CMSG_DATA(cm);
1011				break;
1012
1013			case IP_TOS:
1014				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1015					error = EINVAL;
1016					break;
1017				}
1018				tos = *(u_char *)CMSG_DATA(cm);
1019				break;
1020
1021			default:
1022				error = ENOPROTOOPT;
1023				break;
1024			}
1025			if (error)
1026				break;
1027		}
1028		m_freem(control);
1029	}
1030	if (error) {
1031		INP_RUNLOCK(inp);
1032		m_freem(m);
1033		return (error);
1034	}
1035
1036	/*
1037	 * Depending on whether or not the application has bound or connected
1038	 * the socket, we may have to do varying levels of work.  The optimal
1039	 * case is for a connected UDP socket, as a global lock isn't
1040	 * required at all.
1041	 *
1042	 * In order to decide which we need, we require stability of the
1043	 * inpcb binding, which we ensure by acquiring a read lock on the
1044	 * inpcb.  This doesn't strictly follow the lock order, so we play
1045	 * the trylock and retry game; note that we may end up with more
1046	 * conservative locks than required the second time around, so later
1047	 * assertions have to accept that.  Further analysis of the number of
1048	 * misses under contention is required.
1049	 *
1050	 * XXXRW: Check that hash locking update here is correct.
1051	 */
1052	sin = (struct sockaddr_in *)addr;
1053	if (sin != NULL &&
1054	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1055		INP_RUNLOCK(inp);
1056		INP_WLOCK(inp);
1057		INP_HASH_WLOCK(&V_udbinfo);
1058		unlock_udbinfo = UH_WLOCKED;
1059	} else if ((sin != NULL && (
1060	    (sin->sin_addr.s_addr == INADDR_ANY) ||
1061	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1062	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
1063	    (inp->inp_lport == 0))) ||
1064	    (src.sin_family == AF_INET)) {
1065		INP_HASH_RLOCK(&V_udbinfo);
1066		unlock_udbinfo = UH_RLOCKED;
1067	} else
1068		unlock_udbinfo = UH_UNLOCKED;
1069
1070	/*
1071	 * If the IP_SENDSRCADDR control message was specified, override the
1072	 * source address for this datagram.  Its use is invalidated if the
1073	 * address thus specified is incomplete or clobbers other inpcbs.
1074	 */
1075	laddr = inp->inp_laddr;
1076	lport = inp->inp_lport;
1077	if (src.sin_family == AF_INET) {
1078		INP_HASH_LOCK_ASSERT(&V_udbinfo);
1079		if ((lport == 0) ||
1080		    (laddr.s_addr == INADDR_ANY &&
1081		     src.sin_addr.s_addr == INADDR_ANY)) {
1082			error = EINVAL;
1083			goto release;
1084		}
1085		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1086		    &laddr.s_addr, &lport, td->td_ucred);
1087		if (error)
1088			goto release;
1089	}
1090
1091	/*
1092	 * If a UDP socket has been connected, then a local address/port will
1093	 * have been selected and bound.
1094	 *
1095	 * If a UDP socket has not been connected to, then an explicit
1096	 * destination address must be used, in which case a local
1097	 * address/port may not have been selected and bound.
1098	 */
1099	if (sin != NULL) {
1100		INP_LOCK_ASSERT(inp);
1101		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1102			error = EISCONN;
1103			goto release;
1104		}
1105
1106		/*
1107		 * Jail may rewrite the destination address, so let it do
1108		 * that before we use it.
1109		 */
1110		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1111		if (error)
1112			goto release;
1113
1114		/*
1115		 * If a local address or port hasn't yet been selected, or if
1116		 * the destination address needs to be rewritten due to using
1117		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1118		 * to do the heavy lifting.  Once a port is selected, we
1119		 * commit the binding back to the socket; we also commit the
1120		 * binding of the address if in jail.
1121		 *
1122		 * If we already have a valid binding and we're not
1123		 * requesting a destination address rewrite, use a fast path.
1124		 */
1125		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1126		    inp->inp_lport == 0 ||
1127		    sin->sin_addr.s_addr == INADDR_ANY ||
1128		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1129			INP_HASH_LOCK_ASSERT(&V_udbinfo);
1130			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1131			    &lport, &faddr.s_addr, &fport, NULL,
1132			    td->td_ucred);
1133			if (error)
1134				goto release;
1135
1136			/*
1137			 * XXXRW: Why not commit the port if the address is
1138			 * !INADDR_ANY?
1139			 */
1140			/* Commit the local port if newly assigned. */
1141			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1142			    inp->inp_lport == 0) {
1143				INP_WLOCK_ASSERT(inp);
1144				INP_HASH_WLOCK_ASSERT(&V_udbinfo);
1145				/*
1146				 * Remember addr if jailed, to prevent
1147				 * rebinding.
1148				 */
1149				if (prison_flag(td->td_ucred, PR_IP4))
1150					inp->inp_laddr = laddr;
1151				inp->inp_lport = lport;
1152				if (in_pcbinshash(inp) != 0) {
1153					inp->inp_lport = 0;
1154					error = EAGAIN;
1155					goto release;
1156				}
1157				inp->inp_flags |= INP_ANONPORT;
1158			}
1159		} else {
1160			faddr = sin->sin_addr;
1161			fport = sin->sin_port;
1162		}
1163	} else {
1164		INP_LOCK_ASSERT(inp);
1165		faddr = inp->inp_faddr;
1166		fport = inp->inp_fport;
1167		if (faddr.s_addr == INADDR_ANY) {
1168			error = ENOTCONN;
1169			goto release;
1170		}
1171	}
1172
1173	/*
1174	 * Calculate data length and get a mbuf for UDP, IP, and possible
1175	 * link-layer headers.  Immediate slide the data pointer back forward
1176	 * since we won't use that space at this layer.
1177	 */
1178	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
1179	if (m == NULL) {
1180		error = ENOBUFS;
1181		goto release;
1182	}
1183	m->m_data += max_linkhdr;
1184	m->m_len -= max_linkhdr;
1185	m->m_pkthdr.len -= max_linkhdr;
1186
1187	/*
1188	 * Fill in mbuf with extended UDP header and addresses and length put
1189	 * into network format.
1190	 */
1191	ui = mtod(m, struct udpiphdr *);
1192	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1193	ui->ui_pr = IPPROTO_UDP;
1194	ui->ui_src = laddr;
1195	ui->ui_dst = faddr;
1196	ui->ui_sport = lport;
1197	ui->ui_dport = fport;
1198	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1199
1200	/*
1201	 * Set the Don't Fragment bit in the IP header.
1202	 */
1203	if (inp->inp_flags & INP_DONTFRAG) {
1204		struct ip *ip;
1205
1206		ip = (struct ip *)&ui->ui_i;
1207		ip->ip_off |= IP_DF;
1208	}
1209
1210	ipflags = 0;
1211	if (inp->inp_socket->so_options & SO_DONTROUTE)
1212		ipflags |= IP_ROUTETOIF;
1213	if (inp->inp_socket->so_options & SO_BROADCAST)
1214		ipflags |= IP_ALLOWBROADCAST;
1215	if (inp->inp_flags & INP_ONESBCAST)
1216		ipflags |= IP_SENDONES;
1217
1218#ifdef MAC
1219	mac_inpcb_create_mbuf(inp, m);
1220#endif
1221
1222	/*
1223	 * Set up checksum and output datagram.
1224	 */
1225	if (V_udp_cksum) {
1226		if (inp->inp_flags & INP_ONESBCAST)
1227			faddr.s_addr = INADDR_BROADCAST;
1228		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1229		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
1230		m->m_pkthdr.csum_flags = CSUM_UDP;
1231		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1232	} else
1233		ui->ui_sum = 0;
1234	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
1235	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1236	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1237	UDPSTAT_INC(udps_opackets);
1238
1239	if (unlock_udbinfo == UH_WLOCKED)
1240		INP_HASH_WUNLOCK(&V_udbinfo);
1241	else if (unlock_udbinfo == UH_RLOCKED)
1242		INP_HASH_RUNLOCK(&V_udbinfo);
1243	error = ip_output(m, inp->inp_options, NULL, ipflags,
1244	    inp->inp_moptions, inp);
1245	if (unlock_udbinfo == UH_WLOCKED)
1246		INP_WUNLOCK(inp);
1247	else
1248		INP_RUNLOCK(inp);
1249	return (error);
1250
1251release:
1252	if (unlock_udbinfo == UH_WLOCKED) {
1253		INP_HASH_WUNLOCK(&V_udbinfo);
1254		INP_WUNLOCK(inp);
1255	} else if (unlock_udbinfo == UH_RLOCKED) {
1256		INP_HASH_RUNLOCK(&V_udbinfo);
1257		INP_RUNLOCK(inp);
1258	} else
1259		INP_RUNLOCK(inp);
1260	m_freem(m);
1261	return (error);
1262}
1263
1264
1265#if defined(IPSEC) && defined(IPSEC_NAT_T)
1266/*
1267 * Potentially decap ESP in UDP frame.  Check for an ESP header
1268 * and optional marker; if present, strip the UDP header and
1269 * push the result through IPSec.
1270 *
1271 * Returns mbuf to be processed (potentially re-allocated) or
1272 * NULL if consumed and/or processed.
1273 */
1274static struct mbuf *
1275udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1276{
1277	size_t minlen, payload, skip, iphlen;
1278	caddr_t data;
1279	struct udpcb *up;
1280	struct m_tag *tag;
1281	struct udphdr *udphdr;
1282	struct ip *ip;
1283
1284	INP_RLOCK_ASSERT(inp);
1285
1286	/*
1287	 * Pull up data so the longest case is contiguous:
1288	 *    IP/UDP hdr + non ESP marker + ESP hdr.
1289	 */
1290	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1291	if (minlen > m->m_pkthdr.len)
1292		minlen = m->m_pkthdr.len;
1293	if ((m = m_pullup(m, minlen)) == NULL) {
1294		IPSECSTAT_INC(in_inval);
1295		return (NULL);		/* Bypass caller processing. */
1296	}
1297	data = mtod(m, caddr_t);	/* Points to ip header. */
1298	payload = m->m_len - off;	/* Size of payload. */
1299
1300	if (payload == 1 && data[off] == '\xff')
1301		return (m);		/* NB: keepalive packet, no decap. */
1302
1303	up = intoudpcb(inp);
1304	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1305	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1306	    ("u_flags 0x%x", up->u_flags));
1307
1308	/*
1309	 * Check that the payload is large enough to hold an
1310	 * ESP header and compute the amount of data to remove.
1311	 *
1312	 * NB: the caller has already done a pullup for us.
1313	 * XXX can we assume alignment and eliminate bcopys?
1314	 */
1315	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1316		/*
1317		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1318		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1319		 * possible AH mode non-IKE marker+non-ESP marker
1320		 * from draft-ietf-ipsec-udp-encaps-00.txt.
1321		 */
1322		uint64_t marker;
1323
1324		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1325			return (m);	/* NB: no decap. */
1326		bcopy(data + off, &marker, sizeof(uint64_t));
1327		if (marker != 0)	/* Non-IKE marker. */
1328			return (m);	/* NB: no decap. */
1329		skip = sizeof(uint64_t) + sizeof(struct udphdr);
1330	} else {
1331		uint32_t spi;
1332
1333		if (payload <= sizeof(struct esp)) {
1334			IPSECSTAT_INC(in_inval);
1335			m_freem(m);
1336			return (NULL);	/* Discard. */
1337		}
1338		bcopy(data + off, &spi, sizeof(uint32_t));
1339		if (spi == 0)		/* Non-ESP marker. */
1340			return (m);	/* NB: no decap. */
1341		skip = sizeof(struct udphdr);
1342	}
1343
1344	/*
1345	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1346	 * the UDP ports. This is required if we want to select
1347	 * the right SPD for multiple hosts behind same NAT.
1348	 *
1349	 * NB: ports are maintained in network byte order everywhere
1350	 *     in the NAT-T code.
1351	 */
1352	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1353		2 * sizeof(uint16_t), M_NOWAIT);
1354	if (tag == NULL) {
1355		IPSECSTAT_INC(in_nomem);
1356		m_freem(m);
1357		return (NULL);		/* Discard. */
1358	}
1359	iphlen = off - sizeof(struct udphdr);
1360	udphdr = (struct udphdr *)(data + iphlen);
1361	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1362	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1363	m_tag_prepend(m, tag);
1364
1365	/*
1366	 * Remove the UDP header (and possibly the non ESP marker)
1367	 * IP header length is iphlen
1368	 * Before:
1369	 *   <--- off --->
1370	 *   +----+------+-----+
1371	 *   | IP |  UDP | ESP |
1372	 *   +----+------+-----+
1373	 *        <-skip->
1374	 * After:
1375	 *          +----+-----+
1376	 *          | IP | ESP |
1377	 *          +----+-----+
1378	 *   <-skip->
1379	 */
1380	ovbcopy(data, data + skip, iphlen);
1381	m_adj(m, skip);
1382
1383	ip = mtod(m, struct ip *);
1384	ip->ip_len -= skip;
1385	ip->ip_p = IPPROTO_ESP;
1386
1387	/*
1388	 * We cannot yet update the cksums so clear any
1389	 * h/w cksum flags as they are no longer valid.
1390	 */
1391	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1392		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1393
1394	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
1395	return (NULL);			/* NB: consumed, bypass processing. */
1396}
1397#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1398
1399static void
1400udp_abort(struct socket *so)
1401{
1402	struct inpcb *inp;
1403
1404	inp = sotoinpcb(so);
1405	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1406	INP_WLOCK(inp);
1407	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1408		INP_HASH_WLOCK(&V_udbinfo);
1409		in_pcbdisconnect(inp);
1410		inp->inp_laddr.s_addr = INADDR_ANY;
1411		INP_HASH_WUNLOCK(&V_udbinfo);
1412		soisdisconnected(so);
1413	}
1414	INP_WUNLOCK(inp);
1415}
1416
1417static int
1418udp_attach(struct socket *so, int proto, struct thread *td)
1419{
1420	struct inpcb *inp;
1421	int error;
1422
1423	inp = sotoinpcb(so);
1424	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1425	error = soreserve(so, udp_sendspace, udp_recvspace);
1426	if (error)
1427		return (error);
1428	INP_INFO_WLOCK(&V_udbinfo);
1429	error = in_pcballoc(so, &V_udbinfo);
1430	if (error) {
1431		INP_INFO_WUNLOCK(&V_udbinfo);
1432		return (error);
1433	}
1434
1435	inp = sotoinpcb(so);
1436	inp->inp_vflag |= INP_IPV4;
1437	inp->inp_ip_ttl = V_ip_defttl;
1438
1439	error = udp_newudpcb(inp);
1440	if (error) {
1441		in_pcbdetach(inp);
1442		in_pcbfree(inp);
1443		INP_INFO_WUNLOCK(&V_udbinfo);
1444		return (error);
1445	}
1446
1447	INP_WUNLOCK(inp);
1448	INP_INFO_WUNLOCK(&V_udbinfo);
1449	return (0);
1450}
1451#endif /* INET */
1452
1453int
1454udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
1455{
1456	struct inpcb *inp;
1457	struct udpcb *up;
1458
1459	KASSERT(so->so_type == SOCK_DGRAM,
1460	    ("udp_set_kernel_tunneling: !dgram"));
1461	inp = sotoinpcb(so);
1462	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1463	INP_WLOCK(inp);
1464	up = intoudpcb(inp);
1465	if (up->u_tun_func != NULL) {
1466		INP_WUNLOCK(inp);
1467		return (EBUSY);
1468	}
1469	up->u_tun_func = f;
1470	INP_WUNLOCK(inp);
1471	return (0);
1472}
1473
1474#ifdef INET
1475static int
1476udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1477{
1478	struct inpcb *inp;
1479	int error;
1480
1481	inp = sotoinpcb(so);
1482	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1483	INP_WLOCK(inp);
1484	INP_HASH_WLOCK(&V_udbinfo);
1485	error = in_pcbbind(inp, nam, td->td_ucred);
1486	INP_HASH_WUNLOCK(&V_udbinfo);
1487	INP_WUNLOCK(inp);
1488	return (error);
1489}
1490
1491static void
1492udp_close(struct socket *so)
1493{
1494	struct inpcb *inp;
1495
1496	inp = sotoinpcb(so);
1497	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1498	INP_WLOCK(inp);
1499	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1500		INP_HASH_WLOCK(&V_udbinfo);
1501		in_pcbdisconnect(inp);
1502		inp->inp_laddr.s_addr = INADDR_ANY;
1503		INP_HASH_WUNLOCK(&V_udbinfo);
1504		soisdisconnected(so);
1505	}
1506	INP_WUNLOCK(inp);
1507}
1508
1509static int
1510udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1511{
1512	struct inpcb *inp;
1513	int error;
1514	struct sockaddr_in *sin;
1515
1516	inp = sotoinpcb(so);
1517	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1518	INP_WLOCK(inp);
1519	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1520		INP_WUNLOCK(inp);
1521		return (EISCONN);
1522	}
1523	sin = (struct sockaddr_in *)nam;
1524	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1525	if (error != 0) {
1526		INP_WUNLOCK(inp);
1527		return (error);
1528	}
1529	INP_HASH_WLOCK(&V_udbinfo);
1530	error = in_pcbconnect(inp, nam, td->td_ucred);
1531	INP_HASH_WUNLOCK(&V_udbinfo);
1532	if (error == 0)
1533		soisconnected(so);
1534	INP_WUNLOCK(inp);
1535	return (error);
1536}
1537
1538static void
1539udp_detach(struct socket *so)
1540{
1541	struct inpcb *inp;
1542	struct udpcb *up;
1543
1544	inp = sotoinpcb(so);
1545	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1546	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1547	    ("udp_detach: not disconnected"));
1548	INP_INFO_WLOCK(&V_udbinfo);
1549	INP_WLOCK(inp);
1550	up = intoudpcb(inp);
1551	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1552	inp->inp_ppcb = NULL;
1553	in_pcbdetach(inp);
1554	in_pcbfree(inp);
1555	INP_INFO_WUNLOCK(&V_udbinfo);
1556	udp_discardcb(up);
1557}
1558
1559static int
1560udp_disconnect(struct socket *so)
1561{
1562	struct inpcb *inp;
1563
1564	inp = sotoinpcb(so);
1565	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1566	INP_WLOCK(inp);
1567	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1568		INP_WUNLOCK(inp);
1569		return (ENOTCONN);
1570	}
1571	INP_HASH_WLOCK(&V_udbinfo);
1572	in_pcbdisconnect(inp);
1573	inp->inp_laddr.s_addr = INADDR_ANY;
1574	INP_HASH_WUNLOCK(&V_udbinfo);
1575	SOCK_LOCK(so);
1576	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1577	SOCK_UNLOCK(so);
1578	INP_WUNLOCK(inp);
1579	return (0);
1580}
1581
1582static int
1583udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1584    struct mbuf *control, struct thread *td)
1585{
1586	struct inpcb *inp;
1587
1588	inp = sotoinpcb(so);
1589	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1590	return (udp_output(inp, m, addr, control, td));
1591}
1592#endif /* INET */
1593
1594int
1595udp_shutdown(struct socket *so)
1596{
1597	struct inpcb *inp;
1598
1599	inp = sotoinpcb(so);
1600	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1601	INP_WLOCK(inp);
1602	socantsendmore(so);
1603	INP_WUNLOCK(inp);
1604	return (0);
1605}
1606
1607#ifdef INET
1608struct pr_usrreqs udp_usrreqs = {
1609	.pru_abort =		udp_abort,
1610	.pru_attach =		udp_attach,
1611	.pru_bind =		udp_bind,
1612	.pru_connect =		udp_connect,
1613	.pru_control =		in_control,
1614	.pru_detach =		udp_detach,
1615	.pru_disconnect =	udp_disconnect,
1616	.pru_peeraddr =		in_getpeeraddr,
1617	.pru_send =		udp_send,
1618	.pru_soreceive =	soreceive_dgram,
1619	.pru_sosend =		sosend_dgram,
1620	.pru_shutdown =		udp_shutdown,
1621	.pru_sockaddr =		in_getsockaddr,
1622	.pru_sosetlabel =	in_pcbsosetlabel,
1623	.pru_close =		udp_close,
1624};
1625#endif /* INET */
1626