ip_output.c revision 71909
161348Sobrien/*
261348Sobrien * Copyright (c) 1982, 1986, 1988, 1990, 1993
3192856Sed *	The Regents of the University of California.  All rights reserved.
4192856Sed *
5192914Sed * Redistribution and use in source and binary forms, with or without
6196819Sache * modification, are permitted provided that the following conditions
710452Sjkh * are met:
810452Sjkh * 1. Redistributions of source code must retain the above copyright
961348Sobrien *    notice, this list of conditions and the following disclaimer.
1061348Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1165909Sache *    notice, this list of conditions and the following disclaimer in the
1265909Sache *    documentation and/or other materials provided with the distribution.
1310452Sjkh * 3. All advertising materials mentioning features or use of this software
14192914Sed *    must display the following acknowledgement:
15192914Sed *	This product includes software developed by the University of
16245888Sbrooks *	California, Berkeley and its contributors.
17203699Sgabor * 4. Neither the name of the University nor the names of its contributors
1878014Sache *    may be used to endorse or promote products derived from this software
19246592Santoine *    without specific prior written permission.
2091634Sphantom *
2191634Sphantom * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2291634Sphantom * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2391634Sphantom * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2491634Sphantom * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25203699Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2691633Sphantom * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2791633Sphantom * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2899257Sru * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29192856Sed * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3099257Sru * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31192856Sed * SUCH DAMAGE.
32192856Sed *
33192856Sed *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
3415853Swosch * $FreeBSD: head/sys/netinet/ip_output.c 71909 2001-02-02 00:18:00Z luigi $
3510452Sjkh */
3610452Sjkh
37#define _IP_VHL
38
39#include "opt_ipfw.h"
40#include "opt_ipdn.h"
41#include "opt_ipdivert.h"
42#include "opt_ipfilter.h"
43#include "opt_ipsec.h"
44#include "opt_pfil_hooks.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kernel.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/protosw.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54
55#include <net/if.h>
56#include <net/route.h>
57
58#include <netinet/in.h>
59#include <netinet/in_systm.h>
60#include <netinet/ip.h>
61#include <netinet/in_pcb.h>
62#include <netinet/in_var.h>
63#include <netinet/ip_var.h>
64
65#include "faith.h"
66
67#ifdef vax
68#include <machine/mtpr.h>
69#endif
70#include <machine/in_cksum.h>
71
72static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
73
74#ifdef IPSEC
75#include <netinet6/ipsec.h>
76#include <netkey/key.h>
77#ifdef IPSEC_DEBUG
78#include <netkey/key_debug.h>
79#else
80#define	KEYDEBUG(lev,arg)
81#endif
82#endif /*IPSEC*/
83
84#include <netinet/ip_fw.h>
85
86#ifdef DUMMYNET
87#include <netinet/ip_dummynet.h>
88#endif
89
90#ifdef IPFIREWALL_FORWARD_DEBUG
91#define print_ip(a)	 printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
92				 		  (ntohl(a.s_addr)>>16)&0xFF,\
93						  (ntohl(a.s_addr)>>8)&0xFF,\
94						  (ntohl(a.s_addr))&0xFF);
95#endif
96
97u_short ip_id;
98
99static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *));
100static void	ip_mloopback
101	__P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int));
102static int	ip_getmoptions
103	__P((struct sockopt *, struct ip_moptions *));
104static int	ip_pcbopts __P((int, struct mbuf **, struct mbuf *));
105static int	ip_setmoptions
106	__P((struct sockopt *, struct ip_moptions **));
107
108int	ip_optcopy __P((struct ip *, struct ip *));
109
110
111extern	struct protosw inetsw[];
112
113/*
114 * IP output.  The packet in mbuf chain m contains a skeletal IP
115 * header (with len, off, ttl, proto, tos, src, dst).
116 * The mbuf chain containing the packet will be freed.
117 * The mbuf opt, if present, will not be freed.
118 */
119int
120ip_output(m0, opt, ro, flags, imo)
121	struct mbuf *m0;
122	struct mbuf *opt;
123	struct route *ro;
124	int flags;
125	struct ip_moptions *imo;
126{
127	struct ip *ip, *mhip;
128	struct ifnet *ifp;
129	struct mbuf *m = m0;
130	int hlen = sizeof (struct ip);
131	int len, off, error = 0;
132	struct sockaddr_in *dst;
133	struct in_ifaddr *ia;
134	int isbroadcast, sw_csum;
135#ifdef IPSEC
136	struct route iproute;
137	struct socket *so = NULL;
138	struct secpolicy *sp = NULL;
139#endif
140	u_int16_t divert_cookie;		/* firewall cookie */
141#ifdef PFIL_HOOKS
142	struct packet_filter_hook *pfh;
143	struct mbuf *m1;
144	int rv;
145#endif /* PFIL_HOOKS */
146#ifdef IPFIREWALL_FORWARD
147	int fwd_rewrite_src = 0;
148#endif
149	struct ip_fw_chain *rule = NULL;
150
151#ifdef IPDIVERT
152	/* Get and reset firewall cookie */
153	divert_cookie = ip_divert_cookie;
154	ip_divert_cookie = 0;
155#else
156	divert_cookie = 0;
157#endif
158
159#if defined(IPFIREWALL) && defined(DUMMYNET)
160        /*
161         * dummynet packet are prepended a vestigial mbuf with
162         * m_type = MT_DUMMYNET and m_data pointing to the matching
163         * rule.
164         */
165        if (m->m_type == MT_DUMMYNET) {
166            /*
167             * the packet was already tagged, so part of the
168             * processing was already done, and we need to go down.
169             * Get parameters from the header.
170             */
171            rule = (struct ip_fw_chain *)(m->m_data) ;
172	    opt = NULL ;
173	    ro = & ( ((struct dn_pkt *)m)->ro ) ;
174	    imo = NULL ;
175	    dst = ((struct dn_pkt *)m)->dn_dst ;
176	    ifp = ((struct dn_pkt *)m)->ifp ;
177	    flags = ((struct dn_pkt *)m)->flags ;
178
179            m0 = m = m->m_next ;
180#ifdef IPSEC
181	    so = ipsec_getsocket(m);
182	    ipsec_setsocket(m, NULL);
183#endif
184            ip = mtod(m, struct ip *);
185            hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
186            goto sendit;
187        } else
188            rule = NULL ;
189#endif
190#ifdef IPSEC
191	so = ipsec_getsocket(m);
192	ipsec_setsocket(m, NULL);
193#endif
194
195#ifdef	DIAGNOSTIC
196	if ((m->m_flags & M_PKTHDR) == 0)
197		panic("ip_output no HDR");
198	if (!ro)
199		panic("ip_output no route, proto = %d",
200		      mtod(m, struct ip *)->ip_p);
201#endif
202	if (opt) {
203		m = ip_insertoptions(m, opt, &len);
204		hlen = len;
205	}
206	ip = mtod(m, struct ip *);
207	/*
208	 * Fill in IP header.
209	 */
210	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
211		ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
212		ip->ip_off &= IP_DF;
213		ip->ip_id = htons(ip_id++);
214		ipstat.ips_localout++;
215	} else {
216		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
217	}
218
219	dst = (struct sockaddr_in *)&ro->ro_dst;
220	/*
221	 * If there is a cached route,
222	 * check that it is to the same destination
223	 * and is still up.  If not, free it and try again.
224	 */
225	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
226	   dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
227		RTFREE(ro->ro_rt);
228		ro->ro_rt = (struct rtentry *)0;
229	}
230	if (ro->ro_rt == 0) {
231		dst->sin_family = AF_INET;
232		dst->sin_len = sizeof(*dst);
233		dst->sin_addr = ip->ip_dst;
234	}
235	/*
236	 * If routing to interface only,
237	 * short circuit routing lookup.
238	 */
239#define ifatoia(ifa)	((struct in_ifaddr *)(ifa))
240#define sintosa(sin)	((struct sockaddr *)(sin))
241	if (flags & IP_ROUTETOIF) {
242		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
243		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
244			ipstat.ips_noroute++;
245			error = ENETUNREACH;
246			goto bad;
247		}
248		ifp = ia->ia_ifp;
249		ip->ip_ttl = 1;
250		isbroadcast = in_broadcast(dst->sin_addr, ifp);
251	} else {
252		/*
253		 * If this is the case, we probably don't want to allocate
254		 * a protocol-cloned route since we didn't get one from the
255		 * ULP.  This lets TCP do its thing, while not burdening
256		 * forwarding or ICMP with the overhead of cloning a route.
257		 * Of course, we still want to do any cloning requested by
258		 * the link layer, as this is probably required in all cases
259		 * for correct operation (as it is for ARP).
260		 */
261		if (ro->ro_rt == 0)
262			rtalloc_ign(ro, RTF_PRCLONING);
263		if (ro->ro_rt == 0) {
264			ipstat.ips_noroute++;
265			error = EHOSTUNREACH;
266			goto bad;
267		}
268		ia = ifatoia(ro->ro_rt->rt_ifa);
269		ifp = ro->ro_rt->rt_ifp;
270		ro->ro_rt->rt_use++;
271		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
272			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
273		if (ro->ro_rt->rt_flags & RTF_HOST)
274			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
275		else
276			isbroadcast = in_broadcast(dst->sin_addr, ifp);
277	}
278	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
279		struct in_multi *inm;
280
281		m->m_flags |= M_MCAST;
282		/*
283		 * IP destination address is multicast.  Make sure "dst"
284		 * still points to the address in "ro".  (It may have been
285		 * changed to point to a gateway address, above.)
286		 */
287		dst = (struct sockaddr_in *)&ro->ro_dst;
288		/*
289		 * See if the caller provided any multicast options
290		 */
291		if (imo != NULL) {
292			ip->ip_ttl = imo->imo_multicast_ttl;
293			if (imo->imo_multicast_ifp != NULL)
294				ifp = imo->imo_multicast_ifp;
295			if (imo->imo_multicast_vif != -1)
296				ip->ip_src.s_addr =
297				    ip_mcast_src(imo->imo_multicast_vif);
298		} else
299			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
300		/*
301		 * Confirm that the outgoing interface supports multicast.
302		 */
303		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
304			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
305				ipstat.ips_noroute++;
306				error = ENETUNREACH;
307				goto bad;
308			}
309		}
310		/*
311		 * If source address not specified yet, use address
312		 * of outgoing interface.
313		 */
314		if (ip->ip_src.s_addr == INADDR_ANY) {
315			register struct in_ifaddr *ia1;
316
317			for (ia1 = in_ifaddrhead.tqh_first; ia1;
318			     ia1 = ia1->ia_link.tqe_next)
319				if (ia1->ia_ifp == ifp) {
320					ip->ip_src = IA_SIN(ia1)->sin_addr;
321					break;
322				}
323		}
324
325		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
326		if (inm != NULL &&
327		   (imo == NULL || imo->imo_multicast_loop)) {
328			/*
329			 * If we belong to the destination multicast group
330			 * on the outgoing interface, and the caller did not
331			 * forbid loopback, loop back a copy.
332			 */
333			ip_mloopback(ifp, m, dst, hlen);
334		}
335		else {
336			/*
337			 * If we are acting as a multicast router, perform
338			 * multicast forwarding as if the packet had just
339			 * arrived on the interface to which we are about
340			 * to send.  The multicast forwarding function
341			 * recursively calls this function, using the
342			 * IP_FORWARDING flag to prevent infinite recursion.
343			 *
344			 * Multicasts that are looped back by ip_mloopback(),
345			 * above, will be forwarded by the ip_input() routine,
346			 * if necessary.
347			 */
348			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
349				/*
350				 * Check if rsvp daemon is running. If not, don't
351				 * set ip_moptions. This ensures that the packet
352				 * is multicast and not just sent down one link
353				 * as prescribed by rsvpd.
354				 */
355				if (!rsvp_on)
356				  imo = NULL;
357				if (ip_mforward(ip, ifp, m, imo) != 0) {
358					m_freem(m);
359					goto done;
360				}
361			}
362		}
363
364		/*
365		 * Multicasts with a time-to-live of zero may be looped-
366		 * back, above, but must not be transmitted on a network.
367		 * Also, multicasts addressed to the loopback interface
368		 * are not sent -- the above call to ip_mloopback() will
369		 * loop back a copy if this host actually belongs to the
370		 * destination group on the loopback interface.
371		 */
372		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
373			m_freem(m);
374			goto done;
375		}
376
377		goto sendit;
378	}
379#ifndef notdef
380	/*
381	 * If source address not specified yet, use address
382	 * of outgoing interface.
383	 */
384	if (ip->ip_src.s_addr == INADDR_ANY) {
385		ip->ip_src = IA_SIN(ia)->sin_addr;
386#ifdef IPFIREWALL_FORWARD
387		/* Keep note that we did this - if the firewall changes
388		 * the next-hop, our interface may change, changing the
389		 * default source IP. It's a shame so much effort happens
390		 * twice. Oh well.
391		 */
392		fwd_rewrite_src++;
393#endif /* IPFIREWALL_FORWARD */
394	}
395#endif /* notdef */
396	/*
397	 * Verify that we have any chance at all of being able to queue
398	 *      the packet or packet fragments
399	 */
400	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
401		ifp->if_snd.ifq_maxlen) {
402			error = ENOBUFS;
403			goto bad;
404	}
405
406	/*
407	 * Look for broadcast address and
408	 * and verify user is allowed to send
409	 * such a packet.
410	 */
411	if (isbroadcast) {
412		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
413			error = EADDRNOTAVAIL;
414			goto bad;
415		}
416		if ((flags & IP_ALLOWBROADCAST) == 0) {
417			error = EACCES;
418			goto bad;
419		}
420		/* don't allow broadcast messages to be fragmented */
421		if ((u_short)ip->ip_len > ifp->if_mtu) {
422			error = EMSGSIZE;
423			goto bad;
424		}
425		m->m_flags |= M_BCAST;
426	} else {
427		m->m_flags &= ~M_BCAST;
428	}
429
430sendit:
431	/*
432	 * IpHack's section.
433	 * - Xlate: translate packet's addr/port (NAT).
434	 * - Firewall: deny/allow/etc.
435	 * - Wrap: fake packet's addr/port <unimpl.>
436	 * - Encapsulate: put it in another IP and send out. <unimp.>
437	 */
438#ifdef PFIL_HOOKS
439	/*
440	 * Run through list of hooks for output packets.
441	 */
442	m1 = m;
443	pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
444	for (; pfh; pfh = pfh->pfil_link.tqe_next)
445		if (pfh->pfil_func) {
446			rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1);
447			if (rv) {
448				error = EHOSTUNREACH;
449				goto done;
450			}
451			m = m1;
452			if (m == NULL)
453				goto done;
454			ip = mtod(m, struct ip *);
455		}
456#endif /* PFIL_HOOKS */
457
458	/*
459	 * Check with the firewall...
460	 */
461	if (fw_enable && ip_fw_chk_ptr) {
462		struct sockaddr_in *old = dst;
463
464		off = (*ip_fw_chk_ptr)(&ip,
465		    hlen, ifp, &divert_cookie, &m, &rule, &dst);
466                /*
467                 * On return we must do the following:
468                 * m == NULL         -> drop the pkt (old interface, deprecated)
469                 * (off & 0x40000)   -> drop the pkt (new interface)
470                 * 1<=off<= 0xffff   -> DIVERT
471                 * (off & 0x10000)   -> send to a DUMMYNET pipe
472                 * (off & 0x20000)   -> TEE the packet
473                 * dst != old        -> IPFIREWALL_FORWARD
474                 * off==0, dst==old  -> accept
475                 * If some of the above modules is not compiled in, then
476                 * we should't have to check the corresponding condition
477                 * (because the ipfw control socket should not accept
478                 * unsupported rules), but better play safe and drop
479                 * packets in case of doubt.
480                 */
481		if (off & IP_FW_PORT_DENY_FLAG) { /* XXX new interface-denied */
482		    if (m)
483			m_freem(m);
484		    error = EACCES ;
485		    goto done;
486		}
487		if (!m) { /* firewall said to reject */
488		    static int __debug=10;
489		    if (__debug >0) {
490			printf("firewall returns NULL, please update!\n");
491			__debug-- ;
492		    }
493		    error = EACCES;
494		    goto done;
495		}
496		if (off == 0 && dst == old) /* common case */
497			goto pass ;
498#ifdef DUMMYNET
499                if ((off & IP_FW_PORT_DYNT_FLAG) != 0) {
500                    /*
501                     * pass the pkt to dummynet. Need to include
502                     * pipe number, m, ifp, ro, dst because these are
503                     * not recomputed in the next pass.
504                     * All other parameters have been already used and
505                     * so they are not needed anymore.
506                     * XXX note: if the ifp or ro entry are deleted
507                     * while a pkt is in dummynet, we are in trouble!
508                     */
509		    error = dummynet_io(off & 0xffff, DN_TO_IP_OUT, m,
510				ifp,ro,dst,rule, flags);
511		    goto done;
512		}
513#endif
514#ifdef IPDIVERT
515		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
516			struct mbuf *clone = NULL;
517
518			/* Clone packet if we're doing a 'tee' */
519			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
520				clone = m_dup(m, M_DONTWAIT);
521
522			/*
523			 * XXX
524			 * delayed checksums are not currently compatible
525			 * with divert sockets.
526			 */
527			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
528				in_delayed_cksum(m);
529				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
530			}
531
532			/* Restore packet header fields to original values */
533			HTONS(ip->ip_len);
534			HTONS(ip->ip_off);
535
536			/* Deliver packet to divert input routine */
537			ip_divert_cookie = divert_cookie;
538			divert_packet(m, 0, off & 0xffff);
539
540			/* If 'tee', continue with original packet */
541			if (clone != NULL) {
542				m = clone;
543				ip = mtod(m, struct ip *);
544				goto pass;
545			}
546			goto done;
547		}
548#endif
549
550#ifdef IPFIREWALL_FORWARD
551		/* Here we check dst to make sure it's directly reachable on the
552		 * interface we previously thought it was.
553		 * If it isn't (which may be likely in some situations) we have
554		 * to re-route it (ie, find a route for the next-hop and the
555		 * associated interface) and set them here. This is nested
556		 * forwarding which in most cases is undesirable, except where
557		 * such control is nigh impossible. So we do it here.
558		 * And I'm babbling.
559		 */
560		if (off == 0 && old != dst) {
561			struct in_ifaddr *ia;
562
563			/* It's changed... */
564			/* There must be a better way to do this next line... */
565			static struct route sro_fwd, *ro_fwd = &sro_fwd;
566#ifdef IPFIREWALL_FORWARD_DEBUG
567			printf("IPFIREWALL_FORWARD: New dst ip: ");
568			print_ip(dst->sin_addr);
569			printf("\n");
570#endif
571			/*
572			 * We need to figure out if we have been forwarded
573			 * to a local socket. If so then we should somehow
574			 * "loop back" to ip_input, and get directed to the
575			 * PCB as if we had received this packet. This is
576			 * because it may be dificult to identify the packets
577			 * you want to forward until they are being output
578			 * and have selected an interface. (e.g. locally
579			 * initiated packets) If we used the loopback inteface,
580			 * we would not be able to control what happens
581			 * as the packet runs through ip_input() as
582			 * it is done through a ISR.
583			 */
584			for (ia = TAILQ_FIRST(&in_ifaddrhead); ia;
585					ia = TAILQ_NEXT(ia, ia_link)) {
586				/*
587				 * If the addr to forward to is one
588				 * of ours, we pretend to
589				 * be the destination for this packet.
590				 */
591				if (IA_SIN(ia)->sin_addr.s_addr ==
592						 dst->sin_addr.s_addr)
593					break;
594			}
595			if (ia) {
596				/* tell ip_input "dont filter" */
597				ip_fw_fwd_addr = dst;
598				if (m->m_pkthdr.rcvif == NULL)
599					m->m_pkthdr.rcvif = ifunit("lo0");
600				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
601					m->m_pkthdr.csum_flags |=
602					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
603					m0->m_pkthdr.csum_data = 0xffff;
604				}
605				m->m_pkthdr.csum_flags |=
606				    CSUM_IP_CHECKED | CSUM_IP_VALID;
607				HTONS(ip->ip_len);
608				HTONS(ip->ip_off);
609				ip_input(m);
610				goto done;
611			}
612			/* Some of the logic for this was
613			 * nicked from above.
614			 *
615			 * This rewrites the cached route in a local PCB.
616			 * Is this what we want to do?
617			 */
618			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
619
620			ro_fwd->ro_rt = 0;
621			rtalloc_ign(ro_fwd, RTF_PRCLONING);
622
623			if (ro_fwd->ro_rt == 0) {
624				ipstat.ips_noroute++;
625				error = EHOSTUNREACH;
626				goto bad;
627			}
628
629			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
630			ifp = ro_fwd->ro_rt->rt_ifp;
631			ro_fwd->ro_rt->rt_use++;
632			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
633				dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway;
634			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
635				isbroadcast =
636				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
637			else
638				isbroadcast = in_broadcast(dst->sin_addr, ifp);
639			RTFREE(ro->ro_rt);
640			ro->ro_rt = ro_fwd->ro_rt;
641			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
642
643			/*
644			 * If we added a default src ip earlier,
645			 * which would have been gotten from the-then
646			 * interface, do it again, from the new one.
647			 */
648			if (fwd_rewrite_src)
649				ip->ip_src = IA_SIN(ia)->sin_addr;
650			goto pass ;
651		}
652#endif /* IPFIREWALL_FORWARD */
653                /*
654                 * if we get here, none of the above matches, and
655                 * we have to drop the pkt
656                 */
657		m_freem(m);
658                error = EACCES; /* not sure this is the right error msg */
659                goto done;
660	}
661
662pass:
663#ifdef IPSEC
664	/* get SP for this packet */
665	if (so == NULL)
666		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
667	else
668		sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
669
670	if (sp == NULL) {
671		ipsecstat.out_inval++;
672		goto bad;
673	}
674
675	error = 0;
676
677	/* check policy */
678	switch (sp->policy) {
679	case IPSEC_POLICY_DISCARD:
680		/*
681		 * This packet is just discarded.
682		 */
683		ipsecstat.out_polvio++;
684		goto bad;
685
686	case IPSEC_POLICY_BYPASS:
687	case IPSEC_POLICY_NONE:
688		/* no need to do IPsec. */
689		goto skip_ipsec;
690
691	case IPSEC_POLICY_IPSEC:
692		if (sp->req == NULL) {
693			/* XXX should be panic ? */
694			printf("ip_output: No IPsec request specified.\n");
695			error = EINVAL;
696			goto bad;
697		}
698		break;
699
700	case IPSEC_POLICY_ENTRUST:
701	default:
702		printf("ip_output: Invalid policy found. %d\n", sp->policy);
703	}
704    {
705	struct ipsec_output_state state;
706	bzero(&state, sizeof(state));
707	state.m = m;
708	if (flags & IP_ROUTETOIF) {
709		state.ro = &iproute;
710		bzero(&iproute, sizeof(iproute));
711	} else
712		state.ro = ro;
713	state.dst = (struct sockaddr *)dst;
714
715	ip->ip_sum = 0;
716
717	/*
718	 * XXX
719	 * delayed checksums are not currently compatible with IPsec
720	 */
721	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
722		in_delayed_cksum(m);
723		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
724	}
725
726	HTONS(ip->ip_len);
727	HTONS(ip->ip_off);
728
729	error = ipsec4_output(&state, sp, flags);
730
731	m = state.m;
732	if (flags & IP_ROUTETOIF) {
733		/*
734		 * if we have tunnel mode SA, we may need to ignore
735		 * IP_ROUTETOIF.
736		 */
737		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
738			flags &= ~IP_ROUTETOIF;
739			ro = state.ro;
740		}
741	} else
742		ro = state.ro;
743	dst = (struct sockaddr_in *)state.dst;
744	if (error) {
745		/* mbuf is already reclaimed in ipsec4_output. */
746		m0 = NULL;
747		switch (error) {
748		case EHOSTUNREACH:
749		case ENETUNREACH:
750		case EMSGSIZE:
751		case ENOBUFS:
752		case ENOMEM:
753			break;
754		default:
755			printf("ip4_output (ipsec): error code %d\n", error);
756			/*fall through*/
757		case ENOENT:
758			/* don't show these error codes to the user */
759			error = 0;
760			break;
761		}
762		goto bad;
763	}
764    }
765
766	/* be sure to update variables that are affected by ipsec4_output() */
767	ip = mtod(m, struct ip *);
768#ifdef _IP_VHL
769	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
770#else
771	hlen = ip->ip_hl << 2;
772#endif
773	if (ro->ro_rt == NULL) {
774		if ((flags & IP_ROUTETOIF) == 0) {
775			printf("ip_output: "
776				"can't update route after IPsec processing\n");
777			error = EHOSTUNREACH;	/*XXX*/
778			goto bad;
779		}
780	} else {
781		ia = ifatoia(ro->ro_rt->rt_ifa);
782		ifp = ro->ro_rt->rt_ifp;
783	}
784
785	/* make it flipped, again. */
786	NTOHS(ip->ip_len);
787	NTOHS(ip->ip_off);
788skip_ipsec:
789#endif /*IPSEC*/
790
791	sw_csum = m->m_pkthdr.csum_flags | CSUM_IP;
792	m->m_pkthdr.csum_flags = sw_csum & ifp->if_hwassist;
793	sw_csum &= ~ifp->if_hwassist;
794	if (sw_csum & CSUM_DELAY_DATA) {
795		in_delayed_cksum(m);
796		sw_csum &= ~CSUM_DELAY_DATA;
797	}
798
799	/*
800	 * If small enough for interface, or the interface will take
801	 * care of the fragmentation for us, can just send directly.
802	 */
803	if ((u_short)ip->ip_len <= ifp->if_mtu ||
804	    ifp->if_hwassist & CSUM_FRAGMENT) {
805		HTONS(ip->ip_len);
806		HTONS(ip->ip_off);
807		ip->ip_sum = 0;
808		if (sw_csum & CSUM_DELAY_IP) {
809			if (ip->ip_vhl == IP_VHL_BORING) {
810				ip->ip_sum = in_cksum_hdr(ip);
811			} else {
812				ip->ip_sum = in_cksum(m, hlen);
813			}
814		}
815
816		/* Record statistics for this interface address. */
817		if (!(flags & IP_FORWARDING)) {
818			ia->ia_ifa.if_opackets++;
819			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
820		}
821
822		error = (*ifp->if_output)(ifp, m,
823				(struct sockaddr *)dst, ro->ro_rt);
824		goto done;
825	}
826	/*
827	 * Too large for interface; fragment if possible.
828	 * Must be able to put at least 8 bytes per fragment.
829	 */
830	if (ip->ip_off & IP_DF) {
831		error = EMSGSIZE;
832		/*
833		 * This case can happen if the user changed the MTU
834		 * of an interface after enabling IP on it.  Because
835		 * most netifs don't keep track of routes pointing to
836		 * them, there is no way for one to update all its
837		 * routes when the MTU is changed.
838		 */
839		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
840		    && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
841		    && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
842			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
843		}
844		ipstat.ips_cantfrag++;
845		goto bad;
846	}
847	len = (ifp->if_mtu - hlen) &~ 7;
848	if (len < 8) {
849		error = EMSGSIZE;
850		goto bad;
851	}
852
853	/*
854	 * if the interface will not calculate checksums on
855	 * fragmented packets, then do it here.
856	 */
857	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
858	    (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
859		in_delayed_cksum(m);
860		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
861	}
862
863    {
864	int mhlen, firstlen = len;
865	struct mbuf **mnext = &m->m_nextpkt;
866	int nfrags = 1;
867
868	/*
869	 * Loop through length of segment after first fragment,
870	 * make new header and copy data of each part and link onto chain.
871	 */
872	m0 = m;
873	mhlen = sizeof (struct ip);
874	for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
875		MGETHDR(m, M_DONTWAIT, MT_HEADER);
876		if (m == 0) {
877			error = ENOBUFS;
878			ipstat.ips_odropped++;
879			goto sendorfree;
880		}
881		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
882		m->m_data += max_linkhdr;
883		mhip = mtod(m, struct ip *);
884		*mhip = *ip;
885		if (hlen > sizeof (struct ip)) {
886			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
887			mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
888		}
889		m->m_len = mhlen;
890		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
891		if (off + len >= (u_short)ip->ip_len)
892			len = (u_short)ip->ip_len - off;
893		else
894			mhip->ip_off |= IP_MF;
895		mhip->ip_len = htons((u_short)(len + mhlen));
896		m->m_next = m_copy(m0, off, len);
897		if (m->m_next == 0) {
898			(void) m_free(m);
899			error = ENOBUFS;	/* ??? */
900			ipstat.ips_odropped++;
901			goto sendorfree;
902		}
903		m->m_pkthdr.len = mhlen + len;
904		m->m_pkthdr.rcvif = (struct ifnet *)0;
905		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
906		HTONS(mhip->ip_off);
907		mhip->ip_sum = 0;
908		if (sw_csum & CSUM_DELAY_IP) {
909			if (mhip->ip_vhl == IP_VHL_BORING) {
910				mhip->ip_sum = in_cksum_hdr(mhip);
911			} else {
912				mhip->ip_sum = in_cksum(m, mhlen);
913			}
914		}
915		*mnext = m;
916		mnext = &m->m_nextpkt;
917		nfrags++;
918	}
919	ipstat.ips_ofragments += nfrags;
920
921	/* set first/last markers for fragment chain */
922	m->m_flags |= M_LASTFRAG;
923	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
924	m0->m_pkthdr.csum_data = nfrags;
925
926	/*
927	 * Update first fragment by trimming what's been copied out
928	 * and updating header, then send each fragment (in order).
929	 */
930	m = m0;
931	m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
932	m->m_pkthdr.len = hlen + firstlen;
933	ip->ip_len = htons((u_short)m->m_pkthdr.len);
934	ip->ip_off |= IP_MF;
935	HTONS(ip->ip_off);
936	ip->ip_sum = 0;
937	if (sw_csum & CSUM_DELAY_IP) {
938		if (ip->ip_vhl == IP_VHL_BORING) {
939			ip->ip_sum = in_cksum_hdr(ip);
940		} else {
941			ip->ip_sum = in_cksum(m, hlen);
942		}
943	}
944sendorfree:
945	for (m = m0; m; m = m0) {
946		m0 = m->m_nextpkt;
947		m->m_nextpkt = 0;
948		if (error == 0) {
949			/* Record statistics for this interface address. */
950			ia->ia_ifa.if_opackets++;
951			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
952
953			error = (*ifp->if_output)(ifp, m,
954			    (struct sockaddr *)dst, ro->ro_rt);
955		} else
956			m_freem(m);
957	}
958
959	if (error == 0)
960		ipstat.ips_fragmented++;
961    }
962done:
963#ifdef IPSEC
964	if (ro == &iproute && ro->ro_rt) {
965		RTFREE(ro->ro_rt);
966		ro->ro_rt = NULL;
967	}
968	if (sp != NULL) {
969		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
970			printf("DP ip_output call free SP:%p\n", sp));
971		key_freesp(sp);
972	}
973#endif /* IPSEC */
974	return (error);
975bad:
976	m_freem(m0);
977	goto done;
978}
979
980void
981in_delayed_cksum(struct mbuf *m)
982{
983	struct ip *ip;
984	u_short csum, offset;
985
986	ip = mtod(m, struct ip *);
987	offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
988	csum = in_cksum_skip(m, ip->ip_len, offset);
989	offset += m->m_pkthdr.csum_data;	/* checksum offset */
990
991	if (offset + sizeof(u_short) > m->m_len) {
992		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
993		    m->m_len, offset, ip->ip_p);
994		/*
995		 * XXX
996		 * this shouldn't happen, but if it does, the
997		 * correct behavior may be to insert the checksum
998		 * in the existing chain instead of rearranging it.
999		 */
1000		m = m_pullup(m, offset + sizeof(u_short));
1001	}
1002	*(u_short *)(m->m_data + offset) = csum;
1003}
1004
1005/*
1006 * Insert IP options into preformed packet.
1007 * Adjust IP destination as required for IP source routing,
1008 * as indicated by a non-zero in_addr at the start of the options.
1009 *
1010 * XXX This routine assumes that the packet has no options in place.
1011 */
1012static struct mbuf *
1013ip_insertoptions(m, opt, phlen)
1014	register struct mbuf *m;
1015	struct mbuf *opt;
1016	int *phlen;
1017{
1018	register struct ipoption *p = mtod(opt, struct ipoption *);
1019	struct mbuf *n;
1020	register struct ip *ip = mtod(m, struct ip *);
1021	unsigned optlen;
1022
1023	optlen = opt->m_len - sizeof(p->ipopt_dst);
1024	if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
1025		return (m);		/* XXX should fail */
1026	if (p->ipopt_dst.s_addr)
1027		ip->ip_dst = p->ipopt_dst;
1028	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1029		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1030		if (n == 0)
1031			return (m);
1032		n->m_pkthdr.rcvif = (struct ifnet *)0;
1033		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1034		m->m_len -= sizeof(struct ip);
1035		m->m_data += sizeof(struct ip);
1036		n->m_next = m;
1037		m = n;
1038		m->m_len = optlen + sizeof(struct ip);
1039		m->m_data += max_linkhdr;
1040		(void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
1041	} else {
1042		m->m_data -= optlen;
1043		m->m_len += optlen;
1044		m->m_pkthdr.len += optlen;
1045		ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
1046	}
1047	ip = mtod(m, struct ip *);
1048	bcopy(p->ipopt_list, ip + 1, optlen);
1049	*phlen = sizeof(struct ip) + optlen;
1050	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
1051	ip->ip_len += optlen;
1052	return (m);
1053}
1054
1055/*
1056 * Copy options from ip to jp,
1057 * omitting those not copied during fragmentation.
1058 */
1059int
1060ip_optcopy(ip, jp)
1061	struct ip *ip, *jp;
1062{
1063	register u_char *cp, *dp;
1064	int opt, optlen, cnt;
1065
1066	cp = (u_char *)(ip + 1);
1067	dp = (u_char *)(jp + 1);
1068	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
1069	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1070		opt = cp[0];
1071		if (opt == IPOPT_EOL)
1072			break;
1073		if (opt == IPOPT_NOP) {
1074			/* Preserve for IP mcast tunnel's LSRR alignment. */
1075			*dp++ = IPOPT_NOP;
1076			optlen = 1;
1077			continue;
1078		}
1079#ifdef DIAGNOSTIC
1080		if (cnt < IPOPT_OLEN + sizeof(*cp))
1081			panic("malformed IPv4 option passed to ip_optcopy");
1082#endif
1083		optlen = cp[IPOPT_OLEN];
1084#ifdef DIAGNOSTIC
1085		if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1086			panic("malformed IPv4 option passed to ip_optcopy");
1087#endif
1088		/* bogus lengths should have been caught by ip_dooptions */
1089		if (optlen > cnt)
1090			optlen = cnt;
1091		if (IPOPT_COPIED(opt)) {
1092			bcopy(cp, dp, optlen);
1093			dp += optlen;
1094		}
1095	}
1096	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1097		*dp++ = IPOPT_EOL;
1098	return (optlen);
1099}
1100
1101/*
1102 * IP socket option processing.
1103 */
1104int
1105ip_ctloutput(so, sopt)
1106	struct socket *so;
1107	struct sockopt *sopt;
1108{
1109	struct	inpcb *inp = sotoinpcb(so);
1110	int	error, optval;
1111
1112	error = optval = 0;
1113	if (sopt->sopt_level != IPPROTO_IP) {
1114		return (EINVAL);
1115	}
1116
1117	switch (sopt->sopt_dir) {
1118	case SOPT_SET:
1119		switch (sopt->sopt_name) {
1120		case IP_OPTIONS:
1121#ifdef notyet
1122		case IP_RETOPTS:
1123#endif
1124		{
1125			struct mbuf *m;
1126			if (sopt->sopt_valsize > MLEN) {
1127				error = EMSGSIZE;
1128				break;
1129			}
1130			MGET(m, sopt->sopt_p ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1131			if (m == 0) {
1132				error = ENOBUFS;
1133				break;
1134			}
1135			m->m_len = sopt->sopt_valsize;
1136			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1137					    m->m_len);
1138
1139			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1140					   m));
1141		}
1142
1143		case IP_TOS:
1144		case IP_TTL:
1145		case IP_RECVOPTS:
1146		case IP_RECVRETOPTS:
1147		case IP_RECVDSTADDR:
1148		case IP_RECVIF:
1149#if defined(NFAITH) && NFAITH > 0
1150		case IP_FAITH:
1151#endif
1152			error = sooptcopyin(sopt, &optval, sizeof optval,
1153					    sizeof optval);
1154			if (error)
1155				break;
1156
1157			switch (sopt->sopt_name) {
1158			case IP_TOS:
1159				inp->inp_ip_tos = optval;
1160				break;
1161
1162			case IP_TTL:
1163				inp->inp_ip_ttl = optval;
1164				break;
1165#define	OPTSET(bit) \
1166	if (optval) \
1167		inp->inp_flags |= bit; \
1168	else \
1169		inp->inp_flags &= ~bit;
1170
1171			case IP_RECVOPTS:
1172				OPTSET(INP_RECVOPTS);
1173				break;
1174
1175			case IP_RECVRETOPTS:
1176				OPTSET(INP_RECVRETOPTS);
1177				break;
1178
1179			case IP_RECVDSTADDR:
1180				OPTSET(INP_RECVDSTADDR);
1181				break;
1182
1183			case IP_RECVIF:
1184				OPTSET(INP_RECVIF);
1185				break;
1186
1187#if defined(NFAITH) && NFAITH > 0
1188			case IP_FAITH:
1189				OPTSET(INP_FAITH);
1190				break;
1191#endif
1192			}
1193			break;
1194#undef OPTSET
1195
1196		case IP_MULTICAST_IF:
1197		case IP_MULTICAST_VIF:
1198		case IP_MULTICAST_TTL:
1199		case IP_MULTICAST_LOOP:
1200		case IP_ADD_MEMBERSHIP:
1201		case IP_DROP_MEMBERSHIP:
1202			error = ip_setmoptions(sopt, &inp->inp_moptions);
1203			break;
1204
1205		case IP_PORTRANGE:
1206			error = sooptcopyin(sopt, &optval, sizeof optval,
1207					    sizeof optval);
1208			if (error)
1209				break;
1210
1211			switch (optval) {
1212			case IP_PORTRANGE_DEFAULT:
1213				inp->inp_flags &= ~(INP_LOWPORT);
1214				inp->inp_flags &= ~(INP_HIGHPORT);
1215				break;
1216
1217			case IP_PORTRANGE_HIGH:
1218				inp->inp_flags &= ~(INP_LOWPORT);
1219				inp->inp_flags |= INP_HIGHPORT;
1220				break;
1221
1222			case IP_PORTRANGE_LOW:
1223				inp->inp_flags &= ~(INP_HIGHPORT);
1224				inp->inp_flags |= INP_LOWPORT;
1225				break;
1226
1227			default:
1228				error = EINVAL;
1229				break;
1230			}
1231			break;
1232
1233#ifdef IPSEC
1234		case IP_IPSEC_POLICY:
1235		{
1236			caddr_t req;
1237			size_t len = 0;
1238			int priv;
1239			struct mbuf *m;
1240			int optname;
1241
1242			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1243				break;
1244			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1245				break;
1246			priv = (sopt->sopt_p != NULL &&
1247				suser(sopt->sopt_p) != 0) ? 0 : 1;
1248			req = mtod(m, caddr_t);
1249			len = m->m_len;
1250			optname = sopt->sopt_name;
1251			error = ipsec4_set_policy(inp, optname, req, len, priv);
1252			m_freem(m);
1253			break;
1254		}
1255#endif /*IPSEC*/
1256
1257		default:
1258			error = ENOPROTOOPT;
1259			break;
1260		}
1261		break;
1262
1263	case SOPT_GET:
1264		switch (sopt->sopt_name) {
1265		case IP_OPTIONS:
1266		case IP_RETOPTS:
1267			if (inp->inp_options)
1268				error = sooptcopyout(sopt,
1269						     mtod(inp->inp_options,
1270							  char *),
1271						     inp->inp_options->m_len);
1272			else
1273				sopt->sopt_valsize = 0;
1274			break;
1275
1276		case IP_TOS:
1277		case IP_TTL:
1278		case IP_RECVOPTS:
1279		case IP_RECVRETOPTS:
1280		case IP_RECVDSTADDR:
1281		case IP_RECVIF:
1282		case IP_PORTRANGE:
1283#if defined(NFAITH) && NFAITH > 0
1284		case IP_FAITH:
1285#endif
1286			switch (sopt->sopt_name) {
1287
1288			case IP_TOS:
1289				optval = inp->inp_ip_tos;
1290				break;
1291
1292			case IP_TTL:
1293				optval = inp->inp_ip_ttl;
1294				break;
1295
1296#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1297
1298			case IP_RECVOPTS:
1299				optval = OPTBIT(INP_RECVOPTS);
1300				break;
1301
1302			case IP_RECVRETOPTS:
1303				optval = OPTBIT(INP_RECVRETOPTS);
1304				break;
1305
1306			case IP_RECVDSTADDR:
1307				optval = OPTBIT(INP_RECVDSTADDR);
1308				break;
1309
1310			case IP_RECVIF:
1311				optval = OPTBIT(INP_RECVIF);
1312				break;
1313
1314			case IP_PORTRANGE:
1315				if (inp->inp_flags & INP_HIGHPORT)
1316					optval = IP_PORTRANGE_HIGH;
1317				else if (inp->inp_flags & INP_LOWPORT)
1318					optval = IP_PORTRANGE_LOW;
1319				else
1320					optval = 0;
1321				break;
1322
1323#if defined(NFAITH) && NFAITH > 0
1324			case IP_FAITH:
1325				optval = OPTBIT(INP_FAITH);
1326				break;
1327#endif
1328			}
1329			error = sooptcopyout(sopt, &optval, sizeof optval);
1330			break;
1331
1332		case IP_MULTICAST_IF:
1333		case IP_MULTICAST_VIF:
1334		case IP_MULTICAST_TTL:
1335		case IP_MULTICAST_LOOP:
1336		case IP_ADD_MEMBERSHIP:
1337		case IP_DROP_MEMBERSHIP:
1338			error = ip_getmoptions(sopt, inp->inp_moptions);
1339			break;
1340
1341#ifdef IPSEC
1342		case IP_IPSEC_POLICY:
1343		{
1344			struct mbuf *m = NULL;
1345			caddr_t req = NULL;
1346			size_t len = 0;
1347
1348			if (m != 0) {
1349				req = mtod(m, caddr_t);
1350				len = m->m_len;
1351			}
1352			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1353			if (error == 0)
1354				error = soopt_mcopyout(sopt, m); /* XXX */
1355			if (error == 0)
1356				m_freem(m);
1357			break;
1358		}
1359#endif /*IPSEC*/
1360
1361		default:
1362			error = ENOPROTOOPT;
1363			break;
1364		}
1365		break;
1366	}
1367	return (error);
1368}
1369
1370/*
1371 * Set up IP options in pcb for insertion in output packets.
1372 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1373 * with destination address if source routed.
1374 */
1375static int
1376ip_pcbopts(optname, pcbopt, m)
1377	int optname;
1378	struct mbuf **pcbopt;
1379	register struct mbuf *m;
1380{
1381	register int cnt, optlen;
1382	register u_char *cp;
1383	u_char opt;
1384
1385	/* turn off any old options */
1386	if (*pcbopt)
1387		(void)m_free(*pcbopt);
1388	*pcbopt = 0;
1389	if (m == (struct mbuf *)0 || m->m_len == 0) {
1390		/*
1391		 * Only turning off any previous options.
1392		 */
1393		if (m)
1394			(void)m_free(m);
1395		return (0);
1396	}
1397
1398#ifndef	vax
1399	if (m->m_len % sizeof(int32_t))
1400		goto bad;
1401#endif
1402	/*
1403	 * IP first-hop destination address will be stored before
1404	 * actual options; move other options back
1405	 * and clear it when none present.
1406	 */
1407	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1408		goto bad;
1409	cnt = m->m_len;
1410	m->m_len += sizeof(struct in_addr);
1411	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1412	ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
1413	bzero(mtod(m, caddr_t), sizeof(struct in_addr));
1414
1415	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1416		opt = cp[IPOPT_OPTVAL];
1417		if (opt == IPOPT_EOL)
1418			break;
1419		if (opt == IPOPT_NOP)
1420			optlen = 1;
1421		else {
1422			if (cnt < IPOPT_OLEN + sizeof(*cp))
1423				goto bad;
1424			optlen = cp[IPOPT_OLEN];
1425			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1426				goto bad;
1427		}
1428		switch (opt) {
1429
1430		default:
1431			break;
1432
1433		case IPOPT_LSRR:
1434		case IPOPT_SSRR:
1435			/*
1436			 * user process specifies route as:
1437			 *	->A->B->C->D
1438			 * D must be our final destination (but we can't
1439			 * check that since we may not have connected yet).
1440			 * A is first hop destination, which doesn't appear in
1441			 * actual IP option, but is stored before the options.
1442			 */
1443			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1444				goto bad;
1445			m->m_len -= sizeof(struct in_addr);
1446			cnt -= sizeof(struct in_addr);
1447			optlen -= sizeof(struct in_addr);
1448			cp[IPOPT_OLEN] = optlen;
1449			/*
1450			 * Move first hop before start of options.
1451			 */
1452			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1453			    sizeof(struct in_addr));
1454			/*
1455			 * Then copy rest of options back
1456			 * to close up the deleted entry.
1457			 */
1458			ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
1459			    sizeof(struct in_addr)),
1460			    (caddr_t)&cp[IPOPT_OFFSET+1],
1461			    (unsigned)cnt + sizeof(struct in_addr));
1462			break;
1463		}
1464	}
1465	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1466		goto bad;
1467	*pcbopt = m;
1468	return (0);
1469
1470bad:
1471	(void)m_free(m);
1472	return (EINVAL);
1473}
1474
1475/*
1476 * XXX
1477 * The whole multicast option thing needs to be re-thought.
1478 * Several of these options are equally applicable to non-multicast
1479 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1480 * standard option (IP_TTL).
1481 */
1482/*
1483 * Set the IP multicast options in response to user setsockopt().
1484 */
1485static int
1486ip_setmoptions(sopt, imop)
1487	struct sockopt *sopt;
1488	struct ip_moptions **imop;
1489{
1490	int error = 0;
1491	int i;
1492	struct in_addr addr;
1493	struct ip_mreq mreq;
1494	struct ifnet *ifp;
1495	struct ip_moptions *imo = *imop;
1496	struct route ro;
1497	struct sockaddr_in *dst;
1498	int s;
1499
1500	if (imo == NULL) {
1501		/*
1502		 * No multicast option buffer attached to the pcb;
1503		 * allocate one and initialize to default values.
1504		 */
1505		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1506		    M_WAITOK);
1507
1508		if (imo == NULL)
1509			return (ENOBUFS);
1510		*imop = imo;
1511		imo->imo_multicast_ifp = NULL;
1512		imo->imo_multicast_vif = -1;
1513		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1514		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1515		imo->imo_num_memberships = 0;
1516	}
1517
1518	switch (sopt->sopt_name) {
1519	/* store an index number for the vif you wanna use in the send */
1520	case IP_MULTICAST_VIF:
1521		if (legal_vif_num == 0) {
1522			error = EOPNOTSUPP;
1523			break;
1524		}
1525		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1526		if (error)
1527			break;
1528		if (!legal_vif_num(i) && (i != -1)) {
1529			error = EINVAL;
1530			break;
1531		}
1532		imo->imo_multicast_vif = i;
1533		break;
1534
1535	case IP_MULTICAST_IF:
1536		/*
1537		 * Select the interface for outgoing multicast packets.
1538		 */
1539		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1540		if (error)
1541			break;
1542		/*
1543		 * INADDR_ANY is used to remove a previous selection.
1544		 * When no interface is selected, a default one is
1545		 * chosen every time a multicast packet is sent.
1546		 */
1547		if (addr.s_addr == INADDR_ANY) {
1548			imo->imo_multicast_ifp = NULL;
1549			break;
1550		}
1551		/*
1552		 * The selected interface is identified by its local
1553		 * IP address.  Find the interface and confirm that
1554		 * it supports multicasting.
1555		 */
1556		s = splimp();
1557		INADDR_TO_IFP(addr, ifp);
1558		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1559			splx(s);
1560			error = EADDRNOTAVAIL;
1561			break;
1562		}
1563		imo->imo_multicast_ifp = ifp;
1564		splx(s);
1565		break;
1566
1567	case IP_MULTICAST_TTL:
1568		/*
1569		 * Set the IP time-to-live for outgoing multicast packets.
1570		 * The original multicast API required a char argument,
1571		 * which is inconsistent with the rest of the socket API.
1572		 * We allow either a char or an int.
1573		 */
1574		if (sopt->sopt_valsize == 1) {
1575			u_char ttl;
1576			error = sooptcopyin(sopt, &ttl, 1, 1);
1577			if (error)
1578				break;
1579			imo->imo_multicast_ttl = ttl;
1580		} else {
1581			u_int ttl;
1582			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1583					    sizeof ttl);
1584			if (error)
1585				break;
1586			if (ttl > 255)
1587				error = EINVAL;
1588			else
1589				imo->imo_multicast_ttl = ttl;
1590		}
1591		break;
1592
1593	case IP_MULTICAST_LOOP:
1594		/*
1595		 * Set the loopback flag for outgoing multicast packets.
1596		 * Must be zero or one.  The original multicast API required a
1597		 * char argument, which is inconsistent with the rest
1598		 * of the socket API.  We allow either a char or an int.
1599		 */
1600		if (sopt->sopt_valsize == 1) {
1601			u_char loop;
1602			error = sooptcopyin(sopt, &loop, 1, 1);
1603			if (error)
1604				break;
1605			imo->imo_multicast_loop = !!loop;
1606		} else {
1607			u_int loop;
1608			error = sooptcopyin(sopt, &loop, sizeof loop,
1609					    sizeof loop);
1610			if (error)
1611				break;
1612			imo->imo_multicast_loop = !!loop;
1613		}
1614		break;
1615
1616	case IP_ADD_MEMBERSHIP:
1617		/*
1618		 * Add a multicast group membership.
1619		 * Group must be a valid IP multicast address.
1620		 */
1621		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1622		if (error)
1623			break;
1624
1625		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1626			error = EINVAL;
1627			break;
1628		}
1629		s = splimp();
1630		/*
1631		 * If no interface address was provided, use the interface of
1632		 * the route to the given multicast address.
1633		 */
1634		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1635			bzero((caddr_t)&ro, sizeof(ro));
1636			dst = (struct sockaddr_in *)&ro.ro_dst;
1637			dst->sin_len = sizeof(*dst);
1638			dst->sin_family = AF_INET;
1639			dst->sin_addr = mreq.imr_multiaddr;
1640			rtalloc(&ro);
1641			if (ro.ro_rt == NULL) {
1642				error = EADDRNOTAVAIL;
1643				splx(s);
1644				break;
1645			}
1646			ifp = ro.ro_rt->rt_ifp;
1647			rtfree(ro.ro_rt);
1648		}
1649		else {
1650			INADDR_TO_IFP(mreq.imr_interface, ifp);
1651		}
1652
1653		/*
1654		 * See if we found an interface, and confirm that it
1655		 * supports multicast.
1656		 */
1657		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1658			error = EADDRNOTAVAIL;
1659			splx(s);
1660			break;
1661		}
1662		/*
1663		 * See if the membership already exists or if all the
1664		 * membership slots are full.
1665		 */
1666		for (i = 0; i < imo->imo_num_memberships; ++i) {
1667			if (imo->imo_membership[i]->inm_ifp == ifp &&
1668			    imo->imo_membership[i]->inm_addr.s_addr
1669						== mreq.imr_multiaddr.s_addr)
1670				break;
1671		}
1672		if (i < imo->imo_num_memberships) {
1673			error = EADDRINUSE;
1674			splx(s);
1675			break;
1676		}
1677		if (i == IP_MAX_MEMBERSHIPS) {
1678			error = ETOOMANYREFS;
1679			splx(s);
1680			break;
1681		}
1682		/*
1683		 * Everything looks good; add a new record to the multicast
1684		 * address list for the given interface.
1685		 */
1686		if ((imo->imo_membership[i] =
1687		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1688			error = ENOBUFS;
1689			splx(s);
1690			break;
1691		}
1692		++imo->imo_num_memberships;
1693		splx(s);
1694		break;
1695
1696	case IP_DROP_MEMBERSHIP:
1697		/*
1698		 * Drop a multicast group membership.
1699		 * Group must be a valid IP multicast address.
1700		 */
1701		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1702		if (error)
1703			break;
1704
1705		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1706			error = EINVAL;
1707			break;
1708		}
1709
1710		s = splimp();
1711		/*
1712		 * If an interface address was specified, get a pointer
1713		 * to its ifnet structure.
1714		 */
1715		if (mreq.imr_interface.s_addr == INADDR_ANY)
1716			ifp = NULL;
1717		else {
1718			INADDR_TO_IFP(mreq.imr_interface, ifp);
1719			if (ifp == NULL) {
1720				error = EADDRNOTAVAIL;
1721				splx(s);
1722				break;
1723			}
1724		}
1725		/*
1726		 * Find the membership in the membership array.
1727		 */
1728		for (i = 0; i < imo->imo_num_memberships; ++i) {
1729			if ((ifp == NULL ||
1730			     imo->imo_membership[i]->inm_ifp == ifp) &&
1731			     imo->imo_membership[i]->inm_addr.s_addr ==
1732			     mreq.imr_multiaddr.s_addr)
1733				break;
1734		}
1735		if (i == imo->imo_num_memberships) {
1736			error = EADDRNOTAVAIL;
1737			splx(s);
1738			break;
1739		}
1740		/*
1741		 * Give up the multicast address record to which the
1742		 * membership points.
1743		 */
1744		in_delmulti(imo->imo_membership[i]);
1745		/*
1746		 * Remove the gap in the membership array.
1747		 */
1748		for (++i; i < imo->imo_num_memberships; ++i)
1749			imo->imo_membership[i-1] = imo->imo_membership[i];
1750		--imo->imo_num_memberships;
1751		splx(s);
1752		break;
1753
1754	default:
1755		error = EOPNOTSUPP;
1756		break;
1757	}
1758
1759	/*
1760	 * If all options have default values, no need to keep the mbuf.
1761	 */
1762	if (imo->imo_multicast_ifp == NULL &&
1763	    imo->imo_multicast_vif == -1 &&
1764	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
1765	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
1766	    imo->imo_num_memberships == 0) {
1767		free(*imop, M_IPMOPTS);
1768		*imop = NULL;
1769	}
1770
1771	return (error);
1772}
1773
1774/*
1775 * Return the IP multicast options in response to user getsockopt().
1776 */
1777static int
1778ip_getmoptions(sopt, imo)
1779	struct sockopt *sopt;
1780	register struct ip_moptions *imo;
1781{
1782	struct in_addr addr;
1783	struct in_ifaddr *ia;
1784	int error, optval;
1785	u_char coptval;
1786
1787	error = 0;
1788	switch (sopt->sopt_name) {
1789	case IP_MULTICAST_VIF:
1790		if (imo != NULL)
1791			optval = imo->imo_multicast_vif;
1792		else
1793			optval = -1;
1794		error = sooptcopyout(sopt, &optval, sizeof optval);
1795		break;
1796
1797	case IP_MULTICAST_IF:
1798		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1799			addr.s_addr = INADDR_ANY;
1800		else {
1801			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1802			addr.s_addr = (ia == NULL) ? INADDR_ANY
1803				: IA_SIN(ia)->sin_addr.s_addr;
1804		}
1805		error = sooptcopyout(sopt, &addr, sizeof addr);
1806		break;
1807
1808	case IP_MULTICAST_TTL:
1809		if (imo == 0)
1810			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1811		else
1812			optval = coptval = imo->imo_multicast_ttl;
1813		if (sopt->sopt_valsize == 1)
1814			error = sooptcopyout(sopt, &coptval, 1);
1815		else
1816			error = sooptcopyout(sopt, &optval, sizeof optval);
1817		break;
1818
1819	case IP_MULTICAST_LOOP:
1820		if (imo == 0)
1821			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1822		else
1823			optval = coptval = imo->imo_multicast_loop;
1824		if (sopt->sopt_valsize == 1)
1825			error = sooptcopyout(sopt, &coptval, 1);
1826		else
1827			error = sooptcopyout(sopt, &optval, sizeof optval);
1828		break;
1829
1830	default:
1831		error = ENOPROTOOPT;
1832		break;
1833	}
1834	return (error);
1835}
1836
1837/*
1838 * Discard the IP multicast options.
1839 */
1840void
1841ip_freemoptions(imo)
1842	register struct ip_moptions *imo;
1843{
1844	register int i;
1845
1846	if (imo != NULL) {
1847		for (i = 0; i < imo->imo_num_memberships; ++i)
1848			in_delmulti(imo->imo_membership[i]);
1849		free(imo, M_IPMOPTS);
1850	}
1851}
1852
1853/*
1854 * Routine called from ip_output() to loop back a copy of an IP multicast
1855 * packet to the input queue of a specified interface.  Note that this
1856 * calls the output routine of the loopback "driver", but with an interface
1857 * pointer that might NOT be a loopback interface -- evil, but easier than
1858 * replicating that code here.
1859 */
1860static void
1861ip_mloopback(ifp, m, dst, hlen)
1862	struct ifnet *ifp;
1863	register struct mbuf *m;
1864	register struct sockaddr_in *dst;
1865	int hlen;
1866{
1867	register struct ip *ip;
1868	struct mbuf *copym;
1869
1870	copym = m_copy(m, 0, M_COPYALL);
1871	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1872		copym = m_pullup(copym, hlen);
1873	if (copym != NULL) {
1874		/*
1875		 * We don't bother to fragment if the IP length is greater
1876		 * than the interface's MTU.  Can this possibly matter?
1877		 */
1878		ip = mtod(copym, struct ip *);
1879		HTONS(ip->ip_len);
1880		HTONS(ip->ip_off);
1881		ip->ip_sum = 0;
1882		if (ip->ip_vhl == IP_VHL_BORING) {
1883			ip->ip_sum = in_cksum_hdr(ip);
1884		} else {
1885			ip->ip_sum = in_cksum(copym, hlen);
1886		}
1887		/*
1888		 * NB:
1889		 * It's not clear whether there are any lingering
1890		 * reentrancy problems in other areas which might
1891		 * be exposed by using ip_input directly (in
1892		 * particular, everything which modifies the packet
1893		 * in-place).  Yet another option is using the
1894		 * protosw directly to deliver the looped back
1895		 * packet.  For the moment, we'll err on the side
1896		 * of safety by using if_simloop().
1897		 */
1898#if 1 /* XXX */
1899		if (dst->sin_family != AF_INET) {
1900			printf("ip_mloopback: bad address family %d\n",
1901						dst->sin_family);
1902			dst->sin_family = AF_INET;
1903		}
1904#endif
1905
1906#ifdef notdef
1907		copym->m_pkthdr.rcvif = ifp;
1908		ip_input(copym);
1909#else
1910		/* if the checksum hasn't been computed, mark it as valid */
1911		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1912			copym->m_pkthdr.csum_flags |=
1913			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1914			copym->m_pkthdr.csum_data = 0xffff;
1915		}
1916		if_simloop(ifp, copym, dst->sin_family, 0);
1917#endif
1918	}
1919}
1920