ip_reass.c revision 71909
1/*
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
34 * $FreeBSD: head/sys/netinet/ip_input.c 71909 2001-02-02 00:18:00Z luigi $
35 */
36
37#define	_IP_VHL
38
39#include "opt_bootp.h"
40#include "opt_ipfw.h"
41#include "opt_ipdn.h"
42#include "opt_ipdivert.h"
43#include "opt_ipfilter.h"
44#include "opt_ipstealth.h"
45#include "opt_ipsec.h"
46#include "opt_pfil_hooks.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/mbuf.h>
51#include <sys/malloc.h>
52#include <sys/domain.h>
53#include <sys/protosw.h>
54#include <sys/socket.h>
55#include <sys/time.h>
56#include <sys/kernel.h>
57#include <sys/syslog.h>
58#include <sys/sysctl.h>
59
60#include <net/pfil.h>
61#include <net/if.h>
62#include <net/if_var.h>
63#include <net/if_dl.h>
64#include <net/route.h>
65#include <net/netisr.h>
66#include <net/intrq.h>
67
68#include <netinet/in.h>
69#include <netinet/in_systm.h>
70#include <netinet/in_var.h>
71#include <netinet/ip.h>
72#include <netinet/in_pcb.h>
73#include <netinet/ip_var.h>
74#include <netinet/ip_icmp.h>
75#include <machine/in_cksum.h>
76
77#include <netinet/ipprotosw.h>
78
79#include <sys/socketvar.h>
80
81#include <netinet/ip_fw.h>
82
83#ifdef IPSEC
84#include <netinet6/ipsec.h>
85#include <netkey/key.h>
86#endif
87
88#include "faith.h"
89#if defined(NFAITH) && NFAITH > 0
90#include <net/if_types.h>
91#endif
92
93#ifdef DUMMYNET
94#include <netinet/ip_dummynet.h>
95#endif
96
97int rsvp_on = 0;
98static int ip_rsvp_on;
99struct socket *ip_rsvpd;
100
101int	ipforwarding = 0;
102SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
103    &ipforwarding, 0, "Enable IP forwarding between interfaces");
104
105static int	ipsendredirects = 1; /* XXX */
106SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
107    &ipsendredirects, 0, "Enable sending IP redirects");
108
109int	ip_defttl = IPDEFTTL;
110SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
111    &ip_defttl, 0, "Maximum TTL on IP packets");
112
113static int	ip_dosourceroute = 0;
114SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
115    &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
116
117static int	ip_acceptsourceroute = 0;
118SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
119    CTLFLAG_RW, &ip_acceptsourceroute, 0,
120    "Enable accepting source routed IP packets");
121
122static int	ip_keepfaith = 0;
123SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
124	&ip_keepfaith,	0,
125	"Enable packet capture for FAITH IPv4->IPv6 translater daemon");
126
127#ifdef DIAGNOSTIC
128static int	ipprintfs = 0;
129#endif
130
131extern	struct domain inetdomain;
132extern	struct ipprotosw inetsw[];
133u_char	ip_protox[IPPROTO_MAX];
134static int	ipqmaxlen = IFQ_MAXLEN;
135struct	in_ifaddrhead in_ifaddrhead; /* first inet address */
136SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
137    &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
138SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
139    &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue");
140
141struct ipstat ipstat;
142SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD,
143    &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
144
145/* Packet reassembly stuff */
146#define IPREASS_NHASH_LOG2      6
147#define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
148#define IPREASS_HMASK           (IPREASS_NHASH - 1)
149#define IPREASS_HASH(x,y) \
150	(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
151
152static struct ipq ipq[IPREASS_NHASH];
153static int    nipq = 0;         /* total # of reass queues */
154static int    maxnipq;
155const  int    ipintrq_present = 1;
156
157#ifdef IPCTL_DEFMTU
158SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
159    &ip_mtu, 0, "Default MTU");
160#endif
161
162#ifdef IPSTEALTH
163static int	ipstealth = 0;
164SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
165    &ipstealth, 0, "");
166#endif
167
168
169/* Firewall hooks */
170ip_fw_chk_t *ip_fw_chk_ptr;
171ip_fw_ctl_t *ip_fw_ctl_ptr;
172int fw_enable = 1 ;
173
174#ifdef DUMMYNET
175ip_dn_ctl_t *ip_dn_ctl_ptr;
176#endif
177
178
179/*
180 * We need to save the IP options in case a protocol wants to respond
181 * to an incoming packet over the same route if the packet got here
182 * using IP source routing.  This allows connection establishment and
183 * maintenance when the remote end is on a network that is not known
184 * to us.
185 */
186static int	ip_nhops = 0;
187static	struct ip_srcrt {
188	struct	in_addr dst;			/* final destination */
189	char	nop;				/* one NOP to align */
190	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
191	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
192} ip_srcrt;
193
194struct sockaddr_in *ip_fw_fwd_addr;
195
196static void	save_rte __P((u_char *, struct in_addr));
197static int	ip_dooptions __P((struct mbuf *));
198static void	ip_forward __P((struct mbuf *, int));
199static void	ip_freef __P((struct ipq *));
200#ifdef IPDIVERT
201static struct	mbuf *ip_reass __P((struct mbuf *,
202			struct ipq *, struct ipq *, u_int32_t *, u_int16_t *));
203#else
204static struct	mbuf *ip_reass __P((struct mbuf *, struct ipq *, struct ipq *));
205#endif
206static struct	in_ifaddr *ip_rtaddr __P((struct in_addr));
207static void	ipintr __P((void));
208
209/*
210 * IP initialization: fill in IP protocol switch table.
211 * All protocols not implemented in kernel go to raw IP protocol handler.
212 */
213void
214ip_init()
215{
216	register struct ipprotosw *pr;
217	register int i;
218
219	TAILQ_INIT(&in_ifaddrhead);
220	pr = (struct ipprotosw *)pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
221	if (pr == 0)
222		panic("ip_init");
223	for (i = 0; i < IPPROTO_MAX; i++)
224		ip_protox[i] = pr - inetsw;
225	for (pr = (struct ipprotosw *)inetdomain.dom_protosw;
226	    pr < (struct ipprotosw *)inetdomain.dom_protoswNPROTOSW; pr++)
227		if (pr->pr_domain->dom_family == PF_INET &&
228		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
229			ip_protox[pr->pr_protocol] = pr - inetsw;
230
231	for (i = 0; i < IPREASS_NHASH; i++)
232	    ipq[i].next = ipq[i].prev = &ipq[i];
233
234	maxnipq = nmbclusters/4;
235
236	ip_id = time_second & 0xffff;
237	ipintrq.ifq_maxlen = ipqmaxlen;
238	mtx_init(&ipintrq.ifq_mtx, "ip_inq", MTX_DEF);
239
240	register_netisr(NETISR_IP, ipintr);
241}
242
243static struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
244static struct	route ipforward_rt;
245
246/*
247 * Ip input routine.  Checksum and byte swap header.  If fragmented
248 * try to reassemble.  Process options.  Pass to next level.
249 */
250void
251ip_input(struct mbuf *m)
252{
253	struct ip *ip;
254	struct ipq *fp;
255	struct in_ifaddr *ia = NULL;
256	int    i, hlen;
257	u_short sum;
258	u_int16_t divert_cookie;		/* firewall cookie */
259#ifdef IPDIVERT
260	u_int32_t divert_info = 0;		/* packet divert/tee info */
261#endif
262	struct ip_fw_chain *rule = NULL;
263#ifdef PFIL_HOOKS
264	struct packet_filter_hook *pfh;
265	struct mbuf *m0;
266	int rv;
267#endif /* PFIL_HOOKS */
268
269#ifdef IPDIVERT
270	/* Get and reset firewall cookie */
271	divert_cookie = ip_divert_cookie;
272	ip_divert_cookie = 0;
273#else
274	divert_cookie = 0;
275#endif
276
277#if defined(IPFIREWALL) && defined(DUMMYNET)
278        /*
279         * dummynet packet are prepended a vestigial mbuf with
280         * m_type = MT_DUMMYNET and m_data pointing to the matching
281         * rule.
282         */
283        if (m->m_type == MT_DUMMYNET) {
284            rule = (struct ip_fw_chain *)(m->m_data) ;
285            m = m->m_next ;
286            ip = mtod(m, struct ip *);
287            hlen = IP_VHL_HL(ip->ip_vhl) << 2;
288            goto iphack ;
289        } else
290            rule = NULL ;
291#endif
292
293#ifdef	DIAGNOSTIC
294	if (m == NULL || (m->m_flags & M_PKTHDR) == 0)
295		panic("ip_input no HDR");
296#endif
297	ipstat.ips_total++;
298
299	if (m->m_pkthdr.len < sizeof(struct ip))
300		goto tooshort;
301
302	if (m->m_len < sizeof (struct ip) &&
303	    (m = m_pullup(m, sizeof (struct ip))) == 0) {
304		ipstat.ips_toosmall++;
305		return;
306	}
307	ip = mtod(m, struct ip *);
308
309	if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
310		ipstat.ips_badvers++;
311		goto bad;
312	}
313
314	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
315	if (hlen < sizeof(struct ip)) {	/* minimum header length */
316		ipstat.ips_badhlen++;
317		goto bad;
318	}
319	if (hlen > m->m_len) {
320		if ((m = m_pullup(m, hlen)) == 0) {
321			ipstat.ips_badhlen++;
322			return;
323		}
324		ip = mtod(m, struct ip *);
325	}
326	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
327		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
328	} else {
329		if (hlen == sizeof(struct ip)) {
330			sum = in_cksum_hdr(ip);
331		} else {
332			sum = in_cksum(m, hlen);
333		}
334	}
335	if (sum) {
336		ipstat.ips_badsum++;
337		goto bad;
338	}
339
340	/*
341	 * Convert fields to host representation.
342	 */
343	NTOHS(ip->ip_len);
344	if (ip->ip_len < hlen) {
345		ipstat.ips_badlen++;
346		goto bad;
347	}
348	NTOHS(ip->ip_off);
349
350	/*
351	 * Check that the amount of data in the buffers
352	 * is as at least much as the IP header would have us expect.
353	 * Trim mbufs if longer than we expect.
354	 * Drop packet if shorter than we expect.
355	 */
356	if (m->m_pkthdr.len < ip->ip_len) {
357tooshort:
358		ipstat.ips_tooshort++;
359		goto bad;
360	}
361	if (m->m_pkthdr.len > ip->ip_len) {
362		if (m->m_len == m->m_pkthdr.len) {
363			m->m_len = ip->ip_len;
364			m->m_pkthdr.len = ip->ip_len;
365		} else
366			m_adj(m, ip->ip_len - m->m_pkthdr.len);
367	}
368	/*
369	 * IpHack's section.
370	 * Right now when no processing on packet has done
371	 * and it is still fresh out of network we do our black
372	 * deals with it.
373	 * - Firewall: deny/allow/divert
374	 * - Xlate: translate packet's addr/port (NAT).
375	 * - Pipe: pass pkt through dummynet.
376	 * - Wrap: fake packet's addr/port <unimpl.>
377	 * - Encapsulate: put it in another IP and send out. <unimp.>
378 	 */
379
380#if defined(IPFIREWALL) && defined(DUMMYNET)
381iphack:
382#endif
383
384#ifdef PFIL_HOOKS
385	/*
386	 * Run through list of hooks for input packets.  If there are any
387	 * filters which require that additional packets in the flow are
388	 * not fast-forwarded, they must clear the M_CANFASTFWD flag.
389	 * Note that filters must _never_ set this flag, as another filter
390	 * in the list may have previously cleared it.
391	 */
392	m0 = m;
393	pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
394	for (; pfh; pfh = pfh->pfil_link.tqe_next)
395		if (pfh->pfil_func) {
396			rv = pfh->pfil_func(ip, hlen,
397					    m->m_pkthdr.rcvif, 0, &m0);
398			if (rv)
399				return;
400			m = m0;
401			if (m == NULL)
402				return;
403			ip = mtod(m, struct ip *);
404		}
405#endif /* PFIL_HOOKS */
406
407	if (fw_enable && ip_fw_chk_ptr) {
408#ifdef IPFIREWALL_FORWARD
409		/*
410		 * If we've been forwarded from the output side, then
411		 * skip the firewall a second time
412		 */
413		if (ip_fw_fwd_addr)
414			goto ours;
415#endif	/* IPFIREWALL_FORWARD */
416		/*
417		 * See the comment in ip_output for the return values
418		 * produced by the firewall.
419		 */
420		i = (*ip_fw_chk_ptr)(&ip,
421		    hlen, NULL, &divert_cookie, &m, &rule, &ip_fw_fwd_addr);
422		if (i & IP_FW_PORT_DENY_FLAG) { /* XXX new interface-denied */
423		    if (m)
424			m_freem(m);
425		    return ;
426		}
427		if (m == NULL) {	/* Packet discarded by firewall */
428		    static int __debug=10;
429		    if (__debug >0) {
430			printf("firewall returns NULL, please update!\n");
431			__debug-- ;
432		    }
433		    return;
434		}
435		if (i == 0 && ip_fw_fwd_addr == NULL)	/* common case */
436			goto pass;
437#ifdef DUMMYNET
438                if ((i & IP_FW_PORT_DYNT_FLAG) != 0) {
439                        /* Send packet to the appropriate pipe */
440                        dummynet_io(i&0xffff,DN_TO_IP_IN,m,NULL,NULL,0, rule,
441				    0);
442			return;
443		}
444#endif
445#ifdef IPDIVERT
446		if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
447			/* Divert or tee packet */
448			divert_info = i;
449			goto ours;
450		}
451#endif
452#ifdef IPFIREWALL_FORWARD
453		if (i == 0 && ip_fw_fwd_addr != NULL)
454			goto pass;
455#endif
456		/*
457		 * if we get here, the packet must be dropped
458		 */
459		m_freem(m);
460		return;
461	}
462pass:
463
464	/*
465	 * Process options and, if not destined for us,
466	 * ship it on.  ip_dooptions returns 1 when an
467	 * error was detected (causing an icmp message
468	 * to be sent and the original packet to be freed).
469	 */
470	ip_nhops = 0;		/* for source routed packets */
471	if (hlen > sizeof (struct ip) && ip_dooptions(m)) {
472#ifdef IPFIREWALL_FORWARD
473		ip_fw_fwd_addr = NULL;
474#endif
475		return;
476	}
477
478        /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
479         * matter if it is destined to another node, or whether it is
480         * a multicast one, RSVP wants it! and prevents it from being forwarded
481         * anywhere else. Also checks if the rsvp daemon is running before
482	 * grabbing the packet.
483         */
484	if (rsvp_on && ip->ip_p==IPPROTO_RSVP)
485		goto ours;
486
487	/*
488	 * Check our list of addresses, to see if the packet is for us.
489	 * If we don't have any addresses, assume any unicast packet
490	 * we receive might be for us (and let the upper layers deal
491	 * with it).
492	 */
493	if (TAILQ_EMPTY(&in_ifaddrhead) &&
494	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
495		goto ours;
496
497	for (ia = TAILQ_FIRST(&in_ifaddrhead); ia;
498					ia = TAILQ_NEXT(ia, ia_link)) {
499#define	satosin(sa)	((struct sockaddr_in *)(sa))
500
501#ifdef BOOTP_COMPAT
502		if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
503			goto ours;
504#endif
505#ifdef IPFIREWALL_FORWARD
506		/*
507		 * If the addr to forward to is one of ours, we pretend to
508		 * be the destination for this packet.
509		 */
510		if (ip_fw_fwd_addr == NULL) {
511			if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr)
512				goto ours;
513		} else if (IA_SIN(ia)->sin_addr.s_addr ==
514					 ip_fw_fwd_addr->sin_addr.s_addr)
515			goto ours;
516#else
517		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr)
518			goto ours;
519#endif
520		if (ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) {
521			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
522			    ip->ip_dst.s_addr)
523				goto ours;
524			if (ip->ip_dst.s_addr == ia->ia_netbroadcast.s_addr)
525				goto ours;
526		}
527	}
528	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
529		struct in_multi *inm;
530		if (ip_mrouter) {
531			/*
532			 * If we are acting as a multicast router, all
533			 * incoming multicast packets are passed to the
534			 * kernel-level multicast forwarding function.
535			 * The packet is returned (relatively) intact; if
536			 * ip_mforward() returns a non-zero value, the packet
537			 * must be discarded, else it may be accepted below.
538			 */
539			if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) {
540				ipstat.ips_cantforward++;
541				m_freem(m);
542				return;
543			}
544
545			/*
546			 * The process-level routing demon needs to receive
547			 * all multicast IGMP packets, whether or not this
548			 * host belongs to their destination groups.
549			 */
550			if (ip->ip_p == IPPROTO_IGMP)
551				goto ours;
552			ipstat.ips_forward++;
553		}
554		/*
555		 * See if we belong to the destination multicast group on the
556		 * arrival interface.
557		 */
558		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
559		if (inm == NULL) {
560			ipstat.ips_notmember++;
561			m_freem(m);
562			return;
563		}
564		goto ours;
565	}
566	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
567		goto ours;
568	if (ip->ip_dst.s_addr == INADDR_ANY)
569		goto ours;
570
571#if defined(NFAITH) && 0 < NFAITH
572	/*
573	 * FAITH(Firewall Aided Internet Translator)
574	 */
575	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
576		if (ip_keepfaith) {
577			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
578				goto ours;
579		}
580		m_freem(m);
581		return;
582	}
583#endif
584	/*
585	 * Not for us; forward if possible and desirable.
586	 */
587	if (ipforwarding == 0) {
588		ipstat.ips_cantforward++;
589		m_freem(m);
590	} else
591		ip_forward(m, 0);
592#ifdef IPFIREWALL_FORWARD
593	ip_fw_fwd_addr = NULL;
594#endif
595	return;
596
597ours:
598	/* Count the packet in the ip address stats */
599	if (ia != NULL) {
600		ia->ia_ifa.if_ipackets++;
601		ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
602	}
603
604	/*
605	 * If offset or IP_MF are set, must reassemble.
606	 * Otherwise, nothing need be done.
607	 * (We could look in the reassembly queue to see
608	 * if the packet was previously fragmented,
609	 * but it's not worth the time; just let them time out.)
610	 */
611	if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
612
613		sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
614		/*
615		 * Look for queue of fragments
616		 * of this datagram.
617		 */
618		for (fp = ipq[sum].next; fp != &ipq[sum]; fp = fp->next)
619			if (ip->ip_id == fp->ipq_id &&
620			    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
621			    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
622			    ip->ip_p == fp->ipq_p)
623				goto found;
624
625		fp = 0;
626
627		/* check if there's a place for the new queue */
628		if (nipq > maxnipq) {
629		    /*
630		     * drop something from the tail of the current queue
631		     * before proceeding further
632		     */
633		    if (ipq[sum].prev == &ipq[sum]) {   /* gak */
634			for (i = 0; i < IPREASS_NHASH; i++) {
635			    if (ipq[i].prev != &ipq[i]) {
636				ip_freef(ipq[i].prev);
637				break;
638			    }
639			}
640		    } else
641			ip_freef(ipq[sum].prev);
642		}
643found:
644		/*
645		 * Adjust ip_len to not reflect header,
646		 * convert offset of this to bytes.
647		 */
648		ip->ip_len -= hlen;
649		if (ip->ip_off & IP_MF) {
650		        /*
651		         * Make sure that fragments have a data length
652			 * that's a non-zero multiple of 8 bytes.
653		         */
654			if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
655				ipstat.ips_toosmall++; /* XXX */
656				goto bad;
657			}
658			m->m_flags |= M_FRAG;
659		}
660		ip->ip_off <<= 3;
661
662		/*
663		 * Attempt reassembly; if it succeeds, proceed.
664		 */
665		ipstat.ips_fragments++;
666		m->m_pkthdr.header = ip;
667#ifdef IPDIVERT
668		m = ip_reass(m,
669		    fp, &ipq[sum], &divert_info, &divert_cookie);
670#else
671		m = ip_reass(m, fp, &ipq[sum]);
672#endif
673		if (m == 0) {
674#ifdef IPFIREWALL_FORWARD
675			ip_fw_fwd_addr = NULL;
676#endif
677			return;
678		}
679		ipstat.ips_reassembled++;
680		ip = mtod(m, struct ip *);
681		/* Get the header length of the reassembled packet */
682		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
683#ifdef IPDIVERT
684		/* Restore original checksum before diverting packet */
685		if (divert_info != 0) {
686			ip->ip_len += hlen;
687			HTONS(ip->ip_len);
688			HTONS(ip->ip_off);
689			ip->ip_sum = 0;
690			if (hlen == sizeof(struct ip))
691				ip->ip_sum = in_cksum_hdr(ip);
692			else
693				ip->ip_sum = in_cksum(m, hlen);
694			NTOHS(ip->ip_off);
695			NTOHS(ip->ip_len);
696			ip->ip_len -= hlen;
697		}
698#endif
699	} else
700		ip->ip_len -= hlen;
701
702#ifdef IPDIVERT
703	/*
704	 * Divert or tee packet to the divert protocol if required.
705	 *
706	 * If divert_info is zero then cookie should be too, so we shouldn't
707	 * need to clear them here.  Assume divert_packet() does so also.
708	 */
709	if (divert_info != 0) {
710		struct mbuf *clone = NULL;
711
712		/* Clone packet if we're doing a 'tee' */
713		if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0)
714			clone = m_dup(m, M_DONTWAIT);
715
716		/* Restore packet header fields to original values */
717		ip->ip_len += hlen;
718		HTONS(ip->ip_len);
719		HTONS(ip->ip_off);
720
721		/* Deliver packet to divert input routine */
722		ip_divert_cookie = divert_cookie;
723		divert_packet(m, 1, divert_info & 0xffff);
724		ipstat.ips_delivered++;
725
726		/* If 'tee', continue with original packet */
727		if (clone == NULL)
728			return;
729		m = clone;
730		ip = mtod(m, struct ip *);
731	}
732#endif
733
734	/*
735	 * Switch out to protocol's input routine.
736	 */
737	ipstat.ips_delivered++;
738    {
739	int off = hlen, nh = ip->ip_p;
740
741	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, off, nh);
742#ifdef	IPFIREWALL_FORWARD
743	ip_fw_fwd_addr = NULL;	/* tcp needed it */
744#endif
745	return;
746    }
747bad:
748#ifdef	IPFIREWALL_FORWARD
749	ip_fw_fwd_addr = NULL;
750#endif
751	m_freem(m);
752}
753
754/*
755 * IP software interrupt routine - to go away sometime soon
756 */
757static void
758ipintr(void)
759{
760	struct mbuf *m;
761
762	while (1) {
763		IF_DEQUEUE(&ipintrq, m);
764		if (m == 0)
765			return;
766		ip_input(m);
767	}
768}
769
770/*
771 * Take incoming datagram fragment and try to reassemble it into
772 * whole datagram.  If a chain for reassembly of this datagram already
773 * exists, then it is given as fp; otherwise have to make a chain.
774 *
775 * When IPDIVERT enabled, keep additional state with each packet that
776 * tells us if we need to divert or tee the packet we're building.
777 */
778
779static struct mbuf *
780#ifdef IPDIVERT
781ip_reass(m, fp, where, divinfo, divcookie)
782#else
783ip_reass(m, fp, where)
784#endif
785	register struct mbuf *m;
786	register struct ipq *fp;
787	struct   ipq    *where;
788#ifdef IPDIVERT
789	u_int32_t *divinfo;
790	u_int16_t *divcookie;
791#endif
792{
793	struct ip *ip = mtod(m, struct ip *);
794	register struct mbuf *p, *q, *nq;
795	struct mbuf *t;
796	int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
797	int i, next;
798
799	/*
800	 * Presence of header sizes in mbufs
801	 * would confuse code below.
802	 */
803	m->m_data += hlen;
804	m->m_len -= hlen;
805
806	/*
807	 * If first fragment to arrive, create a reassembly queue.
808	 */
809	if (fp == 0) {
810		if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL)
811			goto dropfrag;
812		fp = mtod(t, struct ipq *);
813		insque(fp, where);
814		nipq++;
815		fp->ipq_ttl = IPFRAGTTL;
816		fp->ipq_p = ip->ip_p;
817		fp->ipq_id = ip->ip_id;
818		fp->ipq_src = ip->ip_src;
819		fp->ipq_dst = ip->ip_dst;
820		fp->ipq_frags = m;
821		m->m_nextpkt = NULL;
822#ifdef IPDIVERT
823		fp->ipq_div_info = 0;
824		fp->ipq_div_cookie = 0;
825#endif
826		goto inserted;
827	}
828
829#define GETIP(m)	((struct ip*)((m)->m_pkthdr.header))
830
831	/*
832	 * Find a segment which begins after this one does.
833	 */
834	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
835		if (GETIP(q)->ip_off > ip->ip_off)
836			break;
837
838	/*
839	 * If there is a preceding segment, it may provide some of
840	 * our data already.  If so, drop the data from the incoming
841	 * segment.  If it provides all of our data, drop us, otherwise
842	 * stick new segment in the proper place.
843	 *
844	 * If some of the data is dropped from the the preceding
845	 * segment, then it's checksum is invalidated.
846	 */
847	if (p) {
848		i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
849		if (i > 0) {
850			if (i >= ip->ip_len)
851				goto dropfrag;
852			m_adj(m, i);
853			m->m_pkthdr.csum_flags = 0;
854			ip->ip_off += i;
855			ip->ip_len -= i;
856		}
857		m->m_nextpkt = p->m_nextpkt;
858		p->m_nextpkt = m;
859	} else {
860		m->m_nextpkt = fp->ipq_frags;
861		fp->ipq_frags = m;
862	}
863
864	/*
865	 * While we overlap succeeding segments trim them or,
866	 * if they are completely covered, dequeue them.
867	 */
868	for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
869	     q = nq) {
870		i = (ip->ip_off + ip->ip_len) -
871		    GETIP(q)->ip_off;
872		if (i < GETIP(q)->ip_len) {
873			GETIP(q)->ip_len -= i;
874			GETIP(q)->ip_off += i;
875			m_adj(q, i);
876			q->m_pkthdr.csum_flags = 0;
877			break;
878		}
879		nq = q->m_nextpkt;
880		m->m_nextpkt = nq;
881		m_freem(q);
882	}
883
884inserted:
885
886#ifdef IPDIVERT
887	/*
888	 * Transfer firewall instructions to the fragment structure.
889	 * Any fragment diverting causes the whole packet to divert.
890	 */
891	fp->ipq_div_info = *divinfo;
892	fp->ipq_div_cookie = *divcookie;
893	*divinfo = 0;
894	*divcookie = 0;
895#endif
896
897	/*
898	 * Check for complete reassembly.
899	 */
900	next = 0;
901	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
902		if (GETIP(q)->ip_off != next)
903			return (0);
904		next += GETIP(q)->ip_len;
905	}
906	/* Make sure the last packet didn't have the IP_MF flag */
907	if (p->m_flags & M_FRAG)
908		return (0);
909
910	/*
911	 * Reassembly is complete.  Make sure the packet is a sane size.
912	 */
913	q = fp->ipq_frags;
914	ip = GETIP(q);
915	if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
916		ipstat.ips_toolong++;
917		ip_freef(fp);
918		return (0);
919	}
920
921	/*
922	 * Concatenate fragments.
923	 */
924	m = q;
925	t = m->m_next;
926	m->m_next = 0;
927	m_cat(m, t);
928	nq = q->m_nextpkt;
929	q->m_nextpkt = 0;
930	for (q = nq; q != NULL; q = nq) {
931		nq = q->m_nextpkt;
932		q->m_nextpkt = NULL;
933		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
934		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
935		m_cat(m, q);
936	}
937
938#ifdef IPDIVERT
939	/*
940	 * Extract firewall instructions from the fragment structure.
941	 */
942	*divinfo = fp->ipq_div_info;
943	*divcookie = fp->ipq_div_cookie;
944#endif
945
946	/*
947	 * Create header for new ip packet by
948	 * modifying header of first packet;
949	 * dequeue and discard fragment reassembly header.
950	 * Make header visible.
951	 */
952	ip->ip_len = next;
953	ip->ip_src = fp->ipq_src;
954	ip->ip_dst = fp->ipq_dst;
955	remque(fp);
956	nipq--;
957	(void) m_free(dtom(fp));
958	m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
959	m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
960	/* some debugging cruft by sklower, below, will go away soon */
961	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
962		register int plen = 0;
963		for (t = m; t; t = t->m_next)
964			plen += t->m_len;
965		m->m_pkthdr.len = plen;
966	}
967	return (m);
968
969dropfrag:
970#ifdef IPDIVERT
971	*divinfo = 0;
972	*divcookie = 0;
973#endif
974	ipstat.ips_fragdropped++;
975	m_freem(m);
976	return (0);
977
978#undef GETIP
979}
980
981/*
982 * Free a fragment reassembly header and all
983 * associated datagrams.
984 */
985static void
986ip_freef(fp)
987	struct ipq *fp;
988{
989	register struct mbuf *q;
990
991	while (fp->ipq_frags) {
992		q = fp->ipq_frags;
993		fp->ipq_frags = q->m_nextpkt;
994		m_freem(q);
995	}
996	remque(fp);
997	(void) m_free(dtom(fp));
998	nipq--;
999}
1000
1001/*
1002 * IP timer processing;
1003 * if a timer expires on a reassembly
1004 * queue, discard it.
1005 */
1006void
1007ip_slowtimo()
1008{
1009	register struct ipq *fp;
1010	int s = splnet();
1011	int i;
1012
1013	for (i = 0; i < IPREASS_NHASH; i++) {
1014		fp = ipq[i].next;
1015		if (fp == 0)
1016			continue;
1017		while (fp != &ipq[i]) {
1018			--fp->ipq_ttl;
1019			fp = fp->next;
1020			if (fp->prev->ipq_ttl == 0) {
1021				ipstat.ips_fragtimeout++;
1022				ip_freef(fp->prev);
1023			}
1024		}
1025	}
1026	ipflow_slowtimo();
1027	splx(s);
1028}
1029
1030/*
1031 * Drain off all datagram fragments.
1032 */
1033void
1034ip_drain()
1035{
1036	int     i;
1037
1038	for (i = 0; i < IPREASS_NHASH; i++) {
1039		while (ipq[i].next != &ipq[i]) {
1040			ipstat.ips_fragdropped++;
1041			ip_freef(ipq[i].next);
1042		}
1043	}
1044	in_rtqdrain();
1045}
1046
1047/*
1048 * Do option processing on a datagram,
1049 * possibly discarding it if bad options are encountered,
1050 * or forwarding it if source-routed.
1051 * Returns 1 if packet has been forwarded/freed,
1052 * 0 if the packet should be processed further.
1053 */
1054static int
1055ip_dooptions(m)
1056	struct mbuf *m;
1057{
1058	register struct ip *ip = mtod(m, struct ip *);
1059	register u_char *cp;
1060	register struct ip_timestamp *ipt;
1061	register struct in_ifaddr *ia;
1062	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
1063	struct in_addr *sin, dst;
1064	n_time ntime;
1065
1066	dst = ip->ip_dst;
1067	cp = (u_char *)(ip + 1);
1068	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
1069	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1070		opt = cp[IPOPT_OPTVAL];
1071		if (opt == IPOPT_EOL)
1072			break;
1073		if (opt == IPOPT_NOP)
1074			optlen = 1;
1075		else {
1076			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1077				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1078				goto bad;
1079			}
1080			optlen = cp[IPOPT_OLEN];
1081			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1082				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1083				goto bad;
1084			}
1085		}
1086		switch (opt) {
1087
1088		default:
1089			break;
1090
1091		/*
1092		 * Source routing with record.
1093		 * Find interface with current destination address.
1094		 * If none on this machine then drop if strictly routed,
1095		 * or do nothing if loosely routed.
1096		 * Record interface address and bring up next address
1097		 * component.  If strictly routed make sure next
1098		 * address is on directly accessible net.
1099		 */
1100		case IPOPT_LSRR:
1101		case IPOPT_SSRR:
1102			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1103				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1104				goto bad;
1105			}
1106			ipaddr.sin_addr = ip->ip_dst;
1107			ia = (struct in_ifaddr *)
1108				ifa_ifwithaddr((struct sockaddr *)&ipaddr);
1109			if (ia == 0) {
1110				if (opt == IPOPT_SSRR) {
1111					type = ICMP_UNREACH;
1112					code = ICMP_UNREACH_SRCFAIL;
1113					goto bad;
1114				}
1115				if (!ip_dosourceroute)
1116					goto nosourcerouting;
1117				/*
1118				 * Loose routing, and not at next destination
1119				 * yet; nothing to do except forward.
1120				 */
1121				break;
1122			}
1123			off--;			/* 0 origin */
1124			if (off > optlen - (int)sizeof(struct in_addr)) {
1125				/*
1126				 * End of source route.  Should be for us.
1127				 */
1128				if (!ip_acceptsourceroute)
1129					goto nosourcerouting;
1130				save_rte(cp, ip->ip_src);
1131				break;
1132			}
1133
1134			if (!ip_dosourceroute) {
1135				if (ipforwarding) {
1136					char buf[16]; /* aaa.bbb.ccc.ddd\0 */
1137					/*
1138					 * Acting as a router, so generate ICMP
1139					 */
1140nosourcerouting:
1141					strcpy(buf, inet_ntoa(ip->ip_dst));
1142					log(LOG_WARNING,
1143					    "attempted source route from %s to %s\n",
1144					    inet_ntoa(ip->ip_src), buf);
1145					type = ICMP_UNREACH;
1146					code = ICMP_UNREACH_SRCFAIL;
1147					goto bad;
1148				} else {
1149					/*
1150					 * Not acting as a router, so silently drop.
1151					 */
1152					ipstat.ips_cantforward++;
1153					m_freem(m);
1154					return (1);
1155				}
1156			}
1157
1158			/*
1159			 * locate outgoing interface
1160			 */
1161			(void)memcpy(&ipaddr.sin_addr, cp + off,
1162			    sizeof(ipaddr.sin_addr));
1163
1164			if (opt == IPOPT_SSRR) {
1165#define	INA	struct in_ifaddr *
1166#define	SA	struct sockaddr *
1167			    if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
1168				ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1169			} else
1170				ia = ip_rtaddr(ipaddr.sin_addr);
1171			if (ia == 0) {
1172				type = ICMP_UNREACH;
1173				code = ICMP_UNREACH_SRCFAIL;
1174				goto bad;
1175			}
1176			ip->ip_dst = ipaddr.sin_addr;
1177			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
1178			    sizeof(struct in_addr));
1179			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1180			/*
1181			 * Let ip_intr's mcast routing check handle mcast pkts
1182			 */
1183			forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
1184			break;
1185
1186		case IPOPT_RR:
1187			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1188				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1189				goto bad;
1190			}
1191			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1192				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1193				goto bad;
1194			}
1195			/*
1196			 * If no space remains, ignore.
1197			 */
1198			off--;			/* 0 origin */
1199			if (off > optlen - (int)sizeof(struct in_addr))
1200				break;
1201			(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
1202			    sizeof(ipaddr.sin_addr));
1203			/*
1204			 * locate outgoing interface; if we're the destination,
1205			 * use the incoming interface (should be same).
1206			 */
1207			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
1208			    (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) {
1209				type = ICMP_UNREACH;
1210				code = ICMP_UNREACH_HOST;
1211				goto bad;
1212			}
1213			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
1214			    sizeof(struct in_addr));
1215			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1216			break;
1217
1218		case IPOPT_TS:
1219			code = cp - (u_char *)ip;
1220			ipt = (struct ip_timestamp *)cp;
1221			if (ipt->ipt_len < 5)
1222				goto bad;
1223			if (ipt->ipt_ptr >
1224			    ipt->ipt_len - (int)sizeof(int32_t)) {
1225				if (++ipt->ipt_oflw == 0)
1226					goto bad;
1227				break;
1228			}
1229			sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1);
1230			switch (ipt->ipt_flg) {
1231
1232			case IPOPT_TS_TSONLY:
1233				break;
1234
1235			case IPOPT_TS_TSANDADDR:
1236				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1237				    sizeof(struct in_addr) > ipt->ipt_len)
1238					goto bad;
1239				ipaddr.sin_addr = dst;
1240				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1241							    m->m_pkthdr.rcvif);
1242				if (ia == 0)
1243					continue;
1244				(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
1245				    sizeof(struct in_addr));
1246				ipt->ipt_ptr += sizeof(struct in_addr);
1247				break;
1248
1249			case IPOPT_TS_PRESPEC:
1250				if (ipt->ipt_ptr - 1 + sizeof(n_time) +
1251				    sizeof(struct in_addr) > ipt->ipt_len)
1252					goto bad;
1253				(void)memcpy(&ipaddr.sin_addr, sin,
1254				    sizeof(struct in_addr));
1255				if (ifa_ifwithaddr((SA)&ipaddr) == 0)
1256					continue;
1257				ipt->ipt_ptr += sizeof(struct in_addr);
1258				break;
1259
1260			default:
1261				goto bad;
1262			}
1263			ntime = iptime();
1264			(void)memcpy(cp + ipt->ipt_ptr - 1, &ntime,
1265			    sizeof(n_time));
1266			ipt->ipt_ptr += sizeof(n_time);
1267		}
1268	}
1269	if (forward && ipforwarding) {
1270		ip_forward(m, 1);
1271		return (1);
1272	}
1273	return (0);
1274bad:
1275	icmp_error(m, type, code, 0, 0);
1276	ipstat.ips_badoptions++;
1277	return (1);
1278}
1279
1280/*
1281 * Given address of next destination (final or next hop),
1282 * return internet address info of interface to be used to get there.
1283 */
1284static struct in_ifaddr *
1285ip_rtaddr(dst)
1286	 struct in_addr dst;
1287{
1288	register struct sockaddr_in *sin;
1289
1290	sin = (struct sockaddr_in *) &ipforward_rt.ro_dst;
1291
1292	if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) {
1293		if (ipforward_rt.ro_rt) {
1294			RTFREE(ipforward_rt.ro_rt);
1295			ipforward_rt.ro_rt = 0;
1296		}
1297		sin->sin_family = AF_INET;
1298		sin->sin_len = sizeof(*sin);
1299		sin->sin_addr = dst;
1300
1301		rtalloc_ign(&ipforward_rt, RTF_PRCLONING);
1302	}
1303	if (ipforward_rt.ro_rt == 0)
1304		return ((struct in_ifaddr *)0);
1305	return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa);
1306}
1307
1308/*
1309 * Save incoming source route for use in replies,
1310 * to be picked up later by ip_srcroute if the receiver is interested.
1311 */
1312void
1313save_rte(option, dst)
1314	u_char *option;
1315	struct in_addr dst;
1316{
1317	unsigned olen;
1318
1319	olen = option[IPOPT_OLEN];
1320#ifdef DIAGNOSTIC
1321	if (ipprintfs)
1322		printf("save_rte: olen %d\n", olen);
1323#endif
1324	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1325		return;
1326	bcopy(option, ip_srcrt.srcopt, olen);
1327	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1328	ip_srcrt.dst = dst;
1329}
1330
1331/*
1332 * Retrieve incoming source route for use in replies,
1333 * in the same form used by setsockopt.
1334 * The first hop is placed before the options, will be removed later.
1335 */
1336struct mbuf *
1337ip_srcroute()
1338{
1339	register struct in_addr *p, *q;
1340	register struct mbuf *m;
1341
1342	if (ip_nhops == 0)
1343		return ((struct mbuf *)0);
1344	m = m_get(M_DONTWAIT, MT_HEADER);
1345	if (m == 0)
1346		return ((struct mbuf *)0);
1347
1348#define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1349
1350	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1351	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1352	    OPTSIZ;
1353#ifdef DIAGNOSTIC
1354	if (ipprintfs)
1355		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1356#endif
1357
1358	/*
1359	 * First save first hop for return route
1360	 */
1361	p = &ip_srcrt.route[ip_nhops - 1];
1362	*(mtod(m, struct in_addr *)) = *p--;
1363#ifdef DIAGNOSTIC
1364	if (ipprintfs)
1365		printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr));
1366#endif
1367
1368	/*
1369	 * Copy option fields and padding (nop) to mbuf.
1370	 */
1371	ip_srcrt.nop = IPOPT_NOP;
1372	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1373	(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
1374	    &ip_srcrt.nop, OPTSIZ);
1375	q = (struct in_addr *)(mtod(m, caddr_t) +
1376	    sizeof(struct in_addr) + OPTSIZ);
1377#undef OPTSIZ
1378	/*
1379	 * Record return path as an IP source route,
1380	 * reversing the path (pointers are now aligned).
1381	 */
1382	while (p >= ip_srcrt.route) {
1383#ifdef DIAGNOSTIC
1384		if (ipprintfs)
1385			printf(" %lx", (u_long)ntohl(q->s_addr));
1386#endif
1387		*q++ = *p--;
1388	}
1389	/*
1390	 * Last hop goes to final destination.
1391	 */
1392	*q = ip_srcrt.dst;
1393#ifdef DIAGNOSTIC
1394	if (ipprintfs)
1395		printf(" %lx\n", (u_long)ntohl(q->s_addr));
1396#endif
1397	return (m);
1398}
1399
1400/*
1401 * Strip out IP options, at higher
1402 * level protocol in the kernel.
1403 * Second argument is buffer to which options
1404 * will be moved, and return value is their length.
1405 * XXX should be deleted; last arg currently ignored.
1406 */
1407void
1408ip_stripoptions(m, mopt)
1409	register struct mbuf *m;
1410	struct mbuf *mopt;
1411{
1412	register int i;
1413	struct ip *ip = mtod(m, struct ip *);
1414	register caddr_t opts;
1415	int olen;
1416
1417	olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
1418	opts = (caddr_t)(ip + 1);
1419	i = m->m_len - (sizeof (struct ip) + olen);
1420	bcopy(opts + olen, opts, (unsigned)i);
1421	m->m_len -= olen;
1422	if (m->m_flags & M_PKTHDR)
1423		m->m_pkthdr.len -= olen;
1424	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
1425}
1426
1427u_char inetctlerrmap[PRC_NCMDS] = {
1428	0,		0,		0,		0,
1429	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1430	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1431	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1432	0,		0,		0,		0,
1433	ENOPROTOOPT
1434};
1435
1436/*
1437 * Forward a packet.  If some error occurs return the sender
1438 * an icmp packet.  Note we can't always generate a meaningful
1439 * icmp message because icmp doesn't have a large enough repertoire
1440 * of codes and types.
1441 *
1442 * If not forwarding, just drop the packet.  This could be confusing
1443 * if ipforwarding was zero but some routing protocol was advancing
1444 * us as a gateway to somewhere.  However, we must let the routing
1445 * protocol deal with that.
1446 *
1447 * The srcrt parameter indicates whether the packet is being forwarded
1448 * via a source route.
1449 */
1450static void
1451ip_forward(m, srcrt)
1452	struct mbuf *m;
1453	int srcrt;
1454{
1455	register struct ip *ip = mtod(m, struct ip *);
1456	register struct sockaddr_in *sin;
1457	register struct rtentry *rt;
1458	int error, type = 0, code = 0;
1459	struct mbuf *mcopy;
1460	n_long dest;
1461	struct ifnet *destifp;
1462#ifdef IPSEC
1463	struct ifnet dummyifp;
1464#endif
1465
1466	dest = 0;
1467#ifdef DIAGNOSTIC
1468	if (ipprintfs)
1469		printf("forward: src %lx dst %lx ttl %x\n",
1470		    (u_long)ip->ip_src.s_addr, (u_long)ip->ip_dst.s_addr,
1471		    ip->ip_ttl);
1472#endif
1473
1474
1475	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
1476		ipstat.ips_cantforward++;
1477		m_freem(m);
1478		return;
1479	}
1480#ifdef IPSTEALTH
1481	if (!ipstealth) {
1482#endif
1483		if (ip->ip_ttl <= IPTTLDEC) {
1484			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
1485			    dest, 0);
1486			return;
1487		}
1488#ifdef IPSTEALTH
1489	}
1490#endif
1491
1492	sin = (struct sockaddr_in *)&ipforward_rt.ro_dst;
1493	if ((rt = ipforward_rt.ro_rt) == 0 ||
1494	    ip->ip_dst.s_addr != sin->sin_addr.s_addr) {
1495		if (ipforward_rt.ro_rt) {
1496			RTFREE(ipforward_rt.ro_rt);
1497			ipforward_rt.ro_rt = 0;
1498		}
1499		sin->sin_family = AF_INET;
1500		sin->sin_len = sizeof(*sin);
1501		sin->sin_addr = ip->ip_dst;
1502
1503		rtalloc_ign(&ipforward_rt, RTF_PRCLONING);
1504		if (ipforward_rt.ro_rt == 0) {
1505			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1506			return;
1507		}
1508		rt = ipforward_rt.ro_rt;
1509	}
1510
1511	/*
1512	 * Save at most 64 bytes of the packet in case
1513	 * we need to generate an ICMP message to the src.
1514	 */
1515	mcopy = m_copy(m, 0, imin((int)ip->ip_len, 64));
1516	if (mcopy && (mcopy->m_flags & M_EXT))
1517		m_copydata(mcopy, 0, sizeof(struct ip), mtod(mcopy, caddr_t));
1518
1519#ifdef IPSTEALTH
1520	if (!ipstealth) {
1521#endif
1522		ip->ip_ttl -= IPTTLDEC;
1523#ifdef IPSTEALTH
1524	}
1525#endif
1526
1527	/*
1528	 * If forwarding packet using same interface that it came in on,
1529	 * perhaps should send a redirect to sender to shortcut a hop.
1530	 * Only send redirect if source is sending directly to us,
1531	 * and if packet was not source routed (or has any options).
1532	 * Also, don't send redirect if forwarding using a default route
1533	 * or a route modified by a redirect.
1534	 */
1535#define	satosin(sa)	((struct sockaddr_in *)(sa))
1536	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1537	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1538	    satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
1539	    ipsendredirects && !srcrt) {
1540#define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
1541		u_long src = ntohl(ip->ip_src.s_addr);
1542
1543		if (RTA(rt) &&
1544		    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
1545		    if (rt->rt_flags & RTF_GATEWAY)
1546			dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1547		    else
1548			dest = ip->ip_dst.s_addr;
1549		    /* Router requirements says to only send host redirects */
1550		    type = ICMP_REDIRECT;
1551		    code = ICMP_REDIRECT_HOST;
1552#ifdef DIAGNOSTIC
1553		    if (ipprintfs)
1554		        printf("redirect (%d) to %lx\n", code, (u_long)dest);
1555#endif
1556		}
1557	}
1558
1559	error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1560			  IP_FORWARDING, 0);
1561	if (error)
1562		ipstat.ips_cantforward++;
1563	else {
1564		ipstat.ips_forward++;
1565		if (type)
1566			ipstat.ips_redirectsent++;
1567		else {
1568			if (mcopy) {
1569				ipflow_create(&ipforward_rt, mcopy);
1570				m_freem(mcopy);
1571			}
1572			return;
1573		}
1574	}
1575	if (mcopy == NULL)
1576		return;
1577	destifp = NULL;
1578
1579	switch (error) {
1580
1581	case 0:				/* forwarded, but need redirect */
1582		/* type, code set above */
1583		break;
1584
1585	case ENETUNREACH:		/* shouldn't happen, checked above */
1586	case EHOSTUNREACH:
1587	case ENETDOWN:
1588	case EHOSTDOWN:
1589	default:
1590		type = ICMP_UNREACH;
1591		code = ICMP_UNREACH_HOST;
1592		break;
1593
1594	case EMSGSIZE:
1595		type = ICMP_UNREACH;
1596		code = ICMP_UNREACH_NEEDFRAG;
1597#ifndef IPSEC
1598		if (ipforward_rt.ro_rt)
1599			destifp = ipforward_rt.ro_rt->rt_ifp;
1600#else
1601		/*
1602		 * If the packet is routed over IPsec tunnel, tell the
1603		 * originator the tunnel MTU.
1604		 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
1605		 * XXX quickhack!!!
1606		 */
1607		if (ipforward_rt.ro_rt) {
1608			struct secpolicy *sp = NULL;
1609			int ipsecerror;
1610			int ipsechdr;
1611			struct route *ro;
1612
1613			sp = ipsec4_getpolicybyaddr(mcopy,
1614						    IPSEC_DIR_OUTBOUND,
1615			                            IP_FORWARDING,
1616			                            &ipsecerror);
1617
1618			if (sp == NULL)
1619				destifp = ipforward_rt.ro_rt->rt_ifp;
1620			else {
1621				/* count IPsec header size */
1622				ipsechdr = ipsec4_hdrsiz(mcopy,
1623							 IPSEC_DIR_OUTBOUND,
1624							 NULL);
1625
1626				/*
1627				 * find the correct route for outer IPv4
1628				 * header, compute tunnel MTU.
1629				 *
1630				 * XXX BUG ALERT
1631				 * The "dummyifp" code relies upon the fact
1632				 * that icmp_error() touches only ifp->if_mtu.
1633				 */
1634				/*XXX*/
1635				destifp = NULL;
1636				if (sp->req != NULL
1637				 && sp->req->sav != NULL
1638				 && sp->req->sav->sah != NULL) {
1639					ro = &sp->req->sav->sah->sa_route;
1640					if (ro->ro_rt && ro->ro_rt->rt_ifp) {
1641						dummyifp.if_mtu =
1642						    ro->ro_rt->rt_ifp->if_mtu;
1643						dummyifp.if_mtu -= ipsechdr;
1644						destifp = &dummyifp;
1645					}
1646				}
1647
1648				key_freesp(sp);
1649			}
1650		}
1651#endif /*IPSEC*/
1652		ipstat.ips_cantfrag++;
1653		break;
1654
1655	case ENOBUFS:
1656		type = ICMP_SOURCEQUENCH;
1657		code = 0;
1658		break;
1659
1660	case EACCES:			/* ipfw denied packet */
1661		m_freem(mcopy);
1662		return;
1663	}
1664	if (mcopy->m_flags & M_EXT)
1665		m_copyback(mcopy, 0, sizeof(struct ip), mtod(mcopy, caddr_t));
1666	icmp_error(mcopy, type, code, dest, destifp);
1667}
1668
1669void
1670ip_savecontrol(inp, mp, ip, m)
1671	register struct inpcb *inp;
1672	register struct mbuf **mp;
1673	register struct ip *ip;
1674	register struct mbuf *m;
1675{
1676	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
1677		struct timeval tv;
1678
1679		microtime(&tv);
1680		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
1681			SCM_TIMESTAMP, SOL_SOCKET);
1682		if (*mp)
1683			mp = &(*mp)->m_next;
1684	}
1685	if (inp->inp_flags & INP_RECVDSTADDR) {
1686		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
1687		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
1688		if (*mp)
1689			mp = &(*mp)->m_next;
1690	}
1691#ifdef notyet
1692	/* XXX
1693	 * Moving these out of udp_input() made them even more broken
1694	 * than they already were.
1695	 */
1696	/* options were tossed already */
1697	if (inp->inp_flags & INP_RECVOPTS) {
1698		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
1699		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
1700		if (*mp)
1701			mp = &(*mp)->m_next;
1702	}
1703	/* ip_srcroute doesn't do what we want here, need to fix */
1704	if (inp->inp_flags & INP_RECVRETOPTS) {
1705		*mp = sbcreatecontrol((caddr_t) ip_srcroute(),
1706		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
1707		if (*mp)
1708			mp = &(*mp)->m_next;
1709	}
1710#endif
1711	if (inp->inp_flags & INP_RECVIF) {
1712		struct ifnet *ifp;
1713		struct sdlbuf {
1714			struct sockaddr_dl sdl;
1715			u_char	pad[32];
1716		} sdlbuf;
1717		struct sockaddr_dl *sdp;
1718		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
1719
1720		if (((ifp = m->m_pkthdr.rcvif))
1721		&& ( ifp->if_index && (ifp->if_index <= if_index))) {
1722			sdp = (struct sockaddr_dl *)(ifnet_addrs
1723					[ifp->if_index - 1]->ifa_addr);
1724			/*
1725			 * Change our mind and don't try copy.
1726			 */
1727			if ((sdp->sdl_family != AF_LINK)
1728			|| (sdp->sdl_len > sizeof(sdlbuf))) {
1729				goto makedummy;
1730			}
1731			bcopy(sdp, sdl2, sdp->sdl_len);
1732		} else {
1733makedummy:
1734			sdl2->sdl_len
1735				= offsetof(struct sockaddr_dl, sdl_data[0]);
1736			sdl2->sdl_family = AF_LINK;
1737			sdl2->sdl_index = 0;
1738			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
1739		}
1740		*mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
1741			IP_RECVIF, IPPROTO_IP);
1742		if (*mp)
1743			mp = &(*mp)->m_next;
1744	}
1745}
1746
1747int
1748ip_rsvp_init(struct socket *so)
1749{
1750	if (so->so_type != SOCK_RAW ||
1751	    so->so_proto->pr_protocol != IPPROTO_RSVP)
1752	  return EOPNOTSUPP;
1753
1754	if (ip_rsvpd != NULL)
1755	  return EADDRINUSE;
1756
1757	ip_rsvpd = so;
1758	/*
1759	 * This may seem silly, but we need to be sure we don't over-increment
1760	 * the RSVP counter, in case something slips up.
1761	 */
1762	if (!ip_rsvp_on) {
1763		ip_rsvp_on = 1;
1764		rsvp_on++;
1765	}
1766
1767	return 0;
1768}
1769
1770int
1771ip_rsvp_done(void)
1772{
1773	ip_rsvpd = NULL;
1774	/*
1775	 * This may seem silly, but we need to be sure we don't over-decrement
1776	 * the RSVP counter, in case something slips up.
1777	 */
1778	if (ip_rsvp_on) {
1779		ip_rsvp_on = 0;
1780		rsvp_on--;
1781	}
1782	return 0;
1783}
1784