ip_input.c revision 11042:2d6e217af1b4
1203954Srdivacky/*
2203954Srdivacky * CDDL HEADER START
3203954Srdivacky *
4203954Srdivacky * The contents of this file are subject to the terms of the
5203954Srdivacky * Common Development and Distribution License (the "License").
6203954Srdivacky * You may not use this file except in compliance with the License.
7203954Srdivacky *
8203954Srdivacky * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9203954Srdivacky * or http://www.opensolaris.org/os/licensing.
10203954Srdivacky * See the License for the specific language governing permissions
11203954Srdivacky * and limitations under the License.
12221345Sdim *
13203954Srdivacky * When distributing Covered Code, include this CDDL HEADER in each
14221345Sdim * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15204642Srdivacky * If applicable, add the following below this CDDL HEADER, with the
16203954Srdivacky * fields enclosed by brackets "[]" replaced with your own identifying
17204642Srdivacky * information: Portions Copyright [yyyy] [name of copyright owner]
18203954Srdivacky *
19203954Srdivacky * CDDL HEADER END
20204642Srdivacky */
21204642Srdivacky
22204642Srdivacky/*
23204642Srdivacky * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24204642Srdivacky * Use is subject to license terms.
25204642Srdivacky */
26204642Srdivacky/* Copyright (c) 1990 Mentat Inc. */
27204642Srdivacky
28204642Srdivacky#include <sys/types.h>
29204642Srdivacky#include <sys/stream.h>
30218893Sdim#include <sys/dlpi.h>
31204642Srdivacky#include <sys/stropts.h>
32204642Srdivacky#include <sys/sysmacros.h>
33204642Srdivacky#include <sys/strsubr.h>
34204642Srdivacky#include <sys/strlog.h>
35218893Sdim#include <sys/strsun.h>
36204642Srdivacky#include <sys/zone.h>
37204642Srdivacky#define	_SUN_TPI_VERSION 2
38204642Srdivacky#include <sys/tihdr.h>
39204642Srdivacky#include <sys/xti_inet.h>
40204642Srdivacky#include <sys/ddi.h>
41204642Srdivacky#include <sys/sunddi.h>
42204642Srdivacky#include <sys/cmn_err.h>
43204642Srdivacky#include <sys/debug.h>
44204642Srdivacky#include <sys/kobj.h>
45204642Srdivacky#include <sys/modctl.h>
46204642Srdivacky#include <sys/atomic.h>
47204642Srdivacky#include <sys/policy.h>
48204642Srdivacky#include <sys/priv.h>
49203954Srdivacky
50203954Srdivacky#include <sys/systm.h>
51203954Srdivacky#include <sys/param.h>
52203954Srdivacky#include <sys/kmem.h>
53218893Sdim#include <sys/sdt.h>
54203954Srdivacky#include <sys/socket.h>
55203954Srdivacky#include <sys/vtrace.h>
56203954Srdivacky#include <sys/isa_defs.h>
57203954Srdivacky#include <sys/mac.h>
58218893Sdim#include <net/if.h>
59203954Srdivacky#include <net/if_arp.h>
60203954Srdivacky#include <net/route.h>
61203954Srdivacky#include <sys/sockio.h>
62203954Srdivacky#include <netinet/in.h>
63218893Sdim#include <net/if_dl.h>
64204642Srdivacky
65204642Srdivacky#include <inet/common.h>
66204642Srdivacky#include <inet/mi.h>
67203954Srdivacky#include <inet/mib2.h>
68218893Sdim#include <inet/nd.h>
69204642Srdivacky#include <inet/arp.h>
70204642Srdivacky#include <inet/snmpcom.h>
71204642Srdivacky#include <inet/kstatcom.h>
72204642Srdivacky
73218893Sdim#include <netinet/igmp_var.h>
74218893Sdim#include <netinet/ip6.h>
75218893Sdim#include <netinet/icmp6.h>
76218893Sdim#include <netinet/sctp.h>
77204792Srdivacky
78204792Srdivacky#include <inet/ip.h>
79204792Srdivacky#include <inet/ip_impl.h>
80204792Srdivacky#include <inet/ip6.h>
81204792Srdivacky#include <inet/ip6_asp.h>
82204792Srdivacky#include <inet/optcom.h>
83204792Srdivacky#include <inet/tcp.h>
84218893Sdim#include <inet/tcp_impl.h>
85204642Srdivacky#include <inet/ip_multi.h>
86204642Srdivacky#include <inet/ip_if.h>
87204642Srdivacky#include <inet/ip_ire.h>
88204642Srdivacky#include <inet/ip_ftable.h>
89218893Sdim#include <inet/ip_rts.h>
90204642Srdivacky#include <inet/ip_ndp.h>
91204642Srdivacky#include <inet/ip_listutils.h>
92218893Sdim#include <netinet/igmp.h>
93204642Srdivacky#include <netinet/ip_mroute.h>
94204642Srdivacky#include <inet/ipp_common.h>
95204642Srdivacky
96221345Sdim#include <net/pfkeyv2.h>
97221345Sdim#include <inet/sadb.h>
98221345Sdim#include <inet/ipsec_impl.h>
99221345Sdim#include <inet/ipdrop.h>
100203954Srdivacky#include <inet/ip_netinfo.h>
101203954Srdivacky#include <inet/ilb_ip.h>
102218893Sdim#include <sys/squeue_impl.h>
103203954Srdivacky#include <sys/squeue.h>
104203954Srdivacky
105203954Srdivacky#include <sys/ethernet.h>
106218893Sdim#include <net/if_types.h>
107204642Srdivacky#include <sys/cpuvar.h>
108204642Srdivacky
109218893Sdim#include <ipp/ipp.h>
110204642Srdivacky#include <ipp/ipp_impl.h>
111203954Srdivacky#include <ipp/ipgpc/ipgpc.h>
112204642Srdivacky
113203954Srdivacky#include <sys/pattr.h>
114218893Sdim#include <inet/ipclassifier.h>
115204642Srdivacky#include <inet/sctp_ip.h>
116203954Srdivacky#include <inet/sctp/sctp_impl.h>
117203954Srdivacky#include <inet/udp_impl.h>
118203954Srdivacky#include <sys/sunddi.h>
119203954Srdivacky
120218893Sdim#include <sys/tsol/label.h>
121204642Srdivacky#include <sys/tsol/tnet.h>
122204642Srdivacky
123204642Srdivacky#include <rpc/pmap_prot.h>
124204642Srdivacky
125204642Srdivacky#ifdef	DEBUG
126204642Srdivackyextern boolean_t skip_sctp_cksum;
127204642Srdivacky#endif
128204642Srdivacky
129204642Srdivackystatic void	ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
130204642Srdivacky    ip_recv_attr_t *);
131204642Srdivacky
132218893Sdimstatic void	ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
133204642Srdivacky    ip_recv_attr_t *);
134204642Srdivackystatic void	ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
135204642Srdivacky    ip_recv_attr_t *);
136204642Srdivacky
137204642Srdivacky#pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
138204642Srdivacky
139204642Srdivacky/*
140204642Srdivacky * Direct read side procedure capable of dealing with chains. GLDv3 based
141204642Srdivacky * drivers call this function directly with mblk chains while STREAMS
142204642Srdivacky * read side procedure ip_rput() calls this for single packet with ip_ring
143204642Srdivacky * set to NULL to process one packet at a time.
144218893Sdim *
145203954Srdivacky * The ill will always be valid if this function is called directly from
146203954Srdivacky * the driver.
147203954Srdivacky *
148203954Srdivacky * If ip_input() is called from GLDv3:
149203954Srdivacky *
150204642Srdivacky *   - This must be a non-VLAN IP stream.
151203954Srdivacky *   - 'mp' is either an untagged or a special priority-tagged packet.
152203954Srdivacky *   - Any VLAN tag that was in the MAC header has been stripped.
153203954Srdivacky *
154203954Srdivacky * If the IP header in packet is not 32-bit aligned, every message in the
155203954Srdivacky * chain will be aligned before further operations. This is required on SPARC
156203954Srdivacky * platform.
157203954Srdivacky */
158203954Srdivackyvoid
159203954Srdivackyip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
160203954Srdivacky    struct mac_header_info_s *mhip)
161203954Srdivacky{
162203954Srdivacky	(void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
163203954Srdivacky	    NULL);
164203954Srdivacky}
165218893Sdim
166203954Srdivacky/*
167203954Srdivacky * ip_accept_tcp() - This function is called by the squeue when it retrieves
168221345Sdim * a chain of packets in the poll mode. The packets have gone through the
169221345Sdim * data link processing but not IP processing. For performance and latency
170221345Sdim * reasons, the squeue wants to process the chain in line instead of feeding
171221345Sdim * it back via ip_input path.
172221345Sdim *
173221345Sdim * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
174203954Srdivacky * will pass back any TCP packets matching the target sqp to
175203954Srdivacky * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
176203954Srdivacky * ip_input_v4 and ip_fanout_v4 as normal.
177203954Srdivacky * The TCP packets that match the target squeue are returned to the caller
178203954Srdivacky * as a b_next chain after each packet has been prepend with an mblk
179203954Srdivacky * from ip_recv_attr_to_mblk.
180203954Srdivacky */
181203954Srdivackymblk_t *
182203954Srdivackyip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
183203954Srdivacky    mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
184218893Sdim{
185203954Srdivacky	return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
186203954Srdivacky	    last, cnt));
187203954Srdivacky}
188203954Srdivacky
189203954Srdivacky/*
190203954Srdivacky * Used by ip_input and ip_accept_tcp
191203954Srdivacky * The last three arguments are only used by ip_accept_tcp, and mhip is
192203954Srdivacky * only used by ip_input.
193203954Srdivacky */
194203954Srdivackymblk_t *
195203954Srdivackyip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
196203954Srdivacky    struct mac_header_info_s *mhip, squeue_t *target_sqp,
197218893Sdim    mblk_t **last, uint_t *cnt)
198204642Srdivacky{
199203954Srdivacky	mblk_t		*mp;
200204642Srdivacky	ipha_t		*ipha;
201203954Srdivacky	ip_recv_attr_t	iras;	/* Receive attributes */
202204642Srdivacky	rtc_t		rtc;
203203954Srdivacky	iaflags_t	chain_flags = 0;	/* Fixed for chain */
204203954Srdivacky	mblk_t 		*ahead = NULL;	/* Accepted head */
205203954Srdivacky	mblk_t		*atail = NULL;	/* Accepted tail */
206203954Srdivacky	uint_t		acnt = 0;	/* Accepted count */
207204642Srdivacky
208204642Srdivacky	ASSERT(mp_chain != NULL);
209204642Srdivacky	ASSERT(ill != NULL);
210203954Srdivacky
211203954Srdivacky	/* These ones do not change as we loop over packets */
212203954Srdivacky	iras.ira_ill = iras.ira_rill = ill;
213203954Srdivacky	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
214218893Sdim	iras.ira_rifindex = iras.ira_ruifindex;
215203954Srdivacky	iras.ira_sqp = NULL;
216204642Srdivacky	iras.ira_ring = ip_ring;
217204642Srdivacky	/* For ECMP and outbound transmit ring selection */
218204642Srdivacky	iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
219204642Srdivacky
220204642Srdivacky	iras.ira_target_sqp = target_sqp;
221204642Srdivacky	iras.ira_target_sqp_mp = NULL;
222204642Srdivacky	if (target_sqp != NULL)
223204642Srdivacky		chain_flags |= IRAF_TARGET_SQP;
224204642Srdivacky
225204642Srdivacky	/*
226204642Srdivacky	 * We try to have a mhip pointer when possible, but
227218893Sdim	 * it might be NULL in some cases. In those cases we
228203954Srdivacky	 * have to assume unicast.
229203954Srdivacky	 */
230203954Srdivacky	iras.ira_mhip = mhip;
231203954Srdivacky	iras.ira_flags = 0;
232203954Srdivacky	if (mhip != NULL) {
233218893Sdim		switch (mhip->mhi_dsttype) {
234203954Srdivacky		case MAC_ADDRTYPE_MULTICAST :
235203954Srdivacky			chain_flags |= IRAF_L2DST_MULTICAST;
236218893Sdim			break;
237203954Srdivacky		case MAC_ADDRTYPE_BROADCAST :
238208599Srdivacky			chain_flags |= IRAF_L2DST_BROADCAST;
239203954Srdivacky			break;
240203954Srdivacky		}
241203954Srdivacky	}
242204642Srdivacky
243204642Srdivacky	/*
244218893Sdim	 * Initialize the one-element route cache.
245204642Srdivacky	 *
246204642Srdivacky	 * We do ire caching from one iteration to
247204642Srdivacky	 * another. In the event the packet chain contains
248204642Srdivacky	 * all packets from the same dst, this caching saves
249204642Srdivacky	 * an ire_route_recursive for each of the succeeding
250204642Srdivacky	 * packets in a packet chain.
251218893Sdim	 */
252203954Srdivacky	rtc.rtc_ire = NULL;
253204642Srdivacky	rtc.rtc_ipaddr = INADDR_ANY;
254218893Sdim
255203954Srdivacky	/* Loop over b_next */
256204642Srdivacky	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
257218893Sdim		mp_chain = mp->b_next;
258203954Srdivacky		mp->b_next = NULL;
259204642Srdivacky
260204642Srdivacky		ASSERT(DB_TYPE(mp) == M_DATA);
261204642Srdivacky
262204642Srdivacky
263204642Srdivacky		/*
264204642Srdivacky		 * if db_ref > 1 then copymsg and free original. Packet
265204642Srdivacky		 * may be changed and we do not want the other entity
266204792Srdivacky		 * who has a reference to this message to trip over the
267204792Srdivacky		 * changes. This is a blind change because trying to
268204792Srdivacky		 * catch all places that might change the packet is too
269204642Srdivacky		 * difficult.
270203954Srdivacky		 *
271218893Sdim		 * This corresponds to the fast path case, where we have
272203954Srdivacky		 * a chain of M_DATA mblks.  We check the db_ref count
273203954Srdivacky		 * of only the 1st data block in the mblk chain. There
274203954Srdivacky		 * doesn't seem to be a reason why a device driver would
275203954Srdivacky		 * send up data with varying db_ref counts in the mblk
276203954Srdivacky		 * chain. In any case the Fast path is a private
277203954Srdivacky		 * interface, and our drivers don't do such a thing.
278203954Srdivacky		 * Given the above assumption, there is no need to walk
279203954Srdivacky		 * down the entire mblk chain (which could have a
280218893Sdim		 * potential performance problem)
281203954Srdivacky		 *
282203954Srdivacky		 * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
283203954Srdivacky		 * to here because of exclusive ip stacks and vnics.
284203954Srdivacky		 * Packets transmitted from exclusive stack over vnic
285203954Srdivacky		 * can have db_ref > 1 and when it gets looped back to
286203954Srdivacky		 * another vnic in a different zone, you have ip_input()
287203954Srdivacky		 * getting dblks with db_ref > 1. So if someone
288203954Srdivacky		 * complains of TCP performance under this scenario,
289203954Srdivacky		 * take a serious look here on the impact of copymsg().
290203954Srdivacky		 */
291218893Sdim		if (DB_REF(mp) > 1) {
292203954Srdivacky			if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
293204642Srdivacky				/* mhip might point into 1st packet in chain */
294204642Srdivacky				iras.ira_mhip = NULL;
295203954Srdivacky				continue;
296203954Srdivacky			}
297204642Srdivacky		}
298204642Srdivacky
299204642Srdivacky		/*
300204642Srdivacky		 * IP header ptr not aligned?
301204642Srdivacky		 * OR IP header not complete in first mblk
302204642Srdivacky		 */
303204642Srdivacky		ipha = (ipha_t *)mp->b_rptr;
304203954Srdivacky		if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
305204642Srdivacky			mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
306203954Srdivacky			    &iras);
307204642Srdivacky			if (mp == NULL) {
308203954Srdivacky				/* mhip might point into 1st packet in chain */
309203954Srdivacky				iras.ira_mhip = NULL;
310204642Srdivacky				continue;
311203954Srdivacky			}
312204642Srdivacky			ipha = (ipha_t *)mp->b_rptr;
313203954Srdivacky		}
314203954Srdivacky
315203954Srdivacky		/* Protect against a mix of Ethertypes and IP versions */
316203954Srdivacky		if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
317218893Sdim			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
318203954Srdivacky			ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
319204642Srdivacky			freemsg(mp);
320218893Sdim			/* mhip might point into 1st packet in the chain. */
321204642Srdivacky			iras.ira_mhip = NULL;
322204642Srdivacky			continue;
323204642Srdivacky		}
324204642Srdivacky
325218893Sdim		/*
326203954Srdivacky		 * Check for Martian addrs; we have to explicitly
327203954Srdivacky		 * test for for zero dst since this is also used as
328203954Srdivacky		 * an indication that the rtc is not used.
329204642Srdivacky		 */
330204642Srdivacky		if (ipha->ipha_dst == INADDR_ANY) {
331204642Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
332204642Srdivacky			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
333204642Srdivacky			freemsg(mp);
334204642Srdivacky			/* mhip might point into 1st packet in the chain. */
335204642Srdivacky			iras.ira_mhip = NULL;
336218893Sdim			continue;
337204642Srdivacky		}
338204642Srdivacky
339203954Srdivacky		/*
340203954Srdivacky		 * Keep L2SRC from a previous packet in chain since mhip
341204642Srdivacky		 * might point into an earlier packet in the chain.
342204642Srdivacky		 * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
343204642Srdivacky		 * source check in forwarding path.
344204642Srdivacky		 */
345204642Srdivacky		chain_flags |= (iras.ira_flags &
346204642Srdivacky		    (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
347204642Srdivacky
348204642Srdivacky		iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
349204642Srdivacky		    IRAF_VERIFY_ULP_CKSUM | chain_flags;
350204642Srdivacky		iras.ira_free_flags = 0;
351204642Srdivacky		iras.ira_cred = NULL;
352204642Srdivacky		iras.ira_cpid = NOPID;
353204642Srdivacky		iras.ira_tsl = NULL;
354204642Srdivacky		iras.ira_zoneid = ALL_ZONES;	/* Default for forwarding */
355204642Srdivacky
356204642Srdivacky		/*
357204642Srdivacky		 * We must count all incoming packets, even if they end
358204642Srdivacky		 * up being dropped later on. Defer counting bytes until
359204642Srdivacky		 * we have the whole IP header in first mblk.
360204642Srdivacky		 */
361204642Srdivacky		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
362204642Srdivacky
363204642Srdivacky		iras.ira_pktlen = ntohs(ipha->ipha_length);
364203954Srdivacky		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
365203954Srdivacky		    iras.ira_pktlen);
366203954Srdivacky
367218893Sdim		/*
368203954Srdivacky		 * Call one of:
369203954Srdivacky		 * 	ill_input_full_v4
370203954Srdivacky		 *	ill_input_short_v4
371218893Sdim		 * The former is used in unusual cases. See ill_set_inputfn().
372203954Srdivacky		 */
373203954Srdivacky		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
374203954Srdivacky
375203954Srdivacky		/* Any references to clean up? No hold on ira_ill */
376203954Srdivacky		if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
377203954Srdivacky			ira_cleanup(&iras, B_FALSE);
378203954Srdivacky
379203954Srdivacky		if (iras.ira_target_sqp_mp != NULL) {
380218893Sdim			/* Better be called from ip_accept_tcp */
381218893Sdim			ASSERT(target_sqp != NULL);
382203954Srdivacky
383218893Sdim			/* Found one packet to accept */
384203954Srdivacky			mp = iras.ira_target_sqp_mp;
385204642Srdivacky			iras.ira_target_sqp_mp = NULL;
386203954Srdivacky			ASSERT(ip_recv_attr_is_mblk(mp));
387203954Srdivacky
388204642Srdivacky			if (atail != NULL)
389218893Sdim				atail->b_next = mp;
390218893Sdim			else
391204642Srdivacky				ahead = mp;
392218893Sdim			atail = mp;
393218893Sdim			acnt++;
394204642Srdivacky			mp = NULL;
395204642Srdivacky		}
396218893Sdim		/* mhip might point into 1st packet in the chain. */
397204642Srdivacky		iras.ira_mhip = NULL;
398218893Sdim	}
399218893Sdim	/* Any remaining references to the route cache? */
400204642Srdivacky	if (rtc.rtc_ire != NULL) {
401218893Sdim		ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
402218893Sdim		ire_refrele(rtc.rtc_ire);
403218893Sdim	}
404218893Sdim
405218893Sdim	if (ahead != NULL) {
406218893Sdim		/* Better be called from ip_accept_tcp */
407218893Sdim		ASSERT(target_sqp != NULL);
408203954Srdivacky		*last = atail;
409203954Srdivacky		*cnt = acnt;
410203954Srdivacky		return (ahead);
411204642Srdivacky	}
412203954Srdivacky
413204642Srdivacky	return (NULL);
414203954Srdivacky}
415203954Srdivacky
416203954Srdivacky/*
417203954Srdivacky * This input function is used when
418203954Srdivacky *  - is_system_labeled()
419203954Srdivacky *  - CGTP filtering
420203954Srdivacky *  - DHCP unicast before we have an IP address configured
421203954Srdivacky *  - there is an listener for IPPROTO_RSVP
422203954Srdivacky */
423206083Srdivackyvoid
424218893Sdimill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
425206083Srdivacky    ip_recv_attr_t *ira, rtc_t *rtc)
426206083Srdivacky{
427206083Srdivacky	ipha_t		*ipha = (ipha_t *)iph_arg;
428203954Srdivacky	ipaddr_t	nexthop = *(ipaddr_t *)nexthop_arg;
429206083Srdivacky	ill_t		*ill = ira->ira_ill;
430203954Srdivacky	ip_stack_t	*ipst = ill->ill_ipst;
431218893Sdim	int		cgtp_flt_pkt;
432203954Srdivacky
433203954Srdivacky	ASSERT(ira->ira_tsl == NULL);
434203954Srdivacky
435203954Srdivacky	/*
436203954Srdivacky	 * Attach any necessary label information to
437204642Srdivacky	 * this packet
438204642Srdivacky	 */
439203954Srdivacky	if (is_system_labeled()) {
440203954Srdivacky		ira->ira_flags |= IRAF_SYSTEM_LABELED;
441203954Srdivacky
442203954Srdivacky		/*
443203954Srdivacky		 * This updates ira_cred, ira_tsl and ira_free_flags based
444203954Srdivacky		 * on the label.
445204642Srdivacky		 */
446203954Srdivacky		if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
447203954Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
448203954Srdivacky			ip_drop_input("ipIfStatsInDiscards", mp, ill);
449218893Sdim			freemsg(mp);
450203954Srdivacky			return;
451203954Srdivacky		}
452203954Srdivacky		/* Note that ira_tsl can be NULL here. */
453203954Srdivacky
454218893Sdim		/* tsol_get_pkt_label sometimes does pullupmsg */
455204961Srdivacky		ipha = (ipha_t *)mp->b_rptr;
456204961Srdivacky	}
457204961Srdivacky
458218893Sdim	/*
459206083Srdivacky	 * Invoke the CGTP (multirouting) filtering module to process
460206083Srdivacky	 * the incoming packet. Packets identified as duplicates
461206083Srdivacky	 * must be discarded. Filtering is active only if the
462203954Srdivacky	 * the ip_cgtp_filter ndd variable is non-zero.
463203954Srdivacky	 */
464204642Srdivacky	cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
465204642Srdivacky	if (ipst->ips_ip_cgtp_filter &&
466204642Srdivacky	    ipst->ips_ip_cgtp_filter_ops != NULL) {
467204642Srdivacky		netstackid_t stackid;
468204642Srdivacky
469204642Srdivacky		stackid = ipst->ips_netstack->netstack_stackid;
470204642Srdivacky		/*
471204642Srdivacky		 * CGTP and IPMP are mutually exclusive so
472204642Srdivacky		 * phyint_ifindex is fine here.
473204642Srdivacky		 */
474204642Srdivacky		cgtp_flt_pkt =
475204642Srdivacky		    ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
476204642Srdivacky		    ill->ill_phyint->phyint_ifindex, mp);
477218893Sdim		if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
478204642Srdivacky			ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
479204642Srdivacky			freemsg(mp);
480204642Srdivacky			return;
481204642Srdivacky		}
482218893Sdim	}
483204792Srdivacky
484204792Srdivacky	/*
485218893Sdim	 * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
486203954Srdivacky	 * server to unicast DHCP packets to a DHCP client using the
487203954Srdivacky	 * IP address it is offering to the client.  This can be
488203954Srdivacky	 * disabled through the "broadcast bit", but not all DHCP
489204642Srdivacky	 * servers honor that bit.  Therefore, to interoperate with as
490218893Sdim	 * many DHCP servers as possible, the DHCP client allows the
491204792Srdivacky	 * server to unicast, but we treat those packets as broadcast
492204792Srdivacky	 * here.  Note that we don't rewrite the packet itself since
493204792Srdivacky	 * (a) that would mess up the checksums and (b) the DHCP
494204792Srdivacky	 * client conn is bound to INADDR_ANY so ip_fanout_udp() will
495204792Srdivacky	 * hand it the packet regardless.
496204792Srdivacky	 */
497218893Sdim	if (ill->ill_dhcpinit != 0 &&
498204792Srdivacky	    ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
499204792Srdivacky	    ipha->ipha_protocol == IPPROTO_UDP) {
500204642Srdivacky		udpha_t *udpha;
501204792Srdivacky
502204792Srdivacky		ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
503204792Srdivacky		if (ipha == NULL) {
504204792Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
505204792Srdivacky			ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
506218893Sdim			freemsg(mp);
507204792Srdivacky			return;
508204792Srdivacky		}
509218893Sdim		/* Reload since pullupmsg() can change b_rptr. */
510204792Srdivacky		udpha = (udpha_t *)&ipha[1];
511204792Srdivacky
512204792Srdivacky		if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
513204792Srdivacky			DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
514218893Sdim			    mblk_t *, mp);
515204792Srdivacky			/*
516204792Srdivacky			 * This assumes that we deliver to all conns for
517204792Srdivacky			 * multicast and broadcast packets.
518204792Srdivacky			 */
519204792Srdivacky			nexthop = INADDR_BROADCAST;
520204792Srdivacky			ira->ira_flags |= IRAF_DHCP_UNICAST;
521204792Srdivacky		}
522218893Sdim	}
523204792Srdivacky
524204792Srdivacky	/*
525204792Srdivacky	 * If rsvpd is running, let RSVP daemon handle its processing
526204792Srdivacky	 * and forwarding of RSVP multicast/unicast packets.
527204792Srdivacky	 * If rsvpd is not running but mrouted is running, RSVP
528218893Sdim	 * multicast packets are forwarded as multicast traffic
529218893Sdim	 * and RSVP unicast packets are forwarded by unicast router.
530204792Srdivacky	 * If neither rsvpd nor mrouted is running, RSVP multicast
531204792Srdivacky	 * packets are not forwarded, but the unicast packets are
532218893Sdim	 * forwarded like unicast traffic.
533204642Srdivacky	 */
534203954Srdivacky	if (ipha->ipha_protocol == IPPROTO_RSVP &&
535203954Srdivacky	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
536203954Srdivacky		/* RSVP packet and rsvpd running. Treat as ours */
537204642Srdivacky		ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
538204642Srdivacky		/*
539204642Srdivacky		 * We use a multicast address to get the packet to
540204642Srdivacky		 * ire_recv_multicast_v4. There will not be a membership
541204642Srdivacky		 * check since we set IRAF_RSVP
542204642Srdivacky		 */
543204642Srdivacky		nexthop = htonl(INADDR_UNSPEC_GROUP);
544218893Sdim		ira->ira_flags |= IRAF_RSVP;
545204642Srdivacky	}
546204642Srdivacky
547204642Srdivacky	ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
548204792Srdivacky}
549204792Srdivacky
550204792Srdivacky/*
551204792Srdivacky * This is the tail-end of the full receive side packet handling.
552204792Srdivacky * It can be used directly when the configuration is simple.
553204792Srdivacky */
554204792Srdivackyvoid
555218893Sdimill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
556204642Srdivacky    ip_recv_attr_t *ira, rtc_t *rtc)
557204642Srdivacky{
558204642Srdivacky	ire_t		*ire;
559204792Srdivacky	uint_t		opt_len;
560204642Srdivacky	ill_t		*ill = ira->ira_ill;
561204642Srdivacky	ip_stack_t	*ipst = ill->ill_ipst;
562204642Srdivacky	uint_t		pkt_len;
563204792Srdivacky	ssize_t 	len;
564204792Srdivacky	ipha_t		*ipha = (ipha_t *)iph_arg;
565204642Srdivacky	ipaddr_t	nexthop = *(ipaddr_t *)nexthop_arg;
566204642Srdivacky	ilb_stack_t	*ilbs = ipst->ips_netstack->netstack_ilb;
567204642Srdivacky#define	rptr	((uchar_t *)ipha)
568204642Srdivacky
569204642Srdivacky	ASSERT(DB_TYPE(mp) == M_DATA);
570204642Srdivacky
571204642Srdivacky	/*
572204642Srdivacky	 * The following test for loopback is faster than
573204642Srdivacky	 * IP_LOOPBACK_ADDR(), because it avoids any bitwise
574204642Srdivacky	 * operations.
575204642Srdivacky	 * Note that these addresses are always in network byte order
576218893Sdim	 */
577204642Srdivacky	if (((*(uchar_t *)&ipha->ipha_dst) == 127) ||
578204642Srdivacky	    ((*(uchar_t *)&ipha->ipha_src) == 127)) {
579204642Srdivacky		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
580204642Srdivacky		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
581204642Srdivacky		freemsg(mp);
582204642Srdivacky		return;
583218893Sdim	}
584204642Srdivacky
585205407Srdivacky	len = mp->b_wptr - rptr;
586204642Srdivacky	pkt_len = ira->ira_pktlen;
587204642Srdivacky
588204642Srdivacky	/* multiple mblk or too short */
589218893Sdim	len -= pkt_len;
590204642Srdivacky	if (len != 0) {
591204642Srdivacky		mp = ip_check_length(mp, rptr, len, pkt_len,
592204642Srdivacky		    IP_SIMPLE_HDR_LENGTH, ira);
593221345Sdim		if (mp == NULL)
594221345Sdim			return;
595204642Srdivacky		ipha = (ipha_t *)mp->b_rptr;
596204642Srdivacky	}
597204642Srdivacky
598218893Sdim	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
599204642Srdivacky	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
600205407Srdivacky	    int, 0);
601204642Srdivacky
602204642Srdivacky	/*
603204642Srdivacky	 * The event for packets being received from a 'physical'
604218893Sdim	 * interface is placed after validation of the source and/or
605204642Srdivacky	 * destination address as being local so that packets can be
606204642Srdivacky	 * redirected to loopback addresses using ipnat.
607204642Srdivacky	 */
608204642Srdivacky	DTRACE_PROBE4(ip4__physical__in__start,
609204642Srdivacky	    ill_t *, ill, ill_t *, NULL,
610204642Srdivacky	    ipha_t *, ipha, mblk_t *, mp);
611204642Srdivacky
612204642Srdivacky	if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
613208599Srdivacky		int	ll_multicast = 0;
614208599Srdivacky		int	error;
615208599Srdivacky		ipaddr_t orig_dst = ipha->ipha_dst;
616208599Srdivacky
617208599Srdivacky		if (ira->ira_flags & IRAF_L2DST_MULTICAST)
618208599Srdivacky			ll_multicast = HPE_MULTICAST;
619208599Srdivacky		else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
620208599Srdivacky			ll_multicast = HPE_BROADCAST;
621204642Srdivacky
622218893Sdim		FW_HOOKS(ipst->ips_ip4_physical_in_event,
623204642Srdivacky		    ipst->ips_ipv4firewall_physical_in,
624204642Srdivacky		    ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
625204642Srdivacky
626204642Srdivacky		DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
627204642Srdivacky
628218893Sdim		if (mp == NULL)
629204642Srdivacky			return;
630204642Srdivacky		/* The length could have changed */
631204642Srdivacky		ipha = (ipha_t *)mp->b_rptr;
632218893Sdim		ira->ira_pktlen = ntohs(ipha->ipha_length);
633204642Srdivacky		pkt_len = ira->ira_pktlen;
634204642Srdivacky
635204642Srdivacky		/*
636204642Srdivacky		 * In case the destination changed we override any previous
637204642Srdivacky		 * change to nexthop.
638204642Srdivacky		 */
639204642Srdivacky		if (orig_dst != ipha->ipha_dst)
640204642Srdivacky			nexthop = ipha->ipha_dst;
641218893Sdim		if (nexthop == INADDR_ANY) {
642204642Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
643204642Srdivacky			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
644204642Srdivacky			freemsg(mp);
645218893Sdim			return;
646204642Srdivacky		}
647204642Srdivacky	}
648204642Srdivacky
649223017Sdim	if (ipst->ips_ip4_observe.he_interested) {
650223017Sdim		zoneid_t dzone;
651223017Sdim
652223017Sdim		/*
653223017Sdim		 * On the inbound path the src zone will be unknown as
654223017Sdim		 * this packet has come from the wire.
655223017Sdim		 */
656223017Sdim		dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
657223017Sdim		ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
658223017Sdim	}
659223017Sdim
660223017Sdim	/*
661223017Sdim	 * If there is a good HW IP header checksum we clear the need
662223017Sdim	 * look at the IP header checksum.
663223017Sdim	 */
664223017Sdim	if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
665223017Sdim	    ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
666223017Sdim		/* Header checksum was ok. Clear the flag */
667223017Sdim		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
668223017Sdim		ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
669223017Sdim	}
670223017Sdim
671223017Sdim	/*
672223017Sdim	 * Here we check to see if we machine is setup as
673223017Sdim	 * L3 loadbalancer and if the incoming packet is for a VIP
674223017Sdim	 *
675223017Sdim	 * Check the following:
676223017Sdim	 * - there is at least a rule
677223017Sdim	 * - protocol of the packet is supported
678204642Srdivacky	 */
679204642Srdivacky	if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
680204642Srdivacky		ipaddr_t	lb_dst;
681204642Srdivacky		int		lb_ret;
682204642Srdivacky
683205407Srdivacky		/* For convenience, we pull up the mblk. */
684204642Srdivacky		if (mp->b_cont != NULL) {
685218893Sdim			if (pullupmsg(mp, -1) == 0) {
686204642Srdivacky				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
687204642Srdivacky				ip_drop_input("ipIfStatsInDiscards - pullupmsg",
688204642Srdivacky				    mp, ill);
689204642Srdivacky				freemsg(mp);
690204642Srdivacky				return;
691204642Srdivacky			}
692204642Srdivacky			ipha = (ipha_t *)mp->b_rptr;
693204642Srdivacky		}
694218893Sdim
695204642Srdivacky		/*
696204642Srdivacky		 * We just drop all fragments going to any VIP, at
697218893Sdim		 * least for now....
698204642Srdivacky		 */
699204642Srdivacky		if (ntohs(ipha->ipha_fragment_offset_and_flags) &
700218893Sdim		    (IPH_MF | IPH_OFFSET)) {
701218893Sdim			if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
702204642Srdivacky				goto after_ilb;
703204642Srdivacky			}
704218893Sdim
705218893Sdim			ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
706218893Sdim			ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
707204642Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
708204642Srdivacky			ip_drop_input("ILB fragment", mp, ill);
709218893Sdim			freemsg(mp);
710204642Srdivacky			return;
711204642Srdivacky		}
712204642Srdivacky		lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
713204642Srdivacky		    (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
714218893Sdim
715204642Srdivacky		if (lb_ret == ILB_DROPPED) {
716204642Srdivacky			/* Is this the right counter to increase? */
717204642Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
718204642Srdivacky			ip_drop_input("ILB_DROPPED", mp, ill);
719204642Srdivacky			freemsg(mp);
720204642Srdivacky			return;
721204642Srdivacky		}
722204642Srdivacky		if (lb_ret == ILB_BALANCED) {
723218893Sdim			/* Set the dst to that of the chosen server */
724204642Srdivacky			nexthop = lb_dst;
725218893Sdim			DB_CKSUMFLAGS(mp) = 0;
726204642Srdivacky		}
727218893Sdim	}
728204642Srdivacky
729204642Srdivackyafter_ilb:
730204642Srdivacky	opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
731204642Srdivacky	ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
732204642Srdivacky	if (opt_len != 0) {
733212904Sdim		int error = 0;
734218893Sdim
735204642Srdivacky		ira->ira_ip_hdr_length += (opt_len << 2);
736204642Srdivacky		ira->ira_flags |= IRAF_IPV4_OPTIONS;
737204642Srdivacky
738204642Srdivacky		/* IP Options present!  Validate the length. */
739218893Sdim		mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
740206083Srdivacky		if (mp == NULL)
741218893Sdim			return;
742204642Srdivacky
743204642Srdivacky		/* Might have changed */
744206083Srdivacky		ipha = (ipha_t *)mp->b_rptr;
745206083Srdivacky
746206083Srdivacky		/* Verify IP header checksum before parsing the options */
747218893Sdim		if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
748206083Srdivacky		    ip_csum_hdr(ipha)) {
749206083Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
750206083Srdivacky			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
751206083Srdivacky			freemsg(mp);
752218893Sdim			return;
753204642Srdivacky		}
754204642Srdivacky		ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
755218893Sdim
756218893Sdim		/*
757218893Sdim		 * Go off to ip_input_options which returns the next hop
758204642Srdivacky		 * destination address, which may have been affected
759204642Srdivacky		 * by source routing.
760204642Srdivacky		 */
761204642Srdivacky		IP_STAT(ipst, ip_opt);
762204642Srdivacky
763204642Srdivacky		nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
764205407Srdivacky		if (error != 0) {
765218893Sdim			/*
766218893Sdim			 * An ICMP error has been sent and the packet has
767218893Sdim			 * been dropped.
768204642Srdivacky			 */
769218893Sdim			return;
770218893Sdim		}
771218893Sdim	}
772204642Srdivacky	/* Can not use route cache with TX since the labels can differ */
773204642Srdivacky	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
774206083Srdivacky		if (CLASSD(nexthop)) {
775206083Srdivacky			ire = ire_multicast(ill);
776218893Sdim		} else {
777204642Srdivacky			/* Match destination and label */
778204642Srdivacky			ire = ire_route_recursive_v4(nexthop, 0, NULL,
779204642Srdivacky			    ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
780204642Srdivacky			    (ill->ill_flags & ILLF_ROUTER),
781204642Srdivacky			    ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
782206083Srdivacky		}
783205407Srdivacky		/* Update the route cache so we do the ire_refrele */
784205407Srdivacky		ASSERT(ire != NULL);
785205407Srdivacky		if (rtc->rtc_ire != NULL)
786206083Srdivacky			ire_refrele(rtc->rtc_ire);
787205407Srdivacky		rtc->rtc_ire = ire;
788218893Sdim		rtc->rtc_ipaddr = nexthop;
789205407Srdivacky	} else if (nexthop == rtc->rtc_ipaddr) {
790205407Srdivacky		/* Use the route cache */
791205407Srdivacky		ASSERT(rtc->rtc_ire != NULL);
792205407Srdivacky		ire = rtc->rtc_ire;
793205407Srdivacky	} else {
794204642Srdivacky		/* Update the route cache */
795204642Srdivacky		if (CLASSD(nexthop)) {
796205407Srdivacky			ire = ire_multicast(ill);
797205407Srdivacky		} else {
798205407Srdivacky			/* Just match the destination */
799204642Srdivacky			ire = ire_route_recursive_dstonly_v4(nexthop,
800205407Srdivacky			    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
801205407Srdivacky			    ipst);
802204642Srdivacky		}
803218893Sdim		ASSERT(ire != NULL);
804223017Sdim		if (rtc->rtc_ire != NULL)
805223017Sdim			ire_refrele(rtc->rtc_ire);
806223017Sdim		rtc->rtc_ire = ire;
807223017Sdim		rtc->rtc_ipaddr = nexthop;
808204642Srdivacky	}
809223017Sdim
810223017Sdim	ire->ire_ib_pkt_count++;
811223017Sdim
812223017Sdim	/*
813204642Srdivacky	 * Based on ire_type and ire_flags call one of:
814223017Sdim	 *	ire_recv_local_v4 - for IRE_LOCAL
815223017Sdim	 *	ire_recv_loopback_v4 - for IRE_LOOPBACK
816223017Sdim	 *	ire_recv_multirt_v4 - if RTF_MULTIRT
817223017Sdim	 *	ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
818223017Sdim	 *	ire_recv_multicast_v4 - for IRE_MULTICAST
819223017Sdim	 *	ire_recv_broadcast_v4 - for IRE_BROADCAST
820223017Sdim	 *	ire_recv_noaccept_v4 - for ire_noaccept ones
821223017Sdim	 *	ire_recv_forward_v4 - for the rest.
822223017Sdim	 */
823223017Sdim	(*ire->ire_recvfn)(ire, mp, ipha, ira);
824223017Sdim}
825218893Sdim#undef rptr
826206083Srdivacky
827218893Sdim/*
828204642Srdivacky * ire_recvfn for IREs that need forwarding
829204642Srdivacky */
830204642Srdivackyvoid
831218893Sdimire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
832204642Srdivacky{
833204642Srdivacky	ipha_t		*ipha = (ipha_t *)iph_arg;
834218893Sdim	ill_t		*ill = ira->ira_ill;
835218893Sdim	ip_stack_t	*ipst = ill->ill_ipst;
836204642Srdivacky	ill_t		*dst_ill;
837218893Sdim	nce_t		*nce;
838204642Srdivacky	ipaddr_t	src = ipha->ipha_src;
839204642Srdivacky	uint32_t	added_tx_len;
840204642Srdivacky	uint32_t	mtu, iremtu;
841204642Srdivacky
842204642Srdivacky	if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
843204642Srdivacky		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
844204642Srdivacky		ip_drop_input("l2 multicast not forwarded", mp, ill);
845204642Srdivacky		freemsg(mp);
846204642Srdivacky		return;
847204642Srdivacky	}
848204642Srdivacky
849218893Sdim	if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
850204642Srdivacky		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
851204642Srdivacky		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
852204642Srdivacky		freemsg(mp);
853204642Srdivacky		return;
854204642Srdivacky	}
855204642Srdivacky
856204642Srdivacky	/*
857204642Srdivacky	 * Either ire_nce_capable or ire_dep_parent would be set for the IRE
858204642Srdivacky	 * when it is found by ire_route_recursive, but that some other thread
859204642Srdivacky	 * could have changed the routes with the effect of clearing
860204642Srdivacky	 * ire_dep_parent. In that case we'd end up dropping the packet, or
861204642Srdivacky	 * finding a new nce below.
862204642Srdivacky	 * Get, allocate, or update the nce.
863204642Srdivacky	 * We get a refhold on ire_nce_cache as a result of this to avoid races
864204642Srdivacky	 * where ire_nce_cache is deleted.
865204642Srdivacky	 *
866204642Srdivacky	 * This ensures that we don't forward if the interface is down since
867204642Srdivacky	 * ipif_down removes all the nces.
868204642Srdivacky	 */
869204642Srdivacky	mutex_enter(&ire->ire_lock);
870204642Srdivacky	nce = ire->ire_nce_cache;
871204642Srdivacky	if (nce == NULL) {
872204642Srdivacky		/* Not yet set up - try to set one up */
873204642Srdivacky		mutex_exit(&ire->ire_lock);
874204642Srdivacky		(void) ire_revalidate_nce(ire);
875204642Srdivacky		mutex_enter(&ire->ire_lock);
876204642Srdivacky		nce = ire->ire_nce_cache;
877204642Srdivacky		if (nce == NULL) {
878204642Srdivacky			mutex_exit(&ire->ire_lock);
879204642Srdivacky			/* The ire_dep_parent chain went bad, or no memory */
880204642Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
881204642Srdivacky			ip_drop_input("No ire_dep_parent", mp, ill);
882204642Srdivacky			freemsg(mp);
883204642Srdivacky			return;
884204642Srdivacky		}
885204642Srdivacky	}
886204642Srdivacky	nce_refhold(nce);
887204642Srdivacky	mutex_exit(&ire->ire_lock);
888218893Sdim
889204642Srdivacky	if (nce->nce_is_condemned) {
890204642Srdivacky		nce_t *nce1;
891204642Srdivacky
892204642Srdivacky		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
893204642Srdivacky		nce_refrele(nce);
894204642Srdivacky		if (nce1 == NULL) {
895206083Srdivacky			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
896218893Sdim			ip_drop_input("No nce", mp, ill);
897206083Srdivacky			freemsg(mp);
898204642Srdivacky			return;
899206083Srdivacky		}
900218893Sdim		nce = nce1;
901206083Srdivacky	}
902206083Srdivacky	dst_ill = nce->nce_ill;
903206083Srdivacky
904206083Srdivacky	/*
905206083Srdivacky	 * Unless we are forwarding, drop the packet.
906206083Srdivacky	 * We have to let source routed packets through if they go out
907206083Srdivacky	 * the same interface i.e., they are 'ping -l' packets.
908206083Srdivacky	 */
909206083Srdivacky	if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
910206083Srdivacky	    !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
911206083Srdivacky		if (ip_source_routed(ipha, ipst)) {
912206083Srdivacky			ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
913206083Srdivacky			icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
914218893Sdim			nce_refrele(nce);
915206083Srdivacky			return;
916206083Srdivacky		}
917206083Srdivacky		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
918206083Srdivacky		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
919206083Srdivacky		freemsg(mp);
920218893Sdim		nce_refrele(nce);
921218893Sdim		return;
922204642Srdivacky	}
923204642Srdivacky
924204642Srdivacky	if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
925218893Sdim		ipaddr_t	dst = ipha->ipha_dst;
926204642Srdivacky
927218893Sdim		ire->ire_ib_pkt_count--;
928218893Sdim		/*
929218893Sdim		 * Should only use IREs that are visible from the
930218893Sdim		 * global zone for forwarding.
931204642Srdivacky		 * Take a source route into account the same way as ip_input
932204642Srdivacky		 * did.
933204642Srdivacky		 */
934204642Srdivacky		if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
935204642Srdivacky			int		error = 0;
936204642Srdivacky
937204642Srdivacky			dst = ip_input_options(ipha, dst, mp, ira, &error);
938204642Srdivacky			ASSERT(error == 0);	/* ip_input checked */
939204642Srdivacky		}
940203954Srdivacky		ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
941203954Srdivacky		    ira->ira_tsl, MATCH_IRE_SECATTR,
942203954Srdivacky		    (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst,
943204642Srdivacky		    NULL, NULL, NULL);
944204642Srdivacky		ire->ire_ib_pkt_count++;
945218893Sdim		(*ire->ire_recvfn)(ire, mp, ipha, ira);
946204642Srdivacky		ire_refrele(ire);
947204642Srdivacky		nce_refrele(nce);
948204642Srdivacky		return;
949204642Srdivacky	}
950218893Sdim
951203954Srdivacky	/*
952204642Srdivacky	 * ipIfStatsHCInForwDatagrams should only be increment if there
953203954Srdivacky	 * will be an attempt to forward the packet, which is why we
954203954Srdivacky	 * increment after the above condition has been checked.
955204642Srdivacky	 */
956203954Srdivacky	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
957
958	/* Initiate Read side IPPF processing */
959	if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
960		/* ip_process translates an IS_UNDER_IPMP */
961		mp = ip_process(IPP_FWD_IN, mp, ill, ill);
962		if (mp == NULL) {
963			/* ip_drop_packet and MIB done */
964			ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
965			    "during IPPF processing\n"));
966			nce_refrele(nce);
967			return;
968		}
969	}
970
971	DTRACE_PROBE4(ip4__forwarding__start,
972	    ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
973
974	if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
975		int error;
976
977		FW_HOOKS(ipst->ips_ip4_forwarding_event,
978		    ipst->ips_ipv4firewall_forwarding,
979		    ill, dst_ill, ipha, mp, mp, 0, ipst, error);
980
981		DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
982
983		if (mp == NULL) {
984			nce_refrele(nce);
985			return;
986		}
987		/*
988		 * Even if the destination was changed by the filter we use the
989		 * forwarding decision that was made based on the address
990		 * in ip_input.
991		 */
992
993		/* Might have changed */
994		ipha = (ipha_t *)mp->b_rptr;
995		ira->ira_pktlen = ntohs(ipha->ipha_length);
996	}
997
998	/* Packet is being forwarded. Turning off hwcksum flag. */
999	DB_CKSUMFLAGS(mp) = 0;
1000
1001	/*
1002	 * Martian Address Filtering [RFC 1812, Section 5.3.7]
1003	 * The loopback address check for both src and dst has already
1004	 * been checked in ip_input
1005	 * In the future one can envision adding RPF checks using number 3.
1006	 * If we already checked the same source address we can skip this.
1007	 */
1008	if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
1009	    src != ira->ira_verified_src) {
1010		switch (ipst->ips_src_check) {
1011		case 0:
1012			break;
1013		case 2:
1014			if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
1015				BUMP_MIB(ill->ill_ip_mib,
1016				    ipIfStatsForwProhibits);
1017				BUMP_MIB(ill->ill_ip_mib,
1018				    ipIfStatsInAddrErrors);
1019				ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1020				freemsg(mp);
1021				nce_refrele(nce);
1022				return;
1023			}
1024			/* FALLTHRU */
1025
1026		case 1:
1027			if (CLASSD(src)) {
1028				BUMP_MIB(ill->ill_ip_mib,
1029				    ipIfStatsForwProhibits);
1030				BUMP_MIB(ill->ill_ip_mib,
1031				    ipIfStatsInAddrErrors);
1032				ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1033				freemsg(mp);
1034				nce_refrele(nce);
1035				return;
1036			}
1037			break;
1038		}
1039		/* Remember for next packet */
1040		ira->ira_flags |= IRAF_VERIFIED_SRC;
1041		ira->ira_verified_src = src;
1042	}
1043
1044	/*
1045	 * Check if packet is going out the same link on which it arrived.
1046	 * Means we might need to send a redirect.
1047	 */
1048	if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
1049		ip_send_potential_redirect_v4(mp, ipha, ire, ira);
1050	}
1051
1052	added_tx_len = 0;
1053	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
1054		mblk_t		*mp1;
1055		uint32_t	old_pkt_len = ira->ira_pktlen;
1056
1057		/*
1058		 * Check if it can be forwarded and add/remove
1059		 * CIPSO options as needed.
1060		 */
1061		if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
1062			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1063			ip_drop_input("tsol_ip_forward", mp, ill);
1064			freemsg(mp);
1065			nce_refrele(nce);
1066			return;
1067		}
1068		/*
1069		 * Size may have changed. Remember amount added in case
1070		 * IP needs to send an ICMP too big.
1071		 */
1072		mp = mp1;
1073		ipha = (ipha_t *)mp->b_rptr;
1074		ira->ira_pktlen = ntohs(ipha->ipha_length);
1075		ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
1076		if (ira->ira_pktlen > old_pkt_len)
1077			added_tx_len = ira->ira_pktlen - old_pkt_len;
1078
1079		/* Options can have been added or removed */
1080		if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
1081			ira->ira_flags |= IRAF_IPV4_OPTIONS;
1082		else
1083			ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
1084	}
1085
1086	mtu = dst_ill->ill_mtu;
1087	if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
1088		mtu = iremtu;
1089	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
1090	nce_refrele(nce);
1091}
1092
1093/*
1094 * Used for sending out unicast and multicast packets that are
1095 * forwarded.
1096 */
1097void
1098ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1099    ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
1100{
1101	ill_t		*dst_ill = nce->nce_ill;
1102	uint32_t	pkt_len;
1103	uint32_t	sum;
1104	iaflags_t	iraflags = ira->ira_flags;
1105	ip_stack_t	*ipst = ill->ill_ipst;
1106	iaflags_t	ixaflags;
1107
1108	if (ipha->ipha_ttl <= 1) {
1109		/* Perhaps the checksum was bad */
1110		if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1111			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1112			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1113			freemsg(mp);
1114			return;
1115		}
1116		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1117		ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
1118		icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
1119		return;
1120	}
1121	ipha->ipha_ttl--;
1122	/* Adjust the checksum to reflect the ttl decrement. */
1123	sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
1124	ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
1125
1126	/* Check if there are options to update */
1127	if (iraflags & IRAF_IPV4_OPTIONS) {
1128		ASSERT(ipha->ipha_version_and_hdr_length !=
1129		    IP_SIMPLE_HDR_VERSION);
1130		ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
1131
1132		if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
1133			/* ipIfStatsForwProhibits and ip_drop_input done */
1134			return;
1135		}
1136
1137		ipha->ipha_hdr_checksum = 0;
1138		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1139	}
1140
1141	/* Initiate Write side IPPF processing before any fragmentation */
1142	if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
1143		/* ip_process translates an IS_UNDER_IPMP */
1144		mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
1145		if (mp == NULL) {
1146			/* ip_drop_packet and MIB done */
1147			ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
1148			    " during IPPF processing\n"));
1149			return;
1150		}
1151	}
1152
1153	pkt_len = ira->ira_pktlen;
1154
1155	BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
1156
1157	ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
1158
1159	if (pkt_len > mtu) {
1160		/*
1161		 * It needs fragging on its way out.  If we haven't
1162		 * verified the header checksum yet we do it now since
1163		 * are going to put a surely good checksum in the
1164		 * outgoing header, we have to make sure that it
1165		 * was good coming in.
1166		 */
1167		if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1168			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1169			ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1170			freemsg(mp);
1171			return;
1172		}
1173		if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
1174			BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
1175			ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
1176			if (iraflags & IRAF_SYSTEM_LABELED) {
1177				/*
1178				 * Remove any CIPSO option added by
1179				 * tsol_ip_forward, and make sure we report
1180				 * a path MTU so that there
1181				 * is room to add such a CIPSO option for future
1182				 * packets.
1183				 */
1184				mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
1185				    AF_INET);
1186			}
1187
1188			icmp_frag_needed(mp, mtu, ira);
1189			return;
1190		}
1191
1192		(void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
1193		    ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
1194		return;
1195	}
1196
1197	ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
1198	if (iraflags & IRAF_LOOPBACK_COPY) {
1199		/*
1200		 * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
1201		 * is don't care
1202		 */
1203		(void) ip_postfrag_loopcheck(mp, nce,
1204		    ixaflags | IXAF_LOOPBACK_COPY,
1205		    pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
1206	} else {
1207		(void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
1208		    GLOBAL_ZONEID, 0, NULL);
1209	}
1210}
1211
1212/*
1213 * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
1214 * which is what ire_route_recursive returns when there is no matching ire.
1215 * Send ICMP unreachable unless blackhole.
1216 */
1217void
1218ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1219{
1220	ipha_t		*ipha = (ipha_t *)iph_arg;
1221	ill_t		*ill = ira->ira_ill;
1222	ip_stack_t	*ipst = ill->ill_ipst;
1223
1224	/* Would we have forwarded this packet if we had a route? */
1225	if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
1226		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1227		ip_drop_input("l2 multicast not forwarded", mp, ill);
1228		freemsg(mp);
1229		return;
1230	}
1231
1232	if (!(ill->ill_flags & ILLF_ROUTER)) {
1233		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1234		ip_drop_input("ipIfStatsForwProhibits", mp, ill);
1235		freemsg(mp);
1236		return;
1237	}
1238	/*
1239	 * If we had a route this could have been forwarded. Count as such.
1240	 *
1241	 * ipIfStatsHCInForwDatagrams should only be increment if there
1242	 * will be an attempt to forward the packet, which is why we
1243	 * increment after the above condition has been checked.
1244	 */
1245	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
1246
1247	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1248
1249	ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
1250	    ipst);
1251
1252	if (ire->ire_flags & RTF_BLACKHOLE) {
1253		ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
1254		freemsg(mp);
1255	} else {
1256		ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
1257
1258		if (ip_source_routed(ipha, ipst)) {
1259			icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
1260		} else {
1261			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
1262		}
1263	}
1264}
1265
1266/*
1267 * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
1268 * VRRP when in noaccept mode.
1269 * We silently drop the packet. ARP handles packets even if noaccept is set.
1270 */
1271/* ARGSUSED */
1272void
1273ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1274    ip_recv_attr_t *ira)
1275{
1276	ill_t		*ill = ira->ira_ill;
1277
1278	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1279	ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
1280	freemsg(mp);
1281}
1282
1283/*
1284 * ire_recvfn for IRE_BROADCAST.
1285 */
1286void
1287ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1288    ip_recv_attr_t *ira)
1289{
1290	ipha_t		*ipha = (ipha_t *)iph_arg;
1291	ill_t		*ill = ira->ira_ill;
1292	ill_t		*dst_ill = ire->ire_ill;
1293	ip_stack_t	*ipst = ill->ill_ipst;
1294	ire_t		*alt_ire;
1295	nce_t		*nce;
1296	ipaddr_t	ipha_dst;
1297
1298	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
1299
1300	/* Tag for higher-level protocols */
1301	ira->ira_flags |= IRAF_BROADCAST;
1302
1303	/*
1304	 * Whether local or directed broadcast forwarding: don't allow
1305	 * for TCP.
1306	 */
1307	if (ipha->ipha_protocol == IPPROTO_TCP) {
1308		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1309		ip_drop_input("ipIfStatsInDiscards", mp, ill);
1310		freemsg(mp);
1311		return;
1312	}
1313
1314	/*
1315	 * So that we don't end up with dups, only one ill an IPMP group is
1316	 * nominated to receive broadcast traffic.
1317	 * If we have no cast_ill we are liberal and accept everything.
1318	 */
1319	if (IS_UNDER_IPMP(ill)) {
1320		/* For an under ill_grp can change under lock */
1321		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1322		if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1323		    ill->ill_grp->ig_cast_ill != NULL) {
1324			rw_exit(&ipst->ips_ill_g_lock);
1325			/* No MIB since this is normal operation */
1326			ip_drop_input("not nom_cast", mp, ill);
1327			freemsg(mp);
1328			return;
1329		}
1330		rw_exit(&ipst->ips_ill_g_lock);
1331
1332		ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1333	}
1334
1335	/*
1336	 * After reassembly and IPsec we will need to duplicate the
1337	 * broadcast packet for all matching zones on the ill.
1338	 */
1339	ira->ira_zoneid = ALL_ZONES;
1340
1341	/*
1342	 * Check for directed broadcast i.e. ire->ire_ill is different than
1343	 * the incoming ill.
1344	 * The same broadcast address can be assigned to multiple interfaces
1345	 * so have to check explicitly for that case by looking up the alt_ire
1346	 */
1347	if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
1348		/* Reassemble on the ill on which the packet arrived */
1349		ip_input_local_v4(ire, mp, ipha, ira);
1350		/* Restore */
1351		ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1352		return;
1353	}
1354
1355	/* Is there an IRE_BROADCAST on the incoming ill? */
1356	ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
1357	    ipha->ipha_dst);
1358	alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
1359	    ALL_ZONES, ira->ira_tsl,
1360	    MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
1361	if (alt_ire != NULL) {
1362		/* Not a directed broadcast */
1363		/*
1364		 * In the special case of multirouted broadcast
1365		 * packets, we unconditionally need to "gateway"
1366		 * them to the appropriate interface here so that reassembly
1367		 * works. We know that the IRE_BROADCAST on cgtp0 doesn't
1368		 * have RTF_MULTIRT set so we look for such an IRE in the
1369		 * bucket.
1370		 */
1371		if (alt_ire->ire_flags & RTF_MULTIRT) {
1372			irb_t		*irb;
1373			ire_t		*ire1;
1374
1375			irb = ire->ire_bucket;
1376			irb_refhold(irb);
1377			for (ire1 = irb->irb_ire; ire1 != NULL;
1378			    ire1 = ire1->ire_next) {
1379				if (IRE_IS_CONDEMNED(ire1))
1380					continue;
1381				if (!(ire1->ire_type & IRE_BROADCAST) ||
1382				    (ire1->ire_flags & RTF_MULTIRT))
1383					continue;
1384				ill = ire1->ire_ill;
1385				ill_refhold(ill);
1386				break;
1387			}
1388			irb_refrele(irb);
1389			if (ire1 != NULL) {
1390				ill_t *orig_ill = ira->ira_ill;
1391
1392				ire_refrele(alt_ire);
1393				/* Reassemble on the new ill */
1394				ira->ira_ill = ill;
1395				ip_input_local_v4(ire, mp, ipha, ira);
1396				ill_refrele(ill);
1397				/* Restore */
1398				ira->ira_ill = orig_ill;
1399				ira->ira_ruifindex =
1400				    orig_ill->ill_phyint->phyint_ifindex;
1401				return;
1402			}
1403		}
1404		ire_refrele(alt_ire);
1405		/* Reassemble on the ill on which the packet arrived */
1406		ip_input_local_v4(ire, mp, ipha, ira);
1407		goto done;
1408	}
1409
1410	/*
1411	 * This is a directed broadcast
1412	 *
1413	 * If directed broadcast is allowed, then forward the packet out
1414	 * the destination interface with IXAF_LOOPBACK_COPY set. That will
1415	 * result in ip_input() receiving a copy of the packet on the
1416	 * appropriate ill. (We could optimize this to avoid the extra trip
1417	 * via ip_input(), but since directed broadcasts are normally disabled
1418	 * it doesn't make sense to optimize it.)
1419	 */
1420	if (!ipst->ips_ip_g_forward_directed_bcast ||
1421	    (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
1422		ip_drop_input("directed broadcast not allowed", mp, ill);
1423		freemsg(mp);
1424		goto done;
1425	}
1426	if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1427		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1428		ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1429		freemsg(mp);
1430		goto done;
1431	}
1432
1433	/*
1434	 * Clear the indication that this may have hardware
1435	 * checksum as we are not using it for forwarding.
1436	 */
1437	DB_CKSUMFLAGS(mp) = 0;
1438
1439	/*
1440	 * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
1441	 */
1442	ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
1443	ipha->ipha_hdr_checksum = 0;
1444	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1445
1446	/*
1447	 * We use ip_forward_xmit to do any fragmentation.
1448	 * and loopback copy on the outbound interface.
1449	 *
1450	 * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
1451	 */
1452	ira->ira_flags |= IRAF_LOOPBACK_COPY;
1453
1454	nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
1455	if (nce == NULL) {
1456		BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
1457		ip_drop_output("No nce", mp, dst_ill);
1458		freemsg(mp);
1459		goto done;
1460	}
1461
1462	ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0);
1463	nce_refrele(nce);
1464done:
1465	/* Restore */
1466	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1467}
1468
1469/*
1470 * ire_recvfn for IRE_MULTICAST.
1471 */
1472void
1473ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1474    ip_recv_attr_t *ira)
1475{
1476	ipha_t		*ipha = (ipha_t *)iph_arg;
1477	ill_t		*ill = ira->ira_ill;
1478	ip_stack_t	*ipst = ill->ill_ipst;
1479
1480	ASSERT(ire->ire_ill == ira->ira_ill);
1481
1482	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
1483	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
1484
1485	/* RSVP hook */
1486	if (ira->ira_flags & IRAF_RSVP)
1487		goto forus;
1488
1489	/* Tag for higher-level protocols */
1490	ira->ira_flags |= IRAF_MULTICAST;
1491
1492	/*
1493	 * So that we don't end up with dups, only one ill an IPMP group is
1494	 * nominated to receive multicast traffic.
1495	 * If we have no cast_ill we are liberal and accept everything.
1496	 */
1497	if (IS_UNDER_IPMP(ill)) {
1498		ip_stack_t	*ipst = ill->ill_ipst;
1499
1500		/* For an under ill_grp can change under lock */
1501		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1502		if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1503		    ill->ill_grp->ig_cast_ill != NULL) {
1504			rw_exit(&ipst->ips_ill_g_lock);
1505			ip_drop_input("not on cast ill", mp, ill);
1506			freemsg(mp);
1507			return;
1508		}
1509		rw_exit(&ipst->ips_ill_g_lock);
1510		/*
1511		 * We switch to the upper ill so that mrouter and hasmembers
1512		 * can operate on upper here and in ip_input_multicast.
1513		 */
1514		ill = ipmp_ill_hold_ipmp_ill(ill);
1515		if (ill != NULL) {
1516			ASSERT(ill != ira->ira_ill);
1517			ASSERT(ire->ire_ill == ira->ira_ill);
1518			ira->ira_ill = ill;
1519			ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1520		} else {
1521			ill = ira->ira_ill;
1522		}
1523	}
1524
1525	/*
1526	 * Check if we are a multicast router - send ip_mforward a copy of
1527	 * the packet.
1528	 * Due to mroute_decap tunnels we consider forwarding packets even if
1529	 * mrouted has not joined the allmulti group on this interface.
1530	 */
1531	if (ipst->ips_ip_g_mrouter) {
1532		int retval;
1533
1534		/*
1535		 * Clear the indication that this may have hardware
1536		 * checksum as we are not using it for forwarding.
1537		 */
1538		DB_CKSUMFLAGS(mp) = 0;
1539
1540		/*
1541		 * ip_mforward helps us make these distinctions: If received
1542		 * on tunnel and not IGMP, then drop.
1543		 * If IGMP packet, then don't check membership
1544		 * If received on a phyint and IGMP or PIM, then
1545		 * don't check membership
1546		 */
1547		retval = ip_mforward(mp, ira);
1548		/* ip_mforward updates mib variables if needed */
1549
1550		switch (retval) {
1551		case 0:
1552			/*
1553			 * pkt is okay and arrived on phyint.
1554			 *
1555			 * If we are running as a multicast router
1556			 * we need to see all IGMP and/or PIM packets.
1557			 */
1558			if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
1559			    (ipha->ipha_protocol == IPPROTO_PIM)) {
1560				goto forus;
1561			}
1562			break;
1563		case -1:
1564			/* pkt is mal-formed, toss it */
1565			freemsg(mp);
1566			goto done;
1567		case 1:
1568			/*
1569			 * pkt is okay and arrived on a tunnel
1570			 *
1571			 * If we are running a multicast router
1572			 * we need to see all igmp packets.
1573			 */
1574			if (ipha->ipha_protocol == IPPROTO_IGMP) {
1575				goto forus;
1576			}
1577			ip_drop_input("Multicast on tunnel ignored", mp, ill);
1578			freemsg(mp);
1579			goto done;
1580		}
1581	}
1582
1583	/*
1584	 * Check if we have members on this ill. This is not necessary for
1585	 * correctness because even if the NIC/GLD had a leaky filter, we
1586	 * filter before passing to each conn_t.
1587	 */
1588	if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
1589		/*
1590		 * Nobody interested
1591		 *
1592		 * This might just be caused by the fact that
1593		 * multiple IP Multicast addresses map to the same
1594		 * link layer multicast - no need to increment counter!
1595		 */
1596		ip_drop_input("Multicast with no members", mp, ill);
1597		freemsg(mp);
1598		goto done;
1599	}
1600forus:
1601	ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
1602	    ntohl(ipha->ipha_dst)));
1603
1604	/*
1605	 * After reassembly and IPsec we will need to duplicate the
1606	 * multicast packet for all matching zones on the ill.
1607	 */
1608	ira->ira_zoneid = ALL_ZONES;
1609
1610	/* Reassemble on the ill on which the packet arrived */
1611	ip_input_local_v4(ire, mp, ipha, ira);
1612done:
1613	if (ill != ire->ire_ill) {
1614		ill_refrele(ill);
1615		ira->ira_ill = ire->ire_ill;
1616		ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
1617	}
1618}
1619
1620/*
1621 * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
1622 * Drop packets since we don't forward out multirt routes.
1623 */
1624/* ARGSUSED */
1625void
1626ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1627{
1628	ill_t		*ill = ira->ira_ill;
1629
1630	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1631	ip_drop_input("Not forwarding out MULTIRT", mp, ill);
1632	freemsg(mp);
1633}
1634
1635/*
1636 * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
1637 * has rewritten the packet to have a loopback destination address (We
1638 * filter out packet with a loopback destination from arriving over the wire).
1639 * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
1640 */
1641void
1642ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1643{
1644	ipha_t		*ipha = (ipha_t *)iph_arg;
1645	ill_t		*ill = ira->ira_ill;
1646	ill_t		*ire_ill = ire->ire_ill;
1647
1648	ira->ira_zoneid = GLOBAL_ZONEID;
1649
1650	/* Switch to the lo0 ill for further processing  */
1651	if (ire_ill != ill) {
1652		/*
1653		 * Update ira_ill to be the ILL on which the IP address
1654		 * is hosted.
1655		 * No need to hold the ill since we have a hold on the ire
1656		 */
1657		ASSERT(ira->ira_ill == ira->ira_rill);
1658		ira->ira_ill = ire_ill;
1659
1660		ip_input_local_v4(ire, mp, ipha, ira);
1661
1662		/* Restore */
1663		ASSERT(ira->ira_ill == ire_ill);
1664		ira->ira_ill = ill;
1665		return;
1666
1667	}
1668	ip_input_local_v4(ire, mp, ipha, ira);
1669}
1670
1671/*
1672 * ire_recvfn for IRE_LOCAL.
1673 */
1674void
1675ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1676{
1677	ipha_t		*ipha = (ipha_t *)iph_arg;
1678	ill_t		*ill = ira->ira_ill;
1679	ill_t		*ire_ill = ire->ire_ill;
1680
1681	/* Make a note for DAD that this address is in use */
1682	ire->ire_last_used_time = lbolt;
1683
1684	/* Only target the IRE_LOCAL with the right zoneid. */
1685	ira->ira_zoneid = ire->ire_zoneid;
1686
1687	/*
1688	 * If the packet arrived on the wrong ill, we check that
1689	 * this is ok.
1690	 * If it is, then we ensure that we do the reassembly on
1691	 * the ill on which the address is hosted. We keep ira_rill as
1692	 * the one on which the packet arrived, so that IP_PKTINFO and
1693	 * friends can report this.
1694	 */
1695	if (ire_ill != ill) {
1696		ire_t *new_ire;
1697
1698		new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
1699		if (new_ire == NULL) {
1700			/* Drop packet */
1701			BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1702			ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
1703			freemsg(mp);
1704			return;
1705		}
1706		/*
1707		 * Update ira_ill to be the ILL on which the IP address
1708		 * is hosted. No need to hold the ill since we have a
1709		 * hold on the ire. Note that we do the switch even if
1710		 * new_ire == ire (for IPMP, ire would be the one corresponding
1711		 * to the IPMP ill).
1712		 */
1713		ASSERT(ira->ira_ill == ira->ira_rill);
1714		ira->ira_ill = new_ire->ire_ill;
1715
1716		/* ira_ruifindex tracks the upper for ira_rill */
1717		if (IS_UNDER_IPMP(ill))
1718			ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1719
1720		ip_input_local_v4(new_ire, mp, ipha, ira);
1721
1722		/* Restore */
1723		ASSERT(ira->ira_ill == new_ire->ire_ill);
1724		ira->ira_ill = ill;
1725		ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1726
1727		if (new_ire != ire)
1728			ire_refrele(new_ire);
1729		return;
1730	}
1731
1732	ip_input_local_v4(ire, mp, ipha, ira);
1733}
1734
1735/*
1736 * Common function for packets arriving for the host. Handles
1737 * checksum verification, reassembly checks, etc.
1738 */
1739static void
1740ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1741{
1742	ill_t		*ill = ira->ira_ill;
1743	iaflags_t	iraflags = ira->ira_flags;
1744
1745	/*
1746	 * Verify IP header checksum. If the packet was AH or ESP then
1747	 * this flag has already been cleared. Likewise if the packet
1748	 * had a hardware checksum.
1749	 */
1750	if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1751		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1752		ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1753		freemsg(mp);
1754		return;
1755	}
1756
1757	if (iraflags & IRAF_IPV4_OPTIONS) {
1758		if (!ip_input_local_options(mp, ipha, ira)) {
1759			/* Error has been sent and mp consumed */
1760			return;
1761		}
1762	}
1763
1764	/*
1765	 * Is packet part of fragmented IP packet?
1766	 * We compare against defined values in network byte order
1767	 */
1768	if (ipha->ipha_fragment_offset_and_flags &
1769	    (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
1770		/*
1771		 * Make sure we have ira_l2src before we loose the original
1772		 * mblk
1773		 */
1774		if (!(ira->ira_flags & IRAF_L2SRC_SET))
1775			ip_setl2src(mp, ira, ira->ira_rill);
1776
1777		mp = ip_input_fragment(mp, ipha, ira);
1778		if (mp == NULL)
1779			return;
1780		/* Completed reassembly */
1781		ipha = (ipha_t *)mp->b_rptr;
1782	}
1783
1784	/*
1785	 * For broadcast and multicast we need some extra work before
1786	 * we call ip_fanout_v4(), since in the case of shared-IP zones
1787	 * we need to pretend that a packet arrived for each zoneid.
1788	 */
1789	if (iraflags & IRAF_MULTIBROADCAST) {
1790		if (iraflags & IRAF_BROADCAST)
1791			ip_input_broadcast_v4(ire, mp, ipha, ira);
1792		else
1793			ip_input_multicast_v4(ire, mp, ipha, ira);
1794		return;
1795	}
1796	ip_fanout_v4(mp, ipha, ira);
1797}
1798
1799
1800/*
1801 * Handle multiple zones which match the same broadcast address
1802 * and ill by delivering a packet to each of them.
1803 * Walk the bucket and look for different ire_zoneid but otherwise
1804 * the same IRE (same ill/addr/mask/type).
1805 * Note that ire_add() tracks IREs that are identical in all
1806 * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
1807 * increasing ire_identical_cnt. Thus we don't need to be concerned
1808 * about those.
1809 */
1810static void
1811ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1812{
1813	ill_t		*ill = ira->ira_ill;
1814	ip_stack_t	*ipst = ill->ill_ipst;
1815	netstack_t	*ns = ipst->ips_netstack;
1816	irb_t		*irb;
1817	ire_t		*ire1;
1818	mblk_t		*mp1;
1819	ipha_t		*ipha1;
1820
1821	irb = ire->ire_bucket;
1822
1823	/*
1824	 * If we don't have more than one shared-IP zone, or if
1825	 * there can't be more than one IRE_BROADCAST for this
1826	 * IP address, then just set the zoneid and proceed.
1827	 */
1828	if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
1829		ira->ira_zoneid = ire->ire_zoneid;
1830
1831		ip_fanout_v4(mp, ipha, ira);
1832		return;
1833	}
1834	irb_refhold(irb);
1835	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1836		/* We do the main IRE after the end of the loop */
1837		if (ire1 == ire)
1838			continue;
1839
1840		/*
1841		 * Only IREs for the same IP address should be in the same
1842		 * bucket.
1843		 * But could have IRE_HOSTs in the case of CGTP.
1844		 */
1845		ASSERT(ire1->ire_addr == ire->ire_addr);
1846		if (!(ire1->ire_type & IRE_BROADCAST))
1847			continue;
1848
1849		if (IRE_IS_CONDEMNED(ire1))
1850			continue;
1851
1852		mp1 = copymsg(mp);
1853		if (mp1 == NULL) {
1854			/* Failed to deliver to one zone */
1855			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1856			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1857			continue;
1858		}
1859		ira->ira_zoneid = ire1->ire_zoneid;
1860		ipha1 = (ipha_t *)mp1->b_rptr;
1861		ip_fanout_v4(mp1, ipha1, ira);
1862	}
1863	irb_refrele(irb);
1864	/* Do the main ire */
1865	ira->ira_zoneid = ire->ire_zoneid;
1866	ip_fanout_v4(mp, ipha, ira);
1867}
1868
1869/*
1870 * Handle multiple zones which want to receive the same multicast packets
1871 * on this ill by delivering a packet to each of them.
1872 *
1873 * Note that for packets delivered to transports we could instead do this
1874 * as part of the fanout code, but since we need to handle icmp_inbound
1875 * it is simpler to have multicast work the same as broadcast.
1876 *
1877 * The ip_fanout matching for multicast matches based on ilm independent of
1878 * zoneid since the zoneid restriction is applied when joining a multicast
1879 * group.
1880 */
1881/* ARGSUSED */
1882static void
1883ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1884{
1885	ill_t		*ill = ira->ira_ill;
1886	iaflags_t	iraflags = ira->ira_flags;
1887	ip_stack_t	*ipst = ill->ill_ipst;
1888	netstack_t	*ns = ipst->ips_netstack;
1889	zoneid_t	zoneid;
1890	mblk_t		*mp1;
1891	ipha_t		*ipha1;
1892
1893	/* ire_recv_multicast has switched to the upper ill for IPMP */
1894	ASSERT(!IS_UNDER_IPMP(ill));
1895
1896	/*
1897	 * If we don't have more than one shared-IP zone, or if
1898	 * there are no members in anything but the global zone,
1899	 * then just set the zoneid and proceed.
1900	 */
1901	if (ns->netstack_numzones == 1 ||
1902	    !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1903	    GLOBAL_ZONEID)) {
1904		ira->ira_zoneid = GLOBAL_ZONEID;
1905
1906		/* If sender didn't want this zone to receive it, drop */
1907		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1908		    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1909			ip_drop_input("Multicast but wrong zoneid", mp, ill);
1910			freemsg(mp);
1911			return;
1912		}
1913		ip_fanout_v4(mp, ipha, ira);
1914		return;
1915	}
1916
1917	/*
1918	 * Here we loop over all zoneids that have members in the group
1919	 * and deliver a packet to ip_fanout for each zoneid.
1920	 *
1921	 * First find any members in the lowest numeric zoneid by looking for
1922	 * first zoneid larger than -1 (ALL_ZONES).
1923	 * We terminate the loop when we receive -1 (ALL_ZONES).
1924	 */
1925	zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
1926	for (; zoneid != ALL_ZONES;
1927	    zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
1928		/*
1929		 * Avoid an extra copymsg/freemsg by skipping global zone here
1930		 * and doing that at the end.
1931		 */
1932		if (zoneid == GLOBAL_ZONEID)
1933			continue;
1934
1935		ira->ira_zoneid = zoneid;
1936
1937		/* If sender didn't want this zone to receive it, skip */
1938		if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1939		    ira->ira_no_loop_zoneid == ira->ira_zoneid)
1940			continue;
1941
1942		mp1 = copymsg(mp);
1943		if (mp1 == NULL) {
1944			/* Failed to deliver to one zone */
1945			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1946			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1947			continue;
1948		}
1949		ipha1 = (ipha_t *)mp1->b_rptr;
1950		ip_fanout_v4(mp1, ipha1, ira);
1951	}
1952
1953	/* Do the main ire */
1954	ira->ira_zoneid = GLOBAL_ZONEID;
1955	/* If sender didn't want this zone to receive it, drop */
1956	if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1957	    ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1958		ip_drop_input("Multicast but wrong zoneid", mp, ill);
1959		freemsg(mp);
1960	} else {
1961		ip_fanout_v4(mp, ipha, ira);
1962	}
1963}
1964
1965
1966/*
1967 * Determine the zoneid and IRAF_TX_* flags if trusted extensions
1968 * is in use. Updates ira_zoneid and ira_flags as a result.
1969 */
1970static void
1971ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
1972    uint_t ip_hdr_length, ip_recv_attr_t *ira)
1973{
1974	uint16_t	*up;
1975	uint16_t	lport;
1976	zoneid_t	zoneid;
1977
1978	ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
1979
1980	/*
1981	 * If the packet is unlabeled we might allow read-down
1982	 * for MAC_EXEMPT. Below we clear this if it is a multi-level
1983	 * port (MLP).
1984	 * Note that ira_tsl can be NULL here.
1985	 */
1986	if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
1987		ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
1988
1989	if (ira->ira_zoneid != ALL_ZONES)
1990		return;
1991
1992	ira->ira_flags |= IRAF_TX_SHARED_ADDR;
1993
1994	up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
1995	switch (protocol) {
1996	case IPPROTO_TCP:
1997	case IPPROTO_SCTP:
1998	case IPPROTO_UDP:
1999		/* Caller ensures this */
2000		ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
2001
2002		/*
2003		 * Only these transports support MLP.
2004		 * We know their destination port numbers is in
2005		 * the same place in the header.
2006		 */
2007		lport = up[1];
2008
2009		/*
2010		 * No need to handle exclusive-stack zones
2011		 * since ALL_ZONES only applies to the shared IP instance.
2012		 */
2013		zoneid = tsol_mlp_findzone(protocol, lport);
2014		/*
2015		 * If no shared MLP is found, tsol_mlp_findzone returns
2016		 * ALL_ZONES.  In that case, we assume it's SLP, and
2017		 * search for the zone based on the packet label.
2018		 *
2019		 * If there is such a zone, we prefer to find a
2020		 * connection in it.  Otherwise, we look for a
2021		 * MAC-exempt connection in any zone whose label
2022		 * dominates the default label on the packet.
2023		 */
2024		if (zoneid == ALL_ZONES)
2025			zoneid = tsol_attr_to_zoneid(ira);
2026		else
2027			ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
2028		break;
2029	default:
2030		/* Handle shared address for other protocols */
2031		zoneid = tsol_attr_to_zoneid(ira);
2032		break;
2033	}
2034	ira->ira_zoneid = zoneid;
2035}
2036
2037/*
2038 * Increment checksum failure statistics
2039 */
2040static void
2041ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
2042{
2043	ip_stack_t	*ipst = ill->ill_ipst;
2044
2045	switch (protocol) {
2046	case IPPROTO_TCP:
2047		BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
2048
2049		if (hck_flags & HCK_FULLCKSUM)
2050			IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
2051		else if (hck_flags & HCK_PARTIALCKSUM)
2052			IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
2053		else
2054			IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
2055		break;
2056	case IPPROTO_UDP:
2057		BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
2058		if (hck_flags & HCK_FULLCKSUM)
2059			IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
2060		else if (hck_flags & HCK_PARTIALCKSUM)
2061			IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
2062		else
2063			IP_STAT(ipst, ip_udp_in_sw_cksum_err);
2064		break;
2065	case IPPROTO_ICMP:
2066		BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2067		break;
2068	default:
2069		ASSERT(0);
2070		break;
2071	}
2072}
2073
2074/* Calculate the IPv4 pseudo-header checksum */
2075uint32_t
2076ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
2077{
2078	uint_t		ulp_len;
2079	uint32_t	cksum;
2080	uint8_t		protocol = ira->ira_protocol;
2081	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
2082
2083#define	iphs    ((uint16_t *)ipha)
2084
2085	switch (protocol) {
2086	case IPPROTO_TCP:
2087		ulp_len = ira->ira_pktlen - ip_hdr_length;
2088
2089		/* Protocol and length */
2090		cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
2091		/* IP addresses */
2092		cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2093		break;
2094
2095	case IPPROTO_UDP: {
2096		udpha_t		*udpha;
2097
2098		udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2099
2100		/* Protocol and length */
2101		cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
2102		/* IP addresses */
2103		cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2104		break;
2105	}
2106
2107	default:
2108		cksum = 0;
2109		break;
2110	}
2111#undef	iphs
2112	return (cksum);
2113}
2114
2115
2116/*
2117 * Software verification of the ULP checksums.
2118 * Returns B_TRUE if ok.
2119 * Increments statistics of failed.
2120 */
2121static boolean_t
2122ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2123{
2124	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
2125	uint32_t	cksum;
2126	uint8_t		protocol = ira->ira_protocol;
2127	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
2128
2129	IP_STAT(ipst, ip_in_sw_cksum);
2130
2131	ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
2132
2133	cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2134	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
2135	if (cksum == 0)
2136		return (B_TRUE);
2137
2138	ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
2139	return (B_FALSE);
2140}
2141
2142/* There are drivers that can't do partial checksum with IP options */
2143int eri_cksum_workaround = 1;
2144
2145/*
2146 * Verify the ULP checksums.
2147 * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
2148 * algorithm.
2149 * Increments statistics if failed.
2150 */
2151static boolean_t
2152ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
2153    ip_recv_attr_t *ira)
2154{
2155	ill_t		*ill = ira->ira_rill;
2156	uint16_t	hck_flags;
2157	uint32_t	cksum;
2158	mblk_t		*mp1;
2159	int32_t		len;
2160	uint8_t		protocol = ira->ira_protocol;
2161	uint16_t	ip_hdr_length = ira->ira_ip_hdr_length;
2162
2163
2164	switch (protocol) {
2165	case IPPROTO_TCP:
2166		break;
2167
2168	case IPPROTO_UDP: {
2169		udpha_t		*udpha;
2170
2171		udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2172		if (udpha->uha_checksum == 0) {
2173			/* Packet doesn't have a UDP checksum */
2174			return (B_TRUE);
2175		}
2176		break;
2177	}
2178	case IPPROTO_SCTP: {
2179		sctp_hdr_t	*sctph;
2180		uint32_t	pktsum;
2181
2182		sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
2183#ifdef	DEBUG
2184		if (skip_sctp_cksum)
2185			return (B_TRUE);
2186#endif
2187		pktsum = sctph->sh_chksum;
2188		sctph->sh_chksum = 0;
2189		cksum = sctp_cksum(mp, ip_hdr_length);
2190		sctph->sh_chksum = pktsum;
2191		if (cksum == pktsum)
2192			return (B_TRUE);
2193
2194		/*
2195		 * Defer until later whether a bad checksum is ok
2196		 * in order to allow RAW sockets to use Adler checksum
2197		 * with SCTP.
2198		 */
2199		ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
2200		return (B_TRUE);
2201	}
2202
2203	default:
2204		/* No ULP checksum to verify. */
2205		return (B_TRUE);
2206	}
2207	/*
2208	 * Revert to software checksum calculation if the interface
2209	 * isn't capable of checksum offload.
2210	 * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
2211	 * Note: IRAF_NO_HW_CKSUM is not currently used.
2212	 */
2213	ASSERT(!IS_IPMP(ill));
2214	if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
2215	    !dohwcksum) {
2216		return (ip_input_sw_cksum_v4(mp, ipha, ira));
2217	}
2218
2219	/*
2220	 * We apply this for all ULP protocols. Does the HW know to
2221	 * not set the flags for SCTP and other protocols.
2222	 */
2223
2224	hck_flags = DB_CKSUMFLAGS(mp);
2225
2226	if (hck_flags & HCK_FULLCKSUM) {
2227		/*
2228		 * Full checksum has been computed by the hardware
2229		 * and has been attached.  If the driver wants us to
2230		 * verify the correctness of the attached value, in
2231		 * order to protect against faulty hardware, compare
2232		 * it against -0 (0xFFFF) to see if it's valid.
2233		 */
2234		if (hck_flags & HCK_FULLCKSUM_OK)
2235			return (B_TRUE);
2236
2237		cksum = DB_CKSUM16(mp);
2238		if (cksum == 0xFFFF)
2239			return (B_TRUE);
2240		ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2241		return (B_FALSE);
2242	}
2243
2244	mp1 = mp->b_cont;
2245	if ((hck_flags & HCK_PARTIALCKSUM) &&
2246	    (mp1 == NULL || mp1->b_cont == NULL) &&
2247	    ip_hdr_length >= DB_CKSUMSTART(mp) &&
2248	    (!eri_cksum_workaround || ip_hdr_length == IP_SIMPLE_HDR_LENGTH) &&
2249	    ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
2250		uint32_t	adj;
2251		uchar_t		*cksum_start;
2252
2253		cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2254
2255		cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
2256
2257		/*
2258		 * Partial checksum has been calculated by hardware
2259		 * and attached to the packet; in addition, any
2260		 * prepended extraneous data is even byte aligned,
2261		 * and there are at most two mblks associated with
2262		 * the packet.  If any such data exists, we adjust
2263		 * the checksum; also take care any postpended data.
2264		 */
2265		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
2266		/*
2267		 * One's complement subtract extraneous checksum
2268		 */
2269		cksum += DB_CKSUM16(mp);
2270		if (adj >= cksum)
2271			cksum = ~(adj - cksum) & 0xFFFF;
2272		else
2273			cksum -= adj;
2274		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2275		cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2276		if (!(~cksum & 0xFFFF))
2277			return (B_TRUE);
2278
2279		ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2280		return (B_FALSE);
2281	}
2282	return (ip_input_sw_cksum_v4(mp, ipha, ira));
2283}
2284
2285
2286/*
2287 * Handle fanout of received packets.
2288 * Unicast packets that are looped back (from ire_send_local_v4) and packets
2289 * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
2290 *
2291 * IPQoS Notes
2292 * Before sending it to the client, invoke IPPF processing. Policy processing
2293 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
2294 */
2295void
2296ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2297{
2298	ill_t		*ill = ira->ira_ill;
2299	iaflags_t	iraflags = ira->ira_flags;
2300	ip_stack_t	*ipst = ill->ill_ipst;
2301	uint8_t		protocol = ipha->ipha_protocol;
2302	conn_t		*connp;
2303#define	rptr	((uchar_t *)ipha)
2304	uint_t		ip_hdr_length;
2305	uint_t		min_ulp_header_length;
2306	int		offset;
2307	ssize_t		len;
2308	netstack_t	*ns = ipst->ips_netstack;
2309	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2310	ill_t		*rill = ira->ira_rill;
2311
2312	ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
2313
2314	ip_hdr_length = ira->ira_ip_hdr_length;
2315	ira->ira_protocol = protocol;
2316
2317	/*
2318	 * Time for IPP once we've done reassembly and IPsec.
2319	 * We skip this for loopback packets since we don't do IPQoS
2320	 * on loopback.
2321	 */
2322	if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
2323	    !(iraflags & IRAF_LOOPBACK) &&
2324	    (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
2325		/*
2326		 * Use the interface on which the packet arrived - not where
2327		 * the IP address is hosted.
2328		 */
2329		/* ip_process translates an IS_UNDER_IPMP */
2330		mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
2331		if (mp == NULL) {
2332			/* ip_drop_packet and MIB done */
2333			return;
2334		}
2335	}
2336
2337	/* Determine the minimum required size of the upper-layer header */
2338	/* Need to do this for at least the set of ULPs that TX handles. */
2339	switch (protocol) {
2340	case IPPROTO_TCP:
2341		min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
2342		break;
2343	case IPPROTO_SCTP:
2344		min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
2345		break;
2346	case IPPROTO_UDP:
2347		min_ulp_header_length = UDPH_SIZE;
2348		break;
2349	case IPPROTO_ICMP:
2350		min_ulp_header_length = ICMPH_SIZE;
2351		break;
2352	default:
2353		min_ulp_header_length = 0;
2354		break;
2355	}
2356	/* Make sure we have the min ULP header length */
2357	len = mp->b_wptr - rptr;
2358	if (len < ip_hdr_length + min_ulp_header_length) {
2359		if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
2360			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2361			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2362			freemsg(mp);
2363			return;
2364		}
2365		IP_STAT(ipst, ip_recv_pullup);
2366		ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
2367		    ira);
2368		if (ipha == NULL)
2369			goto discard;
2370		len = mp->b_wptr - rptr;
2371	}
2372
2373	/*
2374	 * If trusted extensions then determine the zoneid and TX specific
2375	 * ira_flags.
2376	 */
2377	if (iraflags & IRAF_SYSTEM_LABELED) {
2378		/* This can update ira->ira_flags and ira->ira_zoneid */
2379		ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
2380		iraflags = ira->ira_flags;
2381	}
2382
2383
2384	/* Verify ULP checksum. Handles TCP, UDP, and SCTP */
2385	if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
2386		if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
2387			/* Bad checksum. Stats are already incremented */
2388			ip_drop_input("Bad ULP checksum", mp, ill);
2389			freemsg(mp);
2390			return;
2391		}
2392		/* IRAF_SCTP_CSUM_ERR could have been set */
2393		iraflags = ira->ira_flags;
2394	}
2395	switch (protocol) {
2396	case IPPROTO_TCP:
2397		/* For TCP, discard broadcast and multicast packets. */
2398		if (iraflags & IRAF_MULTIBROADCAST)
2399			goto discard;
2400
2401		/* First mblk contains IP+TCP headers per above check */
2402		ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
2403
2404		/* TCP options present? */
2405		offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
2406		if (offset != 5) {
2407			if (offset < 5)
2408				goto discard;
2409
2410			/*
2411			 * There must be TCP options.
2412			 * Make sure we can grab them.
2413			 */
2414			offset <<= 2;
2415			offset += ip_hdr_length;
2416			if (len < offset) {
2417				if (ira->ira_pktlen < offset) {
2418					BUMP_MIB(ill->ill_ip_mib,
2419					    ipIfStatsInTruncatedPkts);
2420					ip_drop_input(
2421					    "ipIfStatsInTruncatedPkts",
2422					    mp, ill);
2423					freemsg(mp);
2424					return;
2425				}
2426				IP_STAT(ipst, ip_recv_pullup);
2427				ipha = ip_pullup(mp, offset, ira);
2428				if (ipha == NULL)
2429					goto discard;
2430				len = mp->b_wptr - rptr;
2431			}
2432		}
2433
2434		/*
2435		 * Pass up a squeue hint to tcp.
2436		 * If ira_sqp is already set (this is loopback) we leave it
2437		 * alone.
2438		 */
2439		if (ira->ira_sqp == NULL) {
2440			ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2441		}
2442
2443		/* Look for AF_INET or AF_INET6 that matches */
2444		connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
2445		    ira, ipst);
2446		if (connp == NULL) {
2447			/* Send the TH_RST */
2448			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2449			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2450			return;
2451		}
2452		if (connp->conn_incoming_ifindex != 0 &&
2453		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2454			CONN_DEC_REF(connp);
2455
2456			/* Send the TH_RST */
2457			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2458			tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2459			return;
2460		}
2461		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2462		    (iraflags & IRAF_IPSEC_SECURE)) {
2463			mp = ipsec_check_inbound_policy(mp, connp,
2464			    ipha, NULL, ira);
2465			if (mp == NULL) {
2466				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2467				/* Note that mp is NULL */
2468				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2469				CONN_DEC_REF(connp);
2470				return;
2471			}
2472		}
2473		/* Found a client; up it goes */
2474		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2475		ira->ira_ill = ira->ira_rill = NULL;
2476		if (!IPCL_IS_TCP(connp)) {
2477			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2478			(connp->conn_recv)(connp, mp, NULL, ira);
2479			CONN_DEC_REF(connp);
2480			ira->ira_ill = ill;
2481			ira->ira_rill = rill;
2482			return;
2483		}
2484
2485		/*
2486		 * We do different processing whether called from
2487		 * ip_accept_tcp and we match the target, don't match
2488		 * the target, and when we are called by ip_input.
2489		 */
2490		if (iraflags & IRAF_TARGET_SQP) {
2491			if (ira->ira_target_sqp == connp->conn_sqp) {
2492				mblk_t	*attrmp;
2493
2494				attrmp = ip_recv_attr_to_mblk(ira);
2495				if (attrmp == NULL) {
2496					BUMP_MIB(ill->ill_ip_mib,
2497					    ipIfStatsInDiscards);
2498					ip_drop_input("ipIfStatsInDiscards",
2499					    mp, ill);
2500					freemsg(mp);
2501					CONN_DEC_REF(connp);
2502				} else {
2503					SET_SQUEUE(attrmp, connp->conn_recv,
2504					    connp);
2505					attrmp->b_cont = mp;
2506					ASSERT(ira->ira_target_sqp_mp == NULL);
2507					ira->ira_target_sqp_mp = attrmp;
2508					/*
2509					 * Conn ref release when drained from
2510					 * the squeue.
2511					 */
2512				}
2513			} else {
2514				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2515				    connp->conn_recv, connp, ira, SQ_FILL,
2516				    SQTAG_IP_TCP_INPUT);
2517			}
2518		} else {
2519			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2520			    connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
2521		}
2522		ira->ira_ill = ill;
2523		ira->ira_rill = rill;
2524		return;
2525
2526	case IPPROTO_SCTP: {
2527		sctp_hdr_t	*sctph;
2528		in6_addr_t	map_src, map_dst;
2529		uint32_t	ports;	/* Source and destination ports */
2530		sctp_stack_t	*sctps = ipst->ips_netstack->netstack_sctp;
2531
2532		/* For SCTP, discard broadcast and multicast packets. */
2533		if (iraflags & IRAF_MULTIBROADCAST)
2534			goto discard;
2535
2536		/*
2537		 * Since there is no SCTP h/w cksum support yet, just
2538		 * clear the flag.
2539		 */
2540		DB_CKSUMFLAGS(mp) = 0;
2541
2542		/* Length ensured above */
2543		ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
2544		sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
2545
2546		/* get the ports */
2547		ports = *(uint32_t *)&sctph->sh_sport;
2548
2549		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
2550		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
2551		if (iraflags & IRAF_SCTP_CSUM_ERR) {
2552			/*
2553			 * No potential sctp checksum errors go to the Sun
2554			 * sctp stack however they might be Adler-32 summed
2555			 * packets a userland stack bound to a raw IP socket
2556			 * could reasonably use. Note though that Adler-32 is
2557			 * a long deprecated algorithm and customer sctp
2558			 * networks should eventually migrate to CRC-32 at
2559			 * which time this facility should be removed.
2560			 */
2561			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2562			return;
2563		}
2564		connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, sctps);
2565		if (connp == NULL) {
2566			/* Check for raw socket or OOTB handling */
2567			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2568			return;
2569		}
2570		if (connp->conn_incoming_ifindex != 0 &&
2571		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2572			CONN_DEC_REF(connp);
2573			/* Check for raw socket or OOTB handling */
2574			ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2575			return;
2576		}
2577
2578		/* Found a client; up it goes */
2579		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2580		sctp_input(connp, ipha, NULL, mp, ira);
2581		/* sctp_input does a rele of the sctp_t */
2582		return;
2583	}
2584
2585	case IPPROTO_UDP:
2586		/* First mblk contains IP+UDP headers as checked above */
2587		ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
2588
2589		if (iraflags & IRAF_MULTIBROADCAST) {
2590			uint16_t *up;	/* Pointer to ports in ULP header */
2591
2592			up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2593			ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
2594			return;
2595		}
2596
2597		/* Look for AF_INET or AF_INET6 that matches */
2598		connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
2599		    ira, ipst);
2600		if (connp == NULL) {
2601	no_udp_match:
2602			if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
2603			    connf_head != NULL) {
2604				ASSERT(ira->ira_protocol == IPPROTO_UDP);
2605				ip_fanout_proto_v4(mp, ipha, ira);
2606			} else {
2607				ip_fanout_send_icmp_v4(mp,
2608				    ICMP_DEST_UNREACHABLE,
2609				    ICMP_PORT_UNREACHABLE, ira);
2610			}
2611			return;
2612
2613		}
2614		if (connp->conn_incoming_ifindex != 0 &&
2615		    connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2616			CONN_DEC_REF(connp);
2617			goto no_udp_match;
2618		}
2619		if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
2620		    !canputnext(connp->conn_rq)) {
2621			CONN_DEC_REF(connp);
2622			BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
2623			ip_drop_input("udpIfStatsInOverflows", mp, ill);
2624			freemsg(mp);
2625			return;
2626		}
2627		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2628		    (iraflags & IRAF_IPSEC_SECURE)) {
2629			mp = ipsec_check_inbound_policy(mp, connp,
2630			    ipha, NULL, ira);
2631			if (mp == NULL) {
2632				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2633				/* Note that mp is NULL */
2634				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2635				CONN_DEC_REF(connp);
2636				return;
2637			}
2638		}
2639		/*
2640		 * Remove 0-spi if it's 0, or move everything behind
2641		 * the UDP header over it and forward to ESP via
2642		 * ip_fanout_v4().
2643		 */
2644		if (connp->conn_udp->udp_nat_t_endpoint) {
2645			if (iraflags & IRAF_IPSEC_SECURE) {
2646				ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2647				    DROPPER(ipss, ipds_esp_nat_t_ipsec),
2648				    &ipss->ipsec_dropper);
2649				CONN_DEC_REF(connp);
2650				return;
2651			}
2652
2653			mp = zero_spi_check(mp, ira);
2654			if (mp == NULL) {
2655				/*
2656				 * Packet was consumed - probably sent to
2657				 * ip_fanout_v4.
2658				 */
2659				CONN_DEC_REF(connp);
2660				return;
2661			}
2662			/* Else continue like a normal UDP packet. */
2663			ipha = (ipha_t *)mp->b_rptr;
2664			protocol = ipha->ipha_protocol;
2665			ira->ira_protocol = protocol;
2666		}
2667		/* Found a client; up it goes */
2668		IP_STAT(ipst, ip_udp_fannorm);
2669		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2670		ira->ira_ill = ira->ira_rill = NULL;
2671		(connp->conn_recv)(connp, mp, NULL, ira);
2672		CONN_DEC_REF(connp);
2673		ira->ira_ill = ill;
2674		ira->ira_rill = rill;
2675		return;
2676	default:
2677		break;
2678	}
2679
2680	/*
2681	 * Clear hardware checksumming flag as it is currently only
2682	 * used by TCP and UDP.
2683	 */
2684	DB_CKSUMFLAGS(mp) = 0;
2685
2686	switch (protocol) {
2687	case IPPROTO_ICMP:
2688		/*
2689		 * We need to accomodate icmp messages coming in clear
2690		 * until we get everything secure from the wire. If
2691		 * icmp_accept_clear_messages is zero we check with
2692		 * the global policy and act accordingly. If it is
2693		 * non-zero, we accept the message without any checks.
2694		 * But *this does not mean* that this will be delivered
2695		 * to RAW socket clients. By accepting we might send
2696		 * replies back, change our MTU value etc.,
2697		 * but delivery to the ULP/clients depends on their
2698		 * policy dispositions.
2699		 */
2700		if (ipst->ips_icmp_accept_clear_messages == 0) {
2701			mp = ipsec_check_global_policy(mp, NULL,
2702			    ipha, NULL, ira, ns);
2703			if (mp == NULL)
2704				return;
2705		}
2706
2707		/*
2708		 * On a labeled system, we have to check whether the zone
2709		 * itself is permitted to receive raw traffic.
2710		 */
2711		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2712			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
2713				BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
2714				ip_drop_input("tsol_can_accept_raw", mp, ill);
2715				freemsg(mp);
2716				return;
2717			}
2718		}
2719
2720		/*
2721		 * ICMP header checksum, including checksum field,
2722		 * should be zero.
2723		 */
2724		if (IP_CSUM(mp, ip_hdr_length, 0)) {
2725			BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2726			ip_drop_input("icmpInCksumErrs", mp, ill);
2727			freemsg(mp);
2728			return;
2729		}
2730		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2731		mp = icmp_inbound_v4(mp, ira);
2732		if (mp == NULL) {
2733			/* No need to pass to RAW sockets */
2734			return;
2735		}
2736		break;
2737
2738	case IPPROTO_IGMP:
2739		/*
2740		 * If we are not willing to accept IGMP packets in clear,
2741		 * then check with global policy.
2742		 */
2743		if (ipst->ips_igmp_accept_clear_messages == 0) {
2744			mp = ipsec_check_global_policy(mp, NULL,
2745			    ipha, NULL, ira, ns);
2746			if (mp == NULL)
2747				return;
2748		}
2749		if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2750		    !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2751			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2752			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2753			freemsg(mp);
2754			return;
2755		}
2756		/*
2757		 * Validate checksum
2758		 */
2759		if (IP_CSUM(mp, ip_hdr_length, 0)) {
2760			++ipst->ips_igmpstat.igps_rcv_badsum;
2761			ip_drop_input("igps_rcv_badsum", mp, ill);
2762			freemsg(mp);
2763			return;
2764		}
2765
2766		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2767		mp = igmp_input(mp, ira);
2768		if (mp == NULL) {
2769			/* Bad packet - discarded by igmp_input */
2770			return;
2771		}
2772		break;
2773	case IPPROTO_PIM:
2774		/*
2775		 * If we are not willing to accept PIM packets in clear,
2776		 * then check with global policy.
2777		 */
2778		if (ipst->ips_pim_accept_clear_messages == 0) {
2779			mp = ipsec_check_global_policy(mp, NULL,
2780			    ipha, NULL, ira, ns);
2781			if (mp == NULL)
2782				return;
2783		}
2784		if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2785		    !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2786			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2787			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2788			freemsg(mp);
2789			return;
2790		}
2791		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2792
2793		/* Checksum is verified in pim_input */
2794		mp = pim_input(mp, ira);
2795		if (mp == NULL) {
2796			/* Bad packet - discarded by pim_input */
2797			return;
2798		}
2799		break;
2800	case IPPROTO_AH:
2801	case IPPROTO_ESP: {
2802		/*
2803		 * Fast path for AH/ESP.
2804		 */
2805		netstack_t *ns = ipst->ips_netstack;
2806		ipsec_stack_t *ipss = ns->netstack_ipsec;
2807
2808		IP_STAT(ipst, ipsec_proto_ahesp);
2809
2810		if (!ipsec_loaded(ipss)) {
2811			ip_proto_not_sup(mp, ira);
2812			return;
2813		}
2814
2815		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2816		/* select inbound SA and have IPsec process the pkt */
2817		if (protocol == IPPROTO_ESP) {
2818			esph_t *esph;
2819			boolean_t esp_in_udp_sa;
2820			boolean_t esp_in_udp_packet;
2821
2822			mp = ipsec_inbound_esp_sa(mp, ira, &esph);
2823			if (mp == NULL)
2824				return;
2825
2826			ASSERT(esph != NULL);
2827			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2828			ASSERT(ira->ira_ipsec_esp_sa != NULL);
2829			ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
2830
2831			esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
2832			    IPSA_F_NATT) != 0);
2833			esp_in_udp_packet =
2834			    (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
2835
2836			/*
2837			 * The following is a fancy, but quick, way of saying:
2838			 * ESP-in-UDP SA and Raw ESP packet --> drop
2839			 *    OR
2840			 * ESP SA and ESP-in-UDP packet --> drop
2841			 */
2842			if (esp_in_udp_sa != esp_in_udp_packet) {
2843				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2844				ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2845				    DROPPER(ipss, ipds_esp_no_sa),
2846				    &ipss->ipsec_dropper);
2847				return;
2848			}
2849			mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
2850			    ira);
2851		} else {
2852			ah_t *ah;
2853
2854			mp = ipsec_inbound_ah_sa(mp, ira, &ah);
2855			if (mp == NULL)
2856				return;
2857
2858			ASSERT(ah != NULL);
2859			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2860			ASSERT(ira->ira_ipsec_ah_sa != NULL);
2861			ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
2862			mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
2863			    ira);
2864		}
2865
2866		if (mp == NULL) {
2867			/*
2868			 * Either it failed or is pending. In the former case
2869			 * ipIfStatsInDiscards was increased.
2870			 */
2871			return;
2872		}
2873		/* we're done with IPsec processing, send it up */
2874		ip_input_post_ipsec(mp, ira);
2875		return;
2876	}
2877	case IPPROTO_ENCAP: {
2878		ipha_t		*inner_ipha;
2879
2880		/*
2881		 * Handle self-encapsulated packets (IP-in-IP where
2882		 * the inner addresses == the outer addresses).
2883		 */
2884		if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
2885		    mp->b_wptr) {
2886			if (ira->ira_pktlen <
2887			    ip_hdr_length + sizeof (ipha_t)) {
2888				BUMP_MIB(ill->ill_ip_mib,
2889				    ipIfStatsInTruncatedPkts);
2890				ip_drop_input("ipIfStatsInTruncatedPkts",
2891				    mp, ill);
2892				freemsg(mp);
2893				return;
2894			}
2895			ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
2896			    sizeof (ipha_t) - mp->b_rptr, ira);
2897			if (ipha == NULL) {
2898				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2899				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2900				freemsg(mp);
2901				return;
2902			}
2903		}
2904		inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
2905		/*
2906		 * Check the sanity of the inner IP header.
2907		 */
2908		if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
2909			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2910			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2911			freemsg(mp);
2912			return;
2913		}
2914		if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
2915			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2916			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2917			freemsg(mp);
2918			return;
2919		}
2920		if (inner_ipha->ipha_src != ipha->ipha_src ||
2921		    inner_ipha->ipha_dst != ipha->ipha_dst) {
2922			/* We fallthru to iptun fanout below */
2923			goto iptun;
2924		}
2925
2926		/*
2927		 * Self-encapsulated tunnel packet. Remove
2928		 * the outer IP header and fanout again.
2929		 * We also need to make sure that the inner
2930		 * header is pulled up until options.
2931		 */
2932		mp->b_rptr = (uchar_t *)inner_ipha;
2933		ipha = inner_ipha;
2934		ip_hdr_length = IPH_HDR_LENGTH(ipha);
2935		if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
2936			if (ira->ira_pktlen <
2937			    (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
2938				BUMP_MIB(ill->ill_ip_mib,
2939				    ipIfStatsInTruncatedPkts);
2940				ip_drop_input("ipIfStatsInTruncatedPkts",
2941				    mp, ill);
2942				freemsg(mp);
2943				return;
2944			}
2945			ipha = ip_pullup(mp,
2946			    (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
2947			if (ipha == NULL) {
2948				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2949				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2950				freemsg(mp);
2951				return;
2952			}
2953		}
2954		if (ip_hdr_length > sizeof (ipha_t)) {
2955			/* We got options on the inner packet. */
2956			ipaddr_t	dst = ipha->ipha_dst;
2957			int		error = 0;
2958
2959			dst = ip_input_options(ipha, dst, mp, ira, &error);
2960			if (error != 0) {
2961				/*
2962				 * An ICMP error has been sent and the packet
2963				 * has been dropped.
2964				 */
2965				return;
2966			}
2967			if (dst != ipha->ipha_dst) {
2968				/*
2969				 * Someone put a source-route in
2970				 * the inside header of a self-
2971				 * encapsulated packet.  Drop it
2972				 * with extreme prejudice and let
2973				 * the sender know.
2974				 */
2975				ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
2976				    mp, ill);
2977				icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
2978				    ira);
2979				return;
2980			}
2981		}
2982		if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
2983			/*
2984			 * This means that somebody is sending
2985			 * Self-encapsualted packets without AH/ESP.
2986			 *
2987			 * Send this packet to find a tunnel endpoint.
2988			 * if I can't find one, an ICMP
2989			 * PROTOCOL_UNREACHABLE will get sent.
2990			 */
2991			protocol = ipha->ipha_protocol;
2992			ira->ira_protocol = protocol;
2993			goto iptun;
2994		}
2995
2996		/* Update based on removed IP header */
2997		ira->ira_ip_hdr_length = ip_hdr_length;
2998		ira->ira_pktlen = ntohs(ipha->ipha_length);
2999
3000		if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
3001			/*
3002			 * This packet is self-encapsulated multiple
3003			 * times. We don't want to recurse infinitely.
3004			 * To keep it simple, drop the packet.
3005			 */
3006			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3007			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3008			freemsg(mp);
3009			return;
3010		}
3011		ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3012		ira->ira_flags |= IRAF_IPSEC_DECAPS;
3013
3014		ip_input_post_ipsec(mp, ira);
3015		return;
3016	}
3017
3018	iptun:	/* IPPROTO_ENCAPS that is not self-encapsulated */
3019	case IPPROTO_IPV6:
3020		/* iptun will verify trusted label */
3021		connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
3022		    ira, ipst);
3023		if (connp != NULL) {
3024			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
3025			ira->ira_ill = ira->ira_rill = NULL;
3026			(connp->conn_recv)(connp, mp, NULL, ira);
3027			CONN_DEC_REF(connp);
3028			ira->ira_ill = ill;
3029			ira->ira_rill = rill;
3030			return;
3031		}
3032		/* FALLTHRU */
3033	default:
3034		/*
3035		 * On a labeled system, we have to check whether the zone
3036		 * itself is permitted to receive raw traffic.
3037		 */
3038		if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
3039			if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
3040				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3041				ip_drop_input("ipIfStatsInDiscards", mp, ill);
3042				freemsg(mp);
3043				return;
3044			}
3045		}
3046		break;
3047	}
3048
3049	/*
3050	 * The above input functions may have returned the pulled up message.
3051	 * So ipha need to be reinitialized.
3052	 */
3053	ipha = (ipha_t *)mp->b_rptr;
3054	ira->ira_protocol = protocol = ipha->ipha_protocol;
3055	if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
3056		/*
3057		 * No user-level listener for these packets packets.
3058		 * Check for IPPROTO_ENCAP...
3059		 */
3060		if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
3061			/*
3062			 * Check policy here,
3063			 * THEN ship off to ip_mroute_decap().
3064			 *
3065			 * BTW,  If I match a configured IP-in-IP
3066			 * tunnel above, this path will not be reached, and
3067			 * ip_mroute_decap will never be called.
3068			 */
3069			mp = ipsec_check_global_policy(mp, connp,
3070			    ipha, NULL, ira, ns);
3071			if (mp != NULL) {
3072				ip_mroute_decap(mp, ira);
3073			} /* Else we already freed everything! */
3074		} else {
3075			ip_proto_not_sup(mp, ira);
3076		}
3077		return;
3078	}
3079
3080	/*
3081	 * Handle fanout to raw sockets.  There
3082	 * can be more than one stream bound to a particular
3083	 * protocol.  When this is the case, each one gets a copy
3084	 * of any incoming packets.
3085	 */
3086	ASSERT(ira->ira_protocol == ipha->ipha_protocol);
3087	ip_fanout_proto_v4(mp, ipha, ira);
3088	return;
3089
3090discard:
3091	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3092	ip_drop_input("ipIfStatsInDiscards", mp, ill);
3093	freemsg(mp);
3094#undef rptr
3095}
3096