1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25/* Copyright (c) 1990 Mentat Inc. */
26
27#include <sys/types.h>
28#include <sys/stream.h>
29#include <sys/strsubr.h>
30#include <sys/dlpi.h>
31#include <sys/strsun.h>
32#include <sys/zone.h>
33#include <sys/ddi.h>
34#include <sys/sunddi.h>
35#include <sys/cmn_err.h>
36#include <sys/debug.h>
37#include <sys/atomic.h>
38
39#include <sys/systm.h>
40#include <sys/param.h>
41#include <sys/kmem.h>
42#include <sys/sdt.h>
43#include <sys/socket.h>
44#include <sys/mac.h>
45#include <net/if.h>
46#include <net/if_arp.h>
47#include <net/route.h>
48#include <sys/sockio.h>
49#include <netinet/in.h>
50#include <net/if_dl.h>
51
52#include <inet/common.h>
53#include <inet/mi.h>
54#include <inet/mib2.h>
55#include <inet/nd.h>
56#include <inet/arp.h>
57#include <inet/snmpcom.h>
58#include <inet/kstatcom.h>
59
60#include <netinet/igmp_var.h>
61#include <netinet/ip6.h>
62#include <netinet/icmp6.h>
63#include <netinet/sctp.h>
64
65#include <inet/ip.h>
66#include <inet/ip_impl.h>
67#include <inet/ip6.h>
68#include <inet/ip6_asp.h>
69#include <inet/tcp.h>
70#include <inet/ip_multi.h>
71#include <inet/ip_if.h>
72#include <inet/ip_ire.h>
73#include <inet/ip_ftable.h>
74#include <inet/ip_rts.h>
75#include <inet/optcom.h>
76#include <inet/ip_ndp.h>
77#include <inet/ip_listutils.h>
78#include <netinet/igmp.h>
79#include <netinet/ip_mroute.h>
80#include <inet/ipp_common.h>
81
82#include <net/pfkeyv2.h>
83#include <inet/sadb.h>
84#include <inet/ipsec_impl.h>
85#include <inet/ipdrop.h>
86#include <inet/ip_netinfo.h>
87
88#include <sys/pattr.h>
89#include <inet/ipclassifier.h>
90#include <inet/sctp_ip.h>
91#include <inet/sctp/sctp_impl.h>
92#include <inet/udp_impl.h>
93#include <sys/sunddi.h>
94
95#include <sys/tsol/label.h>
96#include <sys/tsol/tnet.h>
97
98#include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
99
100#ifdef	DEBUG
101extern boolean_t skip_sctp_cksum;
102#endif
103
104static int	ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
105static int	ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
106static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
107static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
108static void	ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
109
110/*
111 * There are two types of output functions for IP used for different
112 * purposes:
113 *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
114 *     is no context in the form of a conn_t. However, there is a
115 *     ip_xmit_attr_t that the callers use to influence interface selection
116 *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
117 *
118 *  - conn_ip_output() is used when sending packets with a conn_t and
119 *    ip_set_destination has been called to cache information. In that case
120 *    various socket options are recorded in the ip_xmit_attr_t and should
121 *    be taken into account.
122 */
123
124/*
125 * The caller *must* have called conn_connect() or ip_attr_connect()
126 * before calling conn_ip_output(). The caller needs to redo that each time
127 * the destination IP address or port changes, as well as each time there is
128 * a change to any socket option that would modify how packets are routed out
129 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
130 *
131 * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
132 * We assert for that here.
133 */
134int
135conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
136{
137	iaflags_t	ixaflags = ixa->ixa_flags;
138	ire_t		*ire;
139	nce_t		*nce;
140	dce_t		*dce;
141	ill_t		*ill;
142	ip_stack_t	*ipst = ixa->ixa_ipst;
143	int		error;
144
145	/* We defer ipIfStatsHCOutRequests until an error or we have an ill */
146
147	ASSERT(ixa->ixa_ire != NULL);
148	/* Note there is no ixa_nce when reject and blackhole routes */
149	ASSERT(ixa->ixa_dce != NULL);	/* Could be default dce */
150
151#ifdef DEBUG
152	ASSERT(ixa->ixa_curthread == NULL);
153	ixa->ixa_curthread = curthread;
154#endif
155
156	/*
157	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
158	 * for IGMP/MLD traffic.
159	 */
160
161	ire = ixa->ixa_ire;
162
163	/*
164	 * If the ULP says the (old) IRE resulted in reachability we
165	 * record this before determine whether to use a new IRE.
166	 * No locking for performance reasons.
167	 */
168	if (ixaflags & IXAF_REACH_CONF)
169		ire->ire_badcnt = 0;
170
171	/*
172	 * Has routing changed since we cached the results of the lookup?
173	 *
174	 * This check captures all of:
175	 *  - the cached ire being deleted (by means of the special
176	 *    IRE_GENERATION_CONDEMNED)
177	 *  - A potentially better ire being added (ire_generation being
178	 *    increased)
179	 *  - A deletion of the nexthop ire that was used when we did the
180	 *    lookup.
181	 *  - An addition of a potentially better nexthop ire.
182	 * The last two are handled by walking and increasing the generation
183	 * number on all dependant IREs in ire_flush_cache().
184	 *
185	 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
186	 * since we ensure that each time we set ixa_ire to such an IRE we
187	 * make sure the ixa_ire_generation does not match (by using
188	 * IRE_GENERATION_VERIFY).
189	 */
190	if (ire->ire_generation != ixa->ixa_ire_generation) {
191		error = ip_verify_ire(mp, ixa);
192		if (error != 0) {
193			ip_drop_output("ipIfStatsOutDiscards - verify ire",
194			    mp, NULL);
195			goto drop;
196		}
197		ire = ixa->ixa_ire;
198		ASSERT(ire != NULL);
199		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
200#ifdef DEBUG
201			ASSERT(ixa->ixa_curthread == curthread);
202			ixa->ixa_curthread = NULL;
203#endif
204			ire->ire_ob_pkt_count++;
205			/* ixa_dce might be condemned; use default one */
206			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
207			    &ipst->ips_dce_default->dce_ident));
208		}
209		/*
210		 * If the ncec changed then ip_verify_ire already set
211		 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
212		 * so we can recheck the interface mtu.
213		 */
214
215		/*
216		 * Note that ire->ire_generation could already have changed.
217		 * We catch that next time we send a packet.
218		 */
219	}
220
221	/*
222	 * No need to lock access to ixa_nce since the ip_xmit_attr usage
223	 * is single threaded.
224	 */
225	ASSERT(ixa->ixa_nce != NULL);
226	nce = ixa->ixa_nce;
227	if (nce->nce_is_condemned) {
228		error = ip_verify_nce(mp, ixa);
229		/*
230		 * In case ZEROCOPY capability become not available, we
231		 * copy the message and free the original one. We might
232		 * be copying more data than needed but it doesn't hurt
233		 * since such change rarely happens.
234		 */
235		switch (error) {
236		case 0:
237			break;
238		case ENOTSUP: { /* ZEROCOPY */
239			mblk_t *nmp;
240
241			if ((nmp = copymsg(mp)) != NULL) {
242				freemsg(mp);
243				mp = nmp;
244
245				break;
246			}
247			/* FALLTHROUGH */
248		}
249		default:
250			ip_drop_output("ipIfStatsOutDiscards - verify nce",
251			    mp, NULL);
252			goto drop;
253		}
254		ire = ixa->ixa_ire;
255		ASSERT(ire != NULL);
256		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
257#ifdef DEBUG
258			ASSERT(ixa->ixa_curthread == curthread);
259			ixa->ixa_curthread = NULL;
260#endif
261			ire->ire_ob_pkt_count++;
262			/* ixa_dce might be condemned; use default one */
263			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
264			    ixa, &ipst->ips_dce_default->dce_ident));
265		}
266		ASSERT(ixa->ixa_nce != NULL);
267		nce = ixa->ixa_nce;
268
269		/*
270		 * Note that some other event could already have made
271		 * the new nce condemned. We catch that next time we
272		 * try to send a packet.
273		 */
274	}
275	/*
276	 * If there is no per-destination dce_t then we have a reference to
277	 * the default dce_t (which merely contains the dce_ipid).
278	 * The generation check captures both the introduction of a
279	 * per-destination dce_t (e.g., due to ICMP packet too big) and
280	 * any change to the per-destination dce (including it becoming
281	 * condemned by use of the special DCE_GENERATION_CONDEMNED).
282	 */
283	dce = ixa->ixa_dce;
284
285	/*
286	 * To avoid a periodic timer to increase the path MTU we
287	 * look at dce_last_change_time each time we send a packet.
288	 */
289	if (dce->dce_flags & DCEF_PMTU) {
290		int64_t		now = LBOLT_FASTPATH64;
291
292		if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
293		    ipst->ips_ip_pathmtu_interval)) {
294			/*
295			 * Older than 20 minutes. Drop the path MTU information.
296			 * Since the path MTU changes as a result of this,
297			 * twiddle ixa_dce_generation to make us go through the
298			 * dce verification code in conn_ip_output.
299			 */
300			mutex_enter(&dce->dce_lock);
301			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
302			dce->dce_last_change_time = TICK_TO_SEC(now);
303			mutex_exit(&dce->dce_lock);
304			dce_increment_generation(dce);
305		}
306	}
307
308	if (dce->dce_generation != ixa->ixa_dce_generation) {
309		error = ip_verify_dce(mp, ixa);
310		if (error != 0) {
311			ip_drop_output("ipIfStatsOutDiscards - verify dce",
312			    mp, NULL);
313			goto drop;
314		}
315		dce = ixa->ixa_dce;
316
317		/*
318		 * Note that some other event could already have made the
319		 * new dce's generation number change.
320		 * We catch that next time we try to send a packet.
321		 */
322	}
323
324	ill = nce->nce_ill;
325
326	/*
327	 * An initial ixa_fragsize was set in ip_set_destination
328	 * and we update it if any routing changes above.
329	 * A change to ill_mtu with ifconfig will increase all dce_generation
330	 * so that we will detect that with the generation check. Ditto for
331	 * ill_mc_mtu.
332	 */
333
334	/*
335	 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
336	 * conn_unspec_src.
337	 */
338	if ((ixaflags & IXAF_VERIFY_SOURCE) &&
339	    ixa->ixa_src_generation != ipst->ips_src_generation) {
340		/* Check if the IP source is still assigned to the host. */
341		uint_t gen;
342
343		if (!ip_verify_src(mp, ixa, &gen)) {
344			/* Don't send a packet with a source that isn't ours */
345			error = EADDRNOTAVAIL;
346			ip_drop_output("ipIfStatsOutDiscards - invalid src",
347			    mp, NULL);
348			goto drop;
349		}
350		/* The source is still valid - update the generation number */
351		ixa->ixa_src_generation = gen;
352	}
353
354	/*
355	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
356	 * can only count the use prior to fragmentation. However the MIB
357	 * counters on the ill will be incremented in post fragmentation.
358	 */
359	ire->ire_ob_pkt_count++;
360	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
361
362	/*
363	 * Based on ire_type and ire_flags call one of:
364	 *	ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
365	 *	ire_send_multirt_v* - if RTF_MULTIRT
366	 *	ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
367	 *	ire_send_multicast_v* - for IRE_MULTICAST
368	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
369	 *	ire_send_wire_v* - for the rest.
370	 */
371#ifdef DEBUG
372	ASSERT(ixa->ixa_curthread == curthread);
373	ixa->ixa_curthread = NULL;
374#endif
375	return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
376
377drop:
378	if (ixaflags & IXAF_IS_IPV4) {
379		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
380		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
381	} else {
382		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
383		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
384	}
385	freemsg(mp);
386#ifdef DEBUG
387	ASSERT(ixa->ixa_curthread == curthread);
388	ixa->ixa_curthread = NULL;
389#endif
390	return (error);
391}
392
393/*
394 * Handle both IPv4 and IPv6. Sets the generation number
395 * to allow the caller to know when to call us again.
396 * Returns true if the source address in the packet is a valid source.
397 * We handle callers which try to send with a zero address (since we only
398 * get here if UNSPEC_SRC is not set).
399 */
400boolean_t
401ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
402{
403	ip_stack_t	*ipst = ixa->ixa_ipst;
404
405	/*
406	 * Need to grab the generation number before we check to
407	 * avoid a race with a change to the set of local addresses.
408	 * No lock needed since the thread which updates the set of local
409	 * addresses use ipif/ill locks and exit those (hence a store memory
410	 * barrier) before doing the atomic increase of ips_src_generation.
411	 */
412	if (generationp != NULL)
413		*generationp = ipst->ips_src_generation;
414
415	if (ixa->ixa_flags & IXAF_IS_IPV4) {
416		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
417
418		if (ipha->ipha_src == INADDR_ANY)
419			return (B_FALSE);
420
421		return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
422		    ipst, B_FALSE) != IPVL_BAD);
423	} else {
424		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
425		uint_t	scopeid;
426
427		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
428			return (B_FALSE);
429
430		if (ixa->ixa_flags & IXAF_SCOPEID_SET)
431			scopeid = ixa->ixa_scopeid;
432		else
433			scopeid = 0;
434
435		return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
436		    ipst, B_FALSE, scopeid) != IPVL_BAD);
437	}
438}
439
440/*
441 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
442 */
443int
444ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
445{
446	uint_t		gen;
447	ire_t		*ire;
448	nce_t		*nce;
449	int		error;
450	boolean_t	multirt = B_FALSE;
451
452	/*
453	 * Redo ip_select_route.
454	 * Need to grab generation number as part of the lookup to
455	 * avoid race.
456	 */
457	error = 0;
458	ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
459	ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
460	if (error != 0) {
461		ire_refrele(ire);
462		return (error);
463	}
464
465	if (ixa->ixa_ire != NULL)
466		ire_refrele_notr(ixa->ixa_ire);
467#ifdef DEBUG
468	ire_refhold_notr(ire);
469	ire_refrele(ire);
470#endif
471	ixa->ixa_ire = ire;
472	ixa->ixa_ire_generation = gen;
473	if (multirt) {
474		if (ixa->ixa_flags & IXAF_IS_IPV4)
475			ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
476		else
477			ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
478		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
479	} else {
480		ixa->ixa_postfragfn = ire->ire_postfragfn;
481		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
482	}
483
484	/*
485	 * Don't look for an nce for reject or blackhole.
486	 * They have ire_generation set to IRE_GENERATION_VERIFY which
487	 * makes conn_ip_output avoid references to ixa_nce.
488	 */
489	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
490		ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
491		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
492		return (0);
493	}
494
495	/* The NCE could now be different */
496	nce = ire_to_nce_pkt(ire, mp);
497	if (nce == NULL) {
498		/*
499		 * Allocation failure. Make sure we redo ire/nce selection
500		 * next time we send.
501		 */
502		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
503		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
504		return (ENOBUFS);
505	}
506	if (nce == ixa->ixa_nce) {
507		/* No change */
508		nce_refrele(nce);
509		return (0);
510	}
511
512	/*
513	 * Since the path MTU might change as a result of this
514	 * route change, we twiddle ixa_dce_generation to
515	 * make conn_ip_output go through the ip_verify_dce code.
516	 */
517	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
518
519	if (ixa->ixa_nce != NULL)
520		nce_refrele(ixa->ixa_nce);
521	ixa->ixa_nce = nce;
522	return (0);
523}
524
525/*
526 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
527 */
528static int
529ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
530{
531	ire_t		*ire = ixa->ixa_ire;
532	nce_t		*nce;
533	int		error = 0;
534	ipha_t		*ipha = NULL;
535	ip6_t		*ip6h = NULL;
536
537	if (ire->ire_ipversion == IPV4_VERSION)
538		ipha = (ipha_t *)mp->b_rptr;
539	else
540		ip6h = (ip6_t *)mp->b_rptr;
541
542	nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
543	if (nce == NULL) {
544		/* Try to find a better ire */
545		return (ip_verify_ire(mp, ixa));
546	}
547
548	/*
549	 * The hardware offloading capabilities, for example LSO, of the
550	 * interface might have changed, so do sanity verification here.
551	 */
552	if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
553		if (!ip_verify_lso(nce->nce_ill, ixa)) {
554			ASSERT(ixa->ixa_notify != NULL);
555			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
556			    IXAN_LSO, 0);
557			error = ENOTSUP;
558		}
559	}
560
561	/*
562	 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
563	 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
564	 * any more, return error so that conn_ip_output() can take care of
565	 * the ZEROCOPY message properly. It's safe to continue send the
566	 * message when ZEROCOPY newly become available.
567	 */
568	if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
569		if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
570			ASSERT(ixa->ixa_notify != NULL);
571			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
572			    IXAN_ZCOPY, 0);
573			if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
574				error = ENOTSUP;
575		}
576	}
577
578	/*
579	 * Since the path MTU might change as a result of this
580	 * change, we twiddle ixa_dce_generation to
581	 * make conn_ip_output go through the ip_verify_dce code.
582	 */
583	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
584
585	nce_refrele(ixa->ixa_nce);
586	ixa->ixa_nce = nce;
587	return (error);
588}
589
590/*
591 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
592 */
593static int
594ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
595{
596	dce_t		*dce;
597	uint_t		gen;
598	uint_t		pmtu;
599
600	dce = dce_lookup_pkt(mp, ixa, &gen);
601	ASSERT(dce != NULL);
602
603	dce_refrele_notr(ixa->ixa_dce);
604#ifdef DEBUG
605	dce_refhold_notr(dce);
606	dce_refrele(dce);
607#endif
608	ixa->ixa_dce = dce;
609	ixa->ixa_dce_generation = gen;
610
611	/* Extract the (path) mtu from the dce, ncec_ill etc */
612	pmtu = ip_get_pmtu(ixa);
613
614	/*
615	 * Tell ULP about PMTU changes - increase or decrease - by returning
616	 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
617	 * both ixa_pmtu and ixa_fragsize appropriately.
618	 *
619	 * If ULP doesn't set that flag then we need to update ixa_fragsize
620	 * since routing could have changed the ill after after ixa_fragsize
621	 * was set previously in the conn_ip_output path or in
622	 * ip_set_destination.
623	 *
624	 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
625	 *
626	 * In the case of a path MTU increase we send the packet after the
627	 * notify to the ULP.
628	 */
629	if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
630		if (ixa->ixa_pmtu != pmtu) {
631			uint_t oldmtu = ixa->ixa_pmtu;
632
633			DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
634			    uint32_t, ixa->ixa_pmtu);
635			ASSERT(ixa->ixa_notify != NULL);
636			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
637			    IXAN_PMTU, pmtu);
638			if (pmtu < oldmtu)
639				return (EMSGSIZE);
640		}
641	} else {
642		ixa->ixa_fragsize = pmtu;
643	}
644	return (0);
645}
646
647/*
648 * Verify LSO usability. Keep the return value simple to indicate whether
649 * the LSO capability has changed. Handle both IPv4 and IPv6.
650 */
651static boolean_t
652ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
653{
654	ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
655	ill_lso_capab_t	*new_lsoc = ill->ill_lso_capab;
656
657	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
658		/*
659		 * Not unsable any more.
660		 */
661		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
662		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
663		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
664		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
665		    !ILL_LSO_TCP_IPV4_USABLE(ill) :
666		    !ILL_LSO_TCP_IPV6_USABLE(ill))) {
667			ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
668
669			return (B_FALSE);
670		}
671
672		/*
673		 * Capability has changed, refresh the copy in ixa.
674		 */
675		if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
676			*lsoc = *new_lsoc;
677
678			return (B_FALSE);
679		}
680	} else { /* Was not usable */
681		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
682		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
683		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
684		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
685		    ILL_LSO_TCP_IPV4_USABLE(ill) :
686		    ILL_LSO_TCP_IPV6_USABLE(ill))) {
687			*lsoc = *new_lsoc;
688			ixa->ixa_flags |= IXAF_LSO_CAPAB;
689
690			return (B_FALSE);
691		}
692	}
693
694	return (B_TRUE);
695}
696
697/*
698 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
699 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
700 */
701static boolean_t
702ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
703{
704	if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
705		/*
706		 * Not unsable any more.
707		 */
708		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
709		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
710		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
711		    !ILL_ZCOPY_USABLE(ill)) {
712			ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
713
714			return (B_FALSE);
715		}
716	} else { /* Was not usable */
717		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
718		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
719		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
720		    ILL_ZCOPY_USABLE(ill)) {
721			ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
722
723			return (B_FALSE);
724		}
725	}
726
727	return (B_TRUE);
728}
729
730
731/*
732 * When there is no conn_t context, this will send a packet.
733 * The caller must *not* have called conn_connect() or ip_attr_connect()
734 * before calling ip_output_simple().
735 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
736 * Honors IXAF_SET_SOURCE.
737 *
738 * We acquire the ire and after calling ire_sendfn we release
739 * the hold on the ire. Ditto for the nce and dce.
740 *
741 * This assumes that the caller has set the following in ip_xmit_attr_t:
742 *	ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
743 *	If ixa_ifindex is non-zero it means send out that ill. (If it is
744 *	an upper IPMP ill we load balance across the group; if a lower we send
745 *	on that lower ill without load balancing.)
746 *	IXAF_IS_IPV4 must be set correctly.
747 *	If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
748 *	If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
749 *	If neither of those two are set we do an IPsec policy lookup.
750 *
751 * We handle setting things like
752 *	ixa_pktlen
753 *	ixa_ip_hdr_length
754 *	ixa->ixa_protocol
755 *
756 * The caller may set ixa_xmit_hint, which is used for ECMP selection and
757 * transmit ring selecting in GLD.
758 *
759 * The caller must do an ixa_cleanup() to release any IPsec references
760 * after we return.
761 */
762int
763ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
764{
765	ts_label_t	*effective_tsl = NULL;
766	int		err;
767
768	ASSERT(ixa->ixa_ipst != NULL);
769
770	if (is_system_labeled()) {
771		ip_stack_t *ipst = ixa->ixa_ipst;
772
773		if (ixa->ixa_flags & IXAF_IS_IPV4) {
774			err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
775			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
776			    &effective_tsl);
777		} else {
778			err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
779			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
780			    &effective_tsl);
781		}
782		if (err != 0) {
783			ip2dbg(("tsol_check: label check failed (%d)\n", err));
784			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
785			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
786			ip_drop_output("tsol_check_label", mp, NULL);
787			freemsg(mp);
788			return (err);
789		}
790		if (effective_tsl != NULL) {
791			/* Update the label */
792			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
793		}
794	}
795
796	if (ixa->ixa_flags & IXAF_IS_IPV4)
797		return (ip_output_simple_v4(mp, ixa));
798	else
799		return (ip_output_simple_v6(mp, ixa));
800}
801
802int
803ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
804{
805	ipha_t		*ipha;
806	ipaddr_t	firsthop; /* In IP header */
807	ipaddr_t	dst;	/* End of source route, or ipha_dst if none */
808	ire_t		*ire;
809	ipaddr_t	setsrc;	/* RTF_SETSRC */
810	int		error;
811	ill_t		*ill = NULL;
812	dce_t		*dce = NULL;
813	nce_t		*nce;
814	iaflags_t	ixaflags = ixa->ixa_flags;
815	ip_stack_t	*ipst = ixa->ixa_ipst;
816	boolean_t	repeat = B_FALSE;
817	boolean_t	multirt = B_FALSE;
818	int64_t		now;
819
820	ipha = (ipha_t *)mp->b_rptr;
821	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
822
823	/*
824	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
825	 * for IGMP/MLD traffic.
826	 */
827
828	/* Caller already set flags */
829	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
830
831	ASSERT(ixa->ixa_nce == NULL);
832
833	ixa->ixa_pktlen = ntohs(ipha->ipha_length);
834	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
835	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
836	ixa->ixa_protocol = ipha->ipha_protocol;
837
838	/*
839	 * Assumes that source routed packets have already been massaged by
840	 * the ULP (ip_massage_options) and as a result ipha_dst is the next
841	 * hop in the source route. The final destination is used for IPsec
842	 * policy and DCE lookup.
843	 */
844	firsthop = ipha->ipha_dst;
845	dst = ip_get_dst(ipha);
846
847repeat_ire:
848	error = 0;
849	setsrc = INADDR_ANY;
850	ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
851	    &setsrc, &error, &multirt);
852	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
853	if (error != 0) {
854		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
855		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
856		ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
857		freemsg(mp);
858		goto done;
859	}
860
861	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
862		/* ire_ill might be NULL hence need to skip some code */
863		if (ixaflags & IXAF_SET_SOURCE)
864			ipha->ipha_src = htonl(INADDR_LOOPBACK);
865		ixa->ixa_fragsize = IP_MAXPACKET;
866		ill = NULL;
867		nce = NULL;
868		ire->ire_ob_pkt_count++;
869		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
870		/* No dce yet; use default one */
871		error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
872		    &ipst->ips_dce_default->dce_ident);
873		goto done;
874	}
875
876	/* Note that ipha_dst is only used for IRE_MULTICAST */
877	nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
878	if (nce == NULL) {
879		/* Allocation failure? */
880		ip_drop_output("ire_to_nce", mp, ill);
881		freemsg(mp);
882		error = ENOBUFS;
883		goto done;
884	}
885	if (nce->nce_is_condemned) {
886		nce_t *nce1;
887
888		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
889		nce_refrele(nce);
890		if (nce1 == NULL) {
891			if (!repeat) {
892				/* Try finding a better IRE */
893				repeat = B_TRUE;
894				ire_refrele(ire);
895				goto repeat_ire;
896			}
897			/* Tried twice - drop packet */
898			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
899			ip_drop_output("No nce", mp, ill);
900			freemsg(mp);
901			error = ENOBUFS;
902			goto done;
903		}
904		nce = nce1;
905	}
906
907	/*
908	 * For multicast with multirt we have a flag passed back from
909	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
910	 * possible multicast address.
911	 * We also need a flag for multicast since we can't check
912	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
913	 */
914	if (multirt) {
915		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
916		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
917	} else {
918		ixa->ixa_postfragfn = ire->ire_postfragfn;
919		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
920	}
921	ASSERT(ixa->ixa_nce == NULL);
922	ixa->ixa_nce = nce;
923
924	/*
925	 * Check for a dce_t with a path mtu.
926	 */
927	dce = dce_lookup_v4(dst, ipst, NULL);
928	ASSERT(dce != NULL);
929
930	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
931		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
932	} else if (dce->dce_flags & DCEF_PMTU) {
933		/*
934		 * To avoid a periodic timer to increase the path MTU we
935		 * look at dce_last_change_time each time we send a packet.
936		 */
937		now = ddi_get_lbolt64();
938		if (TICK_TO_SEC(now) - dce->dce_last_change_time >
939		    ipst->ips_ip_pathmtu_interval) {
940			/*
941			 * Older than 20 minutes. Drop the path MTU information.
942			 */
943			mutex_enter(&dce->dce_lock);
944			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
945			dce->dce_last_change_time = TICK_TO_SEC(now);
946			mutex_exit(&dce->dce_lock);
947			dce_increment_generation(dce);
948			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
949		} else {
950			uint_t fragsize;
951
952			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
953			if (fragsize > dce->dce_pmtu)
954				fragsize = dce->dce_pmtu;
955			ixa->ixa_fragsize = fragsize;
956		}
957	} else {
958		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
959	}
960
961	/*
962	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
963	 * interface for source address selection.
964	 */
965	ill = ire_nexthop_ill(ire);
966
967	if (ixaflags & IXAF_SET_SOURCE) {
968		ipaddr_t	src;
969
970		/*
971		 * We use the final destination to get
972		 * correct selection for source routed packets
973		 */
974
975		/* If unreachable we have no ill but need some source */
976		if (ill == NULL) {
977			src = htonl(INADDR_LOOPBACK);
978			error = 0;
979		} else {
980			error = ip_select_source_v4(ill, setsrc, dst,
981			    ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
982			    &src, NULL, NULL);
983		}
984		if (error != 0) {
985			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
986			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
987			ip_drop_output("ipIfStatsOutDiscards - no source",
988			    mp, ill);
989			freemsg(mp);
990			goto done;
991		}
992		ipha->ipha_src = src;
993	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
994		/* Check if the IP source is assigned to the host. */
995		if (!ip_verify_src(mp, ixa, NULL)) {
996			/* Don't send a packet with a source that isn't ours */
997			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
998			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
999			ip_drop_output("ipIfStatsOutDiscards - invalid source",
1000			    mp, ill);
1001			freemsg(mp);
1002			error = EADDRNOTAVAIL;
1003			goto done;
1004		}
1005	}
1006
1007
1008	/*
1009	 * Check against global IPsec policy to set the AH/ESP attributes.
1010	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1011	 */
1012	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1013		ASSERT(ixa->ixa_ipsec_policy == NULL);
1014		mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1015		if (mp == NULL) {
1016			/* MIB and ip_drop_packet already done */
1017			return (EHOSTUNREACH);	/* IPsec policy failure */
1018		}
1019	}
1020
1021	if (ill != NULL) {
1022		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1023	} else {
1024		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1025	}
1026
1027	/*
1028	 * We update the statistics on the most specific IRE i.e., the first
1029	 * one we found.
1030	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1031	 * can only count the use prior to fragmentation. However the MIB
1032	 * counters on the ill will be incremented in post fragmentation.
1033	 */
1034	ire->ire_ob_pkt_count++;
1035
1036	/*
1037	 * Based on ire_type and ire_flags call one of:
1038	 *	ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1039	 *	ire_send_multirt_v4 - if RTF_MULTIRT
1040	 *	ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1041	 *	ire_send_multicast_v4 - for IRE_MULTICAST
1042	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
1043	 *	ire_send_wire_v4 - for the rest.
1044	 */
1045	error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1046done:
1047	ire_refrele(ire);
1048	if (dce != NULL)
1049		dce_refrele(dce);
1050	if (ill != NULL)
1051		ill_refrele(ill);
1052	if (ixa->ixa_nce != NULL)
1053		nce_refrele(ixa->ixa_nce);
1054	ixa->ixa_nce = NULL;
1055	return (error);
1056}
1057
1058/*
1059 * ire_sendfn() functions.
1060 * These functions use the following xmit_attr:
1061 *  - ixa_fragsize - read to determine whether or not to fragment
1062 *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1063 *  - ixa_ipsec_*  are used inside IPsec
1064 *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
1065 *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
1066 */
1067
1068
1069/*
1070 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1071 *
1072 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1073 */
1074/* ARGSUSED4 */
1075int
1076ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1077    ip_xmit_attr_t *ixa, uint32_t *identp)
1078{
1079	ipha_t		*ipha = (ipha_t *)iph_arg;
1080	ip_stack_t	*ipst = ixa->ixa_ipst;
1081	ill_t		*ill = ire->ire_ill;
1082	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
1083	uint_t		pktlen = ixa->ixa_pktlen;
1084
1085	/*
1086	 * No fragmentation, no nce, no application of IPsec,
1087	 * and no ipha_ident assignment.
1088	 *
1089	 * Note different order between IP provider and FW_HOOKS than in
1090	 * send_wire case.
1091	 */
1092
1093	/*
1094	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
1095	 * send probe, but not the receive probe.
1096	 */
1097	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1098	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1099	    int, 1);
1100
1101	if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1102		int error;
1103
1104		DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1105		    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1106		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1107		    ipst->ips_ipv4firewall_loopback_out,
1108		    NULL, ill, ipha, mp, mp, 0, ipst, error);
1109		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1110		if (mp == NULL)
1111			return (error);
1112
1113		/*
1114		 * Even if the destination was changed by the filter we use the
1115		 * forwarding decision that was made based on the address
1116		 * in ip_output/ip_set_destination.
1117		 */
1118		/* Length could be different */
1119		ipha = (ipha_t *)mp->b_rptr;
1120		pktlen = ntohs(ipha->ipha_length);
1121	}
1122
1123	/*
1124	 * If a callback is enabled then we need to know the
1125	 * source and destination zoneids for the packet. We already
1126	 * have those handy.
1127	 */
1128	if (ipst->ips_ip4_observe.he_interested) {
1129		zoneid_t szone, dzone;
1130		zoneid_t stackzoneid;
1131
1132		stackzoneid = netstackid_to_zoneid(
1133		    ipst->ips_netstack->netstack_stackid);
1134
1135		if (stackzoneid == GLOBAL_ZONEID) {
1136			/* Shared-IP zone */
1137			dzone = ire->ire_zoneid;
1138			szone = ixa->ixa_zoneid;
1139		} else {
1140			szone = dzone = stackzoneid;
1141		}
1142		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1143	}
1144
1145	/* Handle lo0 stats */
1146	ipst->ips_loopback_packets++;
1147
1148	/* Map ixa to ira including IPsec policies */
1149	ipsec_out_to_in(ixa, ill, &iras);
1150	iras.ira_pktlen = pktlen;
1151
1152	if (!IS_SIMPLE_IPH(ipha)) {
1153		ip_output_local_options(ipha, ipst);
1154		iras.ira_flags |= IRAF_IPV4_OPTIONS;
1155	}
1156
1157	if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1158		int error;
1159
1160		DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1161		    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1162		FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1163		    ipst->ips_ipv4firewall_loopback_in,
1164		    ill, NULL, ipha, mp, mp, 0, ipst, error);
1165
1166		DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1167		if (mp == NULL) {
1168			ira_cleanup(&iras, B_FALSE);
1169			return (error);
1170		}
1171		/*
1172		 * Even if the destination was changed by the filter we use the
1173		 * forwarding decision that was made based on the address
1174		 * in ip_output/ip_set_destination.
1175		 */
1176		/* Length could be different */
1177		ipha = (ipha_t *)mp->b_rptr;
1178		pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1179	}
1180
1181	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1182	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1183	    int, 1);
1184
1185	ire->ire_ib_pkt_count++;
1186	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1187	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1188
1189	/* Destined to ire_zoneid - use that for fanout */
1190	iras.ira_zoneid = ire->ire_zoneid;
1191
1192	if (is_system_labeled()) {
1193		iras.ira_flags |= IRAF_SYSTEM_LABELED;
1194
1195		/*
1196		 * This updates ira_cred, ira_tsl and ira_free_flags based
1197		 * on the label. We don't expect this to ever fail for
1198		 * loopback packets, so we silently drop the packet should it
1199		 * fail.
1200		 */
1201		if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1202			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1203			ip_drop_input("tsol_get_pkt_label", mp, ill);
1204			freemsg(mp);
1205			return (0);
1206		}
1207		ASSERT(iras.ira_tsl != NULL);
1208
1209		/* tsol_get_pkt_label sometimes does pullupmsg */
1210		ipha = (ipha_t *)mp->b_rptr;
1211	}
1212
1213	ip_fanout_v4(mp, ipha, &iras);
1214
1215	/* We moved any IPsec refs from ixa to iras */
1216	ira_cleanup(&iras, B_FALSE);
1217	return (0);
1218}
1219
1220/*
1221 * ire_sendfn for IRE_BROADCAST
1222 * If the broadcast address is present on multiple ills and ixa_ifindex
1223 * isn't set, then we generate
1224 * a separate datagram (potentially with different source address) for
1225 * those ills. In any case, only one copy is looped back to ip_input_v4.
1226 */
1227int
1228ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1229    ip_xmit_attr_t *ixa, uint32_t *identp)
1230{
1231	ipha_t		*ipha = (ipha_t *)iph_arg;
1232	ip_stack_t	*ipst = ixa->ixa_ipst;
1233	irb_t		*irb = ire->ire_bucket;
1234	ire_t		*ire1;
1235	mblk_t		*mp1;
1236	ipha_t		*ipha1;
1237	iaflags_t	ixaflags = ixa->ixa_flags;
1238	nce_t		*nce1, *nce_orig;
1239
1240	/*
1241	 * Unless ire_send_multirt_v4 already set a ttl, force the
1242	 * ttl to a smallish value.
1243	 */
1244	if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1245		/*
1246		 * To avoid broadcast storms, we usually set the TTL to 1 for
1247		 * broadcasts.  This can
1248		 * be overridden stack-wide through the ip_broadcast_ttl
1249		 * ndd tunable, or on a per-connection basis through the
1250		 * IP_BROADCAST_TTL socket option.
1251		 *
1252		 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1253		 * will force ttl to one after we've set this.
1254		 */
1255		if (ixaflags & IXAF_BROADCAST_TTL_SET)
1256			ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1257		else
1258			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1259	}
1260	/*
1261	 * Make sure we get a loopback copy (after IPsec and frag)
1262	 * Skip hardware checksum so that loopback copy is checksumed.
1263	 */
1264	ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1265
1266	/* Do we need to potentially generate multiple copies? */
1267	if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1268		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1269
1270	/*
1271	 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1272	 * Note that everything in the bucket has the same destination address.
1273	 */
1274	irb_refhold(irb);
1275	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1276		/* We do the main IRE after the end of the loop */
1277		if (ire1 == ire)
1278			continue;
1279
1280		/*
1281		 * Only IREs for the same IP address should be in the same
1282		 * bucket.
1283		 * But could have IRE_HOSTs in the case of CGTP.
1284		 * If we find any multirt routes we bail out of the loop
1285		 * and just do the single packet at the end; ip_postfrag_multirt
1286		 * will duplicate the packet.
1287		 */
1288		ASSERT(ire1->ire_addr == ire->ire_addr);
1289		if (!(ire1->ire_type & IRE_BROADCAST))
1290			continue;
1291
1292		if (IRE_IS_CONDEMNED(ire1))
1293			continue;
1294
1295		if (ixa->ixa_zoneid != ALL_ZONES &&
1296		    ire->ire_zoneid != ire1->ire_zoneid)
1297			continue;
1298
1299		ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1300
1301		if (ire1->ire_flags & RTF_MULTIRT)
1302			break;
1303
1304		/*
1305		 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1306		 * ensure that this goes out on the cast_ill.
1307		 */
1308		if (IS_UNDER_IPMP(ire1->ire_ill))
1309			continue;
1310
1311		mp1 = copymsg(mp);
1312		if (mp1 == NULL) {
1313			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1314			    ipIfStatsOutDiscards);
1315			ip_drop_output("ipIfStatsOutDiscards",
1316			    mp, ire1->ire_ill);
1317			continue;
1318		}
1319
1320		ipha1 = (ipha_t *)mp1->b_rptr;
1321		if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1322			/*
1323			 * Need to pick a different source address for each
1324			 * interface. If we have a global IPsec policy and
1325			 * no per-socket policy then we punt to
1326			 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1327			 */
1328			if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1329				ip_output_simple_broadcast(ixa, mp1);
1330				continue;
1331			}
1332			/* Pick a new source address for each interface */
1333			if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1334			    ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1335			    &ipha1->ipha_src, NULL, NULL) != 0) {
1336				BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1337				    ipIfStatsOutDiscards);
1338				ip_drop_output("ipIfStatsOutDiscards - select "
1339				    "broadcast source", mp1, ire1->ire_ill);
1340				freemsg(mp1);
1341				continue;
1342			}
1343			/*
1344			 * Check against global IPsec policy to set the AH/ESP
1345			 * attributes. IPsec will set IXAF_IPSEC_* and
1346			 * ixa_ipsec_* as appropriate.
1347			 */
1348			if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1349				ASSERT(ixa->ixa_ipsec_policy == NULL);
1350				mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1351				    NULL, ixa);
1352				if (mp1 == NULL) {
1353					/*
1354					 * MIB and ip_drop_packet already
1355					 * done
1356					 */
1357					continue;
1358				}
1359			}
1360		}
1361		/* Make sure we have an NCE on this ill */
1362		nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1363		    ire1->ire_type);
1364		if (nce1 == NULL) {
1365			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1366			    ipIfStatsOutDiscards);
1367			ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1368			    mp1, ire1->ire_ill);
1369			freemsg(mp1);
1370			continue;
1371		}
1372		nce_orig = ixa->ixa_nce;
1373		ixa->ixa_nce = nce1;
1374
1375		ire_refhold(ire1);
1376		/*
1377		 * Ignore any errors here. We just collect the errno for
1378		 * the main ire below
1379		 */
1380		(void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1381		ire_refrele(ire1);
1382
1383		ixa->ixa_nce = nce_orig;
1384		nce_refrele(nce1);
1385
1386		ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1387	}
1388	irb_refrele(irb);
1389	/* Finally, the main one */
1390
1391	/*
1392	 * For IPMP we only send broadcasts on the ipmp_ill.
1393	 */
1394	if (IS_UNDER_IPMP(ire->ire_ill)) {
1395		freemsg(mp);
1396		return (0);
1397	}
1398
1399	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1400}
1401
1402/*
1403 * Send a packet using a different source address and different
1404 * IPsec policy.
1405 */
1406static void
1407ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1408{
1409	ip_xmit_attr_t ixas;
1410
1411	bzero(&ixas, sizeof (ixas));
1412	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1413	ixas.ixa_zoneid = ixa->ixa_zoneid;
1414	ixas.ixa_ifindex = 0;
1415	ixas.ixa_ipst = ixa->ixa_ipst;
1416	ixas.ixa_cred = ixa->ixa_cred;
1417	ixas.ixa_cpid = ixa->ixa_cpid;
1418	ixas.ixa_tsl = ixa->ixa_tsl;
1419	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1420
1421	(void) ip_output_simple(mp, &ixas);
1422	ixa_cleanup(&ixas);
1423}
1424
1425
1426static void
1427multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1428{
1429	ip_stack_t	*ipst = ixa->ixa_ipst;
1430
1431	/* Limit the TTL on multirt packets */
1432	if (ire->ire_type & IRE_MULTICAST) {
1433		if (ipha->ipha_ttl > 1) {
1434			ip2dbg(("ire_send_multirt_v4: forcing multicast "
1435			    "multirt TTL to 1 (was %d), dst 0x%08x\n",
1436			    ipha->ipha_ttl, ntohl(ire->ire_addr)));
1437			ipha->ipha_ttl = 1;
1438		}
1439		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1440	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
1441	    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1442		ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1443		/*
1444		 * Need to ensure we don't increase the ttl should we go through
1445		 * ire_send_broadcast or multicast.
1446		 */
1447		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1448	}
1449}
1450
1451/*
1452 * ire_sendfn for IRE_MULTICAST
1453 */
1454int
1455ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1456    ip_xmit_attr_t *ixa, uint32_t *identp)
1457{
1458	ipha_t		*ipha = (ipha_t *)iph_arg;
1459	ip_stack_t	*ipst = ixa->ixa_ipst;
1460	ill_t		*ill = ire->ire_ill;
1461	iaflags_t	ixaflags = ixa->ixa_flags;
1462
1463	/*
1464	 * The IRE_MULTICAST is the same whether or not multirt is in use.
1465	 * Hence we need special-case code.
1466	 */
1467	if (ixaflags & IXAF_MULTIRT_MULTICAST)
1468		multirt_check_v4(ire, ipha, ixa);
1469
1470	/*
1471	 * Check if anything in ip_input_v4 wants a copy of the transmitted
1472	 * packet (after IPsec and fragmentation)
1473	 *
1474	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1475	 *    RSVP and the rsvp daemon is an example of a
1476	 *    protocol and user level process that
1477	 *    handles it's own routing. Hence, it uses the
1478	 *    SO_DONTROUTE option to accomplish this.
1479	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1480	 *    check whether there are any receivers for the group on the ill
1481	 *    (ignoring the zoneid).
1482	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1483	 *    any members in other shared-IP zones.
1484	 *    If such members exist, then we indicate that the sending zone
1485	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1486	 *    behavior.
1487	 *
1488	 * When we loopback we skip hardware checksum to make sure loopback
1489	 * copy is checksumed.
1490	 *
1491	 * Note that ire_ill is the upper in the case of IPMP.
1492	 */
1493	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1494	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1495	    !(ixaflags & IXAF_DONTROUTE)) {
1496		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1497	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
1498		/*
1499		 * If this zone or any other zone has members then loopback
1500		 * a copy.
1501		 */
1502		if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1503			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1504	} else if (ipst->ips_netstack->netstack_numzones > 1) {
1505		/*
1506		 * This zone should not have a copy. But there are some other
1507		 * zones which might have members.
1508		 */
1509		if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1510		    ixa->ixa_zoneid)) {
1511			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1512			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1513			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1514		}
1515	}
1516
1517	/*
1518	 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1519	 * force the ttl to the IP_MULTICAST_TTL value
1520	 */
1521	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1522		ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1523	}
1524
1525	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1526}
1527
1528/*
1529 * ire_sendfn for IREs with RTF_MULTIRT
1530 */
1531int
1532ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1533    ip_xmit_attr_t *ixa, uint32_t *identp)
1534{
1535	ipha_t		*ipha = (ipha_t *)iph_arg;
1536
1537	multirt_check_v4(ire, ipha, ixa);
1538
1539	if (ire->ire_type & IRE_MULTICAST)
1540		return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1541	else if (ire->ire_type & IRE_BROADCAST)
1542		return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1543	else
1544		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1545}
1546
1547/*
1548 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1549 */
1550int
1551ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1552    ip_xmit_attr_t *ixa, uint32_t *identp)
1553{
1554	ip_stack_t	*ipst = ixa->ixa_ipst;
1555	ipha_t		*ipha = (ipha_t *)iph_arg;
1556	ill_t		*ill;
1557	ip_recv_attr_t	iras;
1558	boolean_t	dummy;
1559
1560	/* We assign an IP ident for nice errors */
1561	ipha->ipha_ident = atomic_add_32_nv(identp, 1);
1562
1563	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1564
1565	if (ire->ire_type & IRE_NOROUTE) {
1566		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1567		ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1568		    RTA_DST, ipst);
1569	}
1570
1571	if (ire->ire_flags & RTF_BLACKHOLE) {
1572		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1573		freemsg(mp);
1574		/* No error even for local senders - silent blackhole */
1575		return (0);
1576	}
1577	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1578
1579	/*
1580	 * We need an ill_t for the ip_recv_attr_t even though this packet
1581	 * was never received and icmp_unreachable doesn't currently use
1582	 * ira_ill.
1583	 */
1584	ill = ill_lookup_on_name("lo0", B_FALSE,
1585	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1586	if (ill == NULL) {
1587		freemsg(mp);
1588		return (EHOSTUNREACH);
1589	}
1590
1591	bzero(&iras, sizeof (iras));
1592	/* Map ixa to ira including IPsec policies */
1593	ipsec_out_to_in(ixa, ill, &iras);
1594
1595	if (ip_source_routed(ipha, ipst)) {
1596		icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1597	} else {
1598		icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1599	}
1600	/* We moved any IPsec refs from ixa to iras */
1601	ira_cleanup(&iras, B_FALSE);
1602	ill_refrele(ill);
1603	return (EHOSTUNREACH);
1604}
1605
1606/*
1607 * Calculate a checksum ignoring any hardware capabilities
1608 *
1609 * Returns B_FALSE if the packet was too short for the checksum. Caller
1610 * should free and do stats.
1611 */
1612static boolean_t
1613ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1614{
1615	ip_stack_t	*ipst = ixa->ixa_ipst;
1616	uint_t		pktlen = ixa->ixa_pktlen;
1617	uint16_t	*cksump;
1618	uint32_t	cksum;
1619	uint8_t		protocol = ixa->ixa_protocol;
1620	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1621	ipaddr_t	dst = ipha->ipha_dst;
1622	ipaddr_t	src = ipha->ipha_src;
1623
1624	/* Just in case it contained garbage */
1625	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1626
1627	/*
1628	 * Calculate ULP checksum
1629	 */
1630	if (protocol == IPPROTO_TCP) {
1631		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1632		cksum = IP_TCP_CSUM_COMP;
1633	} else if (protocol == IPPROTO_UDP) {
1634		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1635		cksum = IP_UDP_CSUM_COMP;
1636	} else if (protocol == IPPROTO_SCTP) {
1637		sctp_hdr_t	*sctph;
1638
1639		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1640		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1641		/*
1642		 * Zero out the checksum field to ensure proper
1643		 * checksum calculation.
1644		 */
1645		sctph->sh_chksum = 0;
1646#ifdef	DEBUG
1647		if (!skip_sctp_cksum)
1648#endif
1649			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1650		goto ip_hdr_cksum;
1651	} else {
1652		goto ip_hdr_cksum;
1653	}
1654
1655	/* ULP puts the checksum field is in the first mblk */
1656	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1657
1658	/*
1659	 * We accumulate the pseudo header checksum in cksum.
1660	 * This is pretty hairy code, so watch close.  One
1661	 * thing to keep in mind is that UDP and TCP have
1662	 * stored their respective datagram lengths in their
1663	 * checksum fields.  This lines things up real nice.
1664	 */
1665	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1666
1667	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1668	/*
1669	 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1670	 * Change to 0xffff
1671	 */
1672	if (protocol == IPPROTO_UDP && cksum == 0)
1673		*cksump = ~cksum;
1674	else
1675		*cksump = cksum;
1676
1677	IP_STAT(ipst, ip_out_sw_cksum);
1678	IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1679
1680ip_hdr_cksum:
1681	/* Calculate IPv4 header checksum */
1682	ipha->ipha_hdr_checksum = 0;
1683	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1684	return (B_TRUE);
1685}
1686
1687/*
1688 * Calculate the ULP checksum - try to use hardware.
1689 * In the case of MULTIRT, broadcast or multicast the
1690 * IXAF_NO_HW_CKSUM is set in which case we use software.
1691 *
1692 * If the hardware supports IP header checksum offload; then clear the
1693 * contents of IP header checksum field as expected by NIC.
1694 * Do this only if we offloaded either full or partial sum.
1695 *
1696 * Returns B_FALSE if the packet was too short for the checksum. Caller
1697 * should free and do stats.
1698 */
1699static boolean_t
1700ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1701    ip_xmit_attr_t *ixa, ill_t *ill)
1702{
1703	uint_t		pktlen = ixa->ixa_pktlen;
1704	uint16_t	*cksump;
1705	uint16_t	hck_flags;
1706	uint32_t	cksum;
1707	uint8_t		protocol = ixa->ixa_protocol;
1708	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1709
1710	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1711	    !dohwcksum) {
1712		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1713	}
1714
1715	/*
1716	 * Calculate ULP checksum. Note that we don't use cksump and cksum
1717	 * if the ill has FULL support.
1718	 */
1719	if (protocol == IPPROTO_TCP) {
1720		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1721		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
1722	} else if (protocol == IPPROTO_UDP) {
1723		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1724		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
1725	} else if (protocol == IPPROTO_SCTP) {
1726		sctp_hdr_t	*sctph;
1727
1728		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1729		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1730		/*
1731		 * Zero out the checksum field to ensure proper
1732		 * checksum calculation.
1733		 */
1734		sctph->sh_chksum = 0;
1735#ifdef	DEBUG
1736		if (!skip_sctp_cksum)
1737#endif
1738			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1739		goto ip_hdr_cksum;
1740	} else {
1741	ip_hdr_cksum:
1742		/* Calculate IPv4 header checksum */
1743		ipha->ipha_hdr_checksum = 0;
1744		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1745		return (B_TRUE);
1746	}
1747
1748	/* ULP puts the checksum field is in the first mblk */
1749	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1750
1751	/*
1752	 * Underlying interface supports hardware checksum offload for
1753	 * the payload; leave the payload checksum for the hardware to
1754	 * calculate.  N.B: We only need to set up checksum info on the
1755	 * first mblk.
1756	 */
1757	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1758
1759	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1760	if (hck_flags & HCKSUM_INET_FULL_V4) {
1761		/*
1762		 * Hardware calculates pseudo-header, header and the
1763		 * payload checksums, so clear the checksum field in
1764		 * the protocol header.
1765		 */
1766		*cksump = 0;
1767		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1768
1769		ipha->ipha_hdr_checksum = 0;
1770		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1771			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1772		} else {
1773			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1774		}
1775		return (B_TRUE);
1776	}
1777	if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
1778		ipaddr_t	dst = ipha->ipha_dst;
1779		ipaddr_t	src = ipha->ipha_src;
1780		/*
1781		 * Partial checksum offload has been enabled.  Fill
1782		 * the checksum field in the protocol header with the
1783		 * pseudo-header checksum value.
1784		 *
1785		 * We accumulate the pseudo header checksum in cksum.
1786		 * This is pretty hairy code, so watch close.  One
1787		 * thing to keep in mind is that UDP and TCP have
1788		 * stored their respective datagram lengths in their
1789		 * checksum fields.  This lines things up real nice.
1790		 */
1791		cksum += (dst >> 16) + (dst & 0xFFFF) +
1792		    (src >> 16) + (src & 0xFFFF);
1793		cksum += *(cksump);
1794		cksum = (cksum & 0xFFFF) + (cksum >> 16);
1795		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1796
1797		/*
1798		 * Offsets are relative to beginning of IP header.
1799		 */
1800		DB_CKSUMSTART(mp) = ip_hdr_length;
1801		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1802		DB_CKSUMEND(mp) = pktlen;
1803		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1804
1805		ipha->ipha_hdr_checksum = 0;
1806		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1807			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1808		} else {
1809			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1810		}
1811		return (B_TRUE);
1812	}
1813	/* Hardware capabilities include neither full nor partial IPv4 */
1814	return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1815}
1816
1817/*
1818 * ire_sendfn for offlink and onlink destinations.
1819 * Also called from the multicast, broadcast, multirt send functions.
1820 *
1821 * Assumes that the caller has a hold on the ire.
1822 *
1823 * This function doesn't care if the IRE just became condemned since that
1824 * can happen at any time.
1825 */
1826/* ARGSUSED */
1827int
1828ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1829    ip_xmit_attr_t *ixa, uint32_t *identp)
1830{
1831	ip_stack_t	*ipst = ixa->ixa_ipst;
1832	ipha_t		*ipha = (ipha_t *)iph_arg;
1833	iaflags_t	ixaflags = ixa->ixa_flags;
1834	ill_t		*ill;
1835
1836	ASSERT(ixa->ixa_nce != NULL);
1837	ill = ixa->ixa_nce->nce_ill;
1838
1839	if (ixaflags & IXAF_DONTROUTE)
1840		ipha->ipha_ttl = 1;
1841
1842	/*
1843	 * Assign an ident value for this packet. There could be other
1844	 * threads targeting the same destination, so we have to arrange
1845	 * for a atomic increment.  Note that we use a 32-bit atomic add
1846	 * because it has better performance than its 16-bit sibling.
1847	 *
1848	 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1849	 * be the number of TCP segments  that the driver/hardware will
1850	 * extraly construct.
1851	 *
1852	 * If running in cluster mode and if the source address
1853	 * belongs to a replicated service then vector through
1854	 * cl_inet_ipident vector to allocate ip identifier
1855	 * NOTE: This is a contract private interface with the
1856	 * clustering group.
1857	 */
1858	if (cl_inet_ipident != NULL) {
1859		ipaddr_t src = ipha->ipha_src;
1860		ipaddr_t dst = ipha->ipha_dst;
1861		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1862
1863		ASSERT(cl_inet_isclusterwide != NULL);
1864		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1865		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1866			/*
1867			 * Note: not correct with LSO since we can't allocate
1868			 * ixa_extra_ident+1 consecutive values.
1869			 */
1870			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1871			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1872			    (uint8_t *)(uintptr_t)dst, NULL);
1873		} else {
1874			ipha->ipha_ident = atomic_add_32_nv(identp,
1875			    ixa->ixa_extra_ident + 1);
1876		}
1877	} else {
1878		ipha->ipha_ident = atomic_add_32_nv(identp,
1879		    ixa->ixa_extra_ident + 1);
1880	}
1881#ifndef _BIG_ENDIAN
1882	ipha->ipha_ident = htons(ipha->ipha_ident);
1883#endif
1884
1885	/*
1886	 * This might set b_band, thus the IPsec and fragmentation
1887	 * code in IP ensures that b_band is updated in the first mblk.
1888	 */
1889	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1890		/* ip_process translates an IS_UNDER_IPMP */
1891		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1892		if (mp == NULL) {
1893			/* ip_drop_packet and MIB done */
1894			return (0);	/* Might just be delayed */
1895		}
1896	}
1897
1898	/*
1899	 * Verify any IPv4 options.
1900	 *
1901	 * The presense of IP options also forces the network stack to
1902	 * calculate the checksum in software.  This is because:
1903	 *
1904	 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1905	 * the size of "start offset" width to 6-bit.  This effectively
1906	 * sets the largest value of the offset to 64-bytes, starting
1907	 * from the MAC header.  When the cumulative MAC and IP headers
1908	 * exceed such limit, the offset will wrap around.  This causes
1909	 * the checksum to be calculated at the wrong place.
1910	 *
1911	 * IPv4 source routing: none of the full-checksum capable NICs
1912	 * is capable of correctly handling the	IPv4 source-routing
1913	 * option for purposes of calculating the pseudo-header; the
1914	 * actual destination is different from the destination in the
1915	 * header which is that of the next-hop.  (This case may not be
1916	 * true for NICs which can parse IPv6 extension headers, but
1917	 * we choose to simplify the implementation by not offloading
1918	 * checksum when they are present.)
1919	 */
1920	if (!IS_SIMPLE_IPH(ipha)) {
1921		ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1922		/* An IS_UNDER_IPMP ill is ok here */
1923		if (ip_output_options(mp, ipha, ixa, ill)) {
1924			/* Packet has been consumed and ICMP error sent */
1925			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1926			return (EINVAL);
1927		}
1928	}
1929
1930	/*
1931	 * To handle IPsec/iptun's labeling needs we need to tag packets
1932	 * while we still have ixa_tsl
1933	 */
1934	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1935	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1936	    ill->ill_mactype == DL_IPV6)) {
1937		cred_t *newcr;
1938
1939		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1940		    KM_NOSLEEP);
1941		if (newcr == NULL) {
1942			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1943			ip_drop_output("ipIfStatsOutDiscards - newcr",
1944			    mp, ill);
1945			freemsg(mp);
1946			return (ENOBUFS);
1947		}
1948		mblk_setcred(mp, newcr, NOPID);
1949		crfree(newcr);	/* mblk_setcred did its own crhold */
1950	}
1951
1952	if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1953	    (ixaflags & IXAF_IPSEC_SECURE)) {
1954		uint32_t pktlen;
1955
1956		pktlen = ixa->ixa_pktlen;
1957		if (ixaflags & IXAF_IPSEC_SECURE)
1958			pktlen += ipsec_out_extra_length(ixa);
1959
1960		if (pktlen > IP_MAXPACKET)
1961			return (EMSGSIZE);
1962
1963		if (ixaflags & IXAF_SET_ULP_CKSUM) {
1964			/*
1965			 * Compute ULP checksum and IP header checksum
1966			 * using software
1967			 */
1968			if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1969				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1970				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1971				freemsg(mp);
1972				return (EINVAL);
1973			}
1974		} else {
1975			/* Calculate IPv4 header checksum */
1976			ipha->ipha_hdr_checksum = 0;
1977			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1978		}
1979
1980		/*
1981		 * If this packet would generate a icmp_frag_needed
1982		 * message, we need to handle it before we do the IPsec
1983		 * processing. Otherwise, we need to strip the IPsec
1984		 * headers before we send up the message to the ULPs
1985		 * which becomes messy and difficult.
1986		 *
1987		 * We check using IXAF_DONTFRAG. The DF bit in the header
1988		 * is not inspected - it will be copied to any generated
1989		 * fragments.
1990		 */
1991		if ((pktlen > ixa->ixa_fragsize) &&
1992		    (ixaflags & IXAF_DONTFRAG)) {
1993			/* Generate ICMP and return error */
1994			ip_recv_attr_t	iras;
1995
1996			DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
1997			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1998			    uint_t, ixa->ixa_pmtu);
1999
2000			bzero(&iras, sizeof (iras));
2001			/* Map ixa to ira including IPsec policies */
2002			ipsec_out_to_in(ixa, ill, &iras);
2003
2004			ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2005			icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2006			/* We moved any IPsec refs from ixa to iras */
2007			ira_cleanup(&iras, B_FALSE);
2008			return (EMSGSIZE);
2009		}
2010		DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2011		    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2012		    uint_t, ixa->ixa_pmtu);
2013
2014		if (ixaflags & IXAF_IPSEC_SECURE) {
2015			/*
2016			 * Pass in sufficient information so that
2017			 * IPsec can determine whether to fragment, and
2018			 * which function to call after fragmentation.
2019			 */
2020			return (ipsec_out_process(mp, ixa));
2021		}
2022		return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2023		    ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2024		    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2025		    ixa->ixa_postfragfn, &ixa->ixa_cookie));
2026	}
2027	if (ixaflags & IXAF_SET_ULP_CKSUM) {
2028		/* Compute ULP checksum and IP header checksum */
2029		/* An IS_UNDER_IPMP ill is ok here */
2030		if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2031			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2032			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2033			freemsg(mp);
2034			return (EINVAL);
2035		}
2036	} else {
2037		/* Calculate IPv4 header checksum */
2038		ipha->ipha_hdr_checksum = 0;
2039		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2040	}
2041	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2042	    ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2043	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2044}
2045
2046/*
2047 * Send mp into ip_input
2048 * Common for IPv4 and IPv6
2049 */
2050void
2051ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2052    uint_t pkt_len, zoneid_t nolzid)
2053{
2054	rtc_t		rtc;
2055	ill_t		*ill = nce->nce_ill;
2056	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
2057	ncec_t		*ncec;
2058
2059	ncec = nce->nce_common;
2060	iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2061	    IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2062	if (ncec->ncec_flags & NCE_F_BCAST)
2063		iras.ira_flags |= IRAF_L2DST_BROADCAST;
2064	else if (ncec->ncec_flags & NCE_F_MCAST)
2065		iras.ira_flags |= IRAF_L2DST_MULTICAST;
2066
2067	iras.ira_free_flags = 0;
2068	iras.ira_cred = NULL;
2069	iras.ira_cpid = NOPID;
2070	iras.ira_tsl = NULL;
2071	iras.ira_zoneid = ALL_ZONES;
2072	iras.ira_pktlen = pkt_len;
2073	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2074	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2075
2076	if (ixaflags & IXAF_IS_IPV4)
2077		iras.ira_flags |= IRAF_IS_IPV4;
2078
2079	iras.ira_ill = iras.ira_rill = ill;
2080	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2081	iras.ira_rifindex = iras.ira_ruifindex;
2082	iras.ira_mhip = NULL;
2083
2084	iras.ira_flags |= ixaflags & IAF_MASK;
2085	iras.ira_no_loop_zoneid = nolzid;
2086
2087	/* Broadcast and multicast doesn't care about the squeue */
2088	iras.ira_sqp = NULL;
2089
2090	rtc.rtc_ire = NULL;
2091	if (ixaflags & IXAF_IS_IPV4) {
2092		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2093
2094		rtc.rtc_ipaddr = INADDR_ANY;
2095
2096		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2097		if (rtc.rtc_ire != NULL) {
2098			ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2099			ire_refrele(rtc.rtc_ire);
2100		}
2101	} else {
2102		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
2103
2104		rtc.rtc_ip6addr = ipv6_all_zeros;
2105
2106		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2107		if (rtc.rtc_ire != NULL) {
2108			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2109			ire_refrele(rtc.rtc_ire);
2110		}
2111	}
2112	/* Any references to clean up? No hold on ira */
2113	if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2114		ira_cleanup(&iras, B_FALSE);
2115}
2116
2117/*
2118 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2119 * looks at the IXAF_LOOPBACK_COPY flag.
2120 * Common for IPv4 and IPv6.
2121 *
2122 * If the loopback copy fails (due to no memory) but we send the packet out
2123 * on the wire we return no failure. Only in the case we supress the wire
2124 * sending do we take the loopback failure into account.
2125 *
2126 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2127 * Those operations are performed on this packet in ip_xmit() and it would
2128 * be odd to do it twice for the same packet.
2129 */
2130int
2131ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2132    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2133    uintptr_t *ixacookie)
2134{
2135	ill_t		*ill = nce->nce_ill;
2136	int		error = 0;
2137
2138	/*
2139	 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2140	 * had looped it back
2141	 */
2142	if (ixaflags & IXAF_LOOPBACK_COPY) {
2143		mblk_t		*mp1;
2144
2145		mp1 = copymsg(mp);
2146		if (mp1 == NULL) {
2147			/* Failed to deliver the loopback copy. */
2148			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2149			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2150			error = ENOBUFS;
2151		} else {
2152			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2153			    nolzid);
2154		}
2155	}
2156
2157	/*
2158	 * If TTL = 0 then only do the loopback to this host i.e. we are
2159	 * done. We are also done if this was the
2160	 * loopback interface since it is sufficient
2161	 * to loopback one copy of a multicast packet.
2162	 */
2163	if (ixaflags & IXAF_IS_IPV4) {
2164		ipha_t *ipha = (ipha_t *)mp->b_rptr;
2165
2166		if (ipha->ipha_ttl == 0) {
2167			ip_drop_output("multicast ipha_ttl not sent to wire",
2168			    mp, ill);
2169			freemsg(mp);
2170			return (error);
2171		}
2172	} else {
2173		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2174
2175		if (ip6h->ip6_hops == 0) {
2176			ip_drop_output("multicast ipha_ttl not sent to wire",
2177			    mp, ill);
2178			freemsg(mp);
2179			return (error);
2180		}
2181	}
2182	if (nce->nce_ill->ill_wq == NULL) {
2183		/* Loopback interface */
2184		ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2185		freemsg(mp);
2186		return (error);
2187	}
2188
2189	return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2190	    ixacookie));
2191}
2192
2193/*
2194 * Post fragmentation function for RTF_MULTIRT routes.
2195 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2196 * checks IXAF_LOOPBACK_COPY.
2197 *
2198 * If no packet is sent due to failures then we return an errno, but if at
2199 * least one succeeded we return zero.
2200 */
2201int
2202ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2203    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2204    uintptr_t *ixacookie)
2205{
2206	irb_t		*irb;
2207	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2208	ire_t		*ire;
2209	ire_t		*ire1;
2210	mblk_t		*mp1;
2211	nce_t		*nce1;
2212	ill_t		*ill = nce->nce_ill;
2213	ill_t		*ill1;
2214	ip_stack_t	*ipst = ill->ill_ipst;
2215	int		error = 0;
2216	int		num_sent = 0;
2217	int		err;
2218	uint_t		ire_type;
2219	ipaddr_t	nexthop;
2220
2221	ASSERT(ixaflags & IXAF_IS_IPV4);
2222
2223	/* Check for IXAF_LOOPBACK_COPY */
2224	if (ixaflags & IXAF_LOOPBACK_COPY) {
2225		mblk_t *mp1;
2226
2227		mp1 = copymsg(mp);
2228		if (mp1 == NULL) {
2229			/* Failed to deliver the loopback copy. */
2230			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2231			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2232			error = ENOBUFS;
2233		} else {
2234			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2235			    nolzid);
2236		}
2237	}
2238
2239	/*
2240	 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2241	 * a copy to each one.
2242	 * Use the nce (nexthop) and ipha_dst to find the ire.
2243	 *
2244	 * MULTIRT is not designed to work with shared-IP zones thus we don't
2245	 * need to pass a zoneid or a label to the IRE lookup.
2246	 */
2247	if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2248		/* Broadcast and multicast case */
2249		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2250		    NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2251	} else {
2252		ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2253
2254		/* Unicast case */
2255		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2256		    NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2257	}
2258
2259	if (ire == NULL ||
2260	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2261	    !(ire->ire_flags & RTF_MULTIRT)) {
2262		/* Drop */
2263		ip_drop_output("ip_postfrag_multirt didn't find route",
2264		    mp, nce->nce_ill);
2265		if (ire != NULL)
2266			ire_refrele(ire);
2267		return (ENETUNREACH);
2268	}
2269
2270	irb = ire->ire_bucket;
2271	irb_refhold(irb);
2272	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2273		/*
2274		 * For broadcast we can have a mixture of IRE_BROADCAST and
2275		 * IRE_HOST due to the manually added IRE_HOSTs that are used
2276		 * to trigger the creation of the special CGTP broadcast routes.
2277		 * Thus we have to skip if ire_type doesn't match the original.
2278		 */
2279		if (IRE_IS_CONDEMNED(ire1) ||
2280		    !(ire1->ire_flags & RTF_MULTIRT) ||
2281		    ire1->ire_type != ire->ire_type)
2282			continue;
2283
2284		/* Do the ire argument one after the loop */
2285		if (ire1 == ire)
2286			continue;
2287
2288		ill1 = ire_nexthop_ill(ire1);
2289		if (ill1 == NULL) {
2290			/*
2291			 * This ire might not have been picked by
2292			 * ire_route_recursive, in which case ire_dep might
2293			 * not have been setup yet.
2294			 * We kick ire_route_recursive to try to resolve
2295			 * starting at ire1.
2296			 */
2297			ire_t *ire2;
2298			uint_t	match_flags = MATCH_IRE_DSTONLY;
2299
2300			if (ire1->ire_ill != NULL)
2301				match_flags |= MATCH_IRE_ILL;
2302			ire2 = ire_route_recursive_impl_v4(ire1,
2303			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2304			    ire1->ire_zoneid, NULL, match_flags,
2305			    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2306			if (ire2 != NULL)
2307				ire_refrele(ire2);
2308			ill1 = ire_nexthop_ill(ire1);
2309		}
2310
2311		if (ill1 == NULL) {
2312			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2313			ip_drop_output("ipIfStatsOutDiscards - no ill",
2314			    mp, ill);
2315			error = ENETUNREACH;
2316			continue;
2317		}
2318
2319		/* Pick the addr and type to use for arp_nce_init */
2320		if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2321			ire_type = IRE_BROADCAST;
2322			nexthop = ire1->ire_gateway_addr;
2323		} else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2324			ire_type = IRE_MULTICAST;
2325			nexthop = ipha->ipha_dst;
2326		} else {
2327			ire_type = ire1->ire_type;	/* Doesn't matter */
2328			nexthop = ire1->ire_gateway_addr;
2329		}
2330
2331		/* If IPMP meta or under, then we just drop */
2332		if (ill1->ill_grp != NULL) {
2333			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2334			ip_drop_output("ipIfStatsOutDiscards - IPMP",
2335			    mp, ill1);
2336			ill_refrele(ill1);
2337			error = ENETUNREACH;
2338			continue;
2339		}
2340
2341		nce1 = arp_nce_init(ill1, nexthop, ire_type);
2342		if (nce1 == NULL) {
2343			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2344			ip_drop_output("ipIfStatsOutDiscards - no nce",
2345			    mp, ill1);
2346			ill_refrele(ill1);
2347			error = ENETUNREACH;
2348			continue;
2349		}
2350		mp1 = copymsg(mp);
2351		if (mp1 == NULL) {
2352			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2353			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2354			nce_refrele(nce1);
2355			ill_refrele(ill1);
2356			error = ENOBUFS;
2357			continue;
2358		}
2359		/* Preserve HW checksum for this copy */
2360		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2361		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2362		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2363		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2364		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2365
2366		ire1->ire_ob_pkt_count++;
2367		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2368		    0, ixacookie);
2369		if (err == 0)
2370			num_sent++;
2371		else
2372			error = err;
2373		nce_refrele(nce1);
2374		ill_refrele(ill1);
2375	}
2376	irb_refrele(irb);
2377	ire_refrele(ire);
2378	/* Finally, the main one */
2379	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2380	    ixacookie);
2381	if (err == 0)
2382		num_sent++;
2383	else
2384		error = err;
2385	if (num_sent > 0)
2386		return (0);
2387	else
2388		return (error);
2389}
2390
2391/*
2392 * Verify local connectivity. This check is called by ULP fusion code.
2393 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2394 * the interface is brought down and back up. So we simply fail the local
2395 * process. The caller, TCP Fusion, should unfuse the connection.
2396 */
2397boolean_t
2398ip_output_verify_local(ip_xmit_attr_t *ixa)
2399{
2400	ire_t		*ire = ixa->ixa_ire;
2401
2402	if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2403		return (B_FALSE);
2404
2405	return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2406}
2407
2408/*
2409 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2410 *
2411 * The caller must call ip_output_verify_local() first. This function handles
2412 * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2413 */
2414mblk_t *
2415ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2416    boolean_t hooks_in, conn_t *peer_connp)
2417{
2418	ill_t		*ill = ixa->ixa_ire->ire_ill;
2419	ipha_t		*ipha = NULL;
2420	ip6_t		*ip6h = NULL;
2421	ip_stack_t	*ipst = ixa->ixa_ipst;
2422	iaflags_t	ixaflags = ixa->ixa_flags;
2423	ip_recv_attr_t	iras;
2424	int		error;
2425
2426	ASSERT(mp != NULL);
2427
2428	if (ixaflags & IXAF_IS_IPV4) {
2429		ipha = (ipha_t *)mp->b_rptr;
2430
2431		/*
2432		 * If a callback is enabled then we need to know the
2433		 * source and destination zoneids for the packet. We already
2434		 * have those handy.
2435		 */
2436		if (ipst->ips_ip4_observe.he_interested) {
2437			zoneid_t szone, dzone;
2438			zoneid_t stackzoneid;
2439
2440			stackzoneid = netstackid_to_zoneid(
2441			    ipst->ips_netstack->netstack_stackid);
2442
2443			if (stackzoneid == GLOBAL_ZONEID) {
2444				/* Shared-IP zone */
2445				dzone = ixa->ixa_ire->ire_zoneid;
2446				szone = ixa->ixa_zoneid;
2447			} else {
2448				szone = dzone = stackzoneid;
2449			}
2450			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2451			    ipst);
2452		}
2453		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2454		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2455		    NULL, int, 1);
2456
2457		/* FW_HOOKS: LOOPBACK_OUT */
2458		if (hooks_out) {
2459			DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2460			    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2461			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2462			    ipst->ips_ipv4firewall_loopback_out,
2463			    NULL, ill, ipha, mp, mp, 0, ipst, error);
2464			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2465		}
2466		if (mp == NULL)
2467			return (NULL);
2468
2469		/* FW_HOOKS: LOOPBACK_IN */
2470		if (hooks_in) {
2471			DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2472			    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2473			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2474			    ipst->ips_ipv4firewall_loopback_in,
2475			    ill, NULL, ipha, mp, mp, 0, ipst, error);
2476			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2477		}
2478		if (mp == NULL)
2479			return (NULL);
2480
2481		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2482		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2483		    NULL, int, 1);
2484
2485		/* Inbound IPsec polocies */
2486		if (peer_connp != NULL) {
2487			/* Map ixa to ira including IPsec policies. */
2488			ipsec_out_to_in(ixa, ill, &iras);
2489			mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2490			    NULL, &iras);
2491		}
2492	} else {
2493		ip6h = (ip6_t *)mp->b_rptr;
2494
2495		/*
2496		 * If a callback is enabled then we need to know the
2497		 * source and destination zoneids for the packet. We already
2498		 * have those handy.
2499		 */
2500		if (ipst->ips_ip6_observe.he_interested) {
2501			zoneid_t szone, dzone;
2502			zoneid_t stackzoneid;
2503
2504			stackzoneid = netstackid_to_zoneid(
2505			    ipst->ips_netstack->netstack_stackid);
2506
2507			if (stackzoneid == GLOBAL_ZONEID) {
2508				/* Shared-IP zone */
2509				dzone = ixa->ixa_ire->ire_zoneid;
2510				szone = ixa->ixa_zoneid;
2511			} else {
2512				szone = dzone = stackzoneid;
2513			}
2514			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2515			    ipst);
2516		}
2517		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2518		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2519		    ip6h, int, 1);
2520
2521		/* FW_HOOKS: LOOPBACK_OUT */
2522		if (hooks_out) {
2523			DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2524			    ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2525			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2526			    ipst->ips_ipv6firewall_loopback_out,
2527			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
2528			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2529		}
2530		if (mp == NULL)
2531			return (NULL);
2532
2533		/* FW_HOOKS: LOOPBACK_IN */
2534		if (hooks_in) {
2535			DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2536			    ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2537			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2538			    ipst->ips_ipv6firewall_loopback_in,
2539			    ill, NULL, ip6h, mp, mp, 0, ipst, error);
2540			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2541		}
2542		if (mp == NULL)
2543			return (NULL);
2544
2545		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2546		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2547		    ip6h, int, 1);
2548
2549		/* Inbound IPsec polocies */
2550		if (peer_connp != NULL) {
2551			/* Map ixa to ira including IPsec policies. */
2552			ipsec_out_to_in(ixa, ill, &iras);
2553			mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2554			    ip6h, &iras);
2555		}
2556	}
2557
2558	if (mp == NULL) {
2559		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2560		ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2561	}
2562
2563	return (mp);
2564}
2565