ip_output.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26/* Copyright (c) 1990 Mentat Inc. */
27
28#include <sys/types.h>
29#include <sys/stream.h>
30#include <sys/strsubr.h>
31#include <sys/dlpi.h>
32#include <sys/strsun.h>
33#include <sys/zone.h>
34#include <sys/ddi.h>
35#include <sys/sunddi.h>
36#include <sys/cmn_err.h>
37#include <sys/debug.h>
38#include <sys/atomic.h>
39
40#include <sys/systm.h>
41#include <sys/param.h>
42#include <sys/kmem.h>
43#include <sys/sdt.h>
44#include <sys/socket.h>
45#include <sys/mac.h>
46#include <net/if.h>
47#include <net/if_arp.h>
48#include <net/route.h>
49#include <sys/sockio.h>
50#include <netinet/in.h>
51#include <net/if_dl.h>
52
53#include <inet/common.h>
54#include <inet/mi.h>
55#include <inet/mib2.h>
56#include <inet/nd.h>
57#include <inet/arp.h>
58#include <inet/snmpcom.h>
59#include <inet/kstatcom.h>
60
61#include <netinet/igmp_var.h>
62#include <netinet/ip6.h>
63#include <netinet/icmp6.h>
64#include <netinet/sctp.h>
65
66#include <inet/ip.h>
67#include <inet/ip_impl.h>
68#include <inet/ip6.h>
69#include <inet/ip6_asp.h>
70#include <inet/tcp.h>
71#include <inet/ip_multi.h>
72#include <inet/ip_if.h>
73#include <inet/ip_ire.h>
74#include <inet/ip_ftable.h>
75#include <inet/ip_rts.h>
76#include <inet/optcom.h>
77#include <inet/ip_ndp.h>
78#include <inet/ip_listutils.h>
79#include <netinet/igmp.h>
80#include <netinet/ip_mroute.h>
81#include <inet/ipp_common.h>
82
83#include <net/pfkeyv2.h>
84#include <inet/sadb.h>
85#include <inet/ipsec_impl.h>
86#include <inet/ipdrop.h>
87#include <inet/ip_netinfo.h>
88
89#include <sys/pattr.h>
90#include <inet/ipclassifier.h>
91#include <inet/sctp_ip.h>
92#include <inet/sctp/sctp_impl.h>
93#include <inet/udp_impl.h>
94#include <sys/sunddi.h>
95
96#include <sys/tsol/label.h>
97#include <sys/tsol/tnet.h>
98
99#ifdef	DEBUG
100extern boolean_t skip_sctp_cksum;
101#endif
102
103static int	ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
104static int	ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
105static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
106static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
107static void	ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
108
109/*
110 * There are two types of output functions for IP used for different
111 * purposes:
112 *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
113 *     is no context in the form of a conn_t. However, there is a
114 *     ip_xmit_attr_t that the callers use to influence interface selection
115 *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
116 *
117 *  - conn_ip_output() is used when sending packets with a conn_t and
118 *    ip_set_destination has been called to cache information. In that case
119 *    various socket options are recorded in the ip_xmit_attr_t and should
120 *    be taken into account.
121 */
122
123/*
124 * The caller *must* have called conn_connect() or ip_attr_connect()
125 * before calling conn_ip_output(). The caller needs to redo that each time
126 * the destination IP address or port changes, as well as each time there is
127 * a change to any socket option that would modify how packets are routed out
128 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
129 *
130 * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
131 * We assert for that here.
132 */
133int
134conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
135{
136	iaflags_t	ixaflags = ixa->ixa_flags;
137	ire_t		*ire;
138	nce_t		*nce;
139	dce_t		*dce;
140	ill_t		*ill;
141	ip_stack_t	*ipst = ixa->ixa_ipst;
142	int		error;
143
144	/* We defer ipIfStatsHCOutRequests until an error or we have an ill */
145
146	ASSERT(ixa->ixa_ire != NULL);
147	/* Note there is no ixa_nce when reject and blackhole routes */
148	ASSERT(ixa->ixa_dce != NULL);	/* Could be default dce */
149
150#ifdef DEBUG
151	ASSERT(ixa->ixa_curthread == NULL);
152	ixa->ixa_curthread = curthread;
153#endif
154
155	/*
156	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
157	 * for IGMP/MLD traffic.
158	 */
159
160	ire = ixa->ixa_ire;
161
162	/*
163	 * If the ULP says the (old) IRE resulted in reachability we
164	 * record this before determine whether to use a new IRE.
165	 * No locking for performance reasons.
166	 */
167	if (ixaflags & IXAF_REACH_CONF)
168		ire->ire_badcnt = 0;
169
170	/*
171	 * Has routing changed since we cached the results of the lookup?
172	 *
173	 * This check captures all of:
174	 *  - the cached ire being deleted (by means of the special
175	 *    IRE_GENERATION_CONDEMNED)
176	 *  - A potentially better ire being added (ire_generation being
177	 *    increased)
178	 *  - A deletion of the nexthop ire that was used when we did the
179	 *    lookup.
180	 *  - An addition of a potentially better nexthop ire.
181	 * The last two are handled by walking and increasing the generation
182	 * number on all dependant IREs in ire_flush_cache().
183	 *
184	 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
185	 * since we ensure that each time we set ixa_ire to such an IRE we
186	 * make sure the ixa_ire_generation does not match (by using
187	 * IRE_GENERATION_VERIFY).
188	 */
189	if (ire->ire_generation != ixa->ixa_ire_generation) {
190		error = ip_verify_ire(mp, ixa);
191		if (error != 0) {
192			ip_drop_output("ipIfStatsOutDiscards - verify ire",
193			    mp, NULL);
194			goto drop;
195		}
196		ire = ixa->ixa_ire;
197		ASSERT(ire != NULL);
198		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
199#ifdef DEBUG
200			ASSERT(ixa->ixa_curthread == curthread);
201			ixa->ixa_curthread = NULL;
202#endif
203			ire->ire_ob_pkt_count++;
204			/* ixa_dce might be condemned; use default one */
205			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
206			    &ipst->ips_dce_default->dce_ident));
207		}
208		/*
209		 * If the ncec changed then ip_verify_ire already set
210		 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
211		 * so we can recheck the interface mtu.
212		 */
213
214		/*
215		 * Note that ire->ire_generation could already have changed.
216		 * We catch that next time we send a packet.
217		 */
218	}
219
220	/*
221	 * No need to lock access to ixa_nce since the ip_xmit_attr usage
222	 * is single threaded.
223	 */
224	ASSERT(ixa->ixa_nce != NULL);
225	nce = ixa->ixa_nce;
226	if (nce->nce_is_condemned) {
227		error = ip_verify_nce(mp, ixa);
228		/*
229		 * In case ZEROCOPY capability become not available, we
230		 * copy the message and free the original one. We might
231		 * be copying more data than needed but it doesn't hurt
232		 * since such change rarely happens.
233		 */
234		switch (error) {
235		case 0:
236			break;
237		case ENOTSUP: { /* ZEROCOPY */
238			mblk_t *nmp;
239
240			if ((nmp = copymsg(mp)) != NULL) {
241				freemsg(mp);
242				mp = nmp;
243
244				break;
245			}
246			/* FALLTHROUGH */
247		}
248		default:
249			ip_drop_output("ipIfStatsOutDiscards - verify nce",
250			    mp, NULL);
251			goto drop;
252		}
253		ire = ixa->ixa_ire;
254		ASSERT(ire != NULL);
255		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
256#ifdef DEBUG
257			ASSERT(ixa->ixa_curthread == curthread);
258			ixa->ixa_curthread = NULL;
259#endif
260			ire->ire_ob_pkt_count++;
261			/* ixa_dce might be condemned; use default one */
262			return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
263			    ixa, &ipst->ips_dce_default->dce_ident));
264		}
265		ASSERT(ixa->ixa_nce != NULL);
266		nce = ixa->ixa_nce;
267
268		/*
269		 * Note that some other event could already have made
270		 * the new nce condemned. We catch that next time we
271		 * try to send a packet.
272		 */
273	}
274	/*
275	 * If there is no per-destination dce_t then we have a reference to
276	 * the default dce_t (which merely contains the dce_ipid).
277	 * The generation check captures both the introduction of a
278	 * per-destination dce_t (e.g., due to ICMP packet too big) and
279	 * any change to the per-destination dce (including it becoming
280	 * condemned by use of the special DCE_GENERATION_CONDEMNED).
281	 */
282	dce = ixa->ixa_dce;
283
284	/*
285	 * To avoid a periodic timer to increase the path MTU we
286	 * look at dce_last_change_time each time we send a packet.
287	 */
288	if ((dce->dce_flags & DCEF_PMTU) &&
289	    (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
290	    ipst->ips_ip_pathmtu_interval)) {
291		/*
292		 * Older than 20 minutes. Drop the path MTU information.
293		 * Since the path MTU changes as a result of this, twiddle
294		 * ixa_dce_generation to make us go through the dce
295		 * verification code in conn_ip_output.
296		 */
297		mutex_enter(&dce->dce_lock);
298		dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
299		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
300		mutex_exit(&dce->dce_lock);
301		dce_increment_generation(dce);
302	}
303
304	if (dce->dce_generation != ixa->ixa_dce_generation) {
305		error = ip_verify_dce(mp, ixa);
306		if (error != 0) {
307			ip_drop_output("ipIfStatsOutDiscards - verify dce",
308			    mp, NULL);
309			goto drop;
310		}
311		dce = ixa->ixa_dce;
312
313		/*
314		 * Note that some other event could already have made the
315		 * new dce's generation number change.
316		 * We catch that next time we try to send a packet.
317		 */
318	}
319
320	ill = nce->nce_ill;
321
322	/*
323	 * An initial ixa_fragsize was set in ip_set_destination
324	 * and we update it if any routing changes above.
325	 * A change to ill_mtu with ifconfig will increase all dce_generation
326	 * so that we will detect that with the generation check.
327	 */
328
329	/*
330	 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
331	 * conn_unspec_src.
332	 */
333	if ((ixaflags & IXAF_VERIFY_SOURCE) &&
334	    ixa->ixa_src_generation != ipst->ips_src_generation) {
335		/* Check if the IP source is still assigned to the host. */
336		uint_t gen;
337
338		if (!ip_verify_src(mp, ixa, &gen)) {
339			/* Don't send a packet with a source that isn't ours */
340			error = EADDRNOTAVAIL;
341			ip_drop_output("ipIfStatsOutDiscards - invalid src",
342			    mp, NULL);
343			goto drop;
344		}
345		/* The source is still valid - update the generation number */
346		ixa->ixa_src_generation = gen;
347	}
348
349	/*
350	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
351	 * can only count the use prior to fragmentation. However the MIB
352	 * counters on the ill will be incremented in post fragmentation.
353	 */
354	ire->ire_ob_pkt_count++;
355	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
356
357	/*
358	 * Based on ire_type and ire_flags call one of:
359	 *	ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
360	 *	ire_send_multirt_v* - if RTF_MULTIRT
361	 *	ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
362	 *	ire_send_multicast_v* - for IRE_MULTICAST
363	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
364	 *	ire_send_wire_v* - for the rest.
365	 */
366#ifdef DEBUG
367	ASSERT(ixa->ixa_curthread == curthread);
368	ixa->ixa_curthread = NULL;
369#endif
370	return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
371
372drop:
373	if (ixaflags & IXAF_IS_IPV4) {
374		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
375		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
376	} else {
377		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
378		BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
379	}
380	freemsg(mp);
381#ifdef DEBUG
382	ASSERT(ixa->ixa_curthread == curthread);
383	ixa->ixa_curthread = NULL;
384#endif
385	return (error);
386}
387
388/*
389 * Handle both IPv4 and IPv6. Sets the generation number
390 * to allow the caller to know when to call us again.
391 * Returns true if the source address in the packet is a valid source.
392 * We handle callers which try to send with a zero address (since we only
393 * get here if UNSPEC_SRC is not set).
394 */
395boolean_t
396ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
397{
398	ip_stack_t	*ipst = ixa->ixa_ipst;
399
400	/*
401	 * Need to grab the generation number before we check to
402	 * avoid a race with a change to the set of local addresses.
403	 * No lock needed since the thread which updates the set of local
404	 * addresses use ipif/ill locks and exit those (hence a store memory
405	 * barrier) before doing the atomic increase of ips_src_generation.
406	 */
407	if (generationp != NULL)
408		*generationp = ipst->ips_src_generation;
409
410	if (ixa->ixa_flags & IXAF_IS_IPV4) {
411		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
412
413		if (ipha->ipha_src == INADDR_ANY)
414			return (B_FALSE);
415
416		return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
417		    ipst, B_FALSE) != IPVL_BAD);
418	} else {
419		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
420		uint_t	scopeid;
421
422		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
423			return (B_FALSE);
424
425		if (ixa->ixa_flags & IXAF_SCOPEID_SET)
426			scopeid = ixa->ixa_scopeid;
427		else
428			scopeid = 0;
429
430		return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
431		    ipst, B_FALSE, scopeid) != IPVL_BAD);
432	}
433}
434
435/*
436 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
437 */
438int
439ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
440{
441	uint_t		gen;
442	ire_t		*ire;
443	nce_t		*nce;
444	int		error;
445	boolean_t	multirt = B_FALSE;
446
447	/*
448	 * Redo ip_select_route.
449	 * Need to grab generation number as part of the lookup to
450	 * avoid race.
451	 */
452	error = 0;
453	ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
454	ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
455	if (error != 0) {
456		ire_refrele(ire);
457		return (error);
458	}
459
460	if (ixa->ixa_ire != NULL)
461		ire_refrele_notr(ixa->ixa_ire);
462#ifdef DEBUG
463	ire_refhold_notr(ire);
464	ire_refrele(ire);
465#endif
466	ixa->ixa_ire = ire;
467	ixa->ixa_ire_generation = gen;
468	if (multirt) {
469		if (ixa->ixa_flags & IXAF_IS_IPV4)
470			ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
471		else
472			ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
473		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
474	} else {
475		ixa->ixa_postfragfn = ire->ire_postfragfn;
476		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
477	}
478
479	/*
480	 * Don't look for an nce for reject or blackhole.
481	 * They have ire_generation set to IRE_GENERATION_VERIFY which
482	 * makes conn_ip_output avoid references to ixa_nce.
483	 */
484	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
485		ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
486		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
487		return (0);
488	}
489
490	/* The NCE could now be different */
491	nce = ire_to_nce_pkt(ire, mp);
492	if (nce == NULL) {
493		/*
494		 * Allocation failure. Make sure we redo ire/nce selection
495		 * next time we send.
496		 */
497		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
498		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
499		return (ENOBUFS);
500	}
501	if (nce == ixa->ixa_nce) {
502		/* No change */
503		nce_refrele(nce);
504		return (0);
505	}
506
507	/*
508	 * Since the path MTU might change as a result of this
509	 * route change, we twiddle ixa_dce_generation to
510	 * make conn_ip_output go through the ip_verify_dce code.
511	 */
512	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
513
514	if (ixa->ixa_nce != NULL)
515		nce_refrele(ixa->ixa_nce);
516	ixa->ixa_nce = nce;
517	return (0);
518}
519
520/*
521 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
522 */
523static int
524ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
525{
526	ire_t		*ire = ixa->ixa_ire;
527	nce_t		*nce;
528	int		error = 0;
529	ipha_t		*ipha = NULL;
530	ip6_t		*ip6h = NULL;
531
532	if (ire->ire_ipversion == IPV4_VERSION)
533		ipha = (ipha_t *)mp->b_rptr;
534	else
535		ip6h = (ip6_t *)mp->b_rptr;
536
537	nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
538	if (nce == NULL) {
539		/* Try to find a better ire */
540		return (ip_verify_ire(mp, ixa));
541	}
542
543	/*
544	 * The hardware offloading capabilities, for example LSO, of the
545	 * interface might have changed, so do sanity verification here.
546	 */
547	if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
548		if (!ip_verify_lso(nce->nce_ill, ixa)) {
549			ASSERT(ixa->ixa_notify != NULL);
550			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
551			    IXAN_LSO, 0);
552			error = ENOTSUP;
553		}
554	}
555
556	/*
557	 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
558	 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
559	 * any more, return error so that conn_ip_output() can take care of
560	 * the ZEROCOPY message properly. It's safe to continue send the
561	 * message when ZEROCOPY newly become available.
562	 */
563	if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
564		if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
565			ASSERT(ixa->ixa_notify != NULL);
566			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
567			    IXAN_ZCOPY, 0);
568			if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
569				error = ENOTSUP;
570		}
571	}
572
573	/*
574	 * Since the path MTU might change as a result of this
575	 * change, we twiddle ixa_dce_generation to
576	 * make conn_ip_output go through the ip_verify_dce code.
577	 */
578	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
579
580	nce_refrele(ixa->ixa_nce);
581	ixa->ixa_nce = nce;
582	return (error);
583}
584
585/*
586 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
587 */
588static int
589ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
590{
591	dce_t		*dce;
592	uint_t		gen;
593	uint_t		pmtu;
594
595	dce = dce_lookup_pkt(mp, ixa, &gen);
596	ASSERT(dce != NULL);
597
598	dce_refrele_notr(ixa->ixa_dce);
599#ifdef DEBUG
600	dce_refhold_notr(dce);
601	dce_refrele(dce);
602#endif
603	ixa->ixa_dce = dce;
604	ixa->ixa_dce_generation = gen;
605
606	/* Extract the (path) mtu from the dce, ncec_ill etc */
607	pmtu = ip_get_pmtu(ixa);
608
609	/*
610	 * Tell ULP about PMTU changes - increase or decrease - by returning
611	 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
612	 * both ixa_pmtu and ixa_fragsize appropriately.
613	 *
614	 * If ULP doesn't set that flag then we need to update ixa_fragsize
615	 * since routing could have changed the ill after after ixa_fragsize
616	 * was set previously in the conn_ip_output path or in
617	 * ip_set_destination.
618	 *
619	 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
620	 *
621	 * In the case of a path MTU increase we send the packet after the
622	 * notify to the ULP.
623	 */
624	if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
625		if (ixa->ixa_pmtu != pmtu) {
626			uint_t oldmtu = ixa->ixa_pmtu;
627
628			DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
629			    uint32_t, ixa->ixa_pmtu);
630			ASSERT(ixa->ixa_notify != NULL);
631			ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
632			    IXAN_PMTU, pmtu);
633			if (pmtu < oldmtu)
634				return (EMSGSIZE);
635		}
636	} else {
637		ixa->ixa_fragsize = pmtu;
638	}
639	return (0);
640}
641
642/*
643 * Verify LSO usability. Keep the return value simple to indicate whether
644 * the LSO capability has changed. Handle both IPv4 and IPv6.
645 */
646static boolean_t
647ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
648{
649	ill_lso_capab_t	*lsoc = &ixa->ixa_lso_capab;
650	ill_lso_capab_t	*new_lsoc = ill->ill_lso_capab;
651
652	if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
653		/*
654		 * Not unsable any more.
655		 */
656		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
657		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
658		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
659		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
660		    !ILL_LSO_TCP_IPV4_USABLE(ill) :
661		    !ILL_LSO_TCP_IPV6_USABLE(ill))) {
662			ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
663
664			return (B_FALSE);
665		}
666
667		/*
668		 * Capability has changed, refresh the copy in ixa.
669		 */
670		if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
671			*lsoc = *new_lsoc;
672
673			return (B_FALSE);
674		}
675	} else { /* Was not usable */
676		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
677		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
678		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
679		    ((ixa->ixa_flags & IXAF_IS_IPV4) ?
680		    ILL_LSO_TCP_IPV4_USABLE(ill) :
681		    ILL_LSO_TCP_IPV6_USABLE(ill))) {
682			*lsoc = *new_lsoc;
683			ixa->ixa_flags |= IXAF_LSO_CAPAB;
684
685			return (B_FALSE);
686		}
687	}
688
689	return (B_TRUE);
690}
691
692/*
693 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
694 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
695 */
696static boolean_t
697ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
698{
699	if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
700		/*
701		 * Not unsable any more.
702		 */
703		if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
704		    (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
705		    (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
706		    !ILL_ZCOPY_USABLE(ill)) {
707			ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
708
709			return (B_FALSE);
710		}
711	} else { /* Was not usable */
712		if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
713		    !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
714		    !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
715		    ILL_ZCOPY_USABLE(ill)) {
716			ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
717
718			return (B_FALSE);
719		}
720	}
721
722	return (B_TRUE);
723}
724
725
726/*
727 * When there is no conn_t context, this will send a packet.
728 * The caller must *not* have called conn_connect() or ip_attr_connect()
729 * before calling ip_output_simple().
730 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
731 * Honors IXAF_SET_SOURCE.
732 *
733 * We acquire the ire and after calling ire_sendfn we release
734 * the hold on the ire. Ditto for the nce and dce.
735 *
736 * This assumes that the caller has set the following in ip_xmit_attr_t:
737 *	ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
738 *	If ixa_ifindex is non-zero it means send out that ill. (If it is
739 *	an upper IPMP ill we load balance across the group; if a lower we send
740 *	on that lower ill without load balancing.)
741 *	IXAF_IS_IPV4 must be set correctly.
742 *	If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
743 *	If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
744 *	If neither of those two are set we do an IPsec policy lookup.
745 *
746 * We handle setting things like
747 *	ixa_pktlen
748 *	ixa_ip_hdr_length
749 *	ixa->ixa_protocol
750 *
751 * The caller may set ixa_xmit_hint, which is used for ECMP selection and
752 * transmit ring selecting in GLD.
753 *
754 * The caller must do an ixa_cleanup() to release any IPsec references
755 * after we return.
756 */
757int
758ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
759{
760	ts_label_t	*effective_tsl = NULL;
761	int		err;
762
763	ASSERT(ixa->ixa_ipst != NULL);
764
765	if (is_system_labeled()) {
766		ip_stack_t *ipst = ixa->ixa_ipst;
767
768		if (ixa->ixa_flags & IXAF_IS_IPV4) {
769			err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
770			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
771			    &effective_tsl);
772		} else {
773			err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
774			    &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
775			    &effective_tsl);
776		}
777		if (err != 0) {
778			ip2dbg(("tsol_check: label check failed (%d)\n", err));
779			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
780			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
781			ip_drop_output("tsol_check_label", mp, NULL);
782			freemsg(mp);
783			return (err);
784		}
785		if (effective_tsl != NULL) {
786			/* Update the label */
787			ip_xmit_attr_replace_tsl(ixa, effective_tsl);
788		}
789	}
790
791	if (ixa->ixa_flags & IXAF_IS_IPV4)
792		return (ip_output_simple_v4(mp, ixa));
793	else
794		return (ip_output_simple_v6(mp, ixa));
795}
796
797int
798ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
799{
800	ipha_t		*ipha;
801	ipaddr_t	firsthop; /* In IP header */
802	ipaddr_t	dst;	/* End of source route, or ipha_dst if none */
803	ire_t		*ire;
804	ipaddr_t	setsrc;	/* RTF_SETSRC */
805	int		error;
806	ill_t		*ill = NULL;
807	dce_t		*dce = NULL;
808	nce_t		*nce;
809	iaflags_t	ixaflags = ixa->ixa_flags;
810	ip_stack_t	*ipst = ixa->ixa_ipst;
811	boolean_t	repeat = B_FALSE;
812	boolean_t	multirt = B_FALSE;
813
814	ipha = (ipha_t *)mp->b_rptr;
815	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
816
817	/*
818	 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
819	 * for IGMP/MLD traffic.
820	 */
821
822	/* Caller already set flags */
823	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
824
825	ASSERT(ixa->ixa_nce == NULL);
826
827	ixa->ixa_pktlen = ntohs(ipha->ipha_length);
828	ASSERT(ixa->ixa_pktlen == msgdsize(mp));
829	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
830	ixa->ixa_protocol = ipha->ipha_protocol;
831
832	/*
833	 * Assumes that source routed packets have already been massaged by
834	 * the ULP (ip_massage_options) and as a result ipha_dst is the next
835	 * hop in the source route. The final destination is used for IPsec
836	 * policy and DCE lookup.
837	 */
838	firsthop = ipha->ipha_dst;
839	dst = ip_get_dst(ipha);
840
841repeat_ire:
842	error = 0;
843	setsrc = INADDR_ANY;
844	ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error,
845	    &multirt);
846	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
847	if (error != 0) {
848		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
849		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
850		ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
851		freemsg(mp);
852		goto done;
853	}
854
855	if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
856		/* ire_ill might be NULL hence need to skip some code */
857		if (ixaflags & IXAF_SET_SOURCE)
858			ipha->ipha_src = htonl(INADDR_LOOPBACK);
859		ixa->ixa_fragsize = IP_MAXPACKET;
860		ill = NULL;
861		nce = NULL;
862		ire->ire_ob_pkt_count++;
863		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
864		/* No dce yet; use default one */
865		error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
866		    &ipst->ips_dce_default->dce_ident);
867		goto done;
868	}
869
870	/* Note that ipha_dst is only used for IRE_MULTICAST */
871	nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
872	if (nce == NULL) {
873		/* Allocation failure? */
874		ip_drop_output("ire_to_nce", mp, ill);
875		freemsg(mp);
876		error = ENOBUFS;
877		goto done;
878	}
879	if (nce->nce_is_condemned) {
880		nce_t *nce1;
881
882		nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
883		nce_refrele(nce);
884		if (nce1 == NULL) {
885			if (!repeat) {
886				/* Try finding a better IRE */
887				repeat = B_TRUE;
888				ire_refrele(ire);
889				goto repeat_ire;
890			}
891			/* Tried twice - drop packet */
892			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
893			ip_drop_output("No nce", mp, ill);
894			freemsg(mp);
895			error = ENOBUFS;
896			goto done;
897		}
898		nce = nce1;
899	}
900
901	/*
902	 * For multicast with multirt we have a flag passed back from
903	 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
904	 * possible multicast address.
905	 * We also need a flag for multicast since we can't check
906	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
907	 */
908	if (multirt) {
909		ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
910		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
911	} else {
912		ixa->ixa_postfragfn = ire->ire_postfragfn;
913		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
914	}
915	ASSERT(ixa->ixa_nce == NULL);
916	ixa->ixa_nce = nce;
917
918	/*
919	 * Check for a dce_t with a path mtu.
920	 */
921	dce = dce_lookup_v4(dst, ipst, NULL);
922	ASSERT(dce != NULL);
923
924	if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
925		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
926	} else if (dce->dce_flags & DCEF_PMTU) {
927		/*
928		 * To avoid a periodic timer to increase the path MTU we
929		 * look at dce_last_change_time each time we send a packet.
930		 */
931		if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
932		    ipst->ips_ip_pathmtu_interval) {
933			/*
934			 * Older than 20 minutes. Drop the path MTU information.
935			 */
936			mutex_enter(&dce->dce_lock);
937			dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
938			dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
939			mutex_exit(&dce->dce_lock);
940			dce_increment_generation(dce);
941			ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
942		} else {
943			uint_t fragsize;
944
945			fragsize = ip_get_base_mtu(nce->nce_ill, ire);
946			if (fragsize > dce->dce_pmtu)
947				fragsize = dce->dce_pmtu;
948			ixa->ixa_fragsize = fragsize;
949		}
950	} else {
951		ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
952	}
953
954	/*
955	 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
956	 * interface for source address selection.
957	 */
958	ill = ire_nexthop_ill(ire);
959
960	if (ixaflags & IXAF_SET_SOURCE) {
961		ipaddr_t	src;
962
963		/*
964		 * We use the final destination to get
965		 * correct selection for source routed packets
966		 */
967
968		/* If unreachable we have no ill but need some source */
969		if (ill == NULL) {
970			src = htonl(INADDR_LOOPBACK);
971			error = 0;
972		} else {
973			error = ip_select_source_v4(ill, setsrc, dst,
974			    ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
975			    &src, NULL, NULL);
976		}
977		if (error != 0) {
978			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
979			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
980			ip_drop_output("ipIfStatsOutDiscards - no source",
981			    mp, ill);
982			freemsg(mp);
983			goto done;
984		}
985		ipha->ipha_src = src;
986	} else if (ixaflags & IXAF_VERIFY_SOURCE) {
987		/* Check if the IP source is assigned to the host. */
988		if (!ip_verify_src(mp, ixa, NULL)) {
989			/* Don't send a packet with a source that isn't ours */
990			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
991			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
992			ip_drop_output("ipIfStatsOutDiscards - invalid source",
993			    mp, ill);
994			freemsg(mp);
995			error = EADDRNOTAVAIL;
996			goto done;
997		}
998	}
999
1000
1001	/*
1002	 * Check against global IPsec policy to set the AH/ESP attributes.
1003	 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1004	 */
1005	if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1006		ASSERT(ixa->ixa_ipsec_policy == NULL);
1007		mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1008		if (mp == NULL) {
1009			/* MIB and ip_drop_packet already done */
1010			return (EHOSTUNREACH);	/* IPsec policy failure */
1011		}
1012	}
1013
1014	if (ill != NULL) {
1015		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1016	} else {
1017		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1018	}
1019
1020	/*
1021	 * We update the statistics on the most specific IRE i.e., the first
1022	 * one we found.
1023	 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1024	 * can only count the use prior to fragmentation. However the MIB
1025	 * counters on the ill will be incremented in post fragmentation.
1026	 */
1027	ire->ire_ob_pkt_count++;
1028
1029	/*
1030	 * Based on ire_type and ire_flags call one of:
1031	 *	ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1032	 *	ire_send_multirt_v4 - if RTF_MULTIRT
1033	 *	ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1034	 *	ire_send_multicast_v4 - for IRE_MULTICAST
1035	 *	ire_send_broadcast_v4 - for IRE_BROADCAST
1036	 *	ire_send_wire_v4 - for the rest.
1037	 */
1038	error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1039done:
1040	ire_refrele(ire);
1041	if (dce != NULL)
1042		dce_refrele(dce);
1043	if (ill != NULL)
1044		ill_refrele(ill);
1045	if (ixa->ixa_nce != NULL)
1046		nce_refrele(ixa->ixa_nce);
1047	ixa->ixa_nce = NULL;
1048	return (error);
1049}
1050
1051/*
1052 * ire_sendfn() functions.
1053 * These functions use the following xmit_attr:
1054 *  - ixa_fragsize - read to determine whether or not to fragment
1055 *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1056 *  - ixa_ipsec_*  are used inside IPsec
1057 *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
1058 *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
1059 */
1060
1061
1062/*
1063 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1064 *
1065 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1066 */
1067/* ARGSUSED4 */
1068int
1069ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1070    ip_xmit_attr_t *ixa, uint32_t *identp)
1071{
1072	ipha_t		*ipha = (ipha_t *)iph_arg;
1073	ip_stack_t	*ipst = ixa->ixa_ipst;
1074	ill_t		*ill = ire->ire_ill;
1075	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
1076	uint_t		pktlen = ixa->ixa_pktlen;
1077
1078	/*
1079	 * No fragmentation, no nce, no application of IPsec,
1080	 * and no ipha_ident assignment.
1081	 *
1082	 * Note different order between IP provider and FW_HOOKS than in
1083	 * send_wire case.
1084	 */
1085
1086	/*
1087	 * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
1088	 * send probe, but not the receive probe.
1089	 */
1090	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1091	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1092	    int, 1);
1093
1094	if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1095		int error;
1096
1097		DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1098		    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1099		FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1100		    ipst->ips_ipv4firewall_loopback_out,
1101		    NULL, ill, ipha, mp, mp, 0, ipst, error);
1102		DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1103		if (mp == NULL)
1104			return (error);
1105
1106		/*
1107		 * Even if the destination was changed by the filter we use the
1108		 * forwarding decision that was made based on the address
1109		 * in ip_output/ip_set_destination.
1110		 */
1111		/* Length could be different */
1112		ipha = (ipha_t *)mp->b_rptr;
1113		pktlen = ntohs(ipha->ipha_length);
1114	}
1115
1116	/*
1117	 * If a callback is enabled then we need to know the
1118	 * source and destination zoneids for the packet. We already
1119	 * have those handy.
1120	 */
1121	if (ipst->ips_ip4_observe.he_interested) {
1122		zoneid_t szone, dzone;
1123		zoneid_t stackzoneid;
1124
1125		stackzoneid = netstackid_to_zoneid(
1126		    ipst->ips_netstack->netstack_stackid);
1127
1128		if (stackzoneid == GLOBAL_ZONEID) {
1129			/* Shared-IP zone */
1130			dzone = ire->ire_zoneid;
1131			szone = ixa->ixa_zoneid;
1132		} else {
1133			szone = dzone = stackzoneid;
1134		}
1135		ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1136	}
1137
1138	/* Handle lo0 stats */
1139	ipst->ips_loopback_packets++;
1140
1141	/* Map ixa to ira including IPsec policies */
1142	ipsec_out_to_in(ixa, ill, &iras);
1143	iras.ira_pktlen = pktlen;
1144
1145	if (!IS_SIMPLE_IPH(ipha)) {
1146		ip_output_local_options(ipha, ipst);
1147		iras.ira_flags |= IRAF_IPV4_OPTIONS;
1148	}
1149
1150	if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1151		int error;
1152
1153		DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1154		    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1155		FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1156		    ipst->ips_ipv4firewall_loopback_in,
1157		    ill, NULL, ipha, mp, mp, 0, ipst, error);
1158
1159		DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1160		if (mp == NULL) {
1161			ira_cleanup(&iras, B_FALSE);
1162			return (error);
1163		}
1164		/*
1165		 * Even if the destination was changed by the filter we use the
1166		 * forwarding decision that was made based on the address
1167		 * in ip_output/ip_set_destination.
1168		 */
1169		/* Length could be different */
1170		ipha = (ipha_t *)mp->b_rptr;
1171		pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1172	}
1173
1174	DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1175	    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1176	    int, 1);
1177
1178	ire->ire_ib_pkt_count++;
1179	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1180	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1181
1182	/* Destined to ire_zoneid - use that for fanout */
1183	iras.ira_zoneid = ire->ire_zoneid;
1184
1185	if (is_system_labeled()) {
1186		iras.ira_flags |= IRAF_SYSTEM_LABELED;
1187
1188		/*
1189		 * This updates ira_cred, ira_tsl and ira_free_flags based
1190		 * on the label. We don't expect this to ever fail for
1191		 * loopback packets, so we silently drop the packet should it
1192		 * fail.
1193		 */
1194		if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1195			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1196			ip_drop_input("tsol_get_pkt_label", mp, ill);
1197			freemsg(mp);
1198			return (0);
1199		}
1200		ASSERT(iras.ira_tsl != NULL);
1201
1202		/* tsol_get_pkt_label sometimes does pullupmsg */
1203		ipha = (ipha_t *)mp->b_rptr;
1204	}
1205
1206	ip_fanout_v4(mp, ipha, &iras);
1207
1208	/* We moved any IPsec refs from ixa to iras */
1209	ira_cleanup(&iras, B_FALSE);
1210	return (0);
1211}
1212
1213/*
1214 * ire_sendfn for IRE_BROADCAST
1215 * If the broadcast address is present on multiple ills and ixa_ifindex
1216 * isn't set, then we generate
1217 * a separate datagram (potentially with different source address) for
1218 * those ills. In any case, only one copy is looped back to ip_input_v4.
1219 */
1220int
1221ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1222    ip_xmit_attr_t *ixa, uint32_t *identp)
1223{
1224	ipha_t		*ipha = (ipha_t *)iph_arg;
1225	ip_stack_t	*ipst = ixa->ixa_ipst;
1226	irb_t		*irb = ire->ire_bucket;
1227	ire_t		*ire1;
1228	mblk_t		*mp1;
1229	ipha_t		*ipha1;
1230	iaflags_t	ixaflags = ixa->ixa_flags;
1231	nce_t		*nce1, *nce_orig;
1232
1233	/*
1234	 * Unless ire_send_multirt_v4 already set a ttl, force the
1235	 * ttl to a smallish value.
1236	 */
1237	if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1238		/*
1239		 * To avoid broadcast storms, we usually set the TTL to 1 for
1240		 * broadcasts.  This can
1241		 * be overridden stack-wide through the ip_broadcast_ttl
1242		 * ndd tunable, or on a per-connection basis through the
1243		 * IP_BROADCAST_TTL socket option.
1244		 *
1245		 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1246		 * will force ttl to one after we've set this.
1247		 */
1248		if (ixaflags & IXAF_BROADCAST_TTL_SET)
1249			ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1250		else
1251			ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1252	}
1253	/*
1254	 * Make sure we get a loopback copy (after IPsec and frag)
1255	 * Skip hardware checksum so that loopback copy is checksumed.
1256	 */
1257	ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1258
1259	/* Do we need to potentially generate multiple copies? */
1260	if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1261		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1262
1263	/*
1264	 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1265	 * Note that everything in the bucket has the same destination address.
1266	 */
1267	irb_refhold(irb);
1268	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1269		/* We do the main IRE after the end of the loop */
1270		if (ire1 == ire)
1271			continue;
1272
1273		/*
1274		 * Only IREs for the same IP address should be in the same
1275		 * bucket.
1276		 * But could have IRE_HOSTs in the case of CGTP.
1277		 * If we find any multirt routes we bail out of the loop
1278		 * and just do the single packet at the end; ip_postfrag_multirt
1279		 * will duplicate the packet.
1280		 */
1281		ASSERT(ire1->ire_addr == ire->ire_addr);
1282		if (!(ire1->ire_type & IRE_BROADCAST))
1283			continue;
1284
1285		if (IRE_IS_CONDEMNED(ire1))
1286			continue;
1287
1288		if (ixa->ixa_zoneid != ALL_ZONES &&
1289		    ire->ire_zoneid != ire1->ire_zoneid)
1290			continue;
1291
1292		ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1293
1294		if (ire1->ire_flags & RTF_MULTIRT)
1295			break;
1296
1297		/*
1298		 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1299		 * ensure that this goes out on the cast_ill.
1300		 */
1301		if (IS_UNDER_IPMP(ire1->ire_ill))
1302			continue;
1303
1304		mp1 = copymsg(mp);
1305		if (mp1 == NULL) {
1306			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1307			    ipIfStatsOutDiscards);
1308			ip_drop_output("ipIfStatsOutDiscards",
1309			    mp, ire1->ire_ill);
1310			continue;
1311		}
1312
1313		ipha1 = (ipha_t *)mp1->b_rptr;
1314		if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1315			/*
1316			 * Need to pick a different source address for each
1317			 * interface. If we have a global IPsec policy and
1318			 * no per-socket policy then we punt to
1319			 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1320			 */
1321			if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1322				ip_output_simple_broadcast(ixa, mp1);
1323				continue;
1324			}
1325			/* Pick a new source address for each interface */
1326			if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1327			    ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1328			    &ipha1->ipha_src, NULL, NULL) != 0) {
1329				BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1330				    ipIfStatsOutDiscards);
1331				ip_drop_output("ipIfStatsOutDiscards - select "
1332				    "broadcast source", mp1, ire1->ire_ill);
1333				freemsg(mp1);
1334				continue;
1335			}
1336			/*
1337			 * Check against global IPsec policy to set the AH/ESP
1338			 * attributes. IPsec will set IXAF_IPSEC_* and
1339			 * ixa_ipsec_* as appropriate.
1340			 */
1341			if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1342				ASSERT(ixa->ixa_ipsec_policy == NULL);
1343				mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1344				    NULL, ixa);
1345				if (mp1 == NULL) {
1346					/*
1347					 * MIB and ip_drop_packet already
1348					 * done
1349					 */
1350					continue;
1351				}
1352			}
1353		}
1354		/* Make sure we have an NCE on this ill */
1355		nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1356		    ire1->ire_type);
1357		if (nce1 == NULL) {
1358			BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1359			    ipIfStatsOutDiscards);
1360			ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1361			    mp1, ire1->ire_ill);
1362			freemsg(mp1);
1363			continue;
1364		}
1365		nce_orig = ixa->ixa_nce;
1366		ixa->ixa_nce = nce1;
1367
1368		ire_refhold(ire1);
1369		/*
1370		 * Ignore any errors here. We just collect the errno for
1371		 * the main ire below
1372		 */
1373		(void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1374		ire_refrele(ire1);
1375
1376		ixa->ixa_nce = nce_orig;
1377		nce_refrele(nce1);
1378
1379		ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1380	}
1381	irb_refrele(irb);
1382	/* Finally, the main one */
1383
1384	/*
1385	 * For IPMP we only send broadcasts on the ipmp_ill.
1386	 */
1387	if (IS_UNDER_IPMP(ire->ire_ill)) {
1388		freemsg(mp);
1389		return (0);
1390	}
1391
1392	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1393}
1394
1395/*
1396 * Send a packet using a different source address and different
1397 * IPsec policy.
1398 */
1399static void
1400ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1401{
1402	ip_xmit_attr_t ixas;
1403
1404	bzero(&ixas, sizeof (ixas));
1405	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1406	ixas.ixa_zoneid = ixa->ixa_zoneid;
1407	ixas.ixa_ifindex = 0;
1408	ixas.ixa_ipst = ixa->ixa_ipst;
1409	ixas.ixa_cred = ixa->ixa_cred;
1410	ixas.ixa_cpid = ixa->ixa_cpid;
1411	ixas.ixa_tsl = ixa->ixa_tsl;
1412	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1413
1414	(void) ip_output_simple(mp, &ixas);
1415	ixa_cleanup(&ixas);
1416}
1417
1418
1419static void
1420multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1421{
1422	ip_stack_t	*ipst = ixa->ixa_ipst;
1423
1424	/* Limit the TTL on multirt packets */
1425	if (ire->ire_type & IRE_MULTICAST) {
1426		if (ipha->ipha_ttl > 1) {
1427			ip2dbg(("ire_send_multirt_v4: forcing multicast "
1428			    "multirt TTL to 1 (was %d), dst 0x%08x\n",
1429			    ipha->ipha_ttl, ntohl(ire->ire_addr)));
1430			ipha->ipha_ttl = 1;
1431		}
1432		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1433	} else if ((ipst->ips_ip_multirt_ttl > 0) &&
1434	    (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1435		ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1436		/*
1437		 * Need to ensure we don't increase the ttl should we go through
1438		 * ire_send_broadcast or multicast.
1439		 */
1440		ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1441	}
1442}
1443
1444/*
1445 * ire_sendfn for IRE_MULTICAST
1446 */
1447int
1448ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1449    ip_xmit_attr_t *ixa, uint32_t *identp)
1450{
1451	ipha_t		*ipha = (ipha_t *)iph_arg;
1452	ip_stack_t	*ipst = ixa->ixa_ipst;
1453	ill_t		*ill = ire->ire_ill;
1454	iaflags_t	ixaflags = ixa->ixa_flags;
1455
1456	/*
1457	 * The IRE_MULTICAST is the same whether or not multirt is in use.
1458	 * Hence we need special-case code.
1459	 */
1460	if (ixaflags & IXAF_MULTIRT_MULTICAST)
1461		multirt_check_v4(ire, ipha, ixa);
1462
1463	/*
1464	 * Check if anything in ip_input_v4 wants a copy of the transmitted
1465	 * packet (after IPsec and fragmentation)
1466	 *
1467	 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1468	 *    RSVP and the rsvp daemon is an example of a
1469	 *    protocol and user level process that
1470	 *    handles it's own routing. Hence, it uses the
1471	 *    SO_DONTROUTE option to accomplish this.
1472	 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1473	 *    check whether there are any receivers for the group on the ill
1474	 *    (ignoring the zoneid).
1475	 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1476	 *    any members in other shared-IP zones.
1477	 *    If such members exist, then we indicate that the sending zone
1478	 *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1479	 *    behavior.
1480	 *
1481	 * When we loopback we skip hardware checksum to make sure loopback
1482	 * copy is checksumed.
1483	 *
1484	 * Note that ire_ill is the upper in the case of IPMP.
1485	 */
1486	ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1487	if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1488	    !(ixaflags & IXAF_DONTROUTE)) {
1489		ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1490	} else if (ixaflags & IXAF_MULTICAST_LOOP) {
1491		/*
1492		 * If this zone or any other zone has members then loopback
1493		 * a copy.
1494		 */
1495		if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1496			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1497	} else if (ipst->ips_netstack->netstack_numzones > 1) {
1498		/*
1499		 * This zone should not have a copy. But there are some other
1500		 * zones which might have members.
1501		 */
1502		if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1503		    ixa->ixa_zoneid)) {
1504			ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1505			ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1506			ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1507		}
1508	}
1509
1510	/*
1511	 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1512	 * force the ttl to the IP_MULTICAST_TTL value
1513	 */
1514	if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1515		ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1516	}
1517
1518	return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1519}
1520
1521/*
1522 * ire_sendfn for IREs with RTF_MULTIRT
1523 */
1524int
1525ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1526    ip_xmit_attr_t *ixa, uint32_t *identp)
1527{
1528	ipha_t		*ipha = (ipha_t *)iph_arg;
1529
1530	multirt_check_v4(ire, ipha, ixa);
1531
1532	if (ire->ire_type & IRE_MULTICAST)
1533		return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1534	else if (ire->ire_type & IRE_BROADCAST)
1535		return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1536	else
1537		return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1538}
1539
1540/*
1541 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1542 */
1543int
1544ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1545    ip_xmit_attr_t *ixa, uint32_t *identp)
1546{
1547	ip_stack_t	*ipst = ixa->ixa_ipst;
1548	ipha_t		*ipha = (ipha_t *)iph_arg;
1549	ill_t		*ill;
1550	ip_recv_attr_t	iras;
1551	boolean_t	dummy;
1552
1553	/* We assign an IP ident for nice errors */
1554	ipha->ipha_ident = atomic_add_32_nv(identp, 1);
1555
1556	BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1557
1558	if (ire->ire_type & IRE_NOROUTE) {
1559		/* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1560		ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1561		    RTA_DST, ipst);
1562	}
1563
1564	if (ire->ire_flags & RTF_BLACKHOLE) {
1565		ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1566		freemsg(mp);
1567		/* No error even for local senders - silent blackhole */
1568		return (0);
1569	}
1570	ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1571
1572	/*
1573	 * We need an ill_t for the ip_recv_attr_t even though this packet
1574	 * was never received and icmp_unreachable doesn't currently use
1575	 * ira_ill.
1576	 */
1577	ill = ill_lookup_on_name("lo0", B_FALSE,
1578	    !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1579	if (ill == NULL) {
1580		freemsg(mp);
1581		return (EHOSTUNREACH);
1582	}
1583
1584	bzero(&iras, sizeof (iras));
1585	/* Map ixa to ira including IPsec policies */
1586	ipsec_out_to_in(ixa, ill, &iras);
1587
1588	if (ip_source_routed(ipha, ipst)) {
1589		icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1590	} else {
1591		icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1592	}
1593	/* We moved any IPsec refs from ixa to iras */
1594	ira_cleanup(&iras, B_FALSE);
1595	ill_refrele(ill);
1596	return (EHOSTUNREACH);
1597}
1598
1599/*
1600 * Calculate a checksum ignoring any hardware capabilities
1601 *
1602 * Returns B_FALSE if the packet was too short for the checksum. Caller
1603 * should free and do stats.
1604 */
1605static boolean_t
1606ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1607{
1608	ip_stack_t	*ipst = ixa->ixa_ipst;
1609	uint_t		pktlen = ixa->ixa_pktlen;
1610	uint16_t	*cksump;
1611	uint32_t	cksum;
1612	uint8_t		protocol = ixa->ixa_protocol;
1613	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1614	ipaddr_t	dst = ipha->ipha_dst;
1615	ipaddr_t	src = ipha->ipha_src;
1616
1617	/* Just in case it contained garbage */
1618	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1619
1620	/*
1621	 * Calculate ULP checksum
1622	 */
1623	if (protocol == IPPROTO_TCP) {
1624		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1625		cksum = IP_TCP_CSUM_COMP;
1626	} else if (protocol == IPPROTO_UDP) {
1627		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1628		cksum = IP_UDP_CSUM_COMP;
1629	} else if (protocol == IPPROTO_SCTP) {
1630		sctp_hdr_t	*sctph;
1631
1632		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1633		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1634		/*
1635		 * Zero out the checksum field to ensure proper
1636		 * checksum calculation.
1637		 */
1638		sctph->sh_chksum = 0;
1639#ifdef	DEBUG
1640		if (!skip_sctp_cksum)
1641#endif
1642			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1643		goto ip_hdr_cksum;
1644	} else {
1645		goto ip_hdr_cksum;
1646	}
1647
1648	/* ULP puts the checksum field is in the first mblk */
1649	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1650
1651	/*
1652	 * We accumulate the pseudo header checksum in cksum.
1653	 * This is pretty hairy code, so watch close.  One
1654	 * thing to keep in mind is that UDP and TCP have
1655	 * stored their respective datagram lengths in their
1656	 * checksum fields.  This lines things up real nice.
1657	 */
1658	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1659
1660	cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1661	/*
1662	 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1663	 * Change to 0xffff
1664	 */
1665	if (protocol == IPPROTO_UDP && cksum == 0)
1666		*cksump = ~cksum;
1667	else
1668		*cksump = cksum;
1669
1670	IP_STAT(ipst, ip_out_sw_cksum);
1671	IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1672
1673ip_hdr_cksum:
1674	/* Calculate IPv4 header checksum */
1675	ipha->ipha_hdr_checksum = 0;
1676	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1677	return (B_TRUE);
1678}
1679
1680/*
1681 * Calculate the ULP checksum - try to use hardware.
1682 * In the case of MULTIRT, broadcast or multicast the
1683 * IXAF_NO_HW_CKSUM is set in which case we use software.
1684 *
1685 * If the hardware supports IP header checksum offload; then clear the
1686 * contents of IP header checksum field as expected by NIC.
1687 * Do this only if we offloaded either full or partial sum.
1688 *
1689 * Returns B_FALSE if the packet was too short for the checksum. Caller
1690 * should free and do stats.
1691 */
1692static boolean_t
1693ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1694    ip_xmit_attr_t *ixa, ill_t *ill)
1695{
1696	uint_t		pktlen = ixa->ixa_pktlen;
1697	uint16_t	*cksump;
1698	uint16_t	hck_flags;
1699	uint32_t	cksum;
1700	uint8_t		protocol = ixa->ixa_protocol;
1701	uint16_t	ip_hdr_length = ixa->ixa_ip_hdr_length;
1702
1703	if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1704	    !dohwcksum) {
1705		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1706	}
1707
1708	/*
1709	 * Calculate ULP checksum. Note that we don't use cksump and cksum
1710	 * if the ill has FULL support.
1711	 */
1712	if (protocol == IPPROTO_TCP) {
1713		cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1714		cksum = IP_TCP_CSUM_COMP;	/* Pseudo-header cksum */
1715	} else if (protocol == IPPROTO_UDP) {
1716		cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1717		cksum = IP_UDP_CSUM_COMP;	/* Pseudo-header cksum */
1718	} else if (protocol == IPPROTO_SCTP) {
1719		sctp_hdr_t	*sctph;
1720
1721		ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1722		sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1723		/*
1724		 * Zero out the checksum field to ensure proper
1725		 * checksum calculation.
1726		 */
1727		sctph->sh_chksum = 0;
1728#ifdef	DEBUG
1729		if (!skip_sctp_cksum)
1730#endif
1731			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1732		goto ip_hdr_cksum;
1733	} else {
1734	ip_hdr_cksum:
1735		/* Calculate IPv4 header checksum */
1736		ipha->ipha_hdr_checksum = 0;
1737		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1738		return (B_TRUE);
1739	}
1740
1741	/* ULP puts the checksum field is in the first mblk */
1742	ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1743
1744	/*
1745	 * Underlying interface supports hardware checksum offload for
1746	 * the payload; leave the payload checksum for the hardware to
1747	 * calculate.  N.B: We only need to set up checksum info on the
1748	 * first mblk.
1749	 */
1750	hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1751
1752	DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1753	if (hck_flags & HCKSUM_INET_FULL_V4) {
1754		/*
1755		 * Hardware calculates pseudo-header, header and the
1756		 * payload checksums, so clear the checksum field in
1757		 * the protocol header.
1758		 */
1759		*cksump = 0;
1760		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1761
1762		ipha->ipha_hdr_checksum = 0;
1763		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1764			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1765		} else {
1766			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1767		}
1768		return (B_TRUE);
1769	}
1770	if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
1771		ipaddr_t	dst = ipha->ipha_dst;
1772		ipaddr_t	src = ipha->ipha_src;
1773		/*
1774		 * Partial checksum offload has been enabled.  Fill
1775		 * the checksum field in the protocol header with the
1776		 * pseudo-header checksum value.
1777		 *
1778		 * We accumulate the pseudo header checksum in cksum.
1779		 * This is pretty hairy code, so watch close.  One
1780		 * thing to keep in mind is that UDP and TCP have
1781		 * stored their respective datagram lengths in their
1782		 * checksum fields.  This lines things up real nice.
1783		 */
1784		cksum += (dst >> 16) + (dst & 0xFFFF) +
1785		    (src >> 16) + (src & 0xFFFF);
1786		cksum += *(cksump);
1787		cksum = (cksum & 0xFFFF) + (cksum >> 16);
1788		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1789
1790		/*
1791		 * Offsets are relative to beginning of IP header.
1792		 */
1793		DB_CKSUMSTART(mp) = ip_hdr_length;
1794		DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1795		DB_CKSUMEND(mp) = pktlen;
1796		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1797
1798		ipha->ipha_hdr_checksum = 0;
1799		if (hck_flags & HCKSUM_IPHDRCKSUM) {
1800			DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1801		} else {
1802			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1803		}
1804		return (B_TRUE);
1805	}
1806	/* Hardware capabilities include neither full nor partial IPv4 */
1807	return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1808}
1809
1810/*
1811 * ire_sendfn for offlink and onlink destinations.
1812 * Also called from the multicast, broadcast, multirt send functions.
1813 *
1814 * Assumes that the caller has a hold on the ire.
1815 *
1816 * This function doesn't care if the IRE just became condemned since that
1817 * can happen at any time.
1818 */
1819/* ARGSUSED */
1820int
1821ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1822    ip_xmit_attr_t *ixa, uint32_t *identp)
1823{
1824	ip_stack_t	*ipst = ixa->ixa_ipst;
1825	ipha_t		*ipha = (ipha_t *)iph_arg;
1826	iaflags_t	ixaflags = ixa->ixa_flags;
1827	ill_t		*ill;
1828
1829	ASSERT(ixa->ixa_nce != NULL);
1830	ill = ixa->ixa_nce->nce_ill;
1831
1832	if (ixaflags & IXAF_DONTROUTE)
1833		ipha->ipha_ttl = 1;
1834
1835	/*
1836	 * Assign an ident value for this packet. There could be other
1837	 * threads targeting the same destination, so we have to arrange
1838	 * for a atomic increment.  Note that we use a 32-bit atomic add
1839	 * because it has better performance than its 16-bit sibling.
1840	 *
1841	 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1842	 * be the number of TCP segments  that the driver/hardware will
1843	 * extraly construct.
1844	 *
1845	 * If running in cluster mode and if the source address
1846	 * belongs to a replicated service then vector through
1847	 * cl_inet_ipident vector to allocate ip identifier
1848	 * NOTE: This is a contract private interface with the
1849	 * clustering group.
1850	 */
1851	if (cl_inet_ipident != NULL) {
1852		ipaddr_t src = ipha->ipha_src;
1853		ipaddr_t dst = ipha->ipha_dst;
1854		netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1855
1856		ASSERT(cl_inet_isclusterwide != NULL);
1857		if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1858		    AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1859			/*
1860			 * Note: not correct with LSO since we can't allocate
1861			 * ixa_extra_ident+1 consecutive values.
1862			 */
1863			ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1864			    IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1865			    (uint8_t *)(uintptr_t)dst, NULL);
1866		} else {
1867			ipha->ipha_ident = atomic_add_32_nv(identp,
1868			    ixa->ixa_extra_ident + 1);
1869		}
1870	} else {
1871		ipha->ipha_ident = atomic_add_32_nv(identp,
1872		    ixa->ixa_extra_ident + 1);
1873	}
1874#ifndef _BIG_ENDIAN
1875	ipha->ipha_ident = htons(ipha->ipha_ident);
1876#endif
1877
1878	/*
1879	 * This might set b_band, thus the IPsec and fragmentation
1880	 * code in IP ensures that b_band is updated in the first mblk.
1881	 */
1882	if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1883		/* ip_process translates an IS_UNDER_IPMP */
1884		mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1885		if (mp == NULL) {
1886			/* ip_drop_packet and MIB done */
1887			return (0);	/* Might just be delayed */
1888		}
1889	}
1890
1891	/*
1892	 * Verify any IPv4 options.
1893	 *
1894	 * The presense of IP options also forces the network stack to
1895	 * calculate the checksum in software.  This is because:
1896	 *
1897	 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1898	 * the size of "start offset" width to 6-bit.  This effectively
1899	 * sets the largest value of the offset to 64-bytes, starting
1900	 * from the MAC header.  When the cumulative MAC and IP headers
1901	 * exceed such limit, the offset will wrap around.  This causes
1902	 * the checksum to be calculated at the wrong place.
1903	 *
1904	 * IPv4 source routing: none of the full-checksum capable NICs
1905	 * is capable of correctly handling the	IPv4 source-routing
1906	 * option for purposes of calculating the pseudo-header; the
1907	 * actual destination is different from the destination in the
1908	 * header which is that of the next-hop.  (This case may not be
1909	 * true for NICs which can parse IPv6 extension headers, but
1910	 * we choose to simplify the implementation by not offloading
1911	 * checksum when they are present.)
1912	 */
1913	if (!IS_SIMPLE_IPH(ipha)) {
1914		ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1915		/* An IS_UNDER_IPMP ill is ok here */
1916		if (ip_output_options(mp, ipha, ixa, ill)) {
1917			/* Packet has been consumed and ICMP error sent */
1918			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1919			return (EINVAL);
1920		}
1921	}
1922
1923	/*
1924	 * To handle IPsec/iptun's labeling needs we need to tag packets
1925	 * while we still have ixa_tsl
1926	 */
1927	if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1928	    (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1929	    ill->ill_mactype == DL_IPV6)) {
1930		cred_t *newcr;
1931
1932		newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1933		    KM_NOSLEEP);
1934		if (newcr == NULL) {
1935			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1936			ip_drop_output("ipIfStatsOutDiscards - newcr",
1937			    mp, ill);
1938			freemsg(mp);
1939			return (ENOBUFS);
1940		}
1941		mblk_setcred(mp, newcr, NOPID);
1942		crfree(newcr);	/* mblk_setcred did its own crhold */
1943	}
1944
1945	if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1946	    (ixaflags & IXAF_IPSEC_SECURE)) {
1947		uint32_t pktlen;
1948
1949		pktlen = ixa->ixa_pktlen;
1950		if (ixaflags & IXAF_IPSEC_SECURE)
1951			pktlen += ipsec_out_extra_length(ixa);
1952
1953		if (pktlen > IP_MAXPACKET)
1954			return (EMSGSIZE);
1955
1956		if (ixaflags & IXAF_SET_ULP_CKSUM) {
1957			/*
1958			 * Compute ULP checksum and IP header checksum
1959			 * using software
1960			 */
1961			if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1962				BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1963				ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1964				freemsg(mp);
1965				return (EINVAL);
1966			}
1967		} else {
1968			/* Calculate IPv4 header checksum */
1969			ipha->ipha_hdr_checksum = 0;
1970			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1971		}
1972
1973		/*
1974		 * If this packet would generate a icmp_frag_needed
1975		 * message, we need to handle it before we do the IPsec
1976		 * processing. Otherwise, we need to strip the IPsec
1977		 * headers before we send up the message to the ULPs
1978		 * which becomes messy and difficult.
1979		 *
1980		 * We check using IXAF_DONTFRAG. The DF bit in the header
1981		 * is not inspected - it will be copied to any generated
1982		 * fragments.
1983		 */
1984		if ((pktlen > ixa->ixa_fragsize) &&
1985		    (ixaflags & IXAF_DONTFRAG)) {
1986			/* Generate ICMP and return error */
1987			ip_recv_attr_t	iras;
1988
1989			DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
1990			    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1991			    uint_t, ixa->ixa_pmtu);
1992
1993			bzero(&iras, sizeof (iras));
1994			/* Map ixa to ira including IPsec policies */
1995			ipsec_out_to_in(ixa, ill, &iras);
1996
1997			ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
1998			icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
1999			/* We moved any IPsec refs from ixa to iras */
2000			ira_cleanup(&iras, B_FALSE);
2001			return (EMSGSIZE);
2002		}
2003		DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2004		    uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2005		    uint_t, ixa->ixa_pmtu);
2006
2007		if (ixaflags & IXAF_IPSEC_SECURE) {
2008			/*
2009			 * Pass in sufficient information so that
2010			 * IPsec can determine whether to fragment, and
2011			 * which function to call after fragmentation.
2012			 */
2013			return (ipsec_out_process(mp, ixa));
2014		}
2015		return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2016		    ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2017		    ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2018		    ixa->ixa_postfragfn, &ixa->ixa_cookie));
2019	}
2020	if (ixaflags & IXAF_SET_ULP_CKSUM) {
2021		/* Compute ULP checksum and IP header checksum */
2022		/* An IS_UNDER_IPMP ill is ok here */
2023		if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2024			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2025			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2026			freemsg(mp);
2027			return (EINVAL);
2028		}
2029	} else {
2030		/* Calculate IPv4 header checksum */
2031		ipha->ipha_hdr_checksum = 0;
2032		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2033	}
2034	return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2035	    ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2036	    ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2037}
2038
2039/*
2040 * Send mp into ip_input
2041 * Common for IPv4 and IPv6
2042 */
2043void
2044ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2045    uint_t pkt_len, zoneid_t nolzid)
2046{
2047	rtc_t		rtc;
2048	ill_t		*ill = nce->nce_ill;
2049	ip_recv_attr_t	iras;	/* NOTE: No bzero for performance */
2050	ncec_t		*ncec;
2051
2052	ncec = nce->nce_common;
2053	iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2054	    IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2055	if (ncec->ncec_flags & NCE_F_BCAST)
2056		iras.ira_flags |= IRAF_L2DST_BROADCAST;
2057	else if (ncec->ncec_flags & NCE_F_MCAST)
2058		iras.ira_flags |= IRAF_L2DST_MULTICAST;
2059
2060	iras.ira_free_flags = 0;
2061	iras.ira_cred = NULL;
2062	iras.ira_cpid = NOPID;
2063	iras.ira_tsl = NULL;
2064	iras.ira_zoneid = ALL_ZONES;
2065	iras.ira_pktlen = pkt_len;
2066	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2067	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2068
2069	if (ixaflags & IXAF_IS_IPV4)
2070		iras.ira_flags |= IRAF_IS_IPV4;
2071
2072	iras.ira_ill = iras.ira_rill = ill;
2073	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2074	iras.ira_rifindex = iras.ira_ruifindex;
2075	iras.ira_mhip = NULL;
2076
2077	iras.ira_flags |= ixaflags & IAF_MASK;
2078	iras.ira_no_loop_zoneid = nolzid;
2079
2080	/* Broadcast and multicast doesn't care about the squeue */
2081	iras.ira_sqp = NULL;
2082
2083	rtc.rtc_ire = NULL;
2084	if (ixaflags & IXAF_IS_IPV4) {
2085		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2086
2087		rtc.rtc_ipaddr = INADDR_ANY;
2088
2089		(*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2090		if (rtc.rtc_ire != NULL) {
2091			ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2092			ire_refrele(rtc.rtc_ire);
2093		}
2094	} else {
2095		ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
2096
2097		rtc.rtc_ip6addr = ipv6_all_zeros;
2098
2099		(*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2100		if (rtc.rtc_ire != NULL) {
2101			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2102			ire_refrele(rtc.rtc_ire);
2103		}
2104	}
2105	/* Any references to clean up? No hold on ira */
2106	if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2107		ira_cleanup(&iras, B_FALSE);
2108}
2109
2110/*
2111 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2112 * looks at the IXAF_LOOPBACK_COPY flag.
2113 * Common for IPv4 and IPv6.
2114 *
2115 * If the loopback copy fails (due to no memory) but we send the packet out
2116 * on the wire we return no failure. Only in the case we supress the wire
2117 * sending do we take the loopback failure into account.
2118 *
2119 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2120 * Those operations are performed on this packet in ip_xmit() and it would
2121 * be odd to do it twice for the same packet.
2122 */
2123int
2124ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2125    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2126    uintptr_t *ixacookie)
2127{
2128	ill_t		*ill = nce->nce_ill;
2129	int		error = 0;
2130
2131	/*
2132	 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2133	 * had looped it back
2134	 */
2135	if (ixaflags & IXAF_LOOPBACK_COPY) {
2136		mblk_t		*mp1;
2137
2138		mp1 = copymsg(mp);
2139		if (mp1 == NULL) {
2140			/* Failed to deliver the loopback copy. */
2141			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2142			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2143			error = ENOBUFS;
2144		} else {
2145			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2146			    nolzid);
2147		}
2148	}
2149
2150	/*
2151	 * If TTL = 0 then only do the loopback to this host i.e. we are
2152	 * done. We are also done if this was the
2153	 * loopback interface since it is sufficient
2154	 * to loopback one copy of a multicast packet.
2155	 */
2156	if (ixaflags & IXAF_IS_IPV4) {
2157		ipha_t *ipha = (ipha_t *)mp->b_rptr;
2158
2159		if (ipha->ipha_ttl == 0) {
2160			ip_drop_output("multicast ipha_ttl not sent to wire",
2161			    mp, ill);
2162			freemsg(mp);
2163			return (error);
2164		}
2165	} else {
2166		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2167
2168		if (ip6h->ip6_hops == 0) {
2169			ip_drop_output("multicast ipha_ttl not sent to wire",
2170			    mp, ill);
2171			freemsg(mp);
2172			return (error);
2173		}
2174	}
2175	if (nce->nce_ill->ill_wq == NULL) {
2176		/* Loopback interface */
2177		ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2178		freemsg(mp);
2179		return (error);
2180	}
2181
2182	return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2183	    ixacookie));
2184}
2185
2186/*
2187 * Post fragmentation function for RTF_MULTIRT routes.
2188 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2189 * checks IXAF_LOOPBACK_COPY.
2190 *
2191 * If no packet is sent due to failures then we return an errno, but if at
2192 * least one succeeded we return zero.
2193 */
2194int
2195ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2196    uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2197    uintptr_t *ixacookie)
2198{
2199	irb_t		*irb;
2200	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2201	ire_t		*ire;
2202	ire_t		*ire1;
2203	mblk_t		*mp1;
2204	nce_t		*nce1;
2205	ill_t		*ill = nce->nce_ill;
2206	ill_t		*ill1;
2207	ip_stack_t	*ipst = ill->ill_ipst;
2208	int		error = 0;
2209	int		num_sent = 0;
2210	int		err;
2211	uint_t		ire_type;
2212	ipaddr_t	nexthop;
2213
2214	ASSERT(ixaflags & IXAF_IS_IPV4);
2215
2216	/* Check for IXAF_LOOPBACK_COPY */
2217	if (ixaflags & IXAF_LOOPBACK_COPY) {
2218		mblk_t *mp1;
2219
2220		mp1 = copymsg(mp);
2221		if (mp1 == NULL) {
2222			/* Failed to deliver the loopback copy. */
2223			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2224			ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2225			error = ENOBUFS;
2226		} else {
2227			ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2228			    nolzid);
2229		}
2230	}
2231
2232	/*
2233	 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2234	 * a copy to each one.
2235	 * Use the nce (nexthop) and ipha_dst to find the ire.
2236	 *
2237	 * MULTIRT is not designed to work with shared-IP zones thus we don't
2238	 * need to pass a zoneid or a label to the IRE lookup.
2239	 */
2240	if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2241		/* Broadcast and multicast case */
2242		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2243		    NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2244	} else {
2245		ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2246
2247		/* Unicast case */
2248		ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2249		    NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2250	}
2251
2252	if (ire == NULL ||
2253	    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2254	    !(ire->ire_flags & RTF_MULTIRT)) {
2255		/* Drop */
2256		ip_drop_output("ip_postfrag_multirt didn't find route",
2257		    mp, nce->nce_ill);
2258		if (ire != NULL)
2259			ire_refrele(ire);
2260		return (ENETUNREACH);
2261	}
2262
2263	irb = ire->ire_bucket;
2264	irb_refhold(irb);
2265	for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2266		/*
2267		 * For broadcast we can have a mixture of IRE_BROADCAST and
2268		 * IRE_HOST due to the manually added IRE_HOSTs that are used
2269		 * to trigger the creation of the special CGTP broadcast routes.
2270		 * Thus we have to skip if ire_type doesn't match the original.
2271		 */
2272		if (IRE_IS_CONDEMNED(ire1) ||
2273		    !(ire1->ire_flags & RTF_MULTIRT) ||
2274		    ire1->ire_type != ire->ire_type)
2275			continue;
2276
2277		/* Do the ire argument one after the loop */
2278		if (ire1 == ire)
2279			continue;
2280
2281		ill1 = ire_nexthop_ill(ire1);
2282		if (ill1 == NULL) {
2283			/*
2284			 * This ire might not have been picked by
2285			 * ire_route_recursive, in which case ire_dep might
2286			 * not have been setup yet.
2287			 * We kick ire_route_recursive to try to resolve
2288			 * starting at ire1.
2289			 */
2290			ire_t *ire2;
2291
2292			ire2 = ire_route_recursive_impl_v4(ire1,
2293			    ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2294			    ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
2295			    B_TRUE, 0, ipst, NULL, NULL, NULL);
2296			if (ire2 != NULL)
2297				ire_refrele(ire2);
2298			ill1 = ire_nexthop_ill(ire1);
2299		}
2300
2301		if (ill1 == NULL) {
2302			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2303			ip_drop_output("ipIfStatsOutDiscards - no ill",
2304			    mp, ill);
2305			error = ENETUNREACH;
2306			continue;
2307		}
2308
2309		/* Pick the addr and type to use for arp_nce_init */
2310		if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2311			ire_type = IRE_BROADCAST;
2312			nexthop = ire1->ire_gateway_addr;
2313		} else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2314			ire_type = IRE_MULTICAST;
2315			nexthop = ipha->ipha_dst;
2316		} else {
2317			ire_type = ire1->ire_type;	/* Doesn't matter */
2318			nexthop = ire1->ire_gateway_addr;
2319		}
2320
2321		/* If IPMP meta or under, then we just drop */
2322		if (ill1->ill_grp != NULL) {
2323			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2324			ip_drop_output("ipIfStatsOutDiscards - IPMP",
2325			    mp, ill1);
2326			ill_refrele(ill1);
2327			error = ENETUNREACH;
2328			continue;
2329		}
2330
2331		nce1 = arp_nce_init(ill1, nexthop, ire_type);
2332		if (nce1 == NULL) {
2333			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2334			ip_drop_output("ipIfStatsOutDiscards - no nce",
2335			    mp, ill1);
2336			ill_refrele(ill1);
2337			error = ENETUNREACH;
2338			continue;
2339		}
2340		mp1 = copymsg(mp);
2341		if (mp1 == NULL) {
2342			BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2343			ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2344			nce_refrele(nce1);
2345			ill_refrele(ill1);
2346			error = ENOBUFS;
2347			continue;
2348		}
2349		/* Preserve HW checksum for this copy */
2350		DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2351		DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2352		DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2353		DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2354		DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2355
2356		ire1->ire_ob_pkt_count++;
2357		err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2358		    0, ixacookie);
2359		if (err == 0)
2360			num_sent++;
2361		else
2362			error = err;
2363		nce_refrele(nce1);
2364		ill_refrele(ill1);
2365	}
2366	irb_refrele(irb);
2367	ire_refrele(ire);
2368	/* Finally, the main one */
2369	err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2370	    ixacookie);
2371	if (err == 0)
2372		num_sent++;
2373	else
2374		error = err;
2375	if (num_sent > 0)
2376		return (0);
2377	else
2378		return (error);
2379}
2380
2381/*
2382 * Verify local connectivity. This check is called by ULP fusion code.
2383 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2384 * the interface is brought down and back up. So we simply fail the local
2385 * process. The caller, TCP Fusion, should unfuse the connection.
2386 */
2387boolean_t
2388ip_output_verify_local(ip_xmit_attr_t *ixa)
2389{
2390	ire_t		*ire = ixa->ixa_ire;
2391
2392	if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2393		return (B_FALSE);
2394
2395	return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2396}
2397
2398/*
2399 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2400 *
2401 * The caller must call ip_output_verify_local() first. This function handles
2402 * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2403 */
2404mblk_t *
2405ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2406    boolean_t hooks_in, conn_t *peer_connp)
2407{
2408	ill_t		*ill = ixa->ixa_ire->ire_ill;
2409	ipha_t		*ipha = NULL;
2410	ip6_t		*ip6h = NULL;
2411	ip_stack_t	*ipst = ixa->ixa_ipst;
2412	iaflags_t	ixaflags = ixa->ixa_flags;
2413	ip_recv_attr_t	iras;
2414	int		error;
2415
2416	ASSERT(mp != NULL);
2417
2418	if (ixaflags & IXAF_IS_IPV4) {
2419		ipha = (ipha_t *)mp->b_rptr;
2420
2421		/*
2422		 * If a callback is enabled then we need to know the
2423		 * source and destination zoneids for the packet. We already
2424		 * have those handy.
2425		 */
2426		if (ipst->ips_ip4_observe.he_interested) {
2427			zoneid_t szone, dzone;
2428			zoneid_t stackzoneid;
2429
2430			stackzoneid = netstackid_to_zoneid(
2431			    ipst->ips_netstack->netstack_stackid);
2432
2433			if (stackzoneid == GLOBAL_ZONEID) {
2434				/* Shared-IP zone */
2435				dzone = ixa->ixa_ire->ire_zoneid;
2436				szone = ixa->ixa_zoneid;
2437			} else {
2438				szone = dzone = stackzoneid;
2439			}
2440			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2441			    ipst);
2442		}
2443		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2444		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2445		    NULL, int, 1);
2446
2447		/* FW_HOOKS: LOOPBACK_OUT */
2448		if (hooks_out) {
2449			DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2450			    ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2451			FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2452			    ipst->ips_ipv4firewall_loopback_out,
2453			    NULL, ill, ipha, mp, mp, 0, ipst, error);
2454			DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2455		}
2456		if (mp == NULL)
2457			return (NULL);
2458
2459		/* FW_HOOKS: LOOPBACK_IN */
2460		if (hooks_in) {
2461			DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2462			    ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2463			FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2464			    ipst->ips_ipv4firewall_loopback_in,
2465			    ill, NULL, ipha, mp, mp, 0, ipst, error);
2466			DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2467		}
2468		if (mp == NULL)
2469			return (NULL);
2470
2471		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2472		    ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2473		    NULL, int, 1);
2474
2475		/* Inbound IPsec polocies */
2476		if (peer_connp != NULL) {
2477			/* Map ixa to ira including IPsec policies. */
2478			ipsec_out_to_in(ixa, ill, &iras);
2479			mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2480			    NULL, &iras);
2481		}
2482	} else {
2483		ip6h = (ip6_t *)mp->b_rptr;
2484
2485		/*
2486		 * If a callback is enabled then we need to know the
2487		 * source and destination zoneids for the packet. We already
2488		 * have those handy.
2489		 */
2490		if (ipst->ips_ip6_observe.he_interested) {
2491			zoneid_t szone, dzone;
2492			zoneid_t stackzoneid;
2493
2494			stackzoneid = netstackid_to_zoneid(
2495			    ipst->ips_netstack->netstack_stackid);
2496
2497			if (stackzoneid == GLOBAL_ZONEID) {
2498				/* Shared-IP zone */
2499				dzone = ixa->ixa_ire->ire_zoneid;
2500				szone = ixa->ixa_zoneid;
2501			} else {
2502				szone = dzone = stackzoneid;
2503			}
2504			ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2505			    ipst);
2506		}
2507		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2508		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2509		    ip6h, int, 1);
2510
2511		/* FW_HOOKS: LOOPBACK_OUT */
2512		if (hooks_out) {
2513			DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2514			    ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2515			FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2516			    ipst->ips_ipv6firewall_loopback_out,
2517			    NULL, ill, ip6h, mp, mp, 0, ipst, error);
2518			DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2519		}
2520		if (mp == NULL)
2521			return (NULL);
2522
2523		/* FW_HOOKS: LOOPBACK_IN */
2524		if (hooks_in) {
2525			DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2526			    ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2527			FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2528			    ipst->ips_ipv6firewall_loopback_in,
2529			    ill, NULL, ip6h, mp, mp, 0, ipst, error);
2530			DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2531		}
2532		if (mp == NULL)
2533			return (NULL);
2534
2535		DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2536		    ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2537		    ip6h, int, 1);
2538
2539		/* Inbound IPsec polocies */
2540		if (peer_connp != NULL) {
2541			/* Map ixa to ira including IPsec policies. */
2542			ipsec_out_to_in(ixa, ill, &iras);
2543			mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2544			    ip6h, &iras);
2545		}
2546	}
2547
2548	if (mp == NULL) {
2549		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2550		ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2551	}
2552
2553	return (mp);
2554}
2555