ip_input.c revision 11042:2d6e217af1b4
1203954Srdivacky/* 2203954Srdivacky * CDDL HEADER START 3203954Srdivacky * 4203954Srdivacky * The contents of this file are subject to the terms of the 5203954Srdivacky * Common Development and Distribution License (the "License"). 6203954Srdivacky * You may not use this file except in compliance with the License. 7203954Srdivacky * 8203954Srdivacky * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9203954Srdivacky * or http://www.opensolaris.org/os/licensing. 10203954Srdivacky * See the License for the specific language governing permissions 11203954Srdivacky * and limitations under the License. 12221345Sdim * 13203954Srdivacky * When distributing Covered Code, include this CDDL HEADER in each 14221345Sdim * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15204642Srdivacky * If applicable, add the following below this CDDL HEADER, with the 16203954Srdivacky * fields enclosed by brackets "[]" replaced with your own identifying 17204642Srdivacky * information: Portions Copyright [yyyy] [name of copyright owner] 18203954Srdivacky * 19203954Srdivacky * CDDL HEADER END 20204642Srdivacky */ 21204642Srdivacky 22204642Srdivacky/* 23204642Srdivacky * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24204642Srdivacky * Use is subject to license terms. 25204642Srdivacky */ 26204642Srdivacky/* Copyright (c) 1990 Mentat Inc. */ 27204642Srdivacky 28204642Srdivacky#include <sys/types.h> 29204642Srdivacky#include <sys/stream.h> 30218893Sdim#include <sys/dlpi.h> 31204642Srdivacky#include <sys/stropts.h> 32204642Srdivacky#include <sys/sysmacros.h> 33204642Srdivacky#include <sys/strsubr.h> 34204642Srdivacky#include <sys/strlog.h> 35218893Sdim#include <sys/strsun.h> 36204642Srdivacky#include <sys/zone.h> 37204642Srdivacky#define _SUN_TPI_VERSION 2 38204642Srdivacky#include <sys/tihdr.h> 39204642Srdivacky#include <sys/xti_inet.h> 40204642Srdivacky#include <sys/ddi.h> 41204642Srdivacky#include <sys/sunddi.h> 42204642Srdivacky#include <sys/cmn_err.h> 43204642Srdivacky#include <sys/debug.h> 44204642Srdivacky#include <sys/kobj.h> 45204642Srdivacky#include <sys/modctl.h> 46204642Srdivacky#include <sys/atomic.h> 47204642Srdivacky#include <sys/policy.h> 48204642Srdivacky#include <sys/priv.h> 49203954Srdivacky 50203954Srdivacky#include <sys/systm.h> 51203954Srdivacky#include <sys/param.h> 52203954Srdivacky#include <sys/kmem.h> 53218893Sdim#include <sys/sdt.h> 54203954Srdivacky#include <sys/socket.h> 55203954Srdivacky#include <sys/vtrace.h> 56203954Srdivacky#include <sys/isa_defs.h> 57203954Srdivacky#include <sys/mac.h> 58218893Sdim#include <net/if.h> 59203954Srdivacky#include <net/if_arp.h> 60203954Srdivacky#include <net/route.h> 61203954Srdivacky#include <sys/sockio.h> 62203954Srdivacky#include <netinet/in.h> 63218893Sdim#include <net/if_dl.h> 64204642Srdivacky 65204642Srdivacky#include <inet/common.h> 66204642Srdivacky#include <inet/mi.h> 67203954Srdivacky#include <inet/mib2.h> 68218893Sdim#include <inet/nd.h> 69204642Srdivacky#include <inet/arp.h> 70204642Srdivacky#include <inet/snmpcom.h> 71204642Srdivacky#include <inet/kstatcom.h> 72204642Srdivacky 73218893Sdim#include <netinet/igmp_var.h> 74218893Sdim#include <netinet/ip6.h> 75218893Sdim#include <netinet/icmp6.h> 76218893Sdim#include <netinet/sctp.h> 77204792Srdivacky 78204792Srdivacky#include <inet/ip.h> 79204792Srdivacky#include <inet/ip_impl.h> 80204792Srdivacky#include <inet/ip6.h> 81204792Srdivacky#include <inet/ip6_asp.h> 82204792Srdivacky#include <inet/optcom.h> 83204792Srdivacky#include <inet/tcp.h> 84218893Sdim#include <inet/tcp_impl.h> 85204642Srdivacky#include <inet/ip_multi.h> 86204642Srdivacky#include <inet/ip_if.h> 87204642Srdivacky#include <inet/ip_ire.h> 88204642Srdivacky#include <inet/ip_ftable.h> 89218893Sdim#include <inet/ip_rts.h> 90204642Srdivacky#include <inet/ip_ndp.h> 91204642Srdivacky#include <inet/ip_listutils.h> 92218893Sdim#include <netinet/igmp.h> 93204642Srdivacky#include <netinet/ip_mroute.h> 94204642Srdivacky#include <inet/ipp_common.h> 95204642Srdivacky 96221345Sdim#include <net/pfkeyv2.h> 97221345Sdim#include <inet/sadb.h> 98221345Sdim#include <inet/ipsec_impl.h> 99221345Sdim#include <inet/ipdrop.h> 100203954Srdivacky#include <inet/ip_netinfo.h> 101203954Srdivacky#include <inet/ilb_ip.h> 102218893Sdim#include <sys/squeue_impl.h> 103203954Srdivacky#include <sys/squeue.h> 104203954Srdivacky 105203954Srdivacky#include <sys/ethernet.h> 106218893Sdim#include <net/if_types.h> 107204642Srdivacky#include <sys/cpuvar.h> 108204642Srdivacky 109218893Sdim#include <ipp/ipp.h> 110204642Srdivacky#include <ipp/ipp_impl.h> 111203954Srdivacky#include <ipp/ipgpc/ipgpc.h> 112204642Srdivacky 113203954Srdivacky#include <sys/pattr.h> 114218893Sdim#include <inet/ipclassifier.h> 115204642Srdivacky#include <inet/sctp_ip.h> 116203954Srdivacky#include <inet/sctp/sctp_impl.h> 117203954Srdivacky#include <inet/udp_impl.h> 118203954Srdivacky#include <sys/sunddi.h> 119203954Srdivacky 120218893Sdim#include <sys/tsol/label.h> 121204642Srdivacky#include <sys/tsol/tnet.h> 122204642Srdivacky 123204642Srdivacky#include <rpc/pmap_prot.h> 124204642Srdivacky 125204642Srdivacky#ifdef DEBUG 126204642Srdivackyextern boolean_t skip_sctp_cksum; 127204642Srdivacky#endif 128204642Srdivacky 129204642Srdivackystatic void ip_input_local_v4(ire_t *, mblk_t *, ipha_t *, 130204642Srdivacky ip_recv_attr_t *); 131204642Srdivacky 132218893Sdimstatic void ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *, 133204642Srdivacky ip_recv_attr_t *); 134204642Srdivackystatic void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *, 135204642Srdivacky ip_recv_attr_t *); 136204642Srdivacky 137204642Srdivacky#pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4) 138204642Srdivacky 139204642Srdivacky/* 140204642Srdivacky * Direct read side procedure capable of dealing with chains. GLDv3 based 141204642Srdivacky * drivers call this function directly with mblk chains while STREAMS 142204642Srdivacky * read side procedure ip_rput() calls this for single packet with ip_ring 143204642Srdivacky * set to NULL to process one packet at a time. 144218893Sdim * 145203954Srdivacky * The ill will always be valid if this function is called directly from 146203954Srdivacky * the driver. 147203954Srdivacky * 148203954Srdivacky * If ip_input() is called from GLDv3: 149203954Srdivacky * 150204642Srdivacky * - This must be a non-VLAN IP stream. 151203954Srdivacky * - 'mp' is either an untagged or a special priority-tagged packet. 152203954Srdivacky * - Any VLAN tag that was in the MAC header has been stripped. 153203954Srdivacky * 154203954Srdivacky * If the IP header in packet is not 32-bit aligned, every message in the 155203954Srdivacky * chain will be aligned before further operations. This is required on SPARC 156203954Srdivacky * platform. 157203954Srdivacky */ 158203954Srdivackyvoid 159203954Srdivackyip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 160203954Srdivacky struct mac_header_info_s *mhip) 161203954Srdivacky{ 162203954Srdivacky (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL, 163203954Srdivacky NULL); 164203954Srdivacky} 165218893Sdim 166203954Srdivacky/* 167203954Srdivacky * ip_accept_tcp() - This function is called by the squeue when it retrieves 168221345Sdim * a chain of packets in the poll mode. The packets have gone through the 169221345Sdim * data link processing but not IP processing. For performance and latency 170221345Sdim * reasons, the squeue wants to process the chain in line instead of feeding 171221345Sdim * it back via ip_input path. 172221345Sdim * 173221345Sdim * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4 174203954Srdivacky * will pass back any TCP packets matching the target sqp to 175203954Srdivacky * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by 176203954Srdivacky * ip_input_v4 and ip_fanout_v4 as normal. 177203954Srdivacky * The TCP packets that match the target squeue are returned to the caller 178203954Srdivacky * as a b_next chain after each packet has been prepend with an mblk 179203954Srdivacky * from ip_recv_attr_to_mblk. 180203954Srdivacky */ 181203954Srdivackymblk_t * 182203954Srdivackyip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, 183203954Srdivacky mblk_t *mp_chain, mblk_t **last, uint_t *cnt) 184218893Sdim{ 185203954Srdivacky return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp, 186203954Srdivacky last, cnt)); 187203954Srdivacky} 188203954Srdivacky 189203954Srdivacky/* 190203954Srdivacky * Used by ip_input and ip_accept_tcp 191203954Srdivacky * The last three arguments are only used by ip_accept_tcp, and mhip is 192203954Srdivacky * only used by ip_input. 193203954Srdivacky */ 194203954Srdivackymblk_t * 195203954Srdivackyip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, 196203954Srdivacky struct mac_header_info_s *mhip, squeue_t *target_sqp, 197218893Sdim mblk_t **last, uint_t *cnt) 198204642Srdivacky{ 199203954Srdivacky mblk_t *mp; 200204642Srdivacky ipha_t *ipha; 201203954Srdivacky ip_recv_attr_t iras; /* Receive attributes */ 202204642Srdivacky rtc_t rtc; 203203954Srdivacky iaflags_t chain_flags = 0; /* Fixed for chain */ 204203954Srdivacky mblk_t *ahead = NULL; /* Accepted head */ 205203954Srdivacky mblk_t *atail = NULL; /* Accepted tail */ 206203954Srdivacky uint_t acnt = 0; /* Accepted count */ 207204642Srdivacky 208204642Srdivacky ASSERT(mp_chain != NULL); 209204642Srdivacky ASSERT(ill != NULL); 210203954Srdivacky 211203954Srdivacky /* These ones do not change as we loop over packets */ 212203954Srdivacky iras.ira_ill = iras.ira_rill = ill; 213203954Srdivacky iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 214218893Sdim iras.ira_rifindex = iras.ira_ruifindex; 215203954Srdivacky iras.ira_sqp = NULL; 216204642Srdivacky iras.ira_ring = ip_ring; 217204642Srdivacky /* For ECMP and outbound transmit ring selection */ 218204642Srdivacky iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring); 219204642Srdivacky 220204642Srdivacky iras.ira_target_sqp = target_sqp; 221204642Srdivacky iras.ira_target_sqp_mp = NULL; 222204642Srdivacky if (target_sqp != NULL) 223204642Srdivacky chain_flags |= IRAF_TARGET_SQP; 224204642Srdivacky 225204642Srdivacky /* 226204642Srdivacky * We try to have a mhip pointer when possible, but 227218893Sdim * it might be NULL in some cases. In those cases we 228203954Srdivacky * have to assume unicast. 229203954Srdivacky */ 230203954Srdivacky iras.ira_mhip = mhip; 231203954Srdivacky iras.ira_flags = 0; 232203954Srdivacky if (mhip != NULL) { 233218893Sdim switch (mhip->mhi_dsttype) { 234203954Srdivacky case MAC_ADDRTYPE_MULTICAST : 235203954Srdivacky chain_flags |= IRAF_L2DST_MULTICAST; 236218893Sdim break; 237203954Srdivacky case MAC_ADDRTYPE_BROADCAST : 238208599Srdivacky chain_flags |= IRAF_L2DST_BROADCAST; 239203954Srdivacky break; 240203954Srdivacky } 241203954Srdivacky } 242204642Srdivacky 243204642Srdivacky /* 244218893Sdim * Initialize the one-element route cache. 245204642Srdivacky * 246204642Srdivacky * We do ire caching from one iteration to 247204642Srdivacky * another. In the event the packet chain contains 248204642Srdivacky * all packets from the same dst, this caching saves 249204642Srdivacky * an ire_route_recursive for each of the succeeding 250204642Srdivacky * packets in a packet chain. 251218893Sdim */ 252203954Srdivacky rtc.rtc_ire = NULL; 253204642Srdivacky rtc.rtc_ipaddr = INADDR_ANY; 254218893Sdim 255203954Srdivacky /* Loop over b_next */ 256204642Srdivacky for (mp = mp_chain; mp != NULL; mp = mp_chain) { 257218893Sdim mp_chain = mp->b_next; 258203954Srdivacky mp->b_next = NULL; 259204642Srdivacky 260204642Srdivacky ASSERT(DB_TYPE(mp) == M_DATA); 261204642Srdivacky 262204642Srdivacky 263204642Srdivacky /* 264204642Srdivacky * if db_ref > 1 then copymsg and free original. Packet 265204642Srdivacky * may be changed and we do not want the other entity 266204792Srdivacky * who has a reference to this message to trip over the 267204792Srdivacky * changes. This is a blind change because trying to 268204792Srdivacky * catch all places that might change the packet is too 269204642Srdivacky * difficult. 270203954Srdivacky * 271218893Sdim * This corresponds to the fast path case, where we have 272203954Srdivacky * a chain of M_DATA mblks. We check the db_ref count 273203954Srdivacky * of only the 1st data block in the mblk chain. There 274203954Srdivacky * doesn't seem to be a reason why a device driver would 275203954Srdivacky * send up data with varying db_ref counts in the mblk 276203954Srdivacky * chain. In any case the Fast path is a private 277203954Srdivacky * interface, and our drivers don't do such a thing. 278203954Srdivacky * Given the above assumption, there is no need to walk 279203954Srdivacky * down the entire mblk chain (which could have a 280218893Sdim * potential performance problem) 281203954Srdivacky * 282203954Srdivacky * The "(DB_REF(mp) > 1)" check was moved from ip_rput() 283203954Srdivacky * to here because of exclusive ip stacks and vnics. 284203954Srdivacky * Packets transmitted from exclusive stack over vnic 285203954Srdivacky * can have db_ref > 1 and when it gets looped back to 286203954Srdivacky * another vnic in a different zone, you have ip_input() 287203954Srdivacky * getting dblks with db_ref > 1. So if someone 288203954Srdivacky * complains of TCP performance under this scenario, 289203954Srdivacky * take a serious look here on the impact of copymsg(). 290203954Srdivacky */ 291218893Sdim if (DB_REF(mp) > 1) { 292203954Srdivacky if ((mp = ip_fix_dbref(mp, &iras)) == NULL) { 293204642Srdivacky /* mhip might point into 1st packet in chain */ 294204642Srdivacky iras.ira_mhip = NULL; 295203954Srdivacky continue; 296203954Srdivacky } 297204642Srdivacky } 298204642Srdivacky 299204642Srdivacky /* 300204642Srdivacky * IP header ptr not aligned? 301204642Srdivacky * OR IP header not complete in first mblk 302204642Srdivacky */ 303204642Srdivacky ipha = (ipha_t *)mp->b_rptr; 304203954Srdivacky if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) { 305204642Srdivacky mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH, 306203954Srdivacky &iras); 307204642Srdivacky if (mp == NULL) { 308203954Srdivacky /* mhip might point into 1st packet in chain */ 309203954Srdivacky iras.ira_mhip = NULL; 310204642Srdivacky continue; 311203954Srdivacky } 312204642Srdivacky ipha = (ipha_t *)mp->b_rptr; 313203954Srdivacky } 314203954Srdivacky 315203954Srdivacky /* Protect against a mix of Ethertypes and IP versions */ 316203954Srdivacky if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 317218893Sdim BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); 318203954Srdivacky ip_drop_input("ipIfStatsInHdrErrors", mp, ill); 319204642Srdivacky freemsg(mp); 320218893Sdim /* mhip might point into 1st packet in the chain. */ 321204642Srdivacky iras.ira_mhip = NULL; 322204642Srdivacky continue; 323204642Srdivacky } 324204642Srdivacky 325218893Sdim /* 326203954Srdivacky * Check for Martian addrs; we have to explicitly 327203954Srdivacky * test for for zero dst since this is also used as 328203954Srdivacky * an indication that the rtc is not used. 329204642Srdivacky */ 330204642Srdivacky if (ipha->ipha_dst == INADDR_ANY) { 331204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 332204642Srdivacky ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 333204642Srdivacky freemsg(mp); 334204642Srdivacky /* mhip might point into 1st packet in the chain. */ 335204642Srdivacky iras.ira_mhip = NULL; 336218893Sdim continue; 337204642Srdivacky } 338204642Srdivacky 339203954Srdivacky /* 340203954Srdivacky * Keep L2SRC from a previous packet in chain since mhip 341204642Srdivacky * might point into an earlier packet in the chain. 342204642Srdivacky * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast 343204642Srdivacky * source check in forwarding path. 344204642Srdivacky */ 345204642Srdivacky chain_flags |= (iras.ira_flags & 346204642Srdivacky (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC)); 347204642Srdivacky 348204642Srdivacky iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM | 349204642Srdivacky IRAF_VERIFY_ULP_CKSUM | chain_flags; 350204642Srdivacky iras.ira_free_flags = 0; 351204642Srdivacky iras.ira_cred = NULL; 352204642Srdivacky iras.ira_cpid = NOPID; 353204642Srdivacky iras.ira_tsl = NULL; 354204642Srdivacky iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */ 355204642Srdivacky 356204642Srdivacky /* 357204642Srdivacky * We must count all incoming packets, even if they end 358204642Srdivacky * up being dropped later on. Defer counting bytes until 359204642Srdivacky * we have the whole IP header in first mblk. 360204642Srdivacky */ 361204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); 362204642Srdivacky 363204642Srdivacky iras.ira_pktlen = ntohs(ipha->ipha_length); 364203954Srdivacky UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, 365203954Srdivacky iras.ira_pktlen); 366203954Srdivacky 367218893Sdim /* 368203954Srdivacky * Call one of: 369203954Srdivacky * ill_input_full_v4 370203954Srdivacky * ill_input_short_v4 371218893Sdim * The former is used in unusual cases. See ill_set_inputfn(). 372203954Srdivacky */ 373203954Srdivacky (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); 374203954Srdivacky 375203954Srdivacky /* Any references to clean up? No hold on ira_ill */ 376203954Srdivacky if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) 377203954Srdivacky ira_cleanup(&iras, B_FALSE); 378203954Srdivacky 379203954Srdivacky if (iras.ira_target_sqp_mp != NULL) { 380218893Sdim /* Better be called from ip_accept_tcp */ 381218893Sdim ASSERT(target_sqp != NULL); 382203954Srdivacky 383218893Sdim /* Found one packet to accept */ 384203954Srdivacky mp = iras.ira_target_sqp_mp; 385204642Srdivacky iras.ira_target_sqp_mp = NULL; 386203954Srdivacky ASSERT(ip_recv_attr_is_mblk(mp)); 387203954Srdivacky 388204642Srdivacky if (atail != NULL) 389218893Sdim atail->b_next = mp; 390218893Sdim else 391204642Srdivacky ahead = mp; 392218893Sdim atail = mp; 393218893Sdim acnt++; 394204642Srdivacky mp = NULL; 395204642Srdivacky } 396218893Sdim /* mhip might point into 1st packet in the chain. */ 397204642Srdivacky iras.ira_mhip = NULL; 398218893Sdim } 399218893Sdim /* Any remaining references to the route cache? */ 400204642Srdivacky if (rtc.rtc_ire != NULL) { 401218893Sdim ASSERT(rtc.rtc_ipaddr != INADDR_ANY); 402218893Sdim ire_refrele(rtc.rtc_ire); 403218893Sdim } 404218893Sdim 405218893Sdim if (ahead != NULL) { 406218893Sdim /* Better be called from ip_accept_tcp */ 407218893Sdim ASSERT(target_sqp != NULL); 408203954Srdivacky *last = atail; 409203954Srdivacky *cnt = acnt; 410203954Srdivacky return (ahead); 411204642Srdivacky } 412203954Srdivacky 413204642Srdivacky return (NULL); 414203954Srdivacky} 415203954Srdivacky 416203954Srdivacky/* 417203954Srdivacky * This input function is used when 418203954Srdivacky * - is_system_labeled() 419203954Srdivacky * - CGTP filtering 420203954Srdivacky * - DHCP unicast before we have an IP address configured 421203954Srdivacky * - there is an listener for IPPROTO_RSVP 422203954Srdivacky */ 423206083Srdivackyvoid 424218893Sdimill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, 425206083Srdivacky ip_recv_attr_t *ira, rtc_t *rtc) 426206083Srdivacky{ 427206083Srdivacky ipha_t *ipha = (ipha_t *)iph_arg; 428203954Srdivacky ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; 429206083Srdivacky ill_t *ill = ira->ira_ill; 430203954Srdivacky ip_stack_t *ipst = ill->ill_ipst; 431218893Sdim int cgtp_flt_pkt; 432203954Srdivacky 433203954Srdivacky ASSERT(ira->ira_tsl == NULL); 434203954Srdivacky 435203954Srdivacky /* 436203954Srdivacky * Attach any necessary label information to 437204642Srdivacky * this packet 438204642Srdivacky */ 439203954Srdivacky if (is_system_labeled()) { 440203954Srdivacky ira->ira_flags |= IRAF_SYSTEM_LABELED; 441203954Srdivacky 442203954Srdivacky /* 443203954Srdivacky * This updates ira_cred, ira_tsl and ira_free_flags based 444203954Srdivacky * on the label. 445204642Srdivacky */ 446203954Srdivacky if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) { 447203954Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 448203954Srdivacky ip_drop_input("ipIfStatsInDiscards", mp, ill); 449218893Sdim freemsg(mp); 450203954Srdivacky return; 451203954Srdivacky } 452203954Srdivacky /* Note that ira_tsl can be NULL here. */ 453203954Srdivacky 454218893Sdim /* tsol_get_pkt_label sometimes does pullupmsg */ 455204961Srdivacky ipha = (ipha_t *)mp->b_rptr; 456204961Srdivacky } 457204961Srdivacky 458218893Sdim /* 459206083Srdivacky * Invoke the CGTP (multirouting) filtering module to process 460206083Srdivacky * the incoming packet. Packets identified as duplicates 461206083Srdivacky * must be discarded. Filtering is active only if the 462203954Srdivacky * the ip_cgtp_filter ndd variable is non-zero. 463203954Srdivacky */ 464204642Srdivacky cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; 465204642Srdivacky if (ipst->ips_ip_cgtp_filter && 466204642Srdivacky ipst->ips_ip_cgtp_filter_ops != NULL) { 467204642Srdivacky netstackid_t stackid; 468204642Srdivacky 469204642Srdivacky stackid = ipst->ips_netstack->netstack_stackid; 470204642Srdivacky /* 471204642Srdivacky * CGTP and IPMP are mutually exclusive so 472204642Srdivacky * phyint_ifindex is fine here. 473204642Srdivacky */ 474204642Srdivacky cgtp_flt_pkt = 475204642Srdivacky ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, 476204642Srdivacky ill->ill_phyint->phyint_ifindex, mp); 477218893Sdim if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { 478204642Srdivacky ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill); 479204642Srdivacky freemsg(mp); 480204642Srdivacky return; 481204642Srdivacky } 482218893Sdim } 483204792Srdivacky 484204792Srdivacky /* 485218893Sdim * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP 486203954Srdivacky * server to unicast DHCP packets to a DHCP client using the 487203954Srdivacky * IP address it is offering to the client. This can be 488203954Srdivacky * disabled through the "broadcast bit", but not all DHCP 489204642Srdivacky * servers honor that bit. Therefore, to interoperate with as 490218893Sdim * many DHCP servers as possible, the DHCP client allows the 491204792Srdivacky * server to unicast, but we treat those packets as broadcast 492204792Srdivacky * here. Note that we don't rewrite the packet itself since 493204792Srdivacky * (a) that would mess up the checksums and (b) the DHCP 494204792Srdivacky * client conn is bound to INADDR_ANY so ip_fanout_udp() will 495204792Srdivacky * hand it the packet regardless. 496204792Srdivacky */ 497218893Sdim if (ill->ill_dhcpinit != 0 && 498204792Srdivacky ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION && 499204792Srdivacky ipha->ipha_protocol == IPPROTO_UDP) { 500204642Srdivacky udpha_t *udpha; 501204792Srdivacky 502204792Srdivacky ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira); 503204792Srdivacky if (ipha == NULL) { 504204792Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 505204792Srdivacky ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill); 506218893Sdim freemsg(mp); 507204792Srdivacky return; 508204792Srdivacky } 509218893Sdim /* Reload since pullupmsg() can change b_rptr. */ 510204792Srdivacky udpha = (udpha_t *)&ipha[1]; 511204792Srdivacky 512204792Srdivacky if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { 513204792Srdivacky DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, 514218893Sdim mblk_t *, mp); 515204792Srdivacky /* 516204792Srdivacky * This assumes that we deliver to all conns for 517204792Srdivacky * multicast and broadcast packets. 518204792Srdivacky */ 519204792Srdivacky nexthop = INADDR_BROADCAST; 520204792Srdivacky ira->ira_flags |= IRAF_DHCP_UNICAST; 521204792Srdivacky } 522218893Sdim } 523204792Srdivacky 524204792Srdivacky /* 525204792Srdivacky * If rsvpd is running, let RSVP daemon handle its processing 526204792Srdivacky * and forwarding of RSVP multicast/unicast packets. 527204792Srdivacky * If rsvpd is not running but mrouted is running, RSVP 528218893Sdim * multicast packets are forwarded as multicast traffic 529218893Sdim * and RSVP unicast packets are forwarded by unicast router. 530204792Srdivacky * If neither rsvpd nor mrouted is running, RSVP multicast 531204792Srdivacky * packets are not forwarded, but the unicast packets are 532218893Sdim * forwarded like unicast traffic. 533204642Srdivacky */ 534203954Srdivacky if (ipha->ipha_protocol == IPPROTO_RSVP && 535203954Srdivacky ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { 536203954Srdivacky /* RSVP packet and rsvpd running. Treat as ours */ 537204642Srdivacky ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop))); 538204642Srdivacky /* 539204642Srdivacky * We use a multicast address to get the packet to 540204642Srdivacky * ire_recv_multicast_v4. There will not be a membership 541204642Srdivacky * check since we set IRAF_RSVP 542204642Srdivacky */ 543204642Srdivacky nexthop = htonl(INADDR_UNSPEC_GROUP); 544218893Sdim ira->ira_flags |= IRAF_RSVP; 545204642Srdivacky } 546204642Srdivacky 547204642Srdivacky ill_input_short_v4(mp, ipha, &nexthop, ira, rtc); 548204792Srdivacky} 549204792Srdivacky 550204792Srdivacky/* 551204792Srdivacky * This is the tail-end of the full receive side packet handling. 552204792Srdivacky * It can be used directly when the configuration is simple. 553204792Srdivacky */ 554204792Srdivackyvoid 555218893Sdimill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, 556204642Srdivacky ip_recv_attr_t *ira, rtc_t *rtc) 557204642Srdivacky{ 558204642Srdivacky ire_t *ire; 559204792Srdivacky uint_t opt_len; 560204642Srdivacky ill_t *ill = ira->ira_ill; 561204642Srdivacky ip_stack_t *ipst = ill->ill_ipst; 562204642Srdivacky uint_t pkt_len; 563204792Srdivacky ssize_t len; 564204792Srdivacky ipha_t *ipha = (ipha_t *)iph_arg; 565204642Srdivacky ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; 566204642Srdivacky ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; 567204642Srdivacky#define rptr ((uchar_t *)ipha) 568204642Srdivacky 569204642Srdivacky ASSERT(DB_TYPE(mp) == M_DATA); 570204642Srdivacky 571204642Srdivacky /* 572204642Srdivacky * The following test for loopback is faster than 573204642Srdivacky * IP_LOOPBACK_ADDR(), because it avoids any bitwise 574204642Srdivacky * operations. 575204642Srdivacky * Note that these addresses are always in network byte order 576218893Sdim */ 577204642Srdivacky if (((*(uchar_t *)&ipha->ipha_dst) == 127) || 578204642Srdivacky ((*(uchar_t *)&ipha->ipha_src) == 127)) { 579204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 580204642Srdivacky ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 581204642Srdivacky freemsg(mp); 582204642Srdivacky return; 583218893Sdim } 584204642Srdivacky 585205407Srdivacky len = mp->b_wptr - rptr; 586204642Srdivacky pkt_len = ira->ira_pktlen; 587204642Srdivacky 588204642Srdivacky /* multiple mblk or too short */ 589218893Sdim len -= pkt_len; 590204642Srdivacky if (len != 0) { 591204642Srdivacky mp = ip_check_length(mp, rptr, len, pkt_len, 592204642Srdivacky IP_SIMPLE_HDR_LENGTH, ira); 593221345Sdim if (mp == NULL) 594221345Sdim return; 595204642Srdivacky ipha = (ipha_t *)mp->b_rptr; 596204642Srdivacky } 597204642Srdivacky 598218893Sdim DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, 599204642Srdivacky ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, 600205407Srdivacky int, 0); 601204642Srdivacky 602204642Srdivacky /* 603204642Srdivacky * The event for packets being received from a 'physical' 604218893Sdim * interface is placed after validation of the source and/or 605204642Srdivacky * destination address as being local so that packets can be 606204642Srdivacky * redirected to loopback addresses using ipnat. 607204642Srdivacky */ 608204642Srdivacky DTRACE_PROBE4(ip4__physical__in__start, 609204642Srdivacky ill_t *, ill, ill_t *, NULL, 610204642Srdivacky ipha_t *, ipha, mblk_t *, mp); 611204642Srdivacky 612204642Srdivacky if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) { 613208599Srdivacky int ll_multicast = 0; 614208599Srdivacky int error; 615208599Srdivacky ipaddr_t orig_dst = ipha->ipha_dst; 616208599Srdivacky 617208599Srdivacky if (ira->ira_flags & IRAF_L2DST_MULTICAST) 618208599Srdivacky ll_multicast = HPE_MULTICAST; 619208599Srdivacky else if (ira->ira_flags & IRAF_L2DST_BROADCAST) 620208599Srdivacky ll_multicast = HPE_BROADCAST; 621204642Srdivacky 622218893Sdim FW_HOOKS(ipst->ips_ip4_physical_in_event, 623204642Srdivacky ipst->ips_ipv4firewall_physical_in, 624204642Srdivacky ill, NULL, ipha, mp, mp, ll_multicast, ipst, error); 625204642Srdivacky 626204642Srdivacky DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); 627204642Srdivacky 628218893Sdim if (mp == NULL) 629204642Srdivacky return; 630204642Srdivacky /* The length could have changed */ 631204642Srdivacky ipha = (ipha_t *)mp->b_rptr; 632218893Sdim ira->ira_pktlen = ntohs(ipha->ipha_length); 633204642Srdivacky pkt_len = ira->ira_pktlen; 634204642Srdivacky 635204642Srdivacky /* 636204642Srdivacky * In case the destination changed we override any previous 637204642Srdivacky * change to nexthop. 638204642Srdivacky */ 639204642Srdivacky if (orig_dst != ipha->ipha_dst) 640204642Srdivacky nexthop = ipha->ipha_dst; 641218893Sdim if (nexthop == INADDR_ANY) { 642204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); 643204642Srdivacky ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 644204642Srdivacky freemsg(mp); 645218893Sdim return; 646204642Srdivacky } 647204642Srdivacky } 648204642Srdivacky 649223017Sdim if (ipst->ips_ip4_observe.he_interested) { 650223017Sdim zoneid_t dzone; 651223017Sdim 652223017Sdim /* 653223017Sdim * On the inbound path the src zone will be unknown as 654223017Sdim * this packet has come from the wire. 655223017Sdim */ 656223017Sdim dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES); 657223017Sdim ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst); 658223017Sdim } 659223017Sdim 660223017Sdim /* 661223017Sdim * If there is a good HW IP header checksum we clear the need 662223017Sdim * look at the IP header checksum. 663223017Sdim */ 664223017Sdim if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && 665223017Sdim ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 666223017Sdim /* Header checksum was ok. Clear the flag */ 667223017Sdim DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; 668223017Sdim ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; 669223017Sdim } 670223017Sdim 671223017Sdim /* 672223017Sdim * Here we check to see if we machine is setup as 673223017Sdim * L3 loadbalancer and if the incoming packet is for a VIP 674223017Sdim * 675223017Sdim * Check the following: 676223017Sdim * - there is at least a rule 677223017Sdim * - protocol of the packet is supported 678204642Srdivacky */ 679204642Srdivacky if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) { 680204642Srdivacky ipaddr_t lb_dst; 681204642Srdivacky int lb_ret; 682204642Srdivacky 683205407Srdivacky /* For convenience, we pull up the mblk. */ 684204642Srdivacky if (mp->b_cont != NULL) { 685218893Sdim if (pullupmsg(mp, -1) == 0) { 686204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 687204642Srdivacky ip_drop_input("ipIfStatsInDiscards - pullupmsg", 688204642Srdivacky mp, ill); 689204642Srdivacky freemsg(mp); 690204642Srdivacky return; 691204642Srdivacky } 692204642Srdivacky ipha = (ipha_t *)mp->b_rptr; 693204642Srdivacky } 694218893Sdim 695204642Srdivacky /* 696204642Srdivacky * We just drop all fragments going to any VIP, at 697218893Sdim * least for now.... 698204642Srdivacky */ 699204642Srdivacky if (ntohs(ipha->ipha_fragment_offset_and_flags) & 700218893Sdim (IPH_MF | IPH_OFFSET)) { 701218893Sdim if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) { 702204642Srdivacky goto after_ilb; 703204642Srdivacky } 704218893Sdim 705218893Sdim ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1); 706218893Sdim ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1); 707204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 708204642Srdivacky ip_drop_input("ILB fragment", mp, ill); 709218893Sdim freemsg(mp); 710204642Srdivacky return; 711204642Srdivacky } 712204642Srdivacky lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol, 713204642Srdivacky (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst); 714218893Sdim 715204642Srdivacky if (lb_ret == ILB_DROPPED) { 716204642Srdivacky /* Is this the right counter to increase? */ 717204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 718204642Srdivacky ip_drop_input("ILB_DROPPED", mp, ill); 719204642Srdivacky freemsg(mp); 720204642Srdivacky return; 721204642Srdivacky } 722204642Srdivacky if (lb_ret == ILB_BALANCED) { 723218893Sdim /* Set the dst to that of the chosen server */ 724204642Srdivacky nexthop = lb_dst; 725218893Sdim DB_CKSUMFLAGS(mp) = 0; 726204642Srdivacky } 727218893Sdim } 728204642Srdivacky 729204642Srdivackyafter_ilb: 730204642Srdivacky opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; 731204642Srdivacky ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 732204642Srdivacky if (opt_len != 0) { 733212904Sdim int error = 0; 734218893Sdim 735204642Srdivacky ira->ira_ip_hdr_length += (opt_len << 2); 736204642Srdivacky ira->ira_flags |= IRAF_IPV4_OPTIONS; 737204642Srdivacky 738204642Srdivacky /* IP Options present! Validate the length. */ 739218893Sdim mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira); 740206083Srdivacky if (mp == NULL) 741218893Sdim return; 742204642Srdivacky 743204642Srdivacky /* Might have changed */ 744206083Srdivacky ipha = (ipha_t *)mp->b_rptr; 745206083Srdivacky 746206083Srdivacky /* Verify IP header checksum before parsing the options */ 747218893Sdim if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && 748206083Srdivacky ip_csum_hdr(ipha)) { 749206083Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 750206083Srdivacky ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 751206083Srdivacky freemsg(mp); 752218893Sdim return; 753204642Srdivacky } 754204642Srdivacky ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; 755218893Sdim 756218893Sdim /* 757218893Sdim * Go off to ip_input_options which returns the next hop 758204642Srdivacky * destination address, which may have been affected 759204642Srdivacky * by source routing. 760204642Srdivacky */ 761204642Srdivacky IP_STAT(ipst, ip_opt); 762204642Srdivacky 763204642Srdivacky nexthop = ip_input_options(ipha, nexthop, mp, ira, &error); 764205407Srdivacky if (error != 0) { 765218893Sdim /* 766218893Sdim * An ICMP error has been sent and the packet has 767218893Sdim * been dropped. 768204642Srdivacky */ 769218893Sdim return; 770218893Sdim } 771218893Sdim } 772204642Srdivacky /* Can not use route cache with TX since the labels can differ */ 773204642Srdivacky if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 774206083Srdivacky if (CLASSD(nexthop)) { 775206083Srdivacky ire = ire_multicast(ill); 776218893Sdim } else { 777204642Srdivacky /* Match destination and label */ 778204642Srdivacky ire = ire_route_recursive_v4(nexthop, 0, NULL, 779204642Srdivacky ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR, 780204642Srdivacky (ill->ill_flags & ILLF_ROUTER), 781204642Srdivacky ira->ira_xmit_hint, ipst, NULL, NULL, NULL); 782206083Srdivacky } 783205407Srdivacky /* Update the route cache so we do the ire_refrele */ 784205407Srdivacky ASSERT(ire != NULL); 785205407Srdivacky if (rtc->rtc_ire != NULL) 786206083Srdivacky ire_refrele(rtc->rtc_ire); 787205407Srdivacky rtc->rtc_ire = ire; 788218893Sdim rtc->rtc_ipaddr = nexthop; 789205407Srdivacky } else if (nexthop == rtc->rtc_ipaddr) { 790205407Srdivacky /* Use the route cache */ 791205407Srdivacky ASSERT(rtc->rtc_ire != NULL); 792205407Srdivacky ire = rtc->rtc_ire; 793205407Srdivacky } else { 794204642Srdivacky /* Update the route cache */ 795204642Srdivacky if (CLASSD(nexthop)) { 796205407Srdivacky ire = ire_multicast(ill); 797205407Srdivacky } else { 798205407Srdivacky /* Just match the destination */ 799204642Srdivacky ire = ire_route_recursive_dstonly_v4(nexthop, 800205407Srdivacky (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, 801205407Srdivacky ipst); 802204642Srdivacky } 803218893Sdim ASSERT(ire != NULL); 804223017Sdim if (rtc->rtc_ire != NULL) 805223017Sdim ire_refrele(rtc->rtc_ire); 806223017Sdim rtc->rtc_ire = ire; 807223017Sdim rtc->rtc_ipaddr = nexthop; 808204642Srdivacky } 809223017Sdim 810223017Sdim ire->ire_ib_pkt_count++; 811223017Sdim 812223017Sdim /* 813204642Srdivacky * Based on ire_type and ire_flags call one of: 814223017Sdim * ire_recv_local_v4 - for IRE_LOCAL 815223017Sdim * ire_recv_loopback_v4 - for IRE_LOOPBACK 816223017Sdim * ire_recv_multirt_v4 - if RTF_MULTIRT 817223017Sdim * ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE 818223017Sdim * ire_recv_multicast_v4 - for IRE_MULTICAST 819223017Sdim * ire_recv_broadcast_v4 - for IRE_BROADCAST 820223017Sdim * ire_recv_noaccept_v4 - for ire_noaccept ones 821223017Sdim * ire_recv_forward_v4 - for the rest. 822223017Sdim */ 823223017Sdim (*ire->ire_recvfn)(ire, mp, ipha, ira); 824223017Sdim} 825218893Sdim#undef rptr 826206083Srdivacky 827218893Sdim/* 828204642Srdivacky * ire_recvfn for IREs that need forwarding 829204642Srdivacky */ 830204642Srdivackyvoid 831218893Sdimire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 832204642Srdivacky{ 833204642Srdivacky ipha_t *ipha = (ipha_t *)iph_arg; 834218893Sdim ill_t *ill = ira->ira_ill; 835218893Sdim ip_stack_t *ipst = ill->ill_ipst; 836204642Srdivacky ill_t *dst_ill; 837218893Sdim nce_t *nce; 838204642Srdivacky ipaddr_t src = ipha->ipha_src; 839204642Srdivacky uint32_t added_tx_len; 840204642Srdivacky uint32_t mtu, iremtu; 841204642Srdivacky 842204642Srdivacky if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { 843204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 844204642Srdivacky ip_drop_input("l2 multicast not forwarded", mp, ill); 845204642Srdivacky freemsg(mp); 846204642Srdivacky return; 847204642Srdivacky } 848204642Srdivacky 849218893Sdim if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { 850204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 851204642Srdivacky ip_drop_input("ipIfStatsForwProhibits", mp, ill); 852204642Srdivacky freemsg(mp); 853204642Srdivacky return; 854204642Srdivacky } 855204642Srdivacky 856204642Srdivacky /* 857204642Srdivacky * Either ire_nce_capable or ire_dep_parent would be set for the IRE 858204642Srdivacky * when it is found by ire_route_recursive, but that some other thread 859204642Srdivacky * could have changed the routes with the effect of clearing 860204642Srdivacky * ire_dep_parent. In that case we'd end up dropping the packet, or 861204642Srdivacky * finding a new nce below. 862204642Srdivacky * Get, allocate, or update the nce. 863204642Srdivacky * We get a refhold on ire_nce_cache as a result of this to avoid races 864204642Srdivacky * where ire_nce_cache is deleted. 865204642Srdivacky * 866204642Srdivacky * This ensures that we don't forward if the interface is down since 867204642Srdivacky * ipif_down removes all the nces. 868204642Srdivacky */ 869204642Srdivacky mutex_enter(&ire->ire_lock); 870204642Srdivacky nce = ire->ire_nce_cache; 871204642Srdivacky if (nce == NULL) { 872204642Srdivacky /* Not yet set up - try to set one up */ 873204642Srdivacky mutex_exit(&ire->ire_lock); 874204642Srdivacky (void) ire_revalidate_nce(ire); 875204642Srdivacky mutex_enter(&ire->ire_lock); 876204642Srdivacky nce = ire->ire_nce_cache; 877204642Srdivacky if (nce == NULL) { 878204642Srdivacky mutex_exit(&ire->ire_lock); 879204642Srdivacky /* The ire_dep_parent chain went bad, or no memory */ 880204642Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 881204642Srdivacky ip_drop_input("No ire_dep_parent", mp, ill); 882204642Srdivacky freemsg(mp); 883204642Srdivacky return; 884204642Srdivacky } 885204642Srdivacky } 886204642Srdivacky nce_refhold(nce); 887204642Srdivacky mutex_exit(&ire->ire_lock); 888218893Sdim 889204642Srdivacky if (nce->nce_is_condemned) { 890204642Srdivacky nce_t *nce1; 891204642Srdivacky 892204642Srdivacky nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE); 893204642Srdivacky nce_refrele(nce); 894204642Srdivacky if (nce1 == NULL) { 895206083Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 896218893Sdim ip_drop_input("No nce", mp, ill); 897206083Srdivacky freemsg(mp); 898204642Srdivacky return; 899206083Srdivacky } 900218893Sdim nce = nce1; 901206083Srdivacky } 902206083Srdivacky dst_ill = nce->nce_ill; 903206083Srdivacky 904206083Srdivacky /* 905206083Srdivacky * Unless we are forwarding, drop the packet. 906206083Srdivacky * We have to let source routed packets through if they go out 907206083Srdivacky * the same interface i.e., they are 'ping -l' packets. 908206083Srdivacky */ 909206083Srdivacky if (!(dst_ill->ill_flags & ILLF_ROUTER) && 910206083Srdivacky !(ip_source_routed(ipha, ipst) && dst_ill == ill)) { 911206083Srdivacky if (ip_source_routed(ipha, ipst)) { 912206083Srdivacky ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill); 913206083Srdivacky icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); 914218893Sdim nce_refrele(nce); 915206083Srdivacky return; 916206083Srdivacky } 917206083Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 918206083Srdivacky ip_drop_input("ipIfStatsForwProhibits", mp, ill); 919206083Srdivacky freemsg(mp); 920218893Sdim nce_refrele(nce); 921218893Sdim return; 922204642Srdivacky } 923204642Srdivacky 924204642Srdivacky if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) { 925218893Sdim ipaddr_t dst = ipha->ipha_dst; 926204642Srdivacky 927218893Sdim ire->ire_ib_pkt_count--; 928218893Sdim /* 929218893Sdim * Should only use IREs that are visible from the 930218893Sdim * global zone for forwarding. 931204642Srdivacky * Take a source route into account the same way as ip_input 932204642Srdivacky * did. 933204642Srdivacky */ 934204642Srdivacky if (ira->ira_flags & IRAF_IPV4_OPTIONS) { 935204642Srdivacky int error = 0; 936204642Srdivacky 937204642Srdivacky dst = ip_input_options(ipha, dst, mp, ira, &error); 938204642Srdivacky ASSERT(error == 0); /* ip_input checked */ 939204642Srdivacky } 940203954Srdivacky ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID, 941203954Srdivacky ira->ira_tsl, MATCH_IRE_SECATTR, 942203954Srdivacky (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst, 943204642Srdivacky NULL, NULL, NULL); 944204642Srdivacky ire->ire_ib_pkt_count++; 945218893Sdim (*ire->ire_recvfn)(ire, mp, ipha, ira); 946204642Srdivacky ire_refrele(ire); 947204642Srdivacky nce_refrele(nce); 948204642Srdivacky return; 949204642Srdivacky } 950218893Sdim 951203954Srdivacky /* 952204642Srdivacky * ipIfStatsHCInForwDatagrams should only be increment if there 953203954Srdivacky * will be an attempt to forward the packet, which is why we 954203954Srdivacky * increment after the above condition has been checked. 955204642Srdivacky */ 956203954Srdivacky BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 957 958 /* Initiate Read side IPPF processing */ 959 if (IPP_ENABLED(IPP_FWD_IN, ipst)) { 960 /* ip_process translates an IS_UNDER_IPMP */ 961 mp = ip_process(IPP_FWD_IN, mp, ill, ill); 962 if (mp == NULL) { 963 /* ip_drop_packet and MIB done */ 964 ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred " 965 "during IPPF processing\n")); 966 nce_refrele(nce); 967 return; 968 } 969 } 970 971 DTRACE_PROBE4(ip4__forwarding__start, 972 ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp); 973 974 if (HOOKS4_INTERESTED_FORWARDING(ipst)) { 975 int error; 976 977 FW_HOOKS(ipst->ips_ip4_forwarding_event, 978 ipst->ips_ipv4firewall_forwarding, 979 ill, dst_ill, ipha, mp, mp, 0, ipst, error); 980 981 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); 982 983 if (mp == NULL) { 984 nce_refrele(nce); 985 return; 986 } 987 /* 988 * Even if the destination was changed by the filter we use the 989 * forwarding decision that was made based on the address 990 * in ip_input. 991 */ 992 993 /* Might have changed */ 994 ipha = (ipha_t *)mp->b_rptr; 995 ira->ira_pktlen = ntohs(ipha->ipha_length); 996 } 997 998 /* Packet is being forwarded. Turning off hwcksum flag. */ 999 DB_CKSUMFLAGS(mp) = 0; 1000 1001 /* 1002 * Martian Address Filtering [RFC 1812, Section 5.3.7] 1003 * The loopback address check for both src and dst has already 1004 * been checked in ip_input 1005 * In the future one can envision adding RPF checks using number 3. 1006 * If we already checked the same source address we can skip this. 1007 */ 1008 if (!(ira->ira_flags & IRAF_VERIFIED_SRC) || 1009 src != ira->ira_verified_src) { 1010 switch (ipst->ips_src_check) { 1011 case 0: 1012 break; 1013 case 2: 1014 if (ip_type_v4(src, ipst) == IRE_BROADCAST) { 1015 BUMP_MIB(ill->ill_ip_mib, 1016 ipIfStatsForwProhibits); 1017 BUMP_MIB(ill->ill_ip_mib, 1018 ipIfStatsInAddrErrors); 1019 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 1020 freemsg(mp); 1021 nce_refrele(nce); 1022 return; 1023 } 1024 /* FALLTHRU */ 1025 1026 case 1: 1027 if (CLASSD(src)) { 1028 BUMP_MIB(ill->ill_ip_mib, 1029 ipIfStatsForwProhibits); 1030 BUMP_MIB(ill->ill_ip_mib, 1031 ipIfStatsInAddrErrors); 1032 ip_drop_input("ipIfStatsInAddrErrors", mp, ill); 1033 freemsg(mp); 1034 nce_refrele(nce); 1035 return; 1036 } 1037 break; 1038 } 1039 /* Remember for next packet */ 1040 ira->ira_flags |= IRAF_VERIFIED_SRC; 1041 ira->ira_verified_src = src; 1042 } 1043 1044 /* 1045 * Check if packet is going out the same link on which it arrived. 1046 * Means we might need to send a redirect. 1047 */ 1048 if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) { 1049 ip_send_potential_redirect_v4(mp, ipha, ire, ira); 1050 } 1051 1052 added_tx_len = 0; 1053 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 1054 mblk_t *mp1; 1055 uint32_t old_pkt_len = ira->ira_pktlen; 1056 1057 /* 1058 * Check if it can be forwarded and add/remove 1059 * CIPSO options as needed. 1060 */ 1061 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) { 1062 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1063 ip_drop_input("tsol_ip_forward", mp, ill); 1064 freemsg(mp); 1065 nce_refrele(nce); 1066 return; 1067 } 1068 /* 1069 * Size may have changed. Remember amount added in case 1070 * IP needs to send an ICMP too big. 1071 */ 1072 mp = mp1; 1073 ipha = (ipha_t *)mp->b_rptr; 1074 ira->ira_pktlen = ntohs(ipha->ipha_length); 1075 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); 1076 if (ira->ira_pktlen > old_pkt_len) 1077 added_tx_len = ira->ira_pktlen - old_pkt_len; 1078 1079 /* Options can have been added or removed */ 1080 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH) 1081 ira->ira_flags |= IRAF_IPV4_OPTIONS; 1082 else 1083 ira->ira_flags &= ~IRAF_IPV4_OPTIONS; 1084 } 1085 1086 mtu = dst_ill->ill_mtu; 1087 if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu) 1088 mtu = iremtu; 1089 ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len); 1090 nce_refrele(nce); 1091} 1092 1093/* 1094 * Used for sending out unicast and multicast packets that are 1095 * forwarded. 1096 */ 1097void 1098ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha, 1099 ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len) 1100{ 1101 ill_t *dst_ill = nce->nce_ill; 1102 uint32_t pkt_len; 1103 uint32_t sum; 1104 iaflags_t iraflags = ira->ira_flags; 1105 ip_stack_t *ipst = ill->ill_ipst; 1106 iaflags_t ixaflags; 1107 1108 if (ipha->ipha_ttl <= 1) { 1109 /* Perhaps the checksum was bad */ 1110 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1111 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1112 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1113 freemsg(mp); 1114 return; 1115 } 1116 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1117 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill); 1118 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira); 1119 return; 1120 } 1121 ipha->ipha_ttl--; 1122 /* Adjust the checksum to reflect the ttl decrement. */ 1123 sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; 1124 ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); 1125 1126 /* Check if there are options to update */ 1127 if (iraflags & IRAF_IPV4_OPTIONS) { 1128 ASSERT(ipha->ipha_version_and_hdr_length != 1129 IP_SIMPLE_HDR_VERSION); 1130 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM)); 1131 1132 if (!ip_forward_options(mp, ipha, dst_ill, ira)) { 1133 /* ipIfStatsForwProhibits and ip_drop_input done */ 1134 return; 1135 } 1136 1137 ipha->ipha_hdr_checksum = 0; 1138 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1139 } 1140 1141 /* Initiate Write side IPPF processing before any fragmentation */ 1142 if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { 1143 /* ip_process translates an IS_UNDER_IPMP */ 1144 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill); 1145 if (mp == NULL) { 1146 /* ip_drop_packet and MIB done */ 1147 ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \ 1148 " during IPPF processing\n")); 1149 return; 1150 } 1151 } 1152 1153 pkt_len = ira->ira_pktlen; 1154 1155 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); 1156 1157 ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL; 1158 1159 if (pkt_len > mtu) { 1160 /* 1161 * It needs fragging on its way out. If we haven't 1162 * verified the header checksum yet we do it now since 1163 * are going to put a surely good checksum in the 1164 * outgoing header, we have to make sure that it 1165 * was good coming in. 1166 */ 1167 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1168 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1169 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1170 freemsg(mp); 1171 return; 1172 } 1173 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) { 1174 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails); 1175 ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill); 1176 if (iraflags & IRAF_SYSTEM_LABELED) { 1177 /* 1178 * Remove any CIPSO option added by 1179 * tsol_ip_forward, and make sure we report 1180 * a path MTU so that there 1181 * is room to add such a CIPSO option for future 1182 * packets. 1183 */ 1184 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, 1185 AF_INET); 1186 } 1187 1188 icmp_frag_needed(mp, mtu, ira); 1189 return; 1190 } 1191 1192 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu, 1193 ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL); 1194 return; 1195 } 1196 1197 ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length)); 1198 if (iraflags & IRAF_LOOPBACK_COPY) { 1199 /* 1200 * IXAF_NO_LOOP_ZONEID is not set hence 7th arg 1201 * is don't care 1202 */ 1203 (void) ip_postfrag_loopcheck(mp, nce, 1204 ixaflags | IXAF_LOOPBACK_COPY, 1205 pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL); 1206 } else { 1207 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint, 1208 GLOBAL_ZONEID, 0, NULL); 1209 } 1210} 1211 1212/* 1213 * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE, 1214 * which is what ire_route_recursive returns when there is no matching ire. 1215 * Send ICMP unreachable unless blackhole. 1216 */ 1217void 1218ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1219{ 1220 ipha_t *ipha = (ipha_t *)iph_arg; 1221 ill_t *ill = ira->ira_ill; 1222 ip_stack_t *ipst = ill->ill_ipst; 1223 1224 /* Would we have forwarded this packet if we had a route? */ 1225 if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { 1226 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1227 ip_drop_input("l2 multicast not forwarded", mp, ill); 1228 freemsg(mp); 1229 return; 1230 } 1231 1232 if (!(ill->ill_flags & ILLF_ROUTER)) { 1233 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1234 ip_drop_input("ipIfStatsForwProhibits", mp, ill); 1235 freemsg(mp); 1236 return; 1237 } 1238 /* 1239 * If we had a route this could have been forwarded. Count as such. 1240 * 1241 * ipIfStatsHCInForwDatagrams should only be increment if there 1242 * will be an attempt to forward the packet, which is why we 1243 * increment after the above condition has been checked. 1244 */ 1245 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); 1246 1247 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); 1248 1249 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST, 1250 ipst); 1251 1252 if (ire->ire_flags & RTF_BLACKHOLE) { 1253 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill); 1254 freemsg(mp); 1255 } else { 1256 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill); 1257 1258 if (ip_source_routed(ipha, ipst)) { 1259 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); 1260 } else { 1261 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira); 1262 } 1263 } 1264} 1265 1266/* 1267 * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for 1268 * VRRP when in noaccept mode. 1269 * We silently drop the packet. ARP handles packets even if noaccept is set. 1270 */ 1271/* ARGSUSED */ 1272void 1273ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1274 ip_recv_attr_t *ira) 1275{ 1276 ill_t *ill = ira->ira_ill; 1277 1278 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1279 ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill); 1280 freemsg(mp); 1281} 1282 1283/* 1284 * ire_recvfn for IRE_BROADCAST. 1285 */ 1286void 1287ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1288 ip_recv_attr_t *ira) 1289{ 1290 ipha_t *ipha = (ipha_t *)iph_arg; 1291 ill_t *ill = ira->ira_ill; 1292 ill_t *dst_ill = ire->ire_ill; 1293 ip_stack_t *ipst = ill->ill_ipst; 1294 ire_t *alt_ire; 1295 nce_t *nce; 1296 ipaddr_t ipha_dst; 1297 1298 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); 1299 1300 /* Tag for higher-level protocols */ 1301 ira->ira_flags |= IRAF_BROADCAST; 1302 1303 /* 1304 * Whether local or directed broadcast forwarding: don't allow 1305 * for TCP. 1306 */ 1307 if (ipha->ipha_protocol == IPPROTO_TCP) { 1308 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1309 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1310 freemsg(mp); 1311 return; 1312 } 1313 1314 /* 1315 * So that we don't end up with dups, only one ill an IPMP group is 1316 * nominated to receive broadcast traffic. 1317 * If we have no cast_ill we are liberal and accept everything. 1318 */ 1319 if (IS_UNDER_IPMP(ill)) { 1320 /* For an under ill_grp can change under lock */ 1321 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1322 if (!ill->ill_nom_cast && ill->ill_grp != NULL && 1323 ill->ill_grp->ig_cast_ill != NULL) { 1324 rw_exit(&ipst->ips_ill_g_lock); 1325 /* No MIB since this is normal operation */ 1326 ip_drop_input("not nom_cast", mp, ill); 1327 freemsg(mp); 1328 return; 1329 } 1330 rw_exit(&ipst->ips_ill_g_lock); 1331 1332 ira->ira_ruifindex = ill_get_upper_ifindex(ill); 1333 } 1334 1335 /* 1336 * After reassembly and IPsec we will need to duplicate the 1337 * broadcast packet for all matching zones on the ill. 1338 */ 1339 ira->ira_zoneid = ALL_ZONES; 1340 1341 /* 1342 * Check for directed broadcast i.e. ire->ire_ill is different than 1343 * the incoming ill. 1344 * The same broadcast address can be assigned to multiple interfaces 1345 * so have to check explicitly for that case by looking up the alt_ire 1346 */ 1347 if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) { 1348 /* Reassemble on the ill on which the packet arrived */ 1349 ip_input_local_v4(ire, mp, ipha, ira); 1350 /* Restore */ 1351 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1352 return; 1353 } 1354 1355 /* Is there an IRE_BROADCAST on the incoming ill? */ 1356 ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST : 1357 ipha->ipha_dst); 1358 alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill, 1359 ALL_ZONES, ira->ira_tsl, 1360 MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL); 1361 if (alt_ire != NULL) { 1362 /* Not a directed broadcast */ 1363 /* 1364 * In the special case of multirouted broadcast 1365 * packets, we unconditionally need to "gateway" 1366 * them to the appropriate interface here so that reassembly 1367 * works. We know that the IRE_BROADCAST on cgtp0 doesn't 1368 * have RTF_MULTIRT set so we look for such an IRE in the 1369 * bucket. 1370 */ 1371 if (alt_ire->ire_flags & RTF_MULTIRT) { 1372 irb_t *irb; 1373 ire_t *ire1; 1374 1375 irb = ire->ire_bucket; 1376 irb_refhold(irb); 1377 for (ire1 = irb->irb_ire; ire1 != NULL; 1378 ire1 = ire1->ire_next) { 1379 if (IRE_IS_CONDEMNED(ire1)) 1380 continue; 1381 if (!(ire1->ire_type & IRE_BROADCAST) || 1382 (ire1->ire_flags & RTF_MULTIRT)) 1383 continue; 1384 ill = ire1->ire_ill; 1385 ill_refhold(ill); 1386 break; 1387 } 1388 irb_refrele(irb); 1389 if (ire1 != NULL) { 1390 ill_t *orig_ill = ira->ira_ill; 1391 1392 ire_refrele(alt_ire); 1393 /* Reassemble on the new ill */ 1394 ira->ira_ill = ill; 1395 ip_input_local_v4(ire, mp, ipha, ira); 1396 ill_refrele(ill); 1397 /* Restore */ 1398 ira->ira_ill = orig_ill; 1399 ira->ira_ruifindex = 1400 orig_ill->ill_phyint->phyint_ifindex; 1401 return; 1402 } 1403 } 1404 ire_refrele(alt_ire); 1405 /* Reassemble on the ill on which the packet arrived */ 1406 ip_input_local_v4(ire, mp, ipha, ira); 1407 goto done; 1408 } 1409 1410 /* 1411 * This is a directed broadcast 1412 * 1413 * If directed broadcast is allowed, then forward the packet out 1414 * the destination interface with IXAF_LOOPBACK_COPY set. That will 1415 * result in ip_input() receiving a copy of the packet on the 1416 * appropriate ill. (We could optimize this to avoid the extra trip 1417 * via ip_input(), but since directed broadcasts are normally disabled 1418 * it doesn't make sense to optimize it.) 1419 */ 1420 if (!ipst->ips_ip_g_forward_directed_bcast || 1421 (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) { 1422 ip_drop_input("directed broadcast not allowed", mp, ill); 1423 freemsg(mp); 1424 goto done; 1425 } 1426 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1427 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1428 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1429 freemsg(mp); 1430 goto done; 1431 } 1432 1433 /* 1434 * Clear the indication that this may have hardware 1435 * checksum as we are not using it for forwarding. 1436 */ 1437 DB_CKSUMFLAGS(mp) = 0; 1438 1439 /* 1440 * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one. 1441 */ 1442 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; 1443 ipha->ipha_hdr_checksum = 0; 1444 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); 1445 1446 /* 1447 * We use ip_forward_xmit to do any fragmentation. 1448 * and loopback copy on the outbound interface. 1449 * 1450 * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side. 1451 */ 1452 ira->ira_flags |= IRAF_LOOPBACK_COPY; 1453 1454 nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST); 1455 if (nce == NULL) { 1456 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); 1457 ip_drop_output("No nce", mp, dst_ill); 1458 freemsg(mp); 1459 goto done; 1460 } 1461 1462 ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0); 1463 nce_refrele(nce); 1464done: 1465 /* Restore */ 1466 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1467} 1468 1469/* 1470 * ire_recvfn for IRE_MULTICAST. 1471 */ 1472void 1473ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, 1474 ip_recv_attr_t *ira) 1475{ 1476 ipha_t *ipha = (ipha_t *)iph_arg; 1477 ill_t *ill = ira->ira_ill; 1478 ip_stack_t *ipst = ill->ill_ipst; 1479 1480 ASSERT(ire->ire_ill == ira->ira_ill); 1481 1482 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); 1483 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen); 1484 1485 /* RSVP hook */ 1486 if (ira->ira_flags & IRAF_RSVP) 1487 goto forus; 1488 1489 /* Tag for higher-level protocols */ 1490 ira->ira_flags |= IRAF_MULTICAST; 1491 1492 /* 1493 * So that we don't end up with dups, only one ill an IPMP group is 1494 * nominated to receive multicast traffic. 1495 * If we have no cast_ill we are liberal and accept everything. 1496 */ 1497 if (IS_UNDER_IPMP(ill)) { 1498 ip_stack_t *ipst = ill->ill_ipst; 1499 1500 /* For an under ill_grp can change under lock */ 1501 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1502 if (!ill->ill_nom_cast && ill->ill_grp != NULL && 1503 ill->ill_grp->ig_cast_ill != NULL) { 1504 rw_exit(&ipst->ips_ill_g_lock); 1505 ip_drop_input("not on cast ill", mp, ill); 1506 freemsg(mp); 1507 return; 1508 } 1509 rw_exit(&ipst->ips_ill_g_lock); 1510 /* 1511 * We switch to the upper ill so that mrouter and hasmembers 1512 * can operate on upper here and in ip_input_multicast. 1513 */ 1514 ill = ipmp_ill_hold_ipmp_ill(ill); 1515 if (ill != NULL) { 1516 ASSERT(ill != ira->ira_ill); 1517 ASSERT(ire->ire_ill == ira->ira_ill); 1518 ira->ira_ill = ill; 1519 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1520 } else { 1521 ill = ira->ira_ill; 1522 } 1523 } 1524 1525 /* 1526 * Check if we are a multicast router - send ip_mforward a copy of 1527 * the packet. 1528 * Due to mroute_decap tunnels we consider forwarding packets even if 1529 * mrouted has not joined the allmulti group on this interface. 1530 */ 1531 if (ipst->ips_ip_g_mrouter) { 1532 int retval; 1533 1534 /* 1535 * Clear the indication that this may have hardware 1536 * checksum as we are not using it for forwarding. 1537 */ 1538 DB_CKSUMFLAGS(mp) = 0; 1539 1540 /* 1541 * ip_mforward helps us make these distinctions: If received 1542 * on tunnel and not IGMP, then drop. 1543 * If IGMP packet, then don't check membership 1544 * If received on a phyint and IGMP or PIM, then 1545 * don't check membership 1546 */ 1547 retval = ip_mforward(mp, ira); 1548 /* ip_mforward updates mib variables if needed */ 1549 1550 switch (retval) { 1551 case 0: 1552 /* 1553 * pkt is okay and arrived on phyint. 1554 * 1555 * If we are running as a multicast router 1556 * we need to see all IGMP and/or PIM packets. 1557 */ 1558 if ((ipha->ipha_protocol == IPPROTO_IGMP) || 1559 (ipha->ipha_protocol == IPPROTO_PIM)) { 1560 goto forus; 1561 } 1562 break; 1563 case -1: 1564 /* pkt is mal-formed, toss it */ 1565 freemsg(mp); 1566 goto done; 1567 case 1: 1568 /* 1569 * pkt is okay and arrived on a tunnel 1570 * 1571 * If we are running a multicast router 1572 * we need to see all igmp packets. 1573 */ 1574 if (ipha->ipha_protocol == IPPROTO_IGMP) { 1575 goto forus; 1576 } 1577 ip_drop_input("Multicast on tunnel ignored", mp, ill); 1578 freemsg(mp); 1579 goto done; 1580 } 1581 } 1582 1583 /* 1584 * Check if we have members on this ill. This is not necessary for 1585 * correctness because even if the NIC/GLD had a leaky filter, we 1586 * filter before passing to each conn_t. 1587 */ 1588 if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) { 1589 /* 1590 * Nobody interested 1591 * 1592 * This might just be caused by the fact that 1593 * multiple IP Multicast addresses map to the same 1594 * link layer multicast - no need to increment counter! 1595 */ 1596 ip_drop_input("Multicast with no members", mp, ill); 1597 freemsg(mp); 1598 goto done; 1599 } 1600forus: 1601 ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n", 1602 ntohl(ipha->ipha_dst))); 1603 1604 /* 1605 * After reassembly and IPsec we will need to duplicate the 1606 * multicast packet for all matching zones on the ill. 1607 */ 1608 ira->ira_zoneid = ALL_ZONES; 1609 1610 /* Reassemble on the ill on which the packet arrived */ 1611 ip_input_local_v4(ire, mp, ipha, ira); 1612done: 1613 if (ill != ire->ire_ill) { 1614 ill_refrele(ill); 1615 ira->ira_ill = ire->ire_ill; 1616 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex; 1617 } 1618} 1619 1620/* 1621 * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT. 1622 * Drop packets since we don't forward out multirt routes. 1623 */ 1624/* ARGSUSED */ 1625void 1626ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1627{ 1628 ill_t *ill = ira->ira_ill; 1629 1630 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); 1631 ip_drop_input("Not forwarding out MULTIRT", mp, ill); 1632 freemsg(mp); 1633} 1634 1635/* 1636 * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK 1637 * has rewritten the packet to have a loopback destination address (We 1638 * filter out packet with a loopback destination from arriving over the wire). 1639 * We don't know what zone to use, thus we always use the GLOBAL_ZONEID. 1640 */ 1641void 1642ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1643{ 1644 ipha_t *ipha = (ipha_t *)iph_arg; 1645 ill_t *ill = ira->ira_ill; 1646 ill_t *ire_ill = ire->ire_ill; 1647 1648 ira->ira_zoneid = GLOBAL_ZONEID; 1649 1650 /* Switch to the lo0 ill for further processing */ 1651 if (ire_ill != ill) { 1652 /* 1653 * Update ira_ill to be the ILL on which the IP address 1654 * is hosted. 1655 * No need to hold the ill since we have a hold on the ire 1656 */ 1657 ASSERT(ira->ira_ill == ira->ira_rill); 1658 ira->ira_ill = ire_ill; 1659 1660 ip_input_local_v4(ire, mp, ipha, ira); 1661 1662 /* Restore */ 1663 ASSERT(ira->ira_ill == ire_ill); 1664 ira->ira_ill = ill; 1665 return; 1666 1667 } 1668 ip_input_local_v4(ire, mp, ipha, ira); 1669} 1670 1671/* 1672 * ire_recvfn for IRE_LOCAL. 1673 */ 1674void 1675ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) 1676{ 1677 ipha_t *ipha = (ipha_t *)iph_arg; 1678 ill_t *ill = ira->ira_ill; 1679 ill_t *ire_ill = ire->ire_ill; 1680 1681 /* Make a note for DAD that this address is in use */ 1682 ire->ire_last_used_time = lbolt; 1683 1684 /* Only target the IRE_LOCAL with the right zoneid. */ 1685 ira->ira_zoneid = ire->ire_zoneid; 1686 1687 /* 1688 * If the packet arrived on the wrong ill, we check that 1689 * this is ok. 1690 * If it is, then we ensure that we do the reassembly on 1691 * the ill on which the address is hosted. We keep ira_rill as 1692 * the one on which the packet arrived, so that IP_PKTINFO and 1693 * friends can report this. 1694 */ 1695 if (ire_ill != ill) { 1696 ire_t *new_ire; 1697 1698 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); 1699 if (new_ire == NULL) { 1700 /* Drop packet */ 1701 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); 1702 ip_drop_input("ipIfStatsInForwProhibits", mp, ill); 1703 freemsg(mp); 1704 return; 1705 } 1706 /* 1707 * Update ira_ill to be the ILL on which the IP address 1708 * is hosted. No need to hold the ill since we have a 1709 * hold on the ire. Note that we do the switch even if 1710 * new_ire == ire (for IPMP, ire would be the one corresponding 1711 * to the IPMP ill). 1712 */ 1713 ASSERT(ira->ira_ill == ira->ira_rill); 1714 ira->ira_ill = new_ire->ire_ill; 1715 1716 /* ira_ruifindex tracks the upper for ira_rill */ 1717 if (IS_UNDER_IPMP(ill)) 1718 ira->ira_ruifindex = ill_get_upper_ifindex(ill); 1719 1720 ip_input_local_v4(new_ire, mp, ipha, ira); 1721 1722 /* Restore */ 1723 ASSERT(ira->ira_ill == new_ire->ire_ill); 1724 ira->ira_ill = ill; 1725 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; 1726 1727 if (new_ire != ire) 1728 ire_refrele(new_ire); 1729 return; 1730 } 1731 1732 ip_input_local_v4(ire, mp, ipha, ira); 1733} 1734 1735/* 1736 * Common function for packets arriving for the host. Handles 1737 * checksum verification, reassembly checks, etc. 1738 */ 1739static void 1740ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1741{ 1742 ill_t *ill = ira->ira_ill; 1743 iaflags_t iraflags = ira->ira_flags; 1744 1745 /* 1746 * Verify IP header checksum. If the packet was AH or ESP then 1747 * this flag has already been cleared. Likewise if the packet 1748 * had a hardware checksum. 1749 */ 1750 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { 1751 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); 1752 ip_drop_input("ipIfStatsInCksumErrs", mp, ill); 1753 freemsg(mp); 1754 return; 1755 } 1756 1757 if (iraflags & IRAF_IPV4_OPTIONS) { 1758 if (!ip_input_local_options(mp, ipha, ira)) { 1759 /* Error has been sent and mp consumed */ 1760 return; 1761 } 1762 } 1763 1764 /* 1765 * Is packet part of fragmented IP packet? 1766 * We compare against defined values in network byte order 1767 */ 1768 if (ipha->ipha_fragment_offset_and_flags & 1769 (IPH_MF_HTONS | IPH_OFFSET_HTONS)) { 1770 /* 1771 * Make sure we have ira_l2src before we loose the original 1772 * mblk 1773 */ 1774 if (!(ira->ira_flags & IRAF_L2SRC_SET)) 1775 ip_setl2src(mp, ira, ira->ira_rill); 1776 1777 mp = ip_input_fragment(mp, ipha, ira); 1778 if (mp == NULL) 1779 return; 1780 /* Completed reassembly */ 1781 ipha = (ipha_t *)mp->b_rptr; 1782 } 1783 1784 /* 1785 * For broadcast and multicast we need some extra work before 1786 * we call ip_fanout_v4(), since in the case of shared-IP zones 1787 * we need to pretend that a packet arrived for each zoneid. 1788 */ 1789 if (iraflags & IRAF_MULTIBROADCAST) { 1790 if (iraflags & IRAF_BROADCAST) 1791 ip_input_broadcast_v4(ire, mp, ipha, ira); 1792 else 1793 ip_input_multicast_v4(ire, mp, ipha, ira); 1794 return; 1795 } 1796 ip_fanout_v4(mp, ipha, ira); 1797} 1798 1799 1800/* 1801 * Handle multiple zones which match the same broadcast address 1802 * and ill by delivering a packet to each of them. 1803 * Walk the bucket and look for different ire_zoneid but otherwise 1804 * the same IRE (same ill/addr/mask/type). 1805 * Note that ire_add() tracks IREs that are identical in all 1806 * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by 1807 * increasing ire_identical_cnt. Thus we don't need to be concerned 1808 * about those. 1809 */ 1810static void 1811ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1812{ 1813 ill_t *ill = ira->ira_ill; 1814 ip_stack_t *ipst = ill->ill_ipst; 1815 netstack_t *ns = ipst->ips_netstack; 1816 irb_t *irb; 1817 ire_t *ire1; 1818 mblk_t *mp1; 1819 ipha_t *ipha1; 1820 1821 irb = ire->ire_bucket; 1822 1823 /* 1824 * If we don't have more than one shared-IP zone, or if 1825 * there can't be more than one IRE_BROADCAST for this 1826 * IP address, then just set the zoneid and proceed. 1827 */ 1828 if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) { 1829 ira->ira_zoneid = ire->ire_zoneid; 1830 1831 ip_fanout_v4(mp, ipha, ira); 1832 return; 1833 } 1834 irb_refhold(irb); 1835 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 1836 /* We do the main IRE after the end of the loop */ 1837 if (ire1 == ire) 1838 continue; 1839 1840 /* 1841 * Only IREs for the same IP address should be in the same 1842 * bucket. 1843 * But could have IRE_HOSTs in the case of CGTP. 1844 */ 1845 ASSERT(ire1->ire_addr == ire->ire_addr); 1846 if (!(ire1->ire_type & IRE_BROADCAST)) 1847 continue; 1848 1849 if (IRE_IS_CONDEMNED(ire1)) 1850 continue; 1851 1852 mp1 = copymsg(mp); 1853 if (mp1 == NULL) { 1854 /* Failed to deliver to one zone */ 1855 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1856 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1857 continue; 1858 } 1859 ira->ira_zoneid = ire1->ire_zoneid; 1860 ipha1 = (ipha_t *)mp1->b_rptr; 1861 ip_fanout_v4(mp1, ipha1, ira); 1862 } 1863 irb_refrele(irb); 1864 /* Do the main ire */ 1865 ira->ira_zoneid = ire->ire_zoneid; 1866 ip_fanout_v4(mp, ipha, ira); 1867} 1868 1869/* 1870 * Handle multiple zones which want to receive the same multicast packets 1871 * on this ill by delivering a packet to each of them. 1872 * 1873 * Note that for packets delivered to transports we could instead do this 1874 * as part of the fanout code, but since we need to handle icmp_inbound 1875 * it is simpler to have multicast work the same as broadcast. 1876 * 1877 * The ip_fanout matching for multicast matches based on ilm independent of 1878 * zoneid since the zoneid restriction is applied when joining a multicast 1879 * group. 1880 */ 1881/* ARGSUSED */ 1882static void 1883ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 1884{ 1885 ill_t *ill = ira->ira_ill; 1886 iaflags_t iraflags = ira->ira_flags; 1887 ip_stack_t *ipst = ill->ill_ipst; 1888 netstack_t *ns = ipst->ips_netstack; 1889 zoneid_t zoneid; 1890 mblk_t *mp1; 1891 ipha_t *ipha1; 1892 1893 /* ire_recv_multicast has switched to the upper ill for IPMP */ 1894 ASSERT(!IS_UNDER_IPMP(ill)); 1895 1896 /* 1897 * If we don't have more than one shared-IP zone, or if 1898 * there are no members in anything but the global zone, 1899 * then just set the zoneid and proceed. 1900 */ 1901 if (ns->netstack_numzones == 1 || 1902 !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, 1903 GLOBAL_ZONEID)) { 1904 ira->ira_zoneid = GLOBAL_ZONEID; 1905 1906 /* If sender didn't want this zone to receive it, drop */ 1907 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1908 ira->ira_no_loop_zoneid == ira->ira_zoneid) { 1909 ip_drop_input("Multicast but wrong zoneid", mp, ill); 1910 freemsg(mp); 1911 return; 1912 } 1913 ip_fanout_v4(mp, ipha, ira); 1914 return; 1915 } 1916 1917 /* 1918 * Here we loop over all zoneids that have members in the group 1919 * and deliver a packet to ip_fanout for each zoneid. 1920 * 1921 * First find any members in the lowest numeric zoneid by looking for 1922 * first zoneid larger than -1 (ALL_ZONES). 1923 * We terminate the loop when we receive -1 (ALL_ZONES). 1924 */ 1925 zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES); 1926 for (; zoneid != ALL_ZONES; 1927 zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) { 1928 /* 1929 * Avoid an extra copymsg/freemsg by skipping global zone here 1930 * and doing that at the end. 1931 */ 1932 if (zoneid == GLOBAL_ZONEID) 1933 continue; 1934 1935 ira->ira_zoneid = zoneid; 1936 1937 /* If sender didn't want this zone to receive it, skip */ 1938 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1939 ira->ira_no_loop_zoneid == ira->ira_zoneid) 1940 continue; 1941 1942 mp1 = copymsg(mp); 1943 if (mp1 == NULL) { 1944 /* Failed to deliver to one zone */ 1945 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1946 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1947 continue; 1948 } 1949 ipha1 = (ipha_t *)mp1->b_rptr; 1950 ip_fanout_v4(mp1, ipha1, ira); 1951 } 1952 1953 /* Do the main ire */ 1954 ira->ira_zoneid = GLOBAL_ZONEID; 1955 /* If sender didn't want this zone to receive it, drop */ 1956 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && 1957 ira->ira_no_loop_zoneid == ira->ira_zoneid) { 1958 ip_drop_input("Multicast but wrong zoneid", mp, ill); 1959 freemsg(mp); 1960 } else { 1961 ip_fanout_v4(mp, ipha, ira); 1962 } 1963} 1964 1965 1966/* 1967 * Determine the zoneid and IRAF_TX_* flags if trusted extensions 1968 * is in use. Updates ira_zoneid and ira_flags as a result. 1969 */ 1970static void 1971ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol, 1972 uint_t ip_hdr_length, ip_recv_attr_t *ira) 1973{ 1974 uint16_t *up; 1975 uint16_t lport; 1976 zoneid_t zoneid; 1977 1978 ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED); 1979 1980 /* 1981 * If the packet is unlabeled we might allow read-down 1982 * for MAC_EXEMPT. Below we clear this if it is a multi-level 1983 * port (MLP). 1984 * Note that ira_tsl can be NULL here. 1985 */ 1986 if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED) 1987 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE; 1988 1989 if (ira->ira_zoneid != ALL_ZONES) 1990 return; 1991 1992 ira->ira_flags |= IRAF_TX_SHARED_ADDR; 1993 1994 up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); 1995 switch (protocol) { 1996 case IPPROTO_TCP: 1997 case IPPROTO_SCTP: 1998 case IPPROTO_UDP: 1999 /* Caller ensures this */ 2000 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr); 2001 2002 /* 2003 * Only these transports support MLP. 2004 * We know their destination port numbers is in 2005 * the same place in the header. 2006 */ 2007 lport = up[1]; 2008 2009 /* 2010 * No need to handle exclusive-stack zones 2011 * since ALL_ZONES only applies to the shared IP instance. 2012 */ 2013 zoneid = tsol_mlp_findzone(protocol, lport); 2014 /* 2015 * If no shared MLP is found, tsol_mlp_findzone returns 2016 * ALL_ZONES. In that case, we assume it's SLP, and 2017 * search for the zone based on the packet label. 2018 * 2019 * If there is such a zone, we prefer to find a 2020 * connection in it. Otherwise, we look for a 2021 * MAC-exempt connection in any zone whose label 2022 * dominates the default label on the packet. 2023 */ 2024 if (zoneid == ALL_ZONES) 2025 zoneid = tsol_attr_to_zoneid(ira); 2026 else 2027 ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE; 2028 break; 2029 default: 2030 /* Handle shared address for other protocols */ 2031 zoneid = tsol_attr_to_zoneid(ira); 2032 break; 2033 } 2034 ira->ira_zoneid = zoneid; 2035} 2036 2037/* 2038 * Increment checksum failure statistics 2039 */ 2040static void 2041ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill) 2042{ 2043 ip_stack_t *ipst = ill->ill_ipst; 2044 2045 switch (protocol) { 2046 case IPPROTO_TCP: 2047 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); 2048 2049 if (hck_flags & HCK_FULLCKSUM) 2050 IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); 2051 else if (hck_flags & HCK_PARTIALCKSUM) 2052 IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); 2053 else 2054 IP_STAT(ipst, ip_tcp_in_sw_cksum_err); 2055 break; 2056 case IPPROTO_UDP: 2057 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); 2058 if (hck_flags & HCK_FULLCKSUM) 2059 IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); 2060 else if (hck_flags & HCK_PARTIALCKSUM) 2061 IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); 2062 else 2063 IP_STAT(ipst, ip_udp_in_sw_cksum_err); 2064 break; 2065 case IPPROTO_ICMP: 2066 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 2067 break; 2068 default: 2069 ASSERT(0); 2070 break; 2071 } 2072} 2073 2074/* Calculate the IPv4 pseudo-header checksum */ 2075uint32_t 2076ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira) 2077{ 2078 uint_t ulp_len; 2079 uint32_t cksum; 2080 uint8_t protocol = ira->ira_protocol; 2081 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2082 2083#define iphs ((uint16_t *)ipha) 2084 2085 switch (protocol) { 2086 case IPPROTO_TCP: 2087 ulp_len = ira->ira_pktlen - ip_hdr_length; 2088 2089 /* Protocol and length */ 2090 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP; 2091 /* IP addresses */ 2092 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 2093 break; 2094 2095 case IPPROTO_UDP: { 2096 udpha_t *udpha; 2097 2098 udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); 2099 2100 /* Protocol and length */ 2101 cksum = udpha->uha_length + IP_UDP_CSUM_COMP; 2102 /* IP addresses */ 2103 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; 2104 break; 2105 } 2106 2107 default: 2108 cksum = 0; 2109 break; 2110 } 2111#undef iphs 2112 return (cksum); 2113} 2114 2115 2116/* 2117 * Software verification of the ULP checksums. 2118 * Returns B_TRUE if ok. 2119 * Increments statistics of failed. 2120 */ 2121static boolean_t 2122ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 2123{ 2124 ip_stack_t *ipst = ira->ira_ill->ill_ipst; 2125 uint32_t cksum; 2126 uint8_t protocol = ira->ira_protocol; 2127 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2128 2129 IP_STAT(ipst, ip_in_sw_cksum); 2130 2131 ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP); 2132 2133 cksum = ip_input_cksum_pseudo_v4(ipha, ira); 2134 cksum = IP_CSUM(mp, ip_hdr_length, cksum); 2135 if (cksum == 0) 2136 return (B_TRUE); 2137 2138 ip_input_cksum_err_v4(protocol, 0, ira->ira_ill); 2139 return (B_FALSE); 2140} 2141 2142/* There are drivers that can't do partial checksum with IP options */ 2143int eri_cksum_workaround = 1; 2144 2145/* 2146 * Verify the ULP checksums. 2147 * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum 2148 * algorithm. 2149 * Increments statistics if failed. 2150 */ 2151static boolean_t 2152ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, 2153 ip_recv_attr_t *ira) 2154{ 2155 ill_t *ill = ira->ira_rill; 2156 uint16_t hck_flags; 2157 uint32_t cksum; 2158 mblk_t *mp1; 2159 int32_t len; 2160 uint8_t protocol = ira->ira_protocol; 2161 uint16_t ip_hdr_length = ira->ira_ip_hdr_length; 2162 2163 2164 switch (protocol) { 2165 case IPPROTO_TCP: 2166 break; 2167 2168 case IPPROTO_UDP: { 2169 udpha_t *udpha; 2170 2171 udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); 2172 if (udpha->uha_checksum == 0) { 2173 /* Packet doesn't have a UDP checksum */ 2174 return (B_TRUE); 2175 } 2176 break; 2177 } 2178 case IPPROTO_SCTP: { 2179 sctp_hdr_t *sctph; 2180 uint32_t pktsum; 2181 2182 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length); 2183#ifdef DEBUG 2184 if (skip_sctp_cksum) 2185 return (B_TRUE); 2186#endif 2187 pktsum = sctph->sh_chksum; 2188 sctph->sh_chksum = 0; 2189 cksum = sctp_cksum(mp, ip_hdr_length); 2190 sctph->sh_chksum = pktsum; 2191 if (cksum == pktsum) 2192 return (B_TRUE); 2193 2194 /* 2195 * Defer until later whether a bad checksum is ok 2196 * in order to allow RAW sockets to use Adler checksum 2197 * with SCTP. 2198 */ 2199 ira->ira_flags |= IRAF_SCTP_CSUM_ERR; 2200 return (B_TRUE); 2201 } 2202 2203 default: 2204 /* No ULP checksum to verify. */ 2205 return (B_TRUE); 2206 } 2207 /* 2208 * Revert to software checksum calculation if the interface 2209 * isn't capable of checksum offload. 2210 * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout. 2211 * Note: IRAF_NO_HW_CKSUM is not currently used. 2212 */ 2213 ASSERT(!IS_IPMP(ill)); 2214 if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || 2215 !dohwcksum) { 2216 return (ip_input_sw_cksum_v4(mp, ipha, ira)); 2217 } 2218 2219 /* 2220 * We apply this for all ULP protocols. Does the HW know to 2221 * not set the flags for SCTP and other protocols. 2222 */ 2223 2224 hck_flags = DB_CKSUMFLAGS(mp); 2225 2226 if (hck_flags & HCK_FULLCKSUM) { 2227 /* 2228 * Full checksum has been computed by the hardware 2229 * and has been attached. If the driver wants us to 2230 * verify the correctness of the attached value, in 2231 * order to protect against faulty hardware, compare 2232 * it against -0 (0xFFFF) to see if it's valid. 2233 */ 2234 if (hck_flags & HCK_FULLCKSUM_OK) 2235 return (B_TRUE); 2236 2237 cksum = DB_CKSUM16(mp); 2238 if (cksum == 0xFFFF) 2239 return (B_TRUE); 2240 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); 2241 return (B_FALSE); 2242 } 2243 2244 mp1 = mp->b_cont; 2245 if ((hck_flags & HCK_PARTIALCKSUM) && 2246 (mp1 == NULL || mp1->b_cont == NULL) && 2247 ip_hdr_length >= DB_CKSUMSTART(mp) && 2248 (!eri_cksum_workaround || ip_hdr_length == IP_SIMPLE_HDR_LENGTH) && 2249 ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) { 2250 uint32_t adj; 2251 uchar_t *cksum_start; 2252 2253 cksum = ip_input_cksum_pseudo_v4(ipha, ira); 2254 2255 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp)); 2256 2257 /* 2258 * Partial checksum has been calculated by hardware 2259 * and attached to the packet; in addition, any 2260 * prepended extraneous data is even byte aligned, 2261 * and there are at most two mblks associated with 2262 * the packet. If any such data exists, we adjust 2263 * the checksum; also take care any postpended data. 2264 */ 2265 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj); 2266 /* 2267 * One's complement subtract extraneous checksum 2268 */ 2269 cksum += DB_CKSUM16(mp); 2270 if (adj >= cksum) 2271 cksum = ~(adj - cksum) & 0xFFFF; 2272 else 2273 cksum -= adj; 2274 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); 2275 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); 2276 if (!(~cksum & 0xFFFF)) 2277 return (B_TRUE); 2278 2279 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); 2280 return (B_FALSE); 2281 } 2282 return (ip_input_sw_cksum_v4(mp, ipha, ira)); 2283} 2284 2285 2286/* 2287 * Handle fanout of received packets. 2288 * Unicast packets that are looped back (from ire_send_local_v4) and packets 2289 * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM. 2290 * 2291 * IPQoS Notes 2292 * Before sending it to the client, invoke IPPF processing. Policy processing 2293 * takes place only if the callout_position, IPP_LOCAL_IN, is enabled. 2294 */ 2295void 2296ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) 2297{ 2298 ill_t *ill = ira->ira_ill; 2299 iaflags_t iraflags = ira->ira_flags; 2300 ip_stack_t *ipst = ill->ill_ipst; 2301 uint8_t protocol = ipha->ipha_protocol; 2302 conn_t *connp; 2303#define rptr ((uchar_t *)ipha) 2304 uint_t ip_hdr_length; 2305 uint_t min_ulp_header_length; 2306 int offset; 2307 ssize_t len; 2308 netstack_t *ns = ipst->ips_netstack; 2309 ipsec_stack_t *ipss = ns->netstack_ipsec; 2310 ill_t *rill = ira->ira_rill; 2311 2312 ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length)); 2313 2314 ip_hdr_length = ira->ira_ip_hdr_length; 2315 ira->ira_protocol = protocol; 2316 2317 /* 2318 * Time for IPP once we've done reassembly and IPsec. 2319 * We skip this for loopback packets since we don't do IPQoS 2320 * on loopback. 2321 */ 2322 if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && 2323 !(iraflags & IRAF_LOOPBACK) && 2324 (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) { 2325 /* 2326 * Use the interface on which the packet arrived - not where 2327 * the IP address is hosted. 2328 */ 2329 /* ip_process translates an IS_UNDER_IPMP */ 2330 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill); 2331 if (mp == NULL) { 2332 /* ip_drop_packet and MIB done */ 2333 return; 2334 } 2335 } 2336 2337 /* Determine the minimum required size of the upper-layer header */ 2338 /* Need to do this for at least the set of ULPs that TX handles. */ 2339 switch (protocol) { 2340 case IPPROTO_TCP: 2341 min_ulp_header_length = TCP_MIN_HEADER_LENGTH; 2342 break; 2343 case IPPROTO_SCTP: 2344 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH; 2345 break; 2346 case IPPROTO_UDP: 2347 min_ulp_header_length = UDPH_SIZE; 2348 break; 2349 case IPPROTO_ICMP: 2350 min_ulp_header_length = ICMPH_SIZE; 2351 break; 2352 default: 2353 min_ulp_header_length = 0; 2354 break; 2355 } 2356 /* Make sure we have the min ULP header length */ 2357 len = mp->b_wptr - rptr; 2358 if (len < ip_hdr_length + min_ulp_header_length) { 2359 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) { 2360 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); 2361 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); 2362 freemsg(mp); 2363 return; 2364 } 2365 IP_STAT(ipst, ip_recv_pullup); 2366 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length, 2367 ira); 2368 if (ipha == NULL) 2369 goto discard; 2370 len = mp->b_wptr - rptr; 2371 } 2372 2373 /* 2374 * If trusted extensions then determine the zoneid and TX specific 2375 * ira_flags. 2376 */ 2377 if (iraflags & IRAF_SYSTEM_LABELED) { 2378 /* This can update ira->ira_flags and ira->ira_zoneid */ 2379 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira); 2380 iraflags = ira->ira_flags; 2381 } 2382 2383 2384 /* Verify ULP checksum. Handles TCP, UDP, and SCTP */ 2385 if (iraflags & IRAF_VERIFY_ULP_CKSUM) { 2386 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) { 2387 /* Bad checksum. Stats are already incremented */ 2388 ip_drop_input("Bad ULP checksum", mp, ill); 2389 freemsg(mp); 2390 return; 2391 } 2392 /* IRAF_SCTP_CSUM_ERR could have been set */ 2393 iraflags = ira->ira_flags; 2394 } 2395 switch (protocol) { 2396 case IPPROTO_TCP: 2397 /* For TCP, discard broadcast and multicast packets. */ 2398 if (iraflags & IRAF_MULTIBROADCAST) 2399 goto discard; 2400 2401 /* First mblk contains IP+TCP headers per above check */ 2402 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH); 2403 2404 /* TCP options present? */ 2405 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4; 2406 if (offset != 5) { 2407 if (offset < 5) 2408 goto discard; 2409 2410 /* 2411 * There must be TCP options. 2412 * Make sure we can grab them. 2413 */ 2414 offset <<= 2; 2415 offset += ip_hdr_length; 2416 if (len < offset) { 2417 if (ira->ira_pktlen < offset) { 2418 BUMP_MIB(ill->ill_ip_mib, 2419 ipIfStatsInTruncatedPkts); 2420 ip_drop_input( 2421 "ipIfStatsInTruncatedPkts", 2422 mp, ill); 2423 freemsg(mp); 2424 return; 2425 } 2426 IP_STAT(ipst, ip_recv_pullup); 2427 ipha = ip_pullup(mp, offset, ira); 2428 if (ipha == NULL) 2429 goto discard; 2430 len = mp->b_wptr - rptr; 2431 } 2432 } 2433 2434 /* 2435 * Pass up a squeue hint to tcp. 2436 * If ira_sqp is already set (this is loopback) we leave it 2437 * alone. 2438 */ 2439 if (ira->ira_sqp == NULL) { 2440 ira->ira_sqp = ip_squeue_get(ira->ira_ring); 2441 } 2442 2443 /* Look for AF_INET or AF_INET6 that matches */ 2444 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length, 2445 ira, ipst); 2446 if (connp == NULL) { 2447 /* Send the TH_RST */ 2448 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2449 tcp_xmit_listeners_reset(mp, ira, ipst, NULL); 2450 return; 2451 } 2452 if (connp->conn_incoming_ifindex != 0 && 2453 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2454 CONN_DEC_REF(connp); 2455 2456 /* Send the TH_RST */ 2457 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2458 tcp_xmit_listeners_reset(mp, ira, ipst, NULL); 2459 return; 2460 } 2461 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2462 (iraflags & IRAF_IPSEC_SECURE)) { 2463 mp = ipsec_check_inbound_policy(mp, connp, 2464 ipha, NULL, ira); 2465 if (mp == NULL) { 2466 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2467 /* Note that mp is NULL */ 2468 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2469 CONN_DEC_REF(connp); 2470 return; 2471 } 2472 } 2473 /* Found a client; up it goes */ 2474 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2475 ira->ira_ill = ira->ira_rill = NULL; 2476 if (!IPCL_IS_TCP(connp)) { 2477 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 2478 (connp->conn_recv)(connp, mp, NULL, ira); 2479 CONN_DEC_REF(connp); 2480 ira->ira_ill = ill; 2481 ira->ira_rill = rill; 2482 return; 2483 } 2484 2485 /* 2486 * We do different processing whether called from 2487 * ip_accept_tcp and we match the target, don't match 2488 * the target, and when we are called by ip_input. 2489 */ 2490 if (iraflags & IRAF_TARGET_SQP) { 2491 if (ira->ira_target_sqp == connp->conn_sqp) { 2492 mblk_t *attrmp; 2493 2494 attrmp = ip_recv_attr_to_mblk(ira); 2495 if (attrmp == NULL) { 2496 BUMP_MIB(ill->ill_ip_mib, 2497 ipIfStatsInDiscards); 2498 ip_drop_input("ipIfStatsInDiscards", 2499 mp, ill); 2500 freemsg(mp); 2501 CONN_DEC_REF(connp); 2502 } else { 2503 SET_SQUEUE(attrmp, connp->conn_recv, 2504 connp); 2505 attrmp->b_cont = mp; 2506 ASSERT(ira->ira_target_sqp_mp == NULL); 2507 ira->ira_target_sqp_mp = attrmp; 2508 /* 2509 * Conn ref release when drained from 2510 * the squeue. 2511 */ 2512 } 2513 } else { 2514 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 2515 connp->conn_recv, connp, ira, SQ_FILL, 2516 SQTAG_IP_TCP_INPUT); 2517 } 2518 } else { 2519 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, 2520 connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT); 2521 } 2522 ira->ira_ill = ill; 2523 ira->ira_rill = rill; 2524 return; 2525 2526 case IPPROTO_SCTP: { 2527 sctp_hdr_t *sctph; 2528 in6_addr_t map_src, map_dst; 2529 uint32_t ports; /* Source and destination ports */ 2530 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; 2531 2532 /* For SCTP, discard broadcast and multicast packets. */ 2533 if (iraflags & IRAF_MULTIBROADCAST) 2534 goto discard; 2535 2536 /* 2537 * Since there is no SCTP h/w cksum support yet, just 2538 * clear the flag. 2539 */ 2540 DB_CKSUMFLAGS(mp) = 0; 2541 2542 /* Length ensured above */ 2543 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH); 2544 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length); 2545 2546 /* get the ports */ 2547 ports = *(uint32_t *)&sctph->sh_sport; 2548 2549 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); 2550 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); 2551 if (iraflags & IRAF_SCTP_CSUM_ERR) { 2552 /* 2553 * No potential sctp checksum errors go to the Sun 2554 * sctp stack however they might be Adler-32 summed 2555 * packets a userland stack bound to a raw IP socket 2556 * could reasonably use. Note though that Adler-32 is 2557 * a long deprecated algorithm and customer sctp 2558 * networks should eventually migrate to CRC-32 at 2559 * which time this facility should be removed. 2560 */ 2561 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2562 return; 2563 } 2564 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, sctps); 2565 if (connp == NULL) { 2566 /* Check for raw socket or OOTB handling */ 2567 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2568 return; 2569 } 2570 if (connp->conn_incoming_ifindex != 0 && 2571 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2572 CONN_DEC_REF(connp); 2573 /* Check for raw socket or OOTB handling */ 2574 ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); 2575 return; 2576 } 2577 2578 /* Found a client; up it goes */ 2579 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2580 sctp_input(connp, ipha, NULL, mp, ira); 2581 /* sctp_input does a rele of the sctp_t */ 2582 return; 2583 } 2584 2585 case IPPROTO_UDP: 2586 /* First mblk contains IP+UDP headers as checked above */ 2587 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE); 2588 2589 if (iraflags & IRAF_MULTIBROADCAST) { 2590 uint16_t *up; /* Pointer to ports in ULP header */ 2591 2592 up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); 2593 ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira); 2594 return; 2595 } 2596 2597 /* Look for AF_INET or AF_INET6 that matches */ 2598 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length, 2599 ira, ipst); 2600 if (connp == NULL) { 2601 no_udp_match: 2602 if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP]. 2603 connf_head != NULL) { 2604 ASSERT(ira->ira_protocol == IPPROTO_UDP); 2605 ip_fanout_proto_v4(mp, ipha, ira); 2606 } else { 2607 ip_fanout_send_icmp_v4(mp, 2608 ICMP_DEST_UNREACHABLE, 2609 ICMP_PORT_UNREACHABLE, ira); 2610 } 2611 return; 2612 2613 } 2614 if (connp->conn_incoming_ifindex != 0 && 2615 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 2616 CONN_DEC_REF(connp); 2617 goto no_udp_match; 2618 } 2619 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : 2620 !canputnext(connp->conn_rq)) { 2621 CONN_DEC_REF(connp); 2622 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); 2623 ip_drop_input("udpIfStatsInOverflows", mp, ill); 2624 freemsg(mp); 2625 return; 2626 } 2627 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || 2628 (iraflags & IRAF_IPSEC_SECURE)) { 2629 mp = ipsec_check_inbound_policy(mp, connp, 2630 ipha, NULL, ira); 2631 if (mp == NULL) { 2632 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2633 /* Note that mp is NULL */ 2634 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2635 CONN_DEC_REF(connp); 2636 return; 2637 } 2638 } 2639 /* 2640 * Remove 0-spi if it's 0, or move everything behind 2641 * the UDP header over it and forward to ESP via 2642 * ip_fanout_v4(). 2643 */ 2644 if (connp->conn_udp->udp_nat_t_endpoint) { 2645 if (iraflags & IRAF_IPSEC_SECURE) { 2646 ip_drop_packet(mp, B_TRUE, ira->ira_ill, 2647 DROPPER(ipss, ipds_esp_nat_t_ipsec), 2648 &ipss->ipsec_dropper); 2649 CONN_DEC_REF(connp); 2650 return; 2651 } 2652 2653 mp = zero_spi_check(mp, ira); 2654 if (mp == NULL) { 2655 /* 2656 * Packet was consumed - probably sent to 2657 * ip_fanout_v4. 2658 */ 2659 CONN_DEC_REF(connp); 2660 return; 2661 } 2662 /* Else continue like a normal UDP packet. */ 2663 ipha = (ipha_t *)mp->b_rptr; 2664 protocol = ipha->ipha_protocol; 2665 ira->ira_protocol = protocol; 2666 } 2667 /* Found a client; up it goes */ 2668 IP_STAT(ipst, ip_udp_fannorm); 2669 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2670 ira->ira_ill = ira->ira_rill = NULL; 2671 (connp->conn_recv)(connp, mp, NULL, ira); 2672 CONN_DEC_REF(connp); 2673 ira->ira_ill = ill; 2674 ira->ira_rill = rill; 2675 return; 2676 default: 2677 break; 2678 } 2679 2680 /* 2681 * Clear hardware checksumming flag as it is currently only 2682 * used by TCP and UDP. 2683 */ 2684 DB_CKSUMFLAGS(mp) = 0; 2685 2686 switch (protocol) { 2687 case IPPROTO_ICMP: 2688 /* 2689 * We need to accomodate icmp messages coming in clear 2690 * until we get everything secure from the wire. If 2691 * icmp_accept_clear_messages is zero we check with 2692 * the global policy and act accordingly. If it is 2693 * non-zero, we accept the message without any checks. 2694 * But *this does not mean* that this will be delivered 2695 * to RAW socket clients. By accepting we might send 2696 * replies back, change our MTU value etc., 2697 * but delivery to the ULP/clients depends on their 2698 * policy dispositions. 2699 */ 2700 if (ipst->ips_icmp_accept_clear_messages == 0) { 2701 mp = ipsec_check_global_policy(mp, NULL, 2702 ipha, NULL, ira, ns); 2703 if (mp == NULL) 2704 return; 2705 } 2706 2707 /* 2708 * On a labeled system, we have to check whether the zone 2709 * itself is permitted to receive raw traffic. 2710 */ 2711 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 2712 if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { 2713 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); 2714 ip_drop_input("tsol_can_accept_raw", mp, ill); 2715 freemsg(mp); 2716 return; 2717 } 2718 } 2719 2720 /* 2721 * ICMP header checksum, including checksum field, 2722 * should be zero. 2723 */ 2724 if (IP_CSUM(mp, ip_hdr_length, 0)) { 2725 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); 2726 ip_drop_input("icmpInCksumErrs", mp, ill); 2727 freemsg(mp); 2728 return; 2729 } 2730 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2731 mp = icmp_inbound_v4(mp, ira); 2732 if (mp == NULL) { 2733 /* No need to pass to RAW sockets */ 2734 return; 2735 } 2736 break; 2737 2738 case IPPROTO_IGMP: 2739 /* 2740 * If we are not willing to accept IGMP packets in clear, 2741 * then check with global policy. 2742 */ 2743 if (ipst->ips_igmp_accept_clear_messages == 0) { 2744 mp = ipsec_check_global_policy(mp, NULL, 2745 ipha, NULL, ira, ns); 2746 if (mp == NULL) 2747 return; 2748 } 2749 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && 2750 !tsol_can_accept_raw(mp, ira, B_TRUE)) { 2751 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2752 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2753 freemsg(mp); 2754 return; 2755 } 2756 /* 2757 * Validate checksum 2758 */ 2759 if (IP_CSUM(mp, ip_hdr_length, 0)) { 2760 ++ipst->ips_igmpstat.igps_rcv_badsum; 2761 ip_drop_input("igps_rcv_badsum", mp, ill); 2762 freemsg(mp); 2763 return; 2764 } 2765 2766 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2767 mp = igmp_input(mp, ira); 2768 if (mp == NULL) { 2769 /* Bad packet - discarded by igmp_input */ 2770 return; 2771 } 2772 break; 2773 case IPPROTO_PIM: 2774 /* 2775 * If we are not willing to accept PIM packets in clear, 2776 * then check with global policy. 2777 */ 2778 if (ipst->ips_pim_accept_clear_messages == 0) { 2779 mp = ipsec_check_global_policy(mp, NULL, 2780 ipha, NULL, ira, ns); 2781 if (mp == NULL) 2782 return; 2783 } 2784 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && 2785 !tsol_can_accept_raw(mp, ira, B_TRUE)) { 2786 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2787 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2788 freemsg(mp); 2789 return; 2790 } 2791 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2792 2793 /* Checksum is verified in pim_input */ 2794 mp = pim_input(mp, ira); 2795 if (mp == NULL) { 2796 /* Bad packet - discarded by pim_input */ 2797 return; 2798 } 2799 break; 2800 case IPPROTO_AH: 2801 case IPPROTO_ESP: { 2802 /* 2803 * Fast path for AH/ESP. 2804 */ 2805 netstack_t *ns = ipst->ips_netstack; 2806 ipsec_stack_t *ipss = ns->netstack_ipsec; 2807 2808 IP_STAT(ipst, ipsec_proto_ahesp); 2809 2810 if (!ipsec_loaded(ipss)) { 2811 ip_proto_not_sup(mp, ira); 2812 return; 2813 } 2814 2815 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 2816 /* select inbound SA and have IPsec process the pkt */ 2817 if (protocol == IPPROTO_ESP) { 2818 esph_t *esph; 2819 boolean_t esp_in_udp_sa; 2820 boolean_t esp_in_udp_packet; 2821 2822 mp = ipsec_inbound_esp_sa(mp, ira, &esph); 2823 if (mp == NULL) 2824 return; 2825 2826 ASSERT(esph != NULL); 2827 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 2828 ASSERT(ira->ira_ipsec_esp_sa != NULL); 2829 ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL); 2830 2831 esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags & 2832 IPSA_F_NATT) != 0); 2833 esp_in_udp_packet = 2834 (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0; 2835 2836 /* 2837 * The following is a fancy, but quick, way of saying: 2838 * ESP-in-UDP SA and Raw ESP packet --> drop 2839 * OR 2840 * ESP SA and ESP-in-UDP packet --> drop 2841 */ 2842 if (esp_in_udp_sa != esp_in_udp_packet) { 2843 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2844 ip_drop_packet(mp, B_TRUE, ira->ira_ill, 2845 DROPPER(ipss, ipds_esp_no_sa), 2846 &ipss->ipsec_dropper); 2847 return; 2848 } 2849 mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, 2850 ira); 2851 } else { 2852 ah_t *ah; 2853 2854 mp = ipsec_inbound_ah_sa(mp, ira, &ah); 2855 if (mp == NULL) 2856 return; 2857 2858 ASSERT(ah != NULL); 2859 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 2860 ASSERT(ira->ira_ipsec_ah_sa != NULL); 2861 ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); 2862 mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, 2863 ira); 2864 } 2865 2866 if (mp == NULL) { 2867 /* 2868 * Either it failed or is pending. In the former case 2869 * ipIfStatsInDiscards was increased. 2870 */ 2871 return; 2872 } 2873 /* we're done with IPsec processing, send it up */ 2874 ip_input_post_ipsec(mp, ira); 2875 return; 2876 } 2877 case IPPROTO_ENCAP: { 2878 ipha_t *inner_ipha; 2879 2880 /* 2881 * Handle self-encapsulated packets (IP-in-IP where 2882 * the inner addresses == the outer addresses). 2883 */ 2884 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) > 2885 mp->b_wptr) { 2886 if (ira->ira_pktlen < 2887 ip_hdr_length + sizeof (ipha_t)) { 2888 BUMP_MIB(ill->ill_ip_mib, 2889 ipIfStatsInTruncatedPkts); 2890 ip_drop_input("ipIfStatsInTruncatedPkts", 2891 mp, ill); 2892 freemsg(mp); 2893 return; 2894 } 2895 ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length + 2896 sizeof (ipha_t) - mp->b_rptr, ira); 2897 if (ipha == NULL) { 2898 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2899 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2900 freemsg(mp); 2901 return; 2902 } 2903 } 2904 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length); 2905 /* 2906 * Check the sanity of the inner IP header. 2907 */ 2908 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { 2909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2910 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2911 freemsg(mp); 2912 return; 2913 } 2914 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { 2915 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2916 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2917 freemsg(mp); 2918 return; 2919 } 2920 if (inner_ipha->ipha_src != ipha->ipha_src || 2921 inner_ipha->ipha_dst != ipha->ipha_dst) { 2922 /* We fallthru to iptun fanout below */ 2923 goto iptun; 2924 } 2925 2926 /* 2927 * Self-encapsulated tunnel packet. Remove 2928 * the outer IP header and fanout again. 2929 * We also need to make sure that the inner 2930 * header is pulled up until options. 2931 */ 2932 mp->b_rptr = (uchar_t *)inner_ipha; 2933 ipha = inner_ipha; 2934 ip_hdr_length = IPH_HDR_LENGTH(ipha); 2935 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) { 2936 if (ira->ira_pktlen < 2937 (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) { 2938 BUMP_MIB(ill->ill_ip_mib, 2939 ipIfStatsInTruncatedPkts); 2940 ip_drop_input("ipIfStatsInTruncatedPkts", 2941 mp, ill); 2942 freemsg(mp); 2943 return; 2944 } 2945 ipha = ip_pullup(mp, 2946 (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira); 2947 if (ipha == NULL) { 2948 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2949 ip_drop_input("ipIfStatsInDiscards", mp, ill); 2950 freemsg(mp); 2951 return; 2952 } 2953 } 2954 if (ip_hdr_length > sizeof (ipha_t)) { 2955 /* We got options on the inner packet. */ 2956 ipaddr_t dst = ipha->ipha_dst; 2957 int error = 0; 2958 2959 dst = ip_input_options(ipha, dst, mp, ira, &error); 2960 if (error != 0) { 2961 /* 2962 * An ICMP error has been sent and the packet 2963 * has been dropped. 2964 */ 2965 return; 2966 } 2967 if (dst != ipha->ipha_dst) { 2968 /* 2969 * Someone put a source-route in 2970 * the inside header of a self- 2971 * encapsulated packet. Drop it 2972 * with extreme prejudice and let 2973 * the sender know. 2974 */ 2975 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", 2976 mp, ill); 2977 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, 2978 ira); 2979 return; 2980 } 2981 } 2982 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { 2983 /* 2984 * This means that somebody is sending 2985 * Self-encapsualted packets without AH/ESP. 2986 * 2987 * Send this packet to find a tunnel endpoint. 2988 * if I can't find one, an ICMP 2989 * PROTOCOL_UNREACHABLE will get sent. 2990 */ 2991 protocol = ipha->ipha_protocol; 2992 ira->ira_protocol = protocol; 2993 goto iptun; 2994 } 2995 2996 /* Update based on removed IP header */ 2997 ira->ira_ip_hdr_length = ip_hdr_length; 2998 ira->ira_pktlen = ntohs(ipha->ipha_length); 2999 3000 if (ira->ira_flags & IRAF_IPSEC_DECAPS) { 3001 /* 3002 * This packet is self-encapsulated multiple 3003 * times. We don't want to recurse infinitely. 3004 * To keep it simple, drop the packet. 3005 */ 3006 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3007 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3008 freemsg(mp); 3009 return; 3010 } 3011 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); 3012 ira->ira_flags |= IRAF_IPSEC_DECAPS; 3013 3014 ip_input_post_ipsec(mp, ira); 3015 return; 3016 } 3017 3018 iptun: /* IPPROTO_ENCAPS that is not self-encapsulated */ 3019 case IPPROTO_IPV6: 3020 /* iptun will verify trusted label */ 3021 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length, 3022 ira, ipst); 3023 if (connp != NULL) { 3024 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); 3025 ira->ira_ill = ira->ira_rill = NULL; 3026 (connp->conn_recv)(connp, mp, NULL, ira); 3027 CONN_DEC_REF(connp); 3028 ira->ira_ill = ill; 3029 ira->ira_rill = rill; 3030 return; 3031 } 3032 /* FALLTHRU */ 3033 default: 3034 /* 3035 * On a labeled system, we have to check whether the zone 3036 * itself is permitted to receive raw traffic. 3037 */ 3038 if (ira->ira_flags & IRAF_SYSTEM_LABELED) { 3039 if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { 3040 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3041 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3042 freemsg(mp); 3043 return; 3044 } 3045 } 3046 break; 3047 } 3048 3049 /* 3050 * The above input functions may have returned the pulled up message. 3051 * So ipha need to be reinitialized. 3052 */ 3053 ipha = (ipha_t *)mp->b_rptr; 3054 ira->ira_protocol = protocol = ipha->ipha_protocol; 3055 if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) { 3056 /* 3057 * No user-level listener for these packets packets. 3058 * Check for IPPROTO_ENCAP... 3059 */ 3060 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { 3061 /* 3062 * Check policy here, 3063 * THEN ship off to ip_mroute_decap(). 3064 * 3065 * BTW, If I match a configured IP-in-IP 3066 * tunnel above, this path will not be reached, and 3067 * ip_mroute_decap will never be called. 3068 */ 3069 mp = ipsec_check_global_policy(mp, connp, 3070 ipha, NULL, ira, ns); 3071 if (mp != NULL) { 3072 ip_mroute_decap(mp, ira); 3073 } /* Else we already freed everything! */ 3074 } else { 3075 ip_proto_not_sup(mp, ira); 3076 } 3077 return; 3078 } 3079 3080 /* 3081 * Handle fanout to raw sockets. There 3082 * can be more than one stream bound to a particular 3083 * protocol. When this is the case, each one gets a copy 3084 * of any incoming packets. 3085 */ 3086 ASSERT(ira->ira_protocol == ipha->ipha_protocol); 3087 ip_fanout_proto_v4(mp, ipha, ira); 3088 return; 3089 3090discard: 3091 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 3092 ip_drop_input("ipIfStatsInDiscards", mp, ill); 3093 freemsg(mp); 3094#undef rptr 3095} 3096