tcp.c revision 2535:b66cbb80977f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26/* Copyright (c) 1990 Mentat Inc. */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29const char tcp_version[] = "%Z%%M%	%I%	%E% SMI";
30
31
32#include <sys/types.h>
33#include <sys/stream.h>
34#include <sys/strsun.h>
35#include <sys/strsubr.h>
36#include <sys/stropts.h>
37#include <sys/strlog.h>
38#include <sys/strsun.h>
39#define	_SUN_TPI_VERSION 2
40#include <sys/tihdr.h>
41#include <sys/timod.h>
42#include <sys/ddi.h>
43#include <sys/sunddi.h>
44#include <sys/suntpi.h>
45#include <sys/xti_inet.h>
46#include <sys/cmn_err.h>
47#include <sys/debug.h>
48#include <sys/vtrace.h>
49#include <sys/kmem.h>
50#include <sys/ethernet.h>
51#include <sys/cpuvar.h>
52#include <sys/dlpi.h>
53#include <sys/multidata.h>
54#include <sys/multidata_impl.h>
55#include <sys/pattr.h>
56#include <sys/policy.h>
57#include <sys/priv.h>
58#include <sys/zone.h>
59
60#include <sys/errno.h>
61#include <sys/signal.h>
62#include <sys/socket.h>
63#include <sys/sockio.h>
64#include <sys/isa_defs.h>
65#include <sys/md5.h>
66#include <sys/random.h>
67#include <netinet/in.h>
68#include <netinet/tcp.h>
69#include <netinet/ip6.h>
70#include <netinet/icmp6.h>
71#include <net/if.h>
72#include <net/route.h>
73#include <inet/ipsec_impl.h>
74
75#include <inet/common.h>
76#include <inet/ip.h>
77#include <inet/ip_impl.h>
78#include <inet/ip6.h>
79#include <inet/ip_ndp.h>
80#include <inet/mi.h>
81#include <inet/mib2.h>
82#include <inet/nd.h>
83#include <inet/optcom.h>
84#include <inet/snmpcom.h>
85#include <inet/kstatcom.h>
86#include <inet/tcp.h>
87#include <inet/tcp_impl.h>
88#include <net/pfkeyv2.h>
89#include <inet/ipsec_info.h>
90#include <inet/ipdrop.h>
91#include <inet/tcp_trace.h>
92
93#include <inet/ipclassifier.h>
94#include <inet/ip_ire.h>
95#include <inet/ip_ftable.h>
96#include <inet/ip_if.h>
97#include <inet/ipp_common.h>
98#include <sys/squeue.h>
99#include <inet/kssl/ksslapi.h>
100#include <sys/tsol/label.h>
101#include <sys/tsol/tnet.h>
102#include <sys/sdt.h>
103#include <rpc/pmap_prot.h>
104
105/*
106 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
107 *
108 * (Read the detailed design doc in PSARC case directory)
109 *
110 * The entire tcp state is contained in tcp_t and conn_t structure
111 * which are allocated in tandem using ipcl_conn_create() and passing
112 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
113 * the references on the tcp_t. The tcp_t structure is never compressed
114 * and packets always land on the correct TCP perimeter from the time
115 * eager is created till the time tcp_t dies (as such the old mentat
116 * TCP global queue is not used for detached state and no IPSEC checking
117 * is required). The global queue is still allocated to send out resets
118 * for connection which have no listeners and IP directly calls
119 * tcp_xmit_listeners_reset() which does any policy check.
120 *
121 * Protection and Synchronisation mechanism:
122 *
123 * The tcp data structure does not use any kind of lock for protecting
124 * its state but instead uses 'squeues' for mutual exclusion from various
125 * read and write side threads. To access a tcp member, the thread should
126 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
127 * squeue_fill). Since the squeues allow a direct function call, caller
128 * can pass any tcp function having prototype of edesc_t as argument
129 * (different from traditional STREAMs model where packets come in only
130 * designated entry points). The list of functions that can be directly
131 * called via squeue are listed before the usual function prototype.
132 *
133 * Referencing:
134 *
135 * TCP is MT-Hot and we use a reference based scheme to make sure that the
136 * tcp structure doesn't disappear when its needed. When the application
137 * creates an outgoing connection or accepts an incoming connection, we
138 * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
139 * The IP reference is just a symbolic reference since ip_tcpclose()
140 * looks at tcp structure after tcp_close_output() returns which could
141 * have dropped the last TCP reference. So as long as the connection is
142 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
143 * conn_t. The classifier puts its own reference when the connection is
144 * inserted in listen or connected hash. Anytime a thread needs to enter
145 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
146 * on write side or by doing a classify on read side and then puts a
147 * reference on the conn before doing squeue_enter/tryenter/fill. For
148 * read side, the classifier itself puts the reference under fanout lock
149 * to make sure that tcp can't disappear before it gets processed. The
150 * squeue will drop this reference automatically so the called function
151 * doesn't have to do a DEC_REF.
152 *
153 * Opening a new connection:
154 *
155 * The outgoing connection open is pretty simple. ip_tcpopen() does the
156 * work in creating the conn/tcp structure and initializing it. The
157 * squeue assignment is done based on the CPU the application
158 * is running on. So for outbound connections, processing is always done
159 * on application CPU which might be different from the incoming CPU
160 * being interrupted by the NIC. An optimal way would be to figure out
161 * the NIC <-> CPU binding at listen time, and assign the outgoing
162 * connection to the squeue attached to the CPU that will be interrupted
163 * for incoming packets (we know the NIC based on the bind IP address).
164 * This might seem like a problem if more data is going out but the
165 * fact is that in most cases the transmit is ACK driven transmit where
166 * the outgoing data normally sits on TCP's xmit queue waiting to be
167 * transmitted.
168 *
169 * Accepting a connection:
170 *
171 * This is a more interesting case because of various races involved in
172 * establishing a eager in its own perimeter. Read the meta comment on
173 * top of tcp_conn_request(). But briefly, the squeue is picked by
174 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
175 *
176 * Closing a connection:
177 *
178 * The close is fairly straight forward. tcp_close() calls tcp_close_output()
179 * via squeue to do the close and mark the tcp as detached if the connection
180 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
181 * reference but tcp_close() drop IP's reference always. So if tcp was
182 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
183 * and 1 because it is in classifier's connected hash. This is the condition
184 * we use to determine that its OK to clean up the tcp outside of squeue
185 * when time wait expires (check the ref under fanout and conn_lock and
186 * if it is 2, remove it from fanout hash and kill it).
187 *
188 * Although close just drops the necessary references and marks the
189 * tcp_detached state, tcp_close needs to know the tcp_detached has been
190 * set (under squeue) before letting the STREAM go away (because a
191 * inbound packet might attempt to go up the STREAM while the close
192 * has happened and tcp_detached is not set). So a special lock and
193 * flag is used along with a condition variable (tcp_closelock, tcp_closed,
194 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
195 * tcp_detached.
196 *
197 * Special provisions and fast paths:
198 *
199 * We make special provision for (AF_INET, SOCK_STREAM) sockets which
200 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
201 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
202 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
203 * check to send packets directly to tcp_rput_data via squeue. Everyone
204 * else comes through tcp_input() on the read side.
205 *
206 * We also make special provisions for sockfs by marking tcp_issocket
207 * whenever we have only sockfs on top of TCP. This allows us to skip
208 * putting the tcp in acceptor hash since a sockfs listener can never
209 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
210 * since eager has already been allocated and the accept now happens
211 * on acceptor STREAM. There is a big blob of comment on top of
212 * tcp_conn_request explaining the new accept. When socket is POP'd,
213 * sockfs sends us an ioctl to mark the fact and we go back to old
214 * behaviour. Once tcp_issocket is unset, its never set for the
215 * life of that connection.
216 *
217 * IPsec notes :
218 *
219 * Since a packet is always executed on the correct TCP perimeter
220 * all IPsec processing is defered to IP including checking new
221 * connections and setting IPSEC policies for new connection. The
222 * only exception is tcp_xmit_listeners_reset() which is called
223 * directly from IP and needs to policy check to see if TH_RST
224 * can be sent out.
225 */
226
227extern major_t TCP6_MAJ;
228
229/*
230 * Values for squeue switch:
231 * 1: squeue_enter_nodrain
232 * 2: squeue_enter
233 * 3: squeue_fill
234 */
235int tcp_squeue_close = 2;
236int tcp_squeue_wput = 2;
237
238squeue_func_t tcp_squeue_close_proc;
239squeue_func_t tcp_squeue_wput_proc;
240
241/*
242 * This controls how tiny a write must be before we try to copy it
243 * into the the mblk on the tail of the transmit queue.  Not much
244 * speedup is observed for values larger than sixteen.  Zero will
245 * disable the optimisation.
246 */
247int tcp_tx_pull_len = 16;
248
249/*
250 * TCP Statistics.
251 *
252 * How TCP statistics work.
253 *
254 * There are two types of statistics invoked by two macros.
255 *
256 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
257 * supposed to be used in non MT-hot paths of the code.
258 *
259 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
260 * supposed to be used for DEBUG purposes and may be used on a hot path.
261 *
262 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
263 * (use "kstat tcp" to get them).
264 *
265 * There is also additional debugging facility that marks tcp_clean_death()
266 * instances and saves them in tcp_t structure. It is triggered by
267 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
268 * tcp_clean_death() calls that counts the number of times each tag was hit. It
269 * is triggered by TCP_CLD_COUNTERS define.
270 *
271 * How to add new counters.
272 *
273 * 1) Add a field in the tcp_stat structure describing your counter.
274 * 2) Add a line in tcp_statistics with the name of the counter.
275 *
276 *    IMPORTANT!! - make sure that both are in sync !!
277 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
278 *
279 * Please avoid using private counters which are not kstat-exported.
280 *
281 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
282 * in tcp_t structure.
283 *
284 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
285 */
286
287#ifndef TCP_DEBUG_COUNTER
288#ifdef DEBUG
289#define	TCP_DEBUG_COUNTER 1
290#else
291#define	TCP_DEBUG_COUNTER 0
292#endif
293#endif
294
295#define	TCP_CLD_COUNTERS 0
296
297#define	TCP_TAG_CLEAN_DEATH 1
298#define	TCP_MAX_CLEAN_DEATH_TAG 32
299
300#ifdef lint
301static int _lint_dummy_;
302#endif
303
304#if TCP_CLD_COUNTERS
305static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
306#define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
307#elif defined(lint)
308#define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
309#else
310#define	TCP_CLD_STAT(x)
311#endif
312
313#if TCP_DEBUG_COUNTER
314#define	TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1)
315#elif defined(lint)
316#define	TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
317#else
318#define	TCP_DBGSTAT(x)
319#endif
320
321tcp_stat_t tcp_statistics = {
322	{ "tcp_time_wait",		KSTAT_DATA_UINT64 },
323	{ "tcp_time_wait_syn",		KSTAT_DATA_UINT64 },
324	{ "tcp_time_wait_success",	KSTAT_DATA_UINT64 },
325	{ "tcp_time_wait_fail",		KSTAT_DATA_UINT64 },
326	{ "tcp_reinput_syn",		KSTAT_DATA_UINT64 },
327	{ "tcp_ip_output",		KSTAT_DATA_UINT64 },
328	{ "tcp_detach_non_time_wait",	KSTAT_DATA_UINT64 },
329	{ "tcp_detach_time_wait",	KSTAT_DATA_UINT64 },
330	{ "tcp_time_wait_reap",		KSTAT_DATA_UINT64 },
331	{ "tcp_clean_death_nondetached",	KSTAT_DATA_UINT64 },
332	{ "tcp_reinit_calls",		KSTAT_DATA_UINT64 },
333	{ "tcp_eager_err1",		KSTAT_DATA_UINT64 },
334	{ "tcp_eager_err2",		KSTAT_DATA_UINT64 },
335	{ "tcp_eager_blowoff_calls",	KSTAT_DATA_UINT64 },
336	{ "tcp_eager_blowoff_q",	KSTAT_DATA_UINT64 },
337	{ "tcp_eager_blowoff_q0",	KSTAT_DATA_UINT64 },
338	{ "tcp_not_hard_bound",		KSTAT_DATA_UINT64 },
339	{ "tcp_no_listener",		KSTAT_DATA_UINT64 },
340	{ "tcp_found_eager",		KSTAT_DATA_UINT64 },
341	{ "tcp_wrong_queue",		KSTAT_DATA_UINT64 },
342	{ "tcp_found_eager_binding1",	KSTAT_DATA_UINT64 },
343	{ "tcp_found_eager_bound1",	KSTAT_DATA_UINT64 },
344	{ "tcp_eager_has_listener1",	KSTAT_DATA_UINT64 },
345	{ "tcp_open_alloc",		KSTAT_DATA_UINT64 },
346	{ "tcp_open_detached_alloc",	KSTAT_DATA_UINT64 },
347	{ "tcp_rput_time_wait",		KSTAT_DATA_UINT64 },
348	{ "tcp_listendrop",		KSTAT_DATA_UINT64 },
349	{ "tcp_listendropq0",		KSTAT_DATA_UINT64 },
350	{ "tcp_wrong_rq",		KSTAT_DATA_UINT64 },
351	{ "tcp_rsrv_calls",		KSTAT_DATA_UINT64 },
352	{ "tcp_eagerfree2",		KSTAT_DATA_UINT64 },
353	{ "tcp_eagerfree3",		KSTAT_DATA_UINT64 },
354	{ "tcp_eagerfree4",		KSTAT_DATA_UINT64 },
355	{ "tcp_eagerfree5",		KSTAT_DATA_UINT64 },
356	{ "tcp_timewait_syn_fail",	KSTAT_DATA_UINT64 },
357	{ "tcp_listen_badflags",	KSTAT_DATA_UINT64 },
358	{ "tcp_timeout_calls",		KSTAT_DATA_UINT64 },
359	{ "tcp_timeout_cached_alloc",	KSTAT_DATA_UINT64 },
360	{ "tcp_timeout_cancel_reqs",	KSTAT_DATA_UINT64 },
361	{ "tcp_timeout_canceled",	KSTAT_DATA_UINT64 },
362	{ "tcp_timermp_alloced",	KSTAT_DATA_UINT64 },
363	{ "tcp_timermp_freed",		KSTAT_DATA_UINT64 },
364	{ "tcp_timermp_allocfail",	KSTAT_DATA_UINT64 },
365	{ "tcp_timermp_allocdblfail",	KSTAT_DATA_UINT64 },
366	{ "tcp_push_timer_cnt",		KSTAT_DATA_UINT64 },
367	{ "tcp_ack_timer_cnt",		KSTAT_DATA_UINT64 },
368	{ "tcp_ire_null1",		KSTAT_DATA_UINT64 },
369	{ "tcp_ire_null",		KSTAT_DATA_UINT64 },
370	{ "tcp_ip_send",		KSTAT_DATA_UINT64 },
371	{ "tcp_ip_ire_send",		KSTAT_DATA_UINT64 },
372	{ "tcp_wsrv_called",		KSTAT_DATA_UINT64 },
373	{ "tcp_flwctl_on",		KSTAT_DATA_UINT64 },
374	{ "tcp_timer_fire_early",	KSTAT_DATA_UINT64 },
375	{ "tcp_timer_fire_miss",	KSTAT_DATA_UINT64 },
376	{ "tcp_freelist_cleanup",	KSTAT_DATA_UINT64 },
377	{ "tcp_rput_v6_error",		KSTAT_DATA_UINT64 },
378	{ "tcp_out_sw_cksum",		KSTAT_DATA_UINT64 },
379	{ "tcp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
380	{ "tcp_zcopy_on",		KSTAT_DATA_UINT64 },
381	{ "tcp_zcopy_off",		KSTAT_DATA_UINT64 },
382	{ "tcp_zcopy_backoff",		KSTAT_DATA_UINT64 },
383	{ "tcp_zcopy_disable",		KSTAT_DATA_UINT64 },
384	{ "tcp_mdt_pkt_out",		KSTAT_DATA_UINT64 },
385	{ "tcp_mdt_pkt_out_v4",		KSTAT_DATA_UINT64 },
386	{ "tcp_mdt_pkt_out_v6",		KSTAT_DATA_UINT64 },
387	{ "tcp_mdt_discarded",		KSTAT_DATA_UINT64 },
388	{ "tcp_mdt_conn_halted1",	KSTAT_DATA_UINT64 },
389	{ "tcp_mdt_conn_halted2",	KSTAT_DATA_UINT64 },
390	{ "tcp_mdt_conn_halted3",	KSTAT_DATA_UINT64 },
391	{ "tcp_mdt_conn_resumed1",	KSTAT_DATA_UINT64 },
392	{ "tcp_mdt_conn_resumed2",	KSTAT_DATA_UINT64 },
393	{ "tcp_mdt_legacy_small",	KSTAT_DATA_UINT64 },
394	{ "tcp_mdt_legacy_all",		KSTAT_DATA_UINT64 },
395	{ "tcp_mdt_legacy_ret",		KSTAT_DATA_UINT64 },
396	{ "tcp_mdt_allocfail",		KSTAT_DATA_UINT64 },
397	{ "tcp_mdt_addpdescfail",	KSTAT_DATA_UINT64 },
398	{ "tcp_mdt_allocd",		KSTAT_DATA_UINT64 },
399	{ "tcp_mdt_linked",		KSTAT_DATA_UINT64 },
400	{ "tcp_fusion_flowctl",		KSTAT_DATA_UINT64 },
401	{ "tcp_fusion_backenabled",	KSTAT_DATA_UINT64 },
402	{ "tcp_fusion_urg",		KSTAT_DATA_UINT64 },
403	{ "tcp_fusion_putnext",		KSTAT_DATA_UINT64 },
404	{ "tcp_fusion_unfusable",	KSTAT_DATA_UINT64 },
405	{ "tcp_fusion_aborted",		KSTAT_DATA_UINT64 },
406	{ "tcp_fusion_unqualified",	KSTAT_DATA_UINT64 },
407	{ "tcp_fusion_rrw_busy",	KSTAT_DATA_UINT64 },
408	{ "tcp_fusion_rrw_msgcnt",	KSTAT_DATA_UINT64 },
409	{ "tcp_in_ack_unsent_drop",	KSTAT_DATA_UINT64 },
410	{ "tcp_sock_fallback",		KSTAT_DATA_UINT64 },
411};
412
413static kstat_t *tcp_kstat;
414
415/*
416 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
417 * tcp write side.
418 */
419#define	CALL_IP_WPUT(connp, q, mp) {					\
420	ASSERT(((q)->q_flag & QREADR) == 0);				\
421	TCP_DBGSTAT(tcp_ip_output);					\
422	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
423}
424
425/* Macros for timestamp comparisons */
426#define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
427#define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
428
429/*
430 * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
431 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
432 * by adding three components: a time component which grows by 1 every 4096
433 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
434 * a per-connection component which grows by 125000 for every new connection;
435 * and an "extra" component that grows by a random amount centered
436 * approximately on 64000.  This causes the the ISS generator to cycle every
437 * 4.89 hours if no TCP connections are made, and faster if connections are
438 * made.
439 *
440 * When tcp_strong_iss is set to 0, ISS is calculated by adding two
441 * components: a time component which grows by 250000 every second; and
442 * a per-connection component which grows by 125000 for every new connections.
443 *
444 * A third method, when tcp_strong_iss is set to 2, for generating ISS is
445 * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
446 * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
447 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
448 * password.
449 */
450#define	ISS_INCR	250000
451#define	ISS_NSEC_SHT	12
452
453static uint32_t tcp_iss_incr_extra;	/* Incremented for each connection */
454static kmutex_t tcp_iss_key_lock;
455static MD5_CTX tcp_iss_key;
456static sin_t	sin_null;	/* Zero address for quick clears */
457static sin6_t	sin6_null;	/* Zero address for quick clears */
458
459/* Packet dropper for TCP IPsec policy drops. */
460static ipdropper_t tcp_dropper;
461
462/*
463 * This implementation follows the 4.3BSD interpretation of the urgent
464 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
465 * incompatible changes in protocols like telnet and rlogin.
466 */
467#define	TCP_OLD_URP_INTERPRETATION	1
468
469#define	TCP_IS_DETACHED_NONEAGER(tcp)	\
470	(TCP_IS_DETACHED(tcp) && \
471	    (!(tcp)->tcp_hard_binding))
472
473/*
474 * TCP reassembly macros.  We hide starting and ending sequence numbers in
475 * b_next and b_prev of messages on the reassembly queue.  The messages are
476 * chained using b_cont.  These macros are used in tcp_reass() so we don't
477 * have to see the ugly casts and assignments.
478 */
479#define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
480#define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
481					(mblk_t *)(uintptr_t)(u))
482#define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
483#define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
484					(mblk_t *)(uintptr_t)(u))
485
486/*
487 * Implementation of TCP Timers.
488 * =============================
489 *
490 * INTERFACE:
491 *
492 * There are two basic functions dealing with tcp timers:
493 *
494 *	timeout_id_t	tcp_timeout(connp, func, time)
495 * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
496 *	TCP_TIMER_RESTART(tcp, intvl)
497 *
498 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
499 * after 'time' ticks passed. The function called by timeout() must adhere to
500 * the same restrictions as a driver soft interrupt handler - it must not sleep
501 * or call other functions that might sleep. The value returned is the opaque
502 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
503 * cancel the request. The call to tcp_timeout() may fail in which case it
504 * returns zero. This is different from the timeout(9F) function which never
505 * fails.
506 *
507 * The call-back function 'func' always receives 'connp' as its single
508 * argument. It is always executed in the squeue corresponding to the tcp
509 * structure. The tcp structure is guaranteed to be present at the time the
510 * call-back is called.
511 *
512 * NOTE: The call-back function 'func' is never called if tcp is in
513 * 	the TCPS_CLOSED state.
514 *
515 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
516 * request. locks acquired by the call-back routine should not be held across
517 * the call to tcp_timeout_cancel() or a deadlock may result.
518 *
519 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
520 * Otherwise, it returns an integer value greater than or equal to 0. In
521 * particular, if the call-back function is already placed on the squeue, it can
522 * not be canceled.
523 *
524 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
525 * 	within squeue context corresponding to the tcp instance. Since the
526 *	call-back is also called via the same squeue, there are no race
527 *	conditions described in untimeout(9F) manual page since all calls are
528 *	strictly serialized.
529 *
530 *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
531 *	stored in tcp_timer_tid and starts a new one using
532 *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
533 *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
534 *	field.
535 *
536 * NOTE: since the timeout cancellation is not guaranteed, the cancelled
537 *	call-back may still be called, so it is possible tcp_timer() will be
538 *	called several times. This should not be a problem since tcp_timer()
539 *	should always check the tcp instance state.
540 *
541 *
542 * IMPLEMENTATION:
543 *
544 * TCP timers are implemented using three-stage process. The call to
545 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
546 * when the timer expires. The tcp_timer_callback() arranges the call of the
547 * tcp_timer_handler() function via squeue corresponding to the tcp
548 * instance. The tcp_timer_handler() calls actual requested timeout call-back
549 * and passes tcp instance as an argument to it. Information is passed between
550 * stages using the tcp_timer_t structure which contains the connp pointer, the
551 * tcp call-back to call and the timeout id returned by the timeout(9F).
552 *
553 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
554 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
555 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
556 * returns the pointer to this mblk.
557 *
558 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
559 * looks like a normal mblk without actual dblk attached to it.
560 *
561 * To optimize performance each tcp instance holds a small cache of timer
562 * mblocks. In the current implementation it caches up to two timer mblocks per
563 * tcp instance. The cache is preserved over tcp frees and is only freed when
564 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
565 * timer processing happens on a corresponding squeue, the cache manipulation
566 * does not require any locks. Experiments show that majority of timer mblocks
567 * allocations are satisfied from the tcp cache and do not involve kmem calls.
568 *
569 * The tcp_timeout() places a refhold on the connp instance which guarantees
570 * that it will be present at the time the call-back function fires. The
571 * tcp_timer_handler() drops the reference after calling the call-back, so the
572 * call-back function does not need to manipulate the references explicitly.
573 */
574
575typedef struct tcp_timer_s {
576	conn_t	*connp;
577	void 	(*tcpt_proc)(void *);
578	timeout_id_t   tcpt_tid;
579} tcp_timer_t;
580
581static kmem_cache_t *tcp_timercache;
582kmem_cache_t	*tcp_sack_info_cache;
583kmem_cache_t	*tcp_iphc_cache;
584
585/*
586 * For scalability, we must not run a timer for every TCP connection
587 * in TIME_WAIT state.  To see why, consider (for time wait interval of
588 * 4 minutes):
589 *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
590 *
591 * This list is ordered by time, so you need only delete from the head
592 * until you get to entries which aren't old enough to delete yet.
593 * The list consists of only the detached TIME_WAIT connections.
594 *
595 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
596 * becomes detached TIME_WAIT (either by changing the state and already
597 * being detached or the other way around). This means that the TIME_WAIT
598 * state can be extended (up to doubled) if the connection doesn't become
599 * detached for a long time.
600 *
601 * The list manipulations (including tcp_time_wait_next/prev)
602 * are protected by the tcp_time_wait_lock. The content of the
603 * detached TIME_WAIT connections is protected by the normal perimeters.
604 */
605
606typedef struct tcp_squeue_priv_s {
607	kmutex_t	tcp_time_wait_lock;
608				/* Protects the next 3 globals */
609	timeout_id_t	tcp_time_wait_tid;
610	tcp_t		*tcp_time_wait_head;
611	tcp_t		*tcp_time_wait_tail;
612	tcp_t		*tcp_free_list;
613	uint_t		tcp_free_list_cnt;
614} tcp_squeue_priv_t;
615
616/*
617 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
618 * Running it every 5 seconds seems to give the best results.
619 */
620#define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
621
622/*
623 * To prevent memory hog, limit the number of entries in tcp_free_list
624 * to 1% of available memory / number of cpus
625 */
626uint_t tcp_free_list_max_cnt = 0;
627
628#define	TCP_XMIT_LOWATER	4096
629#define	TCP_XMIT_HIWATER	49152
630#define	TCP_RECV_LOWATER	2048
631#define	TCP_RECV_HIWATER	49152
632
633/*
634 *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
635 */
636#define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
637
638#define	TIDUSZ	4096	/* transport interface data unit size */
639
640/*
641 * Bind hash list size and has function.  It has to be a power of 2 for
642 * hashing.
643 */
644#define	TCP_BIND_FANOUT_SIZE	512
645#define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
646/*
647 * Size of listen and acceptor hash list.  It has to be a power of 2 for
648 * hashing.
649 */
650#define	TCP_FANOUT_SIZE		256
651
652#ifdef	_ILP32
653#define	TCP_ACCEPTOR_HASH(accid)					\
654		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
655#else
656#define	TCP_ACCEPTOR_HASH(accid)					\
657		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
658#endif	/* _ILP32 */
659
660#define	IP_ADDR_CACHE_SIZE	2048
661#define	IP_ADDR_CACHE_HASH(faddr)					\
662	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
663
664/* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
665#define	TCP_HSP_HASH_SIZE 256
666
667#define	TCP_HSP_HASH(addr)					\
668	(((addr>>24) ^ (addr >>16) ^			\
669	    (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
670
671/*
672 * TCP options struct returned from tcp_parse_options.
673 */
674typedef struct tcp_opt_s {
675	uint32_t	tcp_opt_mss;
676	uint32_t	tcp_opt_wscale;
677	uint32_t	tcp_opt_ts_val;
678	uint32_t	tcp_opt_ts_ecr;
679	tcp_t		*tcp;
680} tcp_opt_t;
681
682/*
683 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
684 */
685
686#ifdef _BIG_ENDIAN
687#define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
688	(TCPOPT_TSTAMP << 8) | 10)
689#else
690#define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
691	(TCPOPT_NOP << 8) | TCPOPT_NOP)
692#endif
693
694/*
695 * Flags returned from tcp_parse_options.
696 */
697#define	TCP_OPT_MSS_PRESENT	1
698#define	TCP_OPT_WSCALE_PRESENT	2
699#define	TCP_OPT_TSTAMP_PRESENT	4
700#define	TCP_OPT_SACK_OK_PRESENT	8
701#define	TCP_OPT_SACK_PRESENT	16
702
703/* TCP option length */
704#define	TCPOPT_NOP_LEN		1
705#define	TCPOPT_MAXSEG_LEN	4
706#define	TCPOPT_WS_LEN		3
707#define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
708#define	TCPOPT_TSTAMP_LEN	10
709#define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
710#define	TCPOPT_SACK_OK_LEN	2
711#define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
712#define	TCPOPT_REAL_SACK_LEN	4
713#define	TCPOPT_MAX_SACK_LEN	36
714#define	TCPOPT_HEADER_LEN	2
715
716/* TCP cwnd burst factor. */
717#define	TCP_CWND_INFINITE	65535
718#define	TCP_CWND_SS		3
719#define	TCP_CWND_NORMAL		5
720
721/* Maximum TCP initial cwin (start/restart). */
722#define	TCP_MAX_INIT_CWND	8
723
724/*
725 * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
726 * either tcp_slow_start_initial or tcp_slow_start_after idle
727 * depending on the caller.  If the upper layer has not used the
728 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
729 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
730 * If the upper layer has changed set the tcp_init_cwnd, just use
731 * it to calculate the tcp_cwnd.
732 */
733#define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
734{									\
735	if ((tcp)->tcp_init_cwnd == 0) {				\
736		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
737		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
738	} else {							\
739		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
740	}								\
741	tcp->tcp_cwnd_cnt = 0;						\
742}
743
744/* TCP Timer control structure */
745typedef struct tcpt_s {
746	pfv_t	tcpt_pfv;	/* The routine we are to call */
747	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
748} tcpt_t;
749
750/* Host Specific Parameter structure */
751typedef struct tcp_hsp {
752	struct tcp_hsp	*tcp_hsp_next;
753	in6_addr_t	tcp_hsp_addr_v6;
754	in6_addr_t	tcp_hsp_subnet_v6;
755	uint_t		tcp_hsp_vers;	/* IPV4_VERSION | IPV6_VERSION */
756	int32_t		tcp_hsp_sendspace;
757	int32_t		tcp_hsp_recvspace;
758	int32_t		tcp_hsp_tstamp;
759} tcp_hsp_t;
760#define	tcp_hsp_addr	V4_PART_OF_V6(tcp_hsp_addr_v6)
761#define	tcp_hsp_subnet	V4_PART_OF_V6(tcp_hsp_subnet_v6)
762
763/*
764 * Functions called directly via squeue having a prototype of edesc_t.
765 */
766void		tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
767static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
768void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
769static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
770static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
771void 		tcp_input(void *arg, mblk_t *mp, void *arg2);
772void		tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
773static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2);
774void		tcp_output(void *arg, mblk_t *mp, void *arg2);
775static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
776static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
777
778
779/* Prototype for TCP functions */
780static void	tcp_random_init(void);
781int		tcp_random(void);
782static void	tcp_accept(tcp_t *tcp, mblk_t *mp);
783static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
784		    tcp_t *eager);
785static int	tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
786static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
787    int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
788    boolean_t user_specified);
789static void	tcp_closei_local(tcp_t *tcp);
790static void	tcp_close_detached(tcp_t *tcp);
791static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
792			mblk_t *idmp, mblk_t **defermp);
793static void	tcp_connect(tcp_t *tcp, mblk_t *mp);
794static void	tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
795		    in_port_t dstport, uint_t srcid);
796static void	tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
797		    in_port_t dstport, uint32_t flowinfo, uint_t srcid,
798		    uint32_t scope_id);
799static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
800static void	tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
801static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
802static char	*tcp_display(tcp_t *tcp, char *, char);
803static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
804static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
805static void	tcp_eager_unlink(tcp_t *tcp);
806static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
807		    int unixerr);
808static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
809		    int tlierr, int unixerr);
810static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
811		    cred_t *cr);
812static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
813		    char *value, caddr_t cp, cred_t *cr);
814static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
815		    char *value, caddr_t cp, cred_t *cr);
816static int	tcp_tpistate(tcp_t *tcp);
817static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
818    int caller_holds_lock);
819static void	tcp_bind_hash_remove(tcp_t *tcp);
820static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id);
821void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
822static void	tcp_acceptor_hash_remove(tcp_t *tcp);
823static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
824static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
825static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
826static void	tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
827static int	tcp_header_init_ipv4(tcp_t *tcp);
828static int	tcp_header_init_ipv6(tcp_t *tcp);
829int		tcp_init(tcp_t *tcp, queue_t *q);
830static int	tcp_init_values(tcp_t *tcp);
831static mblk_t	*tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
832static mblk_t	*tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
833		    t_scalar_t addr_length);
834static void	tcp_ip_ire_mark_advice(tcp_t *tcp);
835static void	tcp_ip_notify(tcp_t *tcp);
836static mblk_t	*tcp_ire_mp(mblk_t *mp);
837static void	tcp_iss_init(tcp_t *tcp);
838static void	tcp_keepalive_killer(void *arg);
839static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
840static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
841static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
842		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
843static boolean_t tcp_allow_connopt_set(int level, int name);
844int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
845int		tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
846int		tcp_opt_set(queue_t *q, uint_t optset_context, int level,
847		    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
848		    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
849		    mblk_t *mblk);
850static void	tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
851static int	tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
852		    uchar_t *ptr, uint_t len);
853static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
854static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt);
855static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
856		    caddr_t cp, cred_t *cr);
857static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
858		    caddr_t cp, cred_t *cr);
859static void	tcp_iss_key_init(uint8_t *phrase, int len);
860static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
861		    caddr_t cp, cred_t *cr);
862static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
863static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
864static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
865static void	tcp_reinit(tcp_t *tcp);
866static void	tcp_reinit_values(tcp_t *tcp);
867static void	tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
868		    tcp_t *thisstream, cred_t *cr);
869
870static uint_t	tcp_rcv_drain(queue_t *q, tcp_t *tcp);
871static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
872static boolean_t tcp_send_rst_chk(void);
873static void	tcp_ss_rexmit(tcp_t *tcp);
874static mblk_t	*tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
875static void	tcp_process_options(tcp_t *, tcph_t *);
876static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
877static void	tcp_rsrv(queue_t *q);
878static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
879static int	tcp_snmp_state(tcp_t *tcp);
880static int	tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
881		    cred_t *cr);
882static int	tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
883		    cred_t *cr);
884static int	tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
885		    cred_t *cr);
886static int	tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
887		    cred_t *cr);
888static int	tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
889		    cred_t *cr);
890static int	tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
891		    caddr_t cp, cred_t *cr);
892static int	tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
893		    caddr_t cp, cred_t *cr);
894static int	tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
895		    cred_t *cr);
896static void	tcp_timer(void *arg);
897static void	tcp_timer_callback(void *);
898static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
899    boolean_t random);
900static in_port_t tcp_get_next_priv_port(const tcp_t *);
901static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
902void		tcp_wput_accept(queue_t *q, mblk_t *mp);
903static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
904static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
905static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
906static int	tcp_send(queue_t *q, tcp_t *tcp, const int mss,
907		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
908		    const int num_sack_blk, int *usable, uint_t *snxt,
909		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
910		    const int mdt_thres);
911static int	tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
912		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
913		    const int num_sack_blk, int *usable, uint_t *snxt,
914		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
915		    const int mdt_thres);
916static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
917		    int num_sack_blk);
918static void	tcp_wsrv(queue_t *q);
919static int	tcp_xmit_end(tcp_t *tcp);
920static mblk_t	*tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send,
921		    int32_t *offset, mblk_t **end_mp, uint32_t seq,
922		    boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
923static void	tcp_ack_timer(void *arg);
924static mblk_t	*tcp_ack_mp(tcp_t *tcp);
925static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
926		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len);
927static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
928		    uint32_t ack, int ctl);
929static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr);
930static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr);
931static int	setmaxps(queue_t *q, int maxpsz);
932static void	tcp_set_rto(tcp_t *, time_t);
933static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
934		    boolean_t, boolean_t);
935static void	tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
936		    boolean_t ipsec_mctl);
937static mblk_t	*tcp_setsockopt_mp(int level, int cmd,
938		    char *opt, int optlen);
939static int	tcp_build_hdrs(queue_t *, tcp_t *);
940static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
941		    uint32_t seg_seq, uint32_t seg_ack, int seg_len,
942		    tcph_t *tcph);
943boolean_t	tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
944boolean_t	tcp_reserved_port_add(int, in_port_t *, in_port_t *);
945boolean_t	tcp_reserved_port_del(in_port_t, in_port_t);
946boolean_t	tcp_reserved_port_check(in_port_t);
947static tcp_t	*tcp_alloc_temp_tcp(in_port_t);
948static int	tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
949static mblk_t	*tcp_mdt_info_mp(mblk_t *);
950static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
951static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
952		    const boolean_t, const uint32_t, const uint32_t,
953		    const uint32_t, const uint32_t);
954static void	tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
955		    const uint_t, const uint_t, boolean_t *);
956static void	tcp_send_data(tcp_t *, queue_t *, mblk_t *);
957extern mblk_t	*tcp_timermp_alloc(int);
958extern void	tcp_timermp_free(tcp_t *);
959static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
960static void	tcp_stop_lingering(tcp_t *tcp);
961static void	tcp_close_linger_timeout(void *arg);
962void		tcp_ddi_init(void);
963void		tcp_ddi_destroy(void);
964static void	tcp_kstat_init(void);
965static void	tcp_kstat_fini(void);
966static int	tcp_kstat_update(kstat_t *kp, int rw);
967void		tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
968static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
969			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
970static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
971			tcph_t *tcph, mblk_t *idmp);
972static squeue_func_t tcp_squeue_switch(int);
973
974static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *);
975static int	tcp_close(queue_t *, int);
976static int	tcpclose_accept(queue_t *);
977static int	tcp_modclose(queue_t *);
978static void	tcp_wput_mod(queue_t *, mblk_t *);
979
980static void	tcp_squeue_add(squeue_t *);
981static boolean_t tcp_zcopy_check(tcp_t *);
982static void	tcp_zcopy_notify(tcp_t *);
983static mblk_t	*tcp_zcopy_disable(tcp_t *, mblk_t *);
984static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
985static void	tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
986
987extern void	tcp_kssl_input(tcp_t *, mblk_t *);
988
989/*
990 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
991 *
992 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
993 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
994 * (defined in tcp.h) needs to be filled in and passed into the kernel
995 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
996 * structure contains the four-tuple of a TCP connection and a range of TCP
997 * states (specified by ac_start and ac_end). The use of wildcard addresses
998 * and ports is allowed. Connections with a matching four tuple and a state
999 * within the specified range will be aborted. The valid states for the
1000 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
1001 * inclusive.
1002 *
1003 * An application which has its connection aborted by this ioctl will receive
1004 * an error that is dependent on the connection state at the time of the abort.
1005 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
1006 * though a RST packet has been received.  If the connection state is equal to
1007 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
1008 * and all resources associated with the connection will be freed.
1009 */
1010static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
1011static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
1012static void	tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
1013static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *);
1014static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
1015static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
1016    boolean_t);
1017
1018static struct module_info tcp_rinfo =  {
1019	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
1020};
1021
1022static struct module_info tcp_winfo =  {
1023	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
1024};
1025
1026/*
1027 * Entry points for TCP as a module. It only allows SNMP requests
1028 * to pass through.
1029 */
1030struct qinit tcp_mod_rinit = {
1031	(pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo,
1032};
1033
1034struct qinit tcp_mod_winit = {
1035	(pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL,
1036	&tcp_rinfo
1037};
1038
1039/*
1040 * Entry points for TCP as a device. The normal case which supports
1041 * the TCP functionality.
1042 */
1043struct qinit tcp_rinit = {
1044	NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo
1045};
1046
1047struct qinit tcp_winit = {
1048	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1049};
1050
1051/* Initial entry point for TCP in socket mode. */
1052struct qinit tcp_sock_winit = {
1053	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1054};
1055
1056/*
1057 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
1058 * an accept. Avoid allocating data structures since eager has already
1059 * been created.
1060 */
1061struct qinit tcp_acceptor_rinit = {
1062	NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
1063};
1064
1065struct qinit tcp_acceptor_winit = {
1066	(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
1067};
1068
1069/*
1070 * Entry points for TCP loopback (read side only)
1071 */
1072struct qinit tcp_loopback_rinit = {
1073	(pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0,
1074	&tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
1075};
1076
1077struct streamtab tcpinfo = {
1078	&tcp_rinit, &tcp_winit
1079};
1080
1081extern squeue_func_t tcp_squeue_wput_proc;
1082extern squeue_func_t tcp_squeue_timer_proc;
1083
1084/* Protected by tcp_g_q_lock */
1085static queue_t	*tcp_g_q;	/* Default queue used during detached closes */
1086kmutex_t tcp_g_q_lock;
1087
1088/* Protected by tcp_hsp_lock */
1089/*
1090 * XXX The host param mechanism should go away and instead we should use
1091 * the metrics associated with the routes to determine the default sndspace
1092 * and rcvspace.
1093 */
1094static tcp_hsp_t	**tcp_hsp_hash;	/* Hash table for HSPs */
1095krwlock_t tcp_hsp_lock;
1096
1097/*
1098 * Extra privileged ports. In host byte order.
1099 * Protected by tcp_epriv_port_lock.
1100 */
1101#define	TCP_NUM_EPRIV_PORTS	64
1102static int	tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS;
1103static uint16_t	tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 };
1104kmutex_t tcp_epriv_port_lock;
1105
1106/*
1107 * The smallest anonymous port in the privileged port range which TCP
1108 * looks for free port.  Use in the option TCP_ANONPRIVBIND.
1109 */
1110static in_port_t tcp_min_anonpriv_port = 512;
1111
1112/* Only modified during _init and _fini thus no locking is needed. */
1113static caddr_t	tcp_g_nd;	/* Head of 'named dispatch' variable list */
1114
1115/* Hint not protected by any lock */
1116static uint_t	tcp_next_port_to_try;
1117
1118
1119/* TCP bind hash list - all tcp_t with state >= BOUND. */
1120tf_t	tcp_bind_fanout[TCP_BIND_FANOUT_SIZE];
1121
1122/* TCP queue hash list - all tcp_t in case they will be an acceptor. */
1123static tf_t	tcp_acceptor_fanout[TCP_FANOUT_SIZE];
1124
1125/*
1126 * TCP has a private interface for other kernel modules to reserve a
1127 * port range for them to use.  Once reserved, TCP will not use any ports
1128 * in the range.  This interface relies on the TCP_EXCLBIND feature.  If
1129 * the semantics of TCP_EXCLBIND is changed, implementation of this interface
1130 * has to be verified.
1131 *
1132 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges.  Each port
1133 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports.  A port
1134 * range is [port a, port b] inclusive.  And each port range is between
1135 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
1136 *
1137 * Note that the default anonymous port range starts from 32768.  There is
1138 * no port "collision" between that and the reserved port range.  If there
1139 * is port collision (because the default smallest anonymous port is lowered
1140 * or some apps specifically bind to ports in the reserved port range), the
1141 * system may not be able to reserve a port range even there are enough
1142 * unbound ports as a reserved port range contains consecutive ports .
1143 */
1144#define	TCP_RESERVED_PORTS_ARRAY_MAX_SIZE	5
1145#define	TCP_RESERVED_PORTS_RANGE_MAX		1000
1146#define	TCP_SMALLEST_RESERVED_PORT		10240
1147#define	TCP_LARGEST_RESERVED_PORT		20480
1148
1149/* Structure to represent those reserved port ranges. */
1150typedef struct tcp_rport_s {
1151	in_port_t	lo_port;
1152	in_port_t	hi_port;
1153	tcp_t		**temp_tcp_array;
1154} tcp_rport_t;
1155
1156/* The reserved port array. */
1157static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE];
1158
1159/* Locks to protect the tcp_reserved_ports array. */
1160static krwlock_t tcp_reserved_port_lock;
1161
1162/* The number of ranges in the array. */
1163uint32_t tcp_reserved_port_array_size = 0;
1164
1165/*
1166 * MIB-2 stuff for SNMP
1167 * Note: tcpInErrs {tcp 15} is accumulated in ip.c
1168 */
1169mib2_tcp_t	tcp_mib;	/* SNMP fixed size info */
1170kstat_t		*tcp_mibkp;	/* kstat exporting tcp_mib data */
1171
1172boolean_t tcp_icmp_source_quench = B_FALSE;
1173/*
1174 * Following assumes TPI alignment requirements stay along 32 bit
1175 * boundaries
1176 */
1177#define	ROUNDUP32(x) \
1178	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
1179
1180/* Template for response to info request. */
1181static struct T_info_ack tcp_g_t_info_ack = {
1182	T_INFO_ACK,		/* PRIM_type */
1183	0,			/* TSDU_size */
1184	T_INFINITE,		/* ETSDU_size */
1185	T_INVALID,		/* CDATA_size */
1186	T_INVALID,		/* DDATA_size */
1187	sizeof (sin_t),		/* ADDR_size */
1188	0,			/* OPT_size - not initialized here */
1189	TIDUSZ,			/* TIDU_size */
1190	T_COTS_ORD,		/* SERV_type */
1191	TCPS_IDLE,		/* CURRENT_state */
1192	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
1193};
1194
1195static struct T_info_ack tcp_g_t_info_ack_v6 = {
1196	T_INFO_ACK,		/* PRIM_type */
1197	0,			/* TSDU_size */
1198	T_INFINITE,		/* ETSDU_size */
1199	T_INVALID,		/* CDATA_size */
1200	T_INVALID,		/* DDATA_size */
1201	sizeof (sin6_t),	/* ADDR_size */
1202	0,			/* OPT_size - not initialized here */
1203	TIDUSZ,		/* TIDU_size */
1204	T_COTS_ORD,		/* SERV_type */
1205	TCPS_IDLE,		/* CURRENT_state */
1206	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
1207};
1208
1209#define	MS	1L
1210#define	SECONDS	(1000 * MS)
1211#define	MINUTES	(60 * SECONDS)
1212#define	HOURS	(60 * MINUTES)
1213#define	DAYS	(24 * HOURS)
1214
1215#define	PARAM_MAX (~(uint32_t)0)
1216
1217/* Max size IP datagram is 64k - 1 */
1218#define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
1219#define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
1220/* Max of the above */
1221#define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
1222
1223/* Largest TCP port number */
1224#define	TCP_MAX_PORT	(64 * 1024 - 1)
1225
1226/*
1227 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
1228 * layer header.  It has to be a multiple of 4.
1229 */
1230static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
1231#define	tcp_wroff_xtra	tcp_wroff_xtra_param.tcp_param_val
1232
1233/*
1234 * All of these are alterable, within the min/max values given, at run time.
1235 * Note that the default value of "tcp_time_wait_interval" is four minutes,
1236 * per the TCP spec.
1237 */
1238/* BEGIN CSTYLED */
1239tcpparam_t	tcp_param_arr[] = {
1240 /*min		max		value		name */
1241 { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
1242 { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
1243 { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
1244 { 1,		1024,		1,		"tcp_conn_req_min" },
1245 { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
1246 { 128,		(1<<30),	1024*1024,	"tcp_cwnd_max" },
1247 { 0,		10,		0,		"tcp_debug" },
1248 { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
1249 { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
1250 { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
1251 { 500*MS,	PARAM_MAX,	8*MINUTES,	"tcp_ip_abort_interval"},
1252 { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
1253 { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
1254 { 1,		255,		64,		"tcp_ipv4_ttl"},
1255 { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
1256 { 0,		100,		10,		"tcp_maxpsz_multiplier" },
1257 { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
1258 { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
1259 { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
1260 { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
1261 { 1*MS,	20*SECONDS,	3*SECONDS,	"tcp_rexmit_interval_initial"},
1262 { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
1263 { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
1264 { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
1265 { 0,		16,		0,		"tcp_snd_lowat_fraction" },
1266 { 0,		128000,		0,		"tcp_sth_rcv_hiwat" },
1267 { 0,		128000,		0,		"tcp_sth_rcv_lowat" },
1268 { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
1269 { 0,		1,		0,		"tcp_ignore_path_mtu" },
1270 { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
1271 { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
1272 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
1273 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
1274 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
1275 { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
1276 { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
1277 { 0,		TCP_MSS_MAX,	64,		"tcp_co_min"},
1278 { 8192,	(1<<30),	1024*1024,	"tcp_max_buf"},
1279/*
1280 * Question:  What default value should I set for tcp_strong_iss?
1281 */
1282 { 0,		2,		1,		"tcp_strong_iss"},
1283 { 0,		65536,		20,		"tcp_rtt_updates"},
1284 { 0,		1,		1,		"tcp_wscale_always"},
1285 { 0,		1,		0,		"tcp_tstamp_always"},
1286 { 0,		1,		1,		"tcp_tstamp_if_wscale"},
1287 { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
1288 { 0,		16,		2,		"tcp_deferred_acks_max"},
1289 { 1,		16384,		4,		"tcp_slow_start_after_idle"},
1290 { 1,		4,		4,		"tcp_slow_start_initial"},
1291 { 10*MS,	50*MS,		20*MS,		"tcp_co_timer_interval"},
1292 { 0,		2,		2,		"tcp_sack_permitted"},
1293 { 0,		1,		0,		"tcp_trace"},
1294 { 0,		1,		1,		"tcp_compression_enabled"},
1295 { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
1296 { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
1297 { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
1298 { 0,		1,		0,		"tcp_rev_src_routes"},
1299 { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
1300 { 100*MS,	60*SECONDS,	1*SECONDS,	"tcp_ndd_get_info_interval"},
1301 { 0,		16,		8,		"tcp_local_dacks_max"},
1302 { 0,		2,		1,		"tcp_ecn_permitted"},
1303 { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
1304 { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
1305 { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
1306 { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
1307 { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
1308};
1309/* END CSTYLED */
1310
1311/*
1312 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
1313 * each header fragment in the header buffer.  Each parameter value has
1314 * to be a multiple of 4 (32-bit aligned).
1315 */
1316static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" };
1317static tcpparam_t tcp_mdt_tail_param = { 0,  256, 32, "tcp_mdt_hdr_tail_min" };
1318#define	tcp_mdt_hdr_head_min	tcp_mdt_head_param.tcp_param_val
1319#define	tcp_mdt_hdr_tail_min	tcp_mdt_tail_param.tcp_param_val
1320
1321/*
1322 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
1323 * the maximum number of payload buffers associated per Multidata.
1324 */
1325static tcpparam_t tcp_mdt_max_pbufs_param =
1326	{ 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
1327#define	tcp_mdt_max_pbufs	tcp_mdt_max_pbufs_param.tcp_param_val
1328
1329/* Round up the value to the nearest mss. */
1330#define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
1331
1332/*
1333 * Set ECN capable transport (ECT) code point in IP header.
1334 *
1335 * Note that there are 2 ECT code points '01' and '10', which are called
1336 * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
1337 * point ECT(0) for TCP as described in RFC 2481.
1338 */
1339#define	SET_ECT(tcp, iph) \
1340	if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1341		/* We need to clear the code point first. */ \
1342		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
1343		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
1344	} else { \
1345		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
1346		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
1347	}
1348
1349/*
1350 * The format argument to pass to tcp_display().
1351 * DISP_PORT_ONLY means that the returned string has only port info.
1352 * DISP_ADDR_AND_PORT means that the returned string also contains the
1353 * remote and local IP address.
1354 */
1355#define	DISP_PORT_ONLY		1
1356#define	DISP_ADDR_AND_PORT	2
1357
1358/*
1359 * This controls the rate some ndd info report functions can be used
1360 * by non-privileged users.  It stores the last time such info is
1361 * requested.  When those report functions are called again, this
1362 * is checked with the current time and compare with the ndd param
1363 * tcp_ndd_get_info_interval.
1364 */
1365static clock_t tcp_last_ndd_get_info_time = 0;
1366#define	NDD_TOO_QUICK_MSG \
1367	"ndd get info rate too high for non-privileged users, try again " \
1368	"later.\n"
1369#define	NDD_OUT_OF_BUF_MSG	"<< Out of buffer >>\n"
1370
1371#define	IS_VMLOANED_MBLK(mp) \
1372	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
1373
1374/*
1375 * These two variables control the rate for TCP to generate RSTs in
1376 * response to segments not belonging to any connections.  We limit
1377 * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in
1378 * each 1 second interval.  This is to protect TCP against DoS attack.
1379 */
1380static clock_t tcp_last_rst_intrvl;
1381static uint32_t tcp_rst_cnt;
1382
1383/* The number of RST not sent because of the rate limit. */
1384static uint32_t tcp_rst_unsent;
1385
1386/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
1387boolean_t tcp_mdt_chain = B_TRUE;
1388
1389/*
1390 * MDT threshold in the form of effective send MSS multiplier; we take
1391 * the MDT path if the amount of unsent data exceeds the threshold value
1392 * (default threshold is 1*SMSS).
1393 */
1394uint_t tcp_mdt_smss_threshold = 1;
1395
1396uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
1397
1398/*
1399 * Forces all connections to obey the value of the tcp_maxpsz_multiplier
1400 * tunable settable via NDD.  Otherwise, the per-connection behavior is
1401 * determined dynamically during tcp_adapt_ire(), which is the default.
1402 */
1403boolean_t tcp_static_maxpsz = B_FALSE;
1404
1405/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
1406uint32_t tcp_random_anon_port = 1;
1407
1408/*
1409 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
1410 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
1411 * data, TCP will not respond with an ACK.  RFC 793 requires that
1412 * TCP responds with an ACK for such a bogus ACK.  By not following
1413 * the RFC, we prevent TCP from getting into an ACK storm if somehow
1414 * an attacker successfully spoofs an acceptable segment to our
1415 * peer; or when our peer is "confused."
1416 */
1417uint32_t tcp_drop_ack_unsent_cnt = 10;
1418
1419/*
1420 * Hook functions to enable cluster networking
1421 * On non-clustered systems these vectors must always be NULL.
1422 */
1423
1424void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
1425			    uint8_t *laddrp, in_port_t lport) = NULL;
1426void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
1427			    uint8_t *laddrp, in_port_t lport) = NULL;
1428void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
1429			    uint8_t *laddrp, in_port_t lport,
1430			    uint8_t *faddrp, in_port_t fport) = NULL;
1431void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
1432			    uint8_t *laddrp, in_port_t lport,
1433			    uint8_t *faddrp, in_port_t fport) = NULL;
1434
1435/*
1436 * The following are defined in ip.c
1437 */
1438extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
1439				uint8_t *laddrp);
1440extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
1441				uint8_t *laddrp, uint8_t *faddrp);
1442
1443#define	CL_INET_CONNECT(tcp)		{			\
1444	if (cl_inet_connect != NULL) {				\
1445		/*						\
1446		 * Running in cluster mode - register active connection	\
1447		 * information						\
1448		 */							\
1449		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
1450			if ((tcp)->tcp_ipha->ipha_src != 0) {		\
1451				(*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
1452				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
1453				    (in_port_t)(tcp)->tcp_lport,	\
1454				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
1455				    (in_port_t)(tcp)->tcp_fport);	\
1456			}						\
1457		} else {						\
1458			if (!IN6_IS_ADDR_UNSPECIFIED(			\
1459			    &(tcp)->tcp_ip6h->ip6_src)) {\
1460				(*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
1461				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
1462				    (in_port_t)(tcp)->tcp_lport,	\
1463				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
1464				    (in_port_t)(tcp)->tcp_fport);	\
1465			}						\
1466		}							\
1467	}								\
1468}
1469
1470#define	CL_INET_DISCONNECT(tcp)	{				\
1471	if (cl_inet_disconnect != NULL) {				\
1472		/*							\
1473		 * Running in cluster mode - deregister active		\
1474		 * connection information				\
1475		 */							\
1476		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
1477			if ((tcp)->tcp_ip_src != 0) {			\
1478				(*cl_inet_disconnect)(IPPROTO_TCP,	\
1479				    AF_INET,				\
1480				    (uint8_t *)(&((tcp)->tcp_ip_src)),\
1481				    (in_port_t)(tcp)->tcp_lport,	\
1482				    (uint8_t *)				\
1483				    (&((tcp)->tcp_ipha->ipha_dst)),\
1484				    (in_port_t)(tcp)->tcp_fport);	\
1485			}						\
1486		} else {						\
1487			if (!IN6_IS_ADDR_UNSPECIFIED(			\
1488			    &(tcp)->tcp_ip_src_v6)) {			\
1489				(*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
1490				    (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
1491				    (in_port_t)(tcp)->tcp_lport,	\
1492				    (uint8_t *)				\
1493				    (&((tcp)->tcp_ip6h->ip6_dst)),\
1494				    (in_port_t)(tcp)->tcp_fport);	\
1495			}						\
1496		}							\
1497	}								\
1498}
1499
1500/*
1501 * Cluster networking hook for traversing current connection list.
1502 * This routine is used to extract the current list of live connections
1503 * which must continue to to be dispatched to this node.
1504 */
1505int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
1506
1507/*
1508 * Figure out the value of window scale opton.  Note that the rwnd is
1509 * ASSUMED to be rounded up to the nearest MSS before the calculation.
1510 * We cannot find the scale value and then do a round up of tcp_rwnd
1511 * because the scale value may not be correct after that.
1512 *
1513 * Set the compiler flag to make this function inline.
1514 */
1515static void
1516tcp_set_ws_value(tcp_t *tcp)
1517{
1518	int i;
1519	uint32_t rwnd = tcp->tcp_rwnd;
1520
1521	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
1522	    i++, rwnd >>= 1)
1523		;
1524	tcp->tcp_rcv_ws = i;
1525}
1526
1527/*
1528 * Remove a connection from the list of detached TIME_WAIT connections.
1529 */
1530static void
1531tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
1532{
1533	boolean_t	locked = B_FALSE;
1534
1535	if (tcp_time_wait == NULL) {
1536		tcp_time_wait = *((tcp_squeue_priv_t **)
1537		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
1538		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1539		locked = B_TRUE;
1540	}
1541
1542	if (tcp->tcp_time_wait_expire == 0) {
1543		ASSERT(tcp->tcp_time_wait_next == NULL);
1544		ASSERT(tcp->tcp_time_wait_prev == NULL);
1545		if (locked)
1546			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1547		return;
1548	}
1549	ASSERT(TCP_IS_DETACHED(tcp));
1550	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1551
1552	if (tcp == tcp_time_wait->tcp_time_wait_head) {
1553		ASSERT(tcp->tcp_time_wait_prev == NULL);
1554		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
1555		if (tcp_time_wait->tcp_time_wait_head != NULL) {
1556			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
1557			    NULL;
1558		} else {
1559			tcp_time_wait->tcp_time_wait_tail = NULL;
1560		}
1561	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
1562		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
1563		ASSERT(tcp->tcp_time_wait_next == NULL);
1564		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
1565		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1566		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
1567	} else {
1568		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
1569		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
1570		tcp->tcp_time_wait_prev->tcp_time_wait_next =
1571		    tcp->tcp_time_wait_next;
1572		tcp->tcp_time_wait_next->tcp_time_wait_prev =
1573		    tcp->tcp_time_wait_prev;
1574	}
1575	tcp->tcp_time_wait_next = NULL;
1576	tcp->tcp_time_wait_prev = NULL;
1577	tcp->tcp_time_wait_expire = 0;
1578
1579	if (locked)
1580		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1581}
1582
1583/*
1584 * Add a connection to the list of detached TIME_WAIT connections
1585 * and set its time to expire.
1586 */
1587static void
1588tcp_time_wait_append(tcp_t *tcp)
1589{
1590	tcp_squeue_priv_t *tcp_time_wait =
1591	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
1592		SQPRIVATE_TCP));
1593
1594	tcp_timers_stop(tcp);
1595
1596	/* Freed above */
1597	ASSERT(tcp->tcp_timer_tid == 0);
1598	ASSERT(tcp->tcp_ack_tid == 0);
1599
1600	/* must have happened at the time of detaching the tcp */
1601	ASSERT(tcp->tcp_ptpahn == NULL);
1602	ASSERT(tcp->tcp_flow_stopped == 0);
1603	ASSERT(tcp->tcp_time_wait_next == NULL);
1604	ASSERT(tcp->tcp_time_wait_prev == NULL);
1605	ASSERT(tcp->tcp_time_wait_expire == NULL);
1606	ASSERT(tcp->tcp_listener == NULL);
1607
1608	tcp->tcp_time_wait_expire = ddi_get_lbolt();
1609	/*
1610	 * The value computed below in tcp->tcp_time_wait_expire may
1611	 * appear negative or wrap around. That is ok since our
1612	 * interest is only in the difference between the current lbolt
1613	 * value and tcp->tcp_time_wait_expire. But the value should not
1614	 * be zero, since it means the tcp is not in the TIME_WAIT list.
1615	 * The corresponding comparison in tcp_time_wait_collector() uses
1616	 * modular arithmetic.
1617	 */
1618	tcp->tcp_time_wait_expire +=
1619	    drv_usectohz(tcp_time_wait_interval * 1000);
1620	if (tcp->tcp_time_wait_expire == 0)
1621		tcp->tcp_time_wait_expire = 1;
1622
1623	ASSERT(TCP_IS_DETACHED(tcp));
1624	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1625	ASSERT(tcp->tcp_time_wait_next == NULL);
1626	ASSERT(tcp->tcp_time_wait_prev == NULL);
1627	TCP_DBGSTAT(tcp_time_wait);
1628	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1629	if (tcp_time_wait->tcp_time_wait_head == NULL) {
1630		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
1631		tcp_time_wait->tcp_time_wait_head = tcp;
1632	} else {
1633		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1634		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
1635		    TCPS_TIME_WAIT);
1636		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
1637		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
1638	}
1639	tcp_time_wait->tcp_time_wait_tail = tcp;
1640	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1641}
1642
1643/* ARGSUSED */
1644void
1645tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
1646{
1647	conn_t	*connp = (conn_t *)arg;
1648	tcp_t	*tcp = connp->conn_tcp;
1649
1650	ASSERT(tcp != NULL);
1651	if (tcp->tcp_state == TCPS_CLOSED) {
1652		return;
1653	}
1654
1655	ASSERT((tcp->tcp_family == AF_INET &&
1656	    tcp->tcp_ipversion == IPV4_VERSION) ||
1657	    (tcp->tcp_family == AF_INET6 &&
1658	    (tcp->tcp_ipversion == IPV4_VERSION ||
1659	    tcp->tcp_ipversion == IPV6_VERSION)));
1660	ASSERT(!tcp->tcp_listener);
1661
1662	TCP_STAT(tcp_time_wait_reap);
1663	ASSERT(TCP_IS_DETACHED(tcp));
1664
1665	/*
1666	 * Because they have no upstream client to rebind or tcp_close()
1667	 * them later, we axe the connection here and now.
1668	 */
1669	tcp_close_detached(tcp);
1670}
1671
1672void
1673tcp_cleanup(tcp_t *tcp)
1674{
1675	mblk_t		*mp;
1676	char		*tcp_iphc;
1677	int		tcp_iphc_len;
1678	int		tcp_hdr_grown;
1679	tcp_sack_info_t	*tcp_sack_info;
1680	conn_t		*connp = tcp->tcp_connp;
1681
1682	tcp_bind_hash_remove(tcp);
1683	tcp_free(tcp);
1684
1685	/* Release any SSL context */
1686	if (tcp->tcp_kssl_ent != NULL) {
1687		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
1688		tcp->tcp_kssl_ent = NULL;
1689	}
1690
1691	if (tcp->tcp_kssl_ctx != NULL) {
1692		kssl_release_ctx(tcp->tcp_kssl_ctx);
1693		tcp->tcp_kssl_ctx = NULL;
1694	}
1695	tcp->tcp_kssl_pending = B_FALSE;
1696
1697	conn_delete_ire(connp, NULL);
1698	if (connp->conn_flags & IPCL_TCPCONN) {
1699		if (connp->conn_latch != NULL)
1700			IPLATCH_REFRELE(connp->conn_latch);
1701		if (connp->conn_policy != NULL)
1702			IPPH_REFRELE(connp->conn_policy);
1703	}
1704
1705	/*
1706	 * Since we will bzero the entire structure, we need to
1707	 * remove it and reinsert it in global hash list. We
1708	 * know the walkers can't get to this conn because we
1709	 * had set CONDEMNED flag earlier and checked reference
1710	 * under conn_lock so walker won't pick it and when we
1711	 * go the ipcl_globalhash_remove() below, no walker
1712	 * can get to it.
1713	 */
1714	ipcl_globalhash_remove(connp);
1715
1716	/* Save some state */
1717	mp = tcp->tcp_timercache;
1718
1719	tcp_sack_info = tcp->tcp_sack_info;
1720	tcp_iphc = tcp->tcp_iphc;
1721	tcp_iphc_len = tcp->tcp_iphc_len;
1722	tcp_hdr_grown = tcp->tcp_hdr_grown;
1723
1724	if (connp->conn_cred != NULL)
1725		crfree(connp->conn_cred);
1726	if (connp->conn_peercred != NULL)
1727		crfree(connp->conn_peercred);
1728	bzero(connp, sizeof (conn_t));
1729	bzero(tcp, sizeof (tcp_t));
1730
1731	/* restore the state */
1732	tcp->tcp_timercache = mp;
1733
1734	tcp->tcp_sack_info = tcp_sack_info;
1735	tcp->tcp_iphc = tcp_iphc;
1736	tcp->tcp_iphc_len = tcp_iphc_len;
1737	tcp->tcp_hdr_grown = tcp_hdr_grown;
1738
1739
1740	tcp->tcp_connp = connp;
1741
1742	connp->conn_tcp = tcp;
1743	connp->conn_flags = IPCL_TCPCONN;
1744	connp->conn_state_flags = CONN_INCIPIENT;
1745	connp->conn_ulp = IPPROTO_TCP;
1746	connp->conn_ref = 1;
1747
1748	ipcl_globalhash_insert(connp);
1749}
1750
1751/*
1752 * Blows away all tcps whose TIME_WAIT has expired. List traversal
1753 * is done forwards from the head.
1754 */
1755/* ARGSUSED */
1756void
1757tcp_time_wait_collector(void *arg)
1758{
1759	tcp_t *tcp;
1760	clock_t now;
1761	mblk_t *mp;
1762	conn_t *connp;
1763	kmutex_t *lock;
1764
1765	squeue_t *sqp = (squeue_t *)arg;
1766	tcp_squeue_priv_t *tcp_time_wait =
1767	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1768
1769	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1770	tcp_time_wait->tcp_time_wait_tid = 0;
1771
1772	if (tcp_time_wait->tcp_free_list != NULL &&
1773	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
1774		TCP_STAT(tcp_freelist_cleanup);
1775		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
1776			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1777			CONN_DEC_REF(tcp->tcp_connp);
1778		}
1779		tcp_time_wait->tcp_free_list_cnt = 0;
1780	}
1781
1782	/*
1783	 * In order to reap time waits reliably, we should use a
1784	 * source of time that is not adjustable by the user -- hence
1785	 * the call to ddi_get_lbolt().
1786	 */
1787	now = ddi_get_lbolt();
1788	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
1789		/*
1790		 * Compare times using modular arithmetic, since
1791		 * lbolt can wrapover.
1792		 */
1793		if ((now - tcp->tcp_time_wait_expire) < 0) {
1794			break;
1795		}
1796
1797		tcp_time_wait_remove(tcp, tcp_time_wait);
1798
1799		connp = tcp->tcp_connp;
1800		ASSERT(connp->conn_fanout != NULL);
1801		lock = &connp->conn_fanout->connf_lock;
1802		/*
1803		 * This is essentially a TW reclaim fast path optimization for
1804		 * performance where the timewait collector checks under the
1805		 * fanout lock (so that no one else can get access to the
1806		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
1807		 * the classifier hash list. If ref count is indeed 2, we can
1808		 * just remove the conn under the fanout lock and avoid
1809		 * cleaning up the conn under the squeue, provided that
1810		 * clustering callbacks are not enabled. If clustering is
1811		 * enabled, we need to make the clustering callback before
1812		 * setting the CONDEMNED flag and after dropping all locks and
1813		 * so we forego this optimization and fall back to the slow
1814		 * path. Also please see the comments in tcp_closei_local
1815		 * regarding the refcnt logic.
1816		 *
1817		 * Since we are holding the tcp_time_wait_lock, its better
1818		 * not to block on the fanout_lock because other connections
1819		 * can't add themselves to time_wait list. So we do a
1820		 * tryenter instead of mutex_enter.
1821		 */
1822		if (mutex_tryenter(lock)) {
1823			mutex_enter(&connp->conn_lock);
1824			if ((connp->conn_ref == 2) &&
1825			    (cl_inet_disconnect == NULL)) {
1826				ipcl_hash_remove_locked(connp,
1827				    connp->conn_fanout);
1828				/*
1829				 * Set the CONDEMNED flag now itself so that
1830				 * the refcnt cannot increase due to any
1831				 * walker. But we have still not cleaned up
1832				 * conn_ire_cache. This is still ok since
1833				 * we are going to clean it up in tcp_cleanup
1834				 * immediately and any interface unplumb
1835				 * thread will wait till the ire is blown away
1836				 */
1837				connp->conn_state_flags |= CONN_CONDEMNED;
1838				mutex_exit(lock);
1839				mutex_exit(&connp->conn_lock);
1840				if (tcp_time_wait->tcp_free_list_cnt <
1841				    tcp_free_list_max_cnt) {
1842					/* Add to head of tcp_free_list */
1843					mutex_exit(
1844					    &tcp_time_wait->tcp_time_wait_lock);
1845					tcp_cleanup(tcp);
1846					mutex_enter(
1847					    &tcp_time_wait->tcp_time_wait_lock);
1848					tcp->tcp_time_wait_next =
1849					    tcp_time_wait->tcp_free_list;
1850					tcp_time_wait->tcp_free_list = tcp;
1851					tcp_time_wait->tcp_free_list_cnt++;
1852					continue;
1853				} else {
1854					/* Do not add to tcp_free_list */
1855					mutex_exit(
1856					    &tcp_time_wait->tcp_time_wait_lock);
1857					tcp_bind_hash_remove(tcp);
1858					conn_delete_ire(tcp->tcp_connp, NULL);
1859					CONN_DEC_REF(tcp->tcp_connp);
1860				}
1861			} else {
1862				CONN_INC_REF_LOCKED(connp);
1863				mutex_exit(lock);
1864				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1865				mutex_exit(&connp->conn_lock);
1866				/*
1867				 * We can reuse the closemp here since conn has
1868				 * detached (otherwise we wouldn't even be in
1869				 * time_wait list).
1870				 */
1871				mp = &tcp->tcp_closemp;
1872				squeue_fill(connp->conn_sqp, mp,
1873				    tcp_timewait_output, connp,
1874				    SQTAG_TCP_TIMEWAIT);
1875			}
1876		} else {
1877			mutex_enter(&connp->conn_lock);
1878			CONN_INC_REF_LOCKED(connp);
1879			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1880			mutex_exit(&connp->conn_lock);
1881			/*
1882			 * We can reuse the closemp here since conn has
1883			 * detached (otherwise we wouldn't even be in
1884			 * time_wait list).
1885			 */
1886			mp = &tcp->tcp_closemp;
1887			squeue_fill(connp->conn_sqp, mp,
1888			    tcp_timewait_output, connp, 0);
1889		}
1890		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1891	}
1892
1893	if (tcp_time_wait->tcp_free_list != NULL)
1894		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
1895
1896	tcp_time_wait->tcp_time_wait_tid =
1897	    timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY);
1898	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1899}
1900
1901/*
1902 * Reply to a clients T_CONN_RES TPI message. This function
1903 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1904 * on the acceptor STREAM and processed in tcp_wput_accept().
1905 * Read the block comment on top of tcp_conn_request().
1906 */
1907static void
1908tcp_accept(tcp_t *listener, mblk_t *mp)
1909{
1910	tcp_t	*acceptor;
1911	tcp_t	*eager;
1912	tcp_t   *tcp;
1913	struct T_conn_res	*tcr;
1914	t_uscalar_t	acceptor_id;
1915	t_scalar_t	seqnum;
1916	mblk_t	*opt_mp = NULL;	/* T_OPTMGMT_REQ messages */
1917	mblk_t	*ok_mp;
1918	mblk_t	*mp1;
1919
1920	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1921		tcp_err_ack(listener, mp, TPROTO, 0);
1922		return;
1923	}
1924	tcr = (struct T_conn_res *)mp->b_rptr;
1925
1926	/*
1927	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1928	 * read side queue of the streams device underneath us i.e. the
1929	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1930	 * look it up in the queue_hash.  Under LP64 it sends down the
1931	 * minor_t of the accepting endpoint.
1932	 *
1933	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1934	 * fanout hash lock is held.
1935	 * This prevents any thread from entering the acceptor queue from
1936	 * below (since it has not been hard bound yet i.e. any inbound
1937	 * packets will arrive on the listener or default tcp queue and
1938	 * go through tcp_lookup).
1939	 * The CONN_INC_REF will prevent the acceptor from closing.
1940	 *
1941	 * XXX It is still possible for a tli application to send down data
1942	 * on the accepting stream while another thread calls t_accept.
1943	 * This should not be a problem for well-behaved applications since
1944	 * the T_OK_ACK is sent after the queue swapping is completed.
1945	 *
1946	 * If the accepting fd is the same as the listening fd, avoid
1947	 * queue hash lookup since that will return an eager listener in a
1948	 * already established state.
1949	 */
1950	acceptor_id = tcr->ACCEPTOR_id;
1951	mutex_enter(&listener->tcp_eager_lock);
1952	if (listener->tcp_acceptor_id == acceptor_id) {
1953		eager = listener->tcp_eager_next_q;
1954		/* only count how many T_CONN_INDs so don't count q0 */
1955		if ((listener->tcp_conn_req_cnt_q != 1) ||
1956		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1957			mutex_exit(&listener->tcp_eager_lock);
1958			tcp_err_ack(listener, mp, TBADF, 0);
1959			return;
1960		}
1961		if (listener->tcp_conn_req_cnt_q0 != 0) {
1962			/* Throw away all the eagers on q0. */
1963			tcp_eager_cleanup(listener, 1);
1964		}
1965		if (listener->tcp_syn_defense) {
1966			listener->tcp_syn_defense = B_FALSE;
1967			if (listener->tcp_ip_addr_cache != NULL) {
1968				kmem_free(listener->tcp_ip_addr_cache,
1969				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1970				listener->tcp_ip_addr_cache = NULL;
1971			}
1972		}
1973		/*
1974		 * Transfer tcp_conn_req_max to the eager so that when
1975		 * a disconnect occurs we can revert the endpoint to the
1976		 * listen state.
1977		 */
1978		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1979		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1980		/*
1981		 * Get a reference on the acceptor just like the
1982		 * tcp_acceptor_hash_lookup below.
1983		 */
1984		acceptor = listener;
1985		CONN_INC_REF(acceptor->tcp_connp);
1986	} else {
1987		acceptor = tcp_acceptor_hash_lookup(acceptor_id);
1988		if (acceptor == NULL) {
1989			if (listener->tcp_debug) {
1990				(void) strlog(TCP_MOD_ID, 0, 1,
1991				    SL_ERROR|SL_TRACE,
1992				    "tcp_accept: did not find acceptor 0x%x\n",
1993				    acceptor_id);
1994			}
1995			mutex_exit(&listener->tcp_eager_lock);
1996			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1997			return;
1998		}
1999		/*
2000		 * Verify acceptor state. The acceptable states for an acceptor
2001		 * include TCPS_IDLE and TCPS_BOUND.
2002		 */
2003		switch (acceptor->tcp_state) {
2004		case TCPS_IDLE:
2005			/* FALLTHRU */
2006		case TCPS_BOUND:
2007			break;
2008		default:
2009			CONN_DEC_REF(acceptor->tcp_connp);
2010			mutex_exit(&listener->tcp_eager_lock);
2011			tcp_err_ack(listener, mp, TOUTSTATE, 0);
2012			return;
2013		}
2014	}
2015
2016	/* The listener must be in TCPS_LISTEN */
2017	if (listener->tcp_state != TCPS_LISTEN) {
2018		CONN_DEC_REF(acceptor->tcp_connp);
2019		mutex_exit(&listener->tcp_eager_lock);
2020		tcp_err_ack(listener, mp, TOUTSTATE, 0);
2021		return;
2022	}
2023
2024	/*
2025	 * Rendezvous with an eager connection request packet hanging off
2026	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
2027	 * tcp structure when the connection packet arrived in
2028	 * tcp_conn_request().
2029	 */
2030	seqnum = tcr->SEQ_number;
2031	eager = listener;
2032	do {
2033		eager = eager->tcp_eager_next_q;
2034		if (eager == NULL) {
2035			CONN_DEC_REF(acceptor->tcp_connp);
2036			mutex_exit(&listener->tcp_eager_lock);
2037			tcp_err_ack(listener, mp, TBADSEQ, 0);
2038			return;
2039		}
2040	} while (eager->tcp_conn_req_seqnum != seqnum);
2041	mutex_exit(&listener->tcp_eager_lock);
2042
2043	/*
2044	 * At this point, both acceptor and listener have 2 ref
2045	 * that they begin with. Acceptor has one additional ref
2046	 * we placed in lookup while listener has 3 additional
2047	 * ref for being behind the squeue (tcp_accept() is
2048	 * done on listener's squeue); being in classifier hash;
2049	 * and eager's ref on listener.
2050	 */
2051	ASSERT(listener->tcp_connp->conn_ref >= 5);
2052	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
2053
2054	/*
2055	 * The eager at this point is set in its own squeue and
2056	 * could easily have been killed (tcp_accept_finish will
2057	 * deal with that) because of a TH_RST so we can only
2058	 * ASSERT for a single ref.
2059	 */
2060	ASSERT(eager->tcp_connp->conn_ref >= 1);
2061
2062	/* Pre allocate the stroptions mblk also */
2063	opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
2064	if (opt_mp == NULL) {
2065		CONN_DEC_REF(acceptor->tcp_connp);
2066		CONN_DEC_REF(eager->tcp_connp);
2067		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2068		return;
2069	}
2070	DB_TYPE(opt_mp) = M_SETOPTS;
2071	opt_mp->b_wptr += sizeof (struct stroptions);
2072
2073	/*
2074	 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
2075	 * from listener to acceptor. The message is chained on opt_mp
2076	 * which will be sent onto eager's squeue.
2077	 */
2078	if (listener->tcp_bound_if != 0) {
2079		/* allocate optmgmt req */
2080		mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2081		    IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
2082		    sizeof (int));
2083		if (mp1 != NULL)
2084			linkb(opt_mp, mp1);
2085	}
2086	if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
2087		uint_t on = 1;
2088
2089		/* allocate optmgmt req */
2090		mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2091		    IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
2092		if (mp1 != NULL)
2093			linkb(opt_mp, mp1);
2094	}
2095
2096	/* Re-use mp1 to hold a copy of mp, in case reallocb fails */
2097	if ((mp1 = copymsg(mp)) == NULL) {
2098		CONN_DEC_REF(acceptor->tcp_connp);
2099		CONN_DEC_REF(eager->tcp_connp);
2100		freemsg(opt_mp);
2101		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2102		return;
2103	}
2104
2105	tcr = (struct T_conn_res *)mp1->b_rptr;
2106
2107	/*
2108	 * This is an expanded version of mi_tpi_ok_ack_alloc()
2109	 * which allocates a larger mblk and appends the new
2110	 * local address to the ok_ack.  The address is copied by
2111	 * soaccept() for getsockname().
2112	 */
2113	{
2114		int extra;
2115
2116		extra = (eager->tcp_family == AF_INET) ?
2117		    sizeof (sin_t) : sizeof (sin6_t);
2118
2119		/*
2120		 * Try to re-use mp, if possible.  Otherwise, allocate
2121		 * an mblk and return it as ok_mp.  In any case, mp
2122		 * is no longer usable upon return.
2123		 */
2124		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
2125			CONN_DEC_REF(acceptor->tcp_connp);
2126			CONN_DEC_REF(eager->tcp_connp);
2127			freemsg(opt_mp);
2128			/* Original mp has been freed by now, so use mp1 */
2129			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
2130			return;
2131		}
2132
2133		mp = NULL;	/* We should never use mp after this point */
2134
2135		switch (extra) {
2136		case sizeof (sin_t): {
2137				sin_t *sin = (sin_t *)ok_mp->b_wptr;
2138
2139				ok_mp->b_wptr += extra;
2140				sin->sin_family = AF_INET;
2141				sin->sin_port = eager->tcp_lport;
2142				sin->sin_addr.s_addr =
2143				    eager->tcp_ipha->ipha_src;
2144				break;
2145			}
2146		case sizeof (sin6_t): {
2147				sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
2148
2149				ok_mp->b_wptr += extra;
2150				sin6->sin6_family = AF_INET6;
2151				sin6->sin6_port = eager->tcp_lport;
2152				if (eager->tcp_ipversion == IPV4_VERSION) {
2153					sin6->sin6_flowinfo = 0;
2154					IN6_IPADDR_TO_V4MAPPED(
2155					    eager->tcp_ipha->ipha_src,
2156					    &sin6->sin6_addr);
2157				} else {
2158					ASSERT(eager->tcp_ip6h != NULL);
2159					sin6->sin6_flowinfo =
2160					    eager->tcp_ip6h->ip6_vcf &
2161					    ~IPV6_VERS_AND_FLOW_MASK;
2162					sin6->sin6_addr =
2163					    eager->tcp_ip6h->ip6_src;
2164				}
2165				break;
2166			}
2167		default:
2168			break;
2169		}
2170		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
2171	}
2172
2173	/*
2174	 * If there are no options we know that the T_CONN_RES will
2175	 * succeed. However, we can't send the T_OK_ACK upstream until
2176	 * the tcp_accept_swap is done since it would be dangerous to
2177	 * let the application start using the new fd prior to the swap.
2178	 */
2179	tcp_accept_swap(listener, acceptor, eager);
2180
2181	/*
2182	 * tcp_accept_swap unlinks eager from listener but does not drop
2183	 * the eager's reference on the listener.
2184	 */
2185	ASSERT(eager->tcp_listener == NULL);
2186	ASSERT(listener->tcp_connp->conn_ref >= 5);
2187
2188	/*
2189	 * The eager is now associated with its own queue. Insert in
2190	 * the hash so that the connection can be reused for a future
2191	 * T_CONN_RES.
2192	 */
2193	tcp_acceptor_hash_insert(acceptor_id, eager);
2194
2195	/*
2196	 * We now do the processing of options with T_CONN_RES.
2197	 * We delay till now since we wanted to have queue to pass to
2198	 * option processing routines that points back to the right
2199	 * instance structure which does not happen until after
2200	 * tcp_accept_swap().
2201	 *
2202	 * Note:
2203	 * The sanity of the logic here assumes that whatever options
2204	 * are appropriate to inherit from listner=>eager are done
2205	 * before this point, and whatever were to be overridden (or not)
2206	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
2207	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
2208	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
2209	 * This may not be true at this point in time but can be fixed
2210	 * independently. This option processing code starts with
2211	 * the instantiated acceptor instance and the final queue at
2212	 * this point.
2213	 */
2214
2215	if (tcr->OPT_length != 0) {
2216		/* Options to process */
2217		int t_error = 0;
2218		int sys_error = 0;
2219		int do_disconnect = 0;
2220
2221		if (tcp_conprim_opt_process(eager, mp1,
2222		    &do_disconnect, &t_error, &sys_error) < 0) {
2223			eager->tcp_accept_error = 1;
2224			if (do_disconnect) {
2225				/*
2226				 * An option failed which does not allow
2227				 * connection to be accepted.
2228				 *
2229				 * We allow T_CONN_RES to succeed and
2230				 * put a T_DISCON_IND on the eager queue.
2231				 */
2232				ASSERT(t_error == 0 && sys_error == 0);
2233				eager->tcp_send_discon_ind = 1;
2234			} else {
2235				ASSERT(t_error != 0);
2236				freemsg(ok_mp);
2237				/*
2238				 * Original mp was either freed or set
2239				 * to ok_mp above, so use mp1 instead.
2240				 */
2241				tcp_err_ack(listener, mp1, t_error, sys_error);
2242				goto finish;
2243			}
2244		}
2245		/*
2246		 * Most likely success in setting options (except if
2247		 * eager->tcp_send_discon_ind set).
2248		 * mp1 option buffer represented by OPT_length/offset
2249		 * potentially modified and contains results of setting
2250		 * options at this point
2251		 */
2252	}
2253
2254	/* We no longer need mp1, since all options processing has passed */
2255	freemsg(mp1);
2256
2257	putnext(listener->tcp_rq, ok_mp);
2258
2259	mutex_enter(&listener->tcp_eager_lock);
2260	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
2261		tcp_t	*tail;
2262		mblk_t	*conn_ind;
2263
2264		/*
2265		 * This path should not be executed if listener and
2266		 * acceptor streams are the same.
2267		 */
2268		ASSERT(listener != acceptor);
2269
2270		tcp = listener->tcp_eager_prev_q0;
2271		/*
2272		 * listener->tcp_eager_prev_q0 points to the TAIL of the
2273		 * deferred T_conn_ind queue. We need to get to the head of
2274		 * the queue in order to send up T_conn_ind the same order as
2275		 * how the 3WHS is completed.
2276		 */
2277		while (tcp != listener) {
2278			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
2279				break;
2280			else
2281				tcp = tcp->tcp_eager_prev_q0;
2282		}
2283		ASSERT(tcp != listener);
2284		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
2285		ASSERT(conn_ind != NULL);
2286		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
2287
2288		/* Move from q0 to q */
2289		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
2290		listener->tcp_conn_req_cnt_q0--;
2291		listener->tcp_conn_req_cnt_q++;
2292		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2293		    tcp->tcp_eager_prev_q0;
2294		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2295		    tcp->tcp_eager_next_q0;
2296		tcp->tcp_eager_prev_q0 = NULL;
2297		tcp->tcp_eager_next_q0 = NULL;
2298		tcp->tcp_conn_def_q0 = B_FALSE;
2299
2300		/*
2301		 * Insert at end of the queue because sockfs sends
2302		 * down T_CONN_RES in chronological order. Leaving
2303		 * the older conn indications at front of the queue
2304		 * helps reducing search time.
2305		 */
2306		tail = listener->tcp_eager_last_q;
2307		if (tail != NULL)
2308			tail->tcp_eager_next_q = tcp;
2309		else
2310			listener->tcp_eager_next_q = tcp;
2311		listener->tcp_eager_last_q = tcp;
2312		tcp->tcp_eager_next_q = NULL;
2313		mutex_exit(&listener->tcp_eager_lock);
2314		putnext(tcp->tcp_rq, conn_ind);
2315	} else {
2316		mutex_exit(&listener->tcp_eager_lock);
2317	}
2318
2319	/*
2320	 * Done with the acceptor - free it
2321	 *
2322	 * Note: from this point on, no access to listener should be made
2323	 * as listener can be equal to acceptor.
2324	 */
2325finish:
2326	ASSERT(acceptor->tcp_detached);
2327	acceptor->tcp_rq = tcp_g_q;
2328	acceptor->tcp_wq = WR(tcp_g_q);
2329	(void) tcp_clean_death(acceptor, 0, 2);
2330	CONN_DEC_REF(acceptor->tcp_connp);
2331
2332	/*
2333	 * In case we already received a FIN we have to make tcp_rput send
2334	 * the ordrel_ind. This will also send up a window update if the window
2335	 * has opened up.
2336	 *
2337	 * In the normal case of a successful connection acceptance
2338	 * we give the O_T_BIND_REQ to the read side put procedure as an
2339	 * indication that this was just accepted. This tells tcp_rput to
2340	 * pass up any data queued in tcp_rcv_list.
2341	 *
2342	 * In the fringe case where options sent with T_CONN_RES failed and
2343	 * we required, we would be indicating a T_DISCON_IND to blow
2344	 * away this connection.
2345	 */
2346
2347	/*
2348	 * XXX: we currently have a problem if XTI application closes the
2349	 * acceptor stream in between. This problem exists in on10-gate also
2350	 * and is well know but nothing can be done short of major rewrite
2351	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
2352	 * eager same squeue as listener (we can distinguish non socket
2353	 * listeners at the time of handling a SYN in tcp_conn_request)
2354	 * and do most of the work that tcp_accept_finish does here itself
2355	 * and then get behind the acceptor squeue to access the acceptor
2356	 * queue.
2357	 */
2358	/*
2359	 * We already have a ref on tcp so no need to do one before squeue_fill
2360	 */
2361	squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
2362	    tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
2363}
2364
2365/*
2366 * Swap information between the eager and acceptor for a TLI/XTI client.
2367 * The sockfs accept is done on the acceptor stream and control goes
2368 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
2369 * called. In either case, both the eager and listener are in their own
2370 * perimeter (squeue) and the code has to deal with potential race.
2371 *
2372 * See the block comment on top of tcp_accept() and tcp_wput_accept().
2373 */
2374static void
2375tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
2376{
2377	conn_t	*econnp, *aconnp;
2378
2379	ASSERT(eager->tcp_rq == listener->tcp_rq);
2380	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
2381	ASSERT(!eager->tcp_hard_bound);
2382	ASSERT(!TCP_IS_SOCKET(acceptor));
2383	ASSERT(!TCP_IS_SOCKET(eager));
2384	ASSERT(!TCP_IS_SOCKET(listener));
2385
2386	acceptor->tcp_detached = B_TRUE;
2387	/*
2388	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
2389	 * the acceptor id.
2390	 */
2391	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
2392
2393	/* remove eager from listen list... */
2394	mutex_enter(&listener->tcp_eager_lock);
2395	tcp_eager_unlink(eager);
2396	ASSERT(eager->tcp_eager_next_q == NULL &&
2397	    eager->tcp_eager_last_q == NULL);
2398	ASSERT(eager->tcp_eager_next_q0 == NULL &&
2399	    eager->tcp_eager_prev_q0 == NULL);
2400	mutex_exit(&listener->tcp_eager_lock);
2401	eager->tcp_rq = acceptor->tcp_rq;
2402	eager->tcp_wq = acceptor->tcp_wq;
2403
2404	econnp = eager->tcp_connp;
2405	aconnp = acceptor->tcp_connp;
2406
2407	eager->tcp_rq->q_ptr = econnp;
2408	eager->tcp_wq->q_ptr = econnp;
2409
2410	/*
2411	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
2412	 * which might be a different squeue from our peer TCP instance.
2413	 * For TCP Fusion, the peer expects that whenever tcp_detached is
2414	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
2415	 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
2416	 * above reach global visibility prior to the clearing of tcp_detached.
2417	 */
2418	membar_producer();
2419	eager->tcp_detached = B_FALSE;
2420
2421	ASSERT(eager->tcp_ack_tid == 0);
2422
2423	econnp->conn_dev = aconnp->conn_dev;
2424	if (eager->tcp_cred != NULL)
2425		crfree(eager->tcp_cred);
2426	eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
2427	econnp->conn_zoneid = aconnp->conn_zoneid;
2428	aconnp->conn_cred = NULL;
2429
2430	econnp->conn_mac_exempt = aconnp->conn_mac_exempt;
2431	aconnp->conn_mac_exempt = B_FALSE;
2432
2433	ASSERT(aconnp->conn_peercred == NULL);
2434
2435	/* Do the IPC initialization */
2436	CONN_INC_REF(econnp);
2437
2438	econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
2439	econnp->conn_af_isv6 = aconnp->conn_af_isv6;
2440	econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
2441	econnp->conn_ulp = aconnp->conn_ulp;
2442
2443	/* Done with old IPC. Drop its ref on its connp */
2444	CONN_DEC_REF(aconnp);
2445}
2446
2447
2448/*
2449 * Adapt to the information, such as rtt and rtt_sd, provided from the
2450 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
2451 *
2452 * Checks for multicast and broadcast destination address.
2453 * Returns zero on failure; non-zero if ok.
2454 *
2455 * Note that the MSS calculation here is based on the info given in
2456 * the IRE.  We do not do any calculation based on TCP options.  They
2457 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
2458 * knows which options to use.
2459 *
2460 * Note on how TCP gets its parameters for a connection.
2461 *
2462 * When a tcp_t structure is allocated, it gets all the default parameters.
2463 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
2464 * spipe, rpipe, ... from the route metrics.  Route metric overrides the
2465 * default.  But if there is an associated tcp_host_param, it will override
2466 * the metrics.
2467 *
2468 * An incoming SYN with a multicast or broadcast destination address, is dropped
2469 * in 1 of 2 places.
2470 *
2471 * 1. If the packet was received over the wire it is dropped in
2472 * ip_rput_process_broadcast()
2473 *
2474 * 2. If the packet was received through internal IP loopback, i.e. the packet
2475 * was generated and received on the same machine, it is dropped in
2476 * ip_wput_local()
2477 *
2478 * An incoming SYN with a multicast or broadcast source address is always
2479 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
2480 * reject an attempt to connect to a broadcast or multicast (destination)
2481 * address.
2482 */
2483static int
2484tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
2485{
2486	tcp_hsp_t	*hsp;
2487	ire_t		*ire;
2488	ire_t		*sire = NULL;
2489	iulp_t		*ire_uinfo = NULL;
2490	uint32_t	mss_max;
2491	uint32_t	mss;
2492	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
2493	conn_t		*connp = tcp->tcp_connp;
2494	boolean_t	ire_cacheable = B_FALSE;
2495	zoneid_t	zoneid = connp->conn_zoneid;
2496	int		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
2497			    MATCH_IRE_SECATTR;
2498	ts_label_t	*tsl = crgetlabel(CONN_CRED(connp));
2499	ill_t		*ill = NULL;
2500	boolean_t	incoming = (ire_mp == NULL);
2501
2502	ASSERT(connp->conn_ire_cache == NULL);
2503
2504	if (tcp->tcp_ipversion == IPV4_VERSION) {
2505
2506		if (CLASSD(tcp->tcp_connp->conn_rem)) {
2507			BUMP_MIB(&ip_mib, ipInDiscards);
2508			return (0);
2509		}
2510		/*
2511		 * If IP_NEXTHOP is set, then look for an IRE_CACHE
2512		 * for the destination with the nexthop as gateway.
2513		 * ire_ctable_lookup() is used because this particular
2514		 * ire, if it exists, will be marked private.
2515		 * If that is not available, use the interface ire
2516		 * for the nexthop.
2517		 *
2518		 * TSol: tcp_update_label will detect label mismatches based
2519		 * only on the destination's label, but that would not
2520		 * detect label mismatches based on the security attributes
2521		 * of routes or next hop gateway. Hence we need to pass the
2522		 * label to ire_ftable_lookup below in order to locate the
2523		 * right prefix (and/or) ire cache. Similarly we also need
2524		 * pass the label to the ire_cache_lookup below to locate
2525		 * the right ire that also matches on the label.
2526		 */
2527		if (tcp->tcp_connp->conn_nexthop_set) {
2528			ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
2529			    tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
2530			    tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW);
2531			if (ire == NULL) {
2532				ire = ire_ftable_lookup(
2533				    tcp->tcp_connp->conn_nexthop_v4,
2534				    0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
2535				    tsl, match_flags);
2536				if (ire == NULL)
2537					return (0);
2538			} else {
2539				ire_uinfo = &ire->ire_uinfo;
2540			}
2541		} else {
2542			ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
2543			    zoneid, tsl);
2544			if (ire != NULL) {
2545				ire_cacheable = B_TRUE;
2546				ire_uinfo = (ire_mp != NULL) ?
2547				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2548				    &ire->ire_uinfo;
2549
2550			} else {
2551				if (ire_mp == NULL) {
2552					ire = ire_ftable_lookup(
2553					    tcp->tcp_connp->conn_rem,
2554					    0, 0, 0, NULL, &sire, zoneid, 0,
2555					    tsl, (MATCH_IRE_RECURSIVE |
2556					    MATCH_IRE_DEFAULT));
2557					if (ire == NULL)
2558						return (0);
2559					ire_uinfo = (sire != NULL) ?
2560					    &sire->ire_uinfo :
2561					    &ire->ire_uinfo;
2562				} else {
2563					ire = (ire_t *)ire_mp->b_rptr;
2564					ire_uinfo =
2565					    &((ire_t *)
2566					    ire_mp->b_rptr)->ire_uinfo;
2567				}
2568			}
2569		}
2570		ASSERT(ire != NULL);
2571
2572		if ((ire->ire_src_addr == INADDR_ANY) ||
2573		    (ire->ire_type & IRE_BROADCAST)) {
2574			/*
2575			 * ire->ire_mp is non null when ire_mp passed in is used
2576			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2577			 */
2578			if (ire->ire_mp == NULL)
2579				ire_refrele(ire);
2580			if (sire != NULL)
2581				ire_refrele(sire);
2582			return (0);
2583		}
2584
2585		if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
2586			ipaddr_t src_addr;
2587
2588			/*
2589			 * ip_bind_connected() has stored the correct source
2590			 * address in conn_src.
2591			 */
2592			src_addr = tcp->tcp_connp->conn_src;
2593			tcp->tcp_ipha->ipha_src = src_addr;
2594			/*
2595			 * Copy of the src addr. in tcp_t is needed
2596			 * for the lookup funcs.
2597			 */
2598			IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
2599		}
2600		/*
2601		 * Set the fragment bit so that IP will tell us if the MTU
2602		 * should change. IP tells us the latest setting of
2603		 * ip_path_mtu_discovery through ire_frag_flag.
2604		 */
2605		if (ip_path_mtu_discovery) {
2606			tcp->tcp_ipha->ipha_fragment_offset_and_flags =
2607			    htons(IPH_DF);
2608		}
2609		/*
2610		 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
2611		 * for IP_NEXTHOP. No cache ire has been found for the
2612		 * destination and we are working with the nexthop's
2613		 * interface ire. Since we need to forward all packets
2614		 * to the nexthop first, we "blindly" set tcp_localnet
2615		 * to false, eventhough the destination may also be
2616		 * onlink.
2617		 */
2618		if (ire_uinfo == NULL)
2619			tcp->tcp_localnet = 0;
2620		else
2621			tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
2622	} else {
2623		/*
2624		 * For incoming connection ire_mp = NULL
2625		 * For outgoing connection ire_mp != NULL
2626		 * Technically we should check conn_incoming_ill
2627		 * when ire_mp is NULL and conn_outgoing_ill when
2628		 * ire_mp is non-NULL. But this is performance
2629		 * critical path and for IPV*_BOUND_IF, outgoing
2630		 * and incoming ill are always set to the same value.
2631		 */
2632		ill_t	*dst_ill = NULL;
2633		ipif_t  *dst_ipif = NULL;
2634
2635		ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
2636
2637		if (connp->conn_outgoing_ill != NULL) {
2638			/* Outgoing or incoming path */
2639			int   err;
2640
2641			dst_ill = conn_get_held_ill(connp,
2642			    &connp->conn_outgoing_ill, &err);
2643			if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
2644				ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
2645				return (0);
2646			}
2647			match_flags |= MATCH_IRE_ILL;
2648			dst_ipif = dst_ill->ill_ipif;
2649		}
2650		ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
2651		    0, 0, dst_ipif, zoneid, tsl, match_flags);
2652
2653		if (ire != NULL) {
2654			ire_cacheable = B_TRUE;
2655			ire_uinfo = (ire_mp != NULL) ?
2656			    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2657			    &ire->ire_uinfo;
2658		} else {
2659			if (ire_mp == NULL) {
2660				ire = ire_ftable_lookup_v6(
2661				    &tcp->tcp_connp->conn_remv6,
2662				    0, 0, 0, dst_ipif, &sire, zoneid,
2663				    0, tsl, match_flags);
2664				if (ire == NULL) {
2665					if (dst_ill != NULL)
2666						ill_refrele(dst_ill);
2667					return (0);
2668				}
2669				ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
2670				    &ire->ire_uinfo;
2671			} else {
2672				ire = (ire_t *)ire_mp->b_rptr;
2673				ire_uinfo =
2674				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
2675			}
2676		}
2677		if (dst_ill != NULL)
2678			ill_refrele(dst_ill);
2679
2680		ASSERT(ire != NULL);
2681		ASSERT(ire_uinfo != NULL);
2682
2683		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
2684		    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
2685			/*
2686			 * ire->ire_mp is non null when ire_mp passed in is used
2687			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2688			 */
2689			if (ire->ire_mp == NULL)
2690				ire_refrele(ire);
2691			if (sire != NULL)
2692				ire_refrele(sire);
2693			return (0);
2694		}
2695
2696		if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
2697			in6_addr_t	src_addr;
2698
2699			/*
2700			 * ip_bind_connected_v6() has stored the correct source
2701			 * address per IPv6 addr. selection policy in
2702			 * conn_src_v6.
2703			 */
2704			src_addr = tcp->tcp_connp->conn_srcv6;
2705
2706			tcp->tcp_ip6h->ip6_src = src_addr;
2707			/*
2708			 * Copy of the src addr. in tcp_t is needed
2709			 * for the lookup funcs.
2710			 */
2711			tcp->tcp_ip_src_v6 = src_addr;
2712			ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
2713			    &connp->conn_srcv6));
2714		}
2715		tcp->tcp_localnet =
2716		    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
2717	}
2718
2719	/*
2720	 * This allows applications to fail quickly when connections are made
2721	 * to dead hosts. Hosts can be labeled dead by adding a reject route
2722	 * with both the RTF_REJECT and RTF_PRIVATE flags set.
2723	 */
2724	if ((ire->ire_flags & RTF_REJECT) &&
2725	    (ire->ire_flags & RTF_PRIVATE))
2726		goto error;
2727
2728	/*
2729	 * Make use of the cached rtt and rtt_sd values to calculate the
2730	 * initial RTO.  Note that they are already initialized in
2731	 * tcp_init_values().
2732	 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
2733	 * IP_NEXTHOP, but instead are using the interface ire for the
2734	 * nexthop, then we do not use the ire_uinfo from that ire to
2735	 * do any initializations.
2736	 */
2737	if (ire_uinfo != NULL) {
2738		if (ire_uinfo->iulp_rtt != 0) {
2739			clock_t	rto;
2740
2741			tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
2742			tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
2743			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2744			    tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5);
2745
2746			if (rto > tcp_rexmit_interval_max) {
2747				tcp->tcp_rto = tcp_rexmit_interval_max;
2748			} else if (rto < tcp_rexmit_interval_min) {
2749				tcp->tcp_rto = tcp_rexmit_interval_min;
2750			} else {
2751				tcp->tcp_rto = rto;
2752			}
2753		}
2754		if (ire_uinfo->iulp_ssthresh != 0)
2755			tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
2756		else
2757			tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2758		if (ire_uinfo->iulp_spipe > 0) {
2759			tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
2760			    tcp_max_buf);
2761			if (tcp_snd_lowat_fraction != 0)
2762				tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2763				    tcp_snd_lowat_fraction;
2764			(void) tcp_maxpsz_set(tcp, B_TRUE);
2765		}
2766		/*
2767		 * Note that up till now, acceptor always inherits receive
2768		 * window from the listener.  But if there is a metrics
2769		 * associated with a host, we should use that instead of
2770		 * inheriting it from listener. Thus we need to pass this
2771		 * info back to the caller.
2772		 */
2773		if (ire_uinfo->iulp_rpipe > 0) {
2774			tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf);
2775		}
2776
2777		if (ire_uinfo->iulp_rtomax > 0) {
2778			tcp->tcp_second_timer_threshold =
2779			    ire_uinfo->iulp_rtomax;
2780		}
2781
2782		/*
2783		 * Use the metric option settings, iulp_tstamp_ok and
2784		 * iulp_wscale_ok, only for active open. What this means
2785		 * is that if the other side uses timestamp or window
2786		 * scale option, TCP will also use those options. That
2787		 * is for passive open.  If the application sets a
2788		 * large window, window scale is enabled regardless of
2789		 * the value in iulp_wscale_ok.  This is the behavior
2790		 * since 2.6.  So we keep it.
2791		 * The only case left in passive open processing is the
2792		 * check for SACK.
2793		 * For ECN, it should probably be like SACK.  But the
2794		 * current value is binary, so we treat it like the other
2795		 * cases.  The metric only controls active open.For passive
2796		 * open, the ndd param, tcp_ecn_permitted, controls the
2797		 * behavior.
2798		 */
2799		if (!tcp_detached) {
2800			/*
2801			 * The if check means that the following can only
2802			 * be turned on by the metrics only IRE, but not off.
2803			 */
2804			if (ire_uinfo->iulp_tstamp_ok)
2805				tcp->tcp_snd_ts_ok = B_TRUE;
2806			if (ire_uinfo->iulp_wscale_ok)
2807				tcp->tcp_snd_ws_ok = B_TRUE;
2808			if (ire_uinfo->iulp_sack == 2)
2809				tcp->tcp_snd_sack_ok = B_TRUE;
2810			if (ire_uinfo->iulp_ecn_ok)
2811				tcp->tcp_ecn_ok = B_TRUE;
2812		} else {
2813			/*
2814			 * Passive open.
2815			 *
2816			 * As above, the if check means that SACK can only be
2817			 * turned on by the metric only IRE.
2818			 */
2819			if (ire_uinfo->iulp_sack > 0) {
2820				tcp->tcp_snd_sack_ok = B_TRUE;
2821			}
2822		}
2823	}
2824
2825
2826	/*
2827	 * XXX: Note that currently, ire_max_frag can be as small as 68
2828	 * because of PMTUd.  So tcp_mss may go to negative if combined
2829	 * length of all those options exceeds 28 bytes.  But because
2830	 * of the tcp_mss_min check below, we may not have a problem if
2831	 * tcp_mss_min is of a reasonable value.  The default is 1 so
2832	 * the negative problem still exists.  And the check defeats PMTUd.
2833	 * In fact, if PMTUd finds that the MSS should be smaller than
2834	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
2835	 * value.
2836	 *
2837	 * We do not deal with that now.  All those problems related to
2838	 * PMTUd will be fixed later.
2839	 */
2840	ASSERT(ire->ire_max_frag != 0);
2841	mss = tcp->tcp_if_mtu = ire->ire_max_frag;
2842	if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
2843		if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
2844			mss = MIN(mss, IPV6_MIN_MTU);
2845		}
2846	}
2847
2848	/* Sanity check for MSS value. */
2849	if (tcp->tcp_ipversion == IPV4_VERSION)
2850		mss_max = tcp_mss_max_ipv4;
2851	else
2852		mss_max = tcp_mss_max_ipv6;
2853
2854	if (tcp->tcp_ipversion == IPV6_VERSION &&
2855	    (ire->ire_frag_flag & IPH_FRAG_HDR)) {
2856		/*
2857		 * After receiving an ICMPv6 "packet too big" message with a
2858		 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
2859		 * will insert a 8-byte fragment header in every packet; we
2860		 * reduce the MSS by that amount here.
2861		 */
2862		mss -= sizeof (ip6_frag_t);
2863	}
2864
2865	if (tcp->tcp_ipsec_overhead == 0)
2866		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
2867
2868	mss -= tcp->tcp_ipsec_overhead;
2869
2870	if (mss < tcp_mss_min)
2871		mss = tcp_mss_min;
2872	if (mss > mss_max)
2873		mss = mss_max;
2874
2875	/* Note that this is the maximum MSS, excluding all options. */
2876	tcp->tcp_mss = mss;
2877
2878	/*
2879	 * Initialize the ISS here now that we have the full connection ID.
2880	 * The RFC 1948 method of initial sequence number generation requires
2881	 * knowledge of the full connection ID before setting the ISS.
2882	 */
2883
2884	tcp_iss_init(tcp);
2885
2886	if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
2887		tcp->tcp_loopback = B_TRUE;
2888
2889	if (tcp->tcp_ipversion == IPV4_VERSION) {
2890		hsp = tcp_hsp_lookup(tcp->tcp_remote);
2891	} else {
2892		hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6);
2893	}
2894
2895	if (hsp != NULL) {
2896		/* Only modify if we're going to make them bigger */
2897		if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) {
2898			tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace;
2899			if (tcp_snd_lowat_fraction != 0)
2900				tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2901					tcp_snd_lowat_fraction;
2902		}
2903
2904		if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) {
2905			tcp->tcp_rwnd = hsp->tcp_hsp_recvspace;
2906		}
2907
2908		/* Copy timestamp flag only for active open */
2909		if (!tcp_detached)
2910			tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp;
2911	}
2912
2913	if (sire != NULL)
2914		IRE_REFRELE(sire);
2915
2916	/*
2917	 * If we got an IRE_CACHE and an ILL, go through their properties;
2918	 * otherwise, this is deferred until later when we have an IRE_CACHE.
2919	 */
2920	if (tcp->tcp_loopback ||
2921	    (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
2922		/*
2923		 * For incoming, see if this tcp may be MDT-capable.  For
2924		 * outgoing, this process has been taken care of through
2925		 * tcp_rput_other.
2926		 */
2927		tcp_ire_ill_check(tcp, ire, ill, incoming);
2928		tcp->tcp_ire_ill_check_done = B_TRUE;
2929	}
2930
2931	mutex_enter(&connp->conn_lock);
2932	/*
2933	 * Make sure that conn is not marked incipient
2934	 * for incoming connections. A blind
2935	 * removal of incipient flag is cheaper than
2936	 * check and removal.
2937	 */
2938	connp->conn_state_flags &= ~CONN_INCIPIENT;
2939
2940	/* Must not cache forwarding table routes. */
2941	if (ire_cacheable) {
2942		rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
2943		if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
2944			connp->conn_ire_cache = ire;
2945			IRE_UNTRACE_REF(ire);
2946			rw_exit(&ire->ire_bucket->irb_lock);
2947			mutex_exit(&connp->conn_lock);
2948			return (1);
2949		}
2950		rw_exit(&ire->ire_bucket->irb_lock);
2951	}
2952	mutex_exit(&connp->conn_lock);
2953
2954	if (ire->ire_mp == NULL)
2955		ire_refrele(ire);
2956	return (1);
2957
2958error:
2959	if (ire->ire_mp == NULL)
2960		ire_refrele(ire);
2961	if (sire != NULL)
2962		ire_refrele(sire);
2963	return (0);
2964}
2965
2966/*
2967 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a
2968 * O_T_BIND_REQ/T_BIND_REQ message.
2969 */
2970static void
2971tcp_bind(tcp_t *tcp, mblk_t *mp)
2972{
2973	sin_t	*sin;
2974	sin6_t	*sin6;
2975	mblk_t	*mp1;
2976	in_port_t requested_port;
2977	in_port_t allocated_port;
2978	struct T_bind_req *tbr;
2979	boolean_t	bind_to_req_port_only;
2980	boolean_t	backlog_update = B_FALSE;
2981	boolean_t	user_specified;
2982	in6_addr_t	v6addr;
2983	ipaddr_t	v4addr;
2984	uint_t	origipversion;
2985	int	err;
2986	queue_t *q = tcp->tcp_wq;
2987	conn_t	*connp;
2988	mlp_type_t addrtype, mlptype;
2989	zone_t	*zone;
2990	cred_t	*cr;
2991	in_port_t mlp_port;
2992
2993	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
2994	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
2995		if (tcp->tcp_debug) {
2996			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
2997			    "tcp_bind: bad req, len %u",
2998			    (uint_t)(mp->b_wptr - mp->b_rptr));
2999		}
3000		tcp_err_ack(tcp, mp, TPROTO, 0);
3001		return;
3002	}
3003	/* Make sure the largest address fits */
3004	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
3005	if (mp1 == NULL) {
3006		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3007		return;
3008	}
3009	mp = mp1;
3010	tbr = (struct T_bind_req *)mp->b_rptr;
3011	if (tcp->tcp_state >= TCPS_BOUND) {
3012		if ((tcp->tcp_state == TCPS_BOUND ||
3013		    tcp->tcp_state == TCPS_LISTEN) &&
3014		    tcp->tcp_conn_req_max != tbr->CONIND_number &&
3015		    tbr->CONIND_number > 0) {
3016			/*
3017			 * Handle listen() increasing CONIND_number.
3018			 * This is more "liberal" then what the TPI spec
3019			 * requires but is needed to avoid a t_unbind
3020			 * when handling listen() since the port number
3021			 * might be "stolen" between the unbind and bind.
3022			 */
3023			backlog_update = B_TRUE;
3024			goto do_bind;
3025		}
3026		if (tcp->tcp_debug) {
3027			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3028			    "tcp_bind: bad state, %d", tcp->tcp_state);
3029		}
3030		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
3031		return;
3032	}
3033	origipversion = tcp->tcp_ipversion;
3034
3035	switch (tbr->ADDR_length) {
3036	case 0:			/* request for a generic port */
3037		tbr->ADDR_offset = sizeof (struct T_bind_req);
3038		if (tcp->tcp_family == AF_INET) {
3039			tbr->ADDR_length = sizeof (sin_t);
3040			sin = (sin_t *)&tbr[1];
3041			*sin = sin_null;
3042			sin->sin_family = AF_INET;
3043			mp->b_wptr = (uchar_t *)&sin[1];
3044			tcp->tcp_ipversion = IPV4_VERSION;
3045			IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
3046		} else {
3047			ASSERT(tcp->tcp_family == AF_INET6);
3048			tbr->ADDR_length = sizeof (sin6_t);
3049			sin6 = (sin6_t *)&tbr[1];
3050			*sin6 = sin6_null;
3051			sin6->sin6_family = AF_INET6;
3052			mp->b_wptr = (uchar_t *)&sin6[1];
3053			tcp->tcp_ipversion = IPV6_VERSION;
3054			V6_SET_ZERO(v6addr);
3055		}
3056		requested_port = 0;
3057		break;
3058
3059	case sizeof (sin_t):	/* Complete IPv4 address */
3060		sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
3061		    sizeof (sin_t));
3062		if (sin == NULL || !OK_32PTR((char *)sin)) {
3063			if (tcp->tcp_debug) {
3064				(void) strlog(TCP_MOD_ID, 0, 1,
3065				    SL_ERROR|SL_TRACE,
3066				    "tcp_bind: bad address parameter, "
3067				    "offset %d, len %d",
3068				    tbr->ADDR_offset, tbr->ADDR_length);
3069			}
3070			tcp_err_ack(tcp, mp, TPROTO, 0);
3071			return;
3072		}
3073		/*
3074		 * With sockets sockfs will accept bogus sin_family in
3075		 * bind() and replace it with the family used in the socket
3076		 * call.
3077		 */
3078		if (sin->sin_family != AF_INET ||
3079		    tcp->tcp_family != AF_INET) {
3080			tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3081			return;
3082		}
3083		requested_port = ntohs(sin->sin_port);
3084		tcp->tcp_ipversion = IPV4_VERSION;
3085		v4addr = sin->sin_addr.s_addr;
3086		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
3087		break;
3088
3089	case sizeof (sin6_t): /* Complete IPv6 address */
3090		sin6 = (sin6_t *)mi_offset_param(mp,
3091		    tbr->ADDR_offset, sizeof (sin6_t));
3092		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
3093			if (tcp->tcp_debug) {
3094				(void) strlog(TCP_MOD_ID, 0, 1,
3095				    SL_ERROR|SL_TRACE,
3096				    "tcp_bind: bad IPv6 address parameter, "
3097				    "offset %d, len %d", tbr->ADDR_offset,
3098				    tbr->ADDR_length);
3099			}
3100			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
3101			return;
3102		}
3103		if (sin6->sin6_family != AF_INET6 ||
3104		    tcp->tcp_family != AF_INET6) {
3105			tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3106			return;
3107		}
3108		requested_port = ntohs(sin6->sin6_port);
3109		tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
3110		    IPV4_VERSION : IPV6_VERSION;
3111		v6addr = sin6->sin6_addr;
3112		break;
3113
3114	default:
3115		if (tcp->tcp_debug) {
3116			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3117			    "tcp_bind: bad address length, %d",
3118			    tbr->ADDR_length);
3119		}
3120		tcp_err_ack(tcp, mp, TBADADDR, 0);
3121		return;
3122	}
3123	tcp->tcp_bound_source_v6 = v6addr;
3124
3125	/* Check for change in ipversion */
3126	if (origipversion != tcp->tcp_ipversion) {
3127		ASSERT(tcp->tcp_family == AF_INET6);
3128		err = tcp->tcp_ipversion == IPV6_VERSION ?
3129		    tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
3130		if (err) {
3131			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3132			return;
3133		}
3134	}
3135
3136	/*
3137	 * Initialize family specific fields. Copy of the src addr.
3138	 * in tcp_t is needed for the lookup funcs.
3139	 */
3140	if (tcp->tcp_ipversion == IPV6_VERSION) {
3141		tcp->tcp_ip6h->ip6_src = v6addr;
3142	} else {
3143		IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
3144	}
3145	tcp->tcp_ip_src_v6 = v6addr;
3146
3147	/*
3148	 * For O_T_BIND_REQ:
3149	 * Verify that the target port/addr is available, or choose
3150	 * another.
3151	 * For  T_BIND_REQ:
3152	 * Verify that the target port/addr is available or fail.
3153	 * In both cases when it succeeds the tcp is inserted in the
3154	 * bind hash table. This ensures that the operation is atomic
3155	 * under the lock on the hash bucket.
3156	 */
3157	bind_to_req_port_only = requested_port != 0 &&
3158	    tbr->PRIM_type != O_T_BIND_REQ;
3159	/*
3160	 * Get a valid port (within the anonymous range and should not
3161	 * be a privileged one) to use if the user has not given a port.
3162	 * If multiple threads are here, they may all start with
3163	 * with the same initial port. But, it should be fine as long as
3164	 * tcp_bindi will ensure that no two threads will be assigned
3165	 * the same port.
3166	 *
3167	 * NOTE: XXX If a privileged process asks for an anonymous port, we
3168	 * still check for ports only in the range > tcp_smallest_non_priv_port,
3169	 * unless TCP_ANONPRIVBIND option is set.
3170	 */
3171	mlptype = mlptSingle;
3172	mlp_port = requested_port;
3173	if (requested_port == 0) {
3174		requested_port = tcp->tcp_anon_priv_bind ?
3175		    tcp_get_next_priv_port(tcp) :
3176		    tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE);
3177		if (requested_port == 0) {
3178			tcp_err_ack(tcp, mp, TNOADDR, 0);
3179			return;
3180		}
3181		user_specified = B_FALSE;
3182
3183		/*
3184		 * If the user went through one of the RPC interfaces to create
3185		 * this socket and RPC is MLP in this zone, then give him an
3186		 * anonymous MLP.
3187		 */
3188		cr = DB_CREDDEF(mp, tcp->tcp_cred);
3189		connp = tcp->tcp_connp;
3190		if (connp->conn_anon_mlp && is_system_labeled()) {
3191			zone = crgetzone(cr);
3192			addrtype = tsol_mlp_addr_type(zone->zone_id,
3193			    IPV6_VERSION, &v6addr);
3194			if (addrtype == mlptSingle) {
3195				tcp_err_ack(tcp, mp, TNOADDR, 0);
3196				return;
3197			}
3198			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3199			    PMAPPORT, addrtype);
3200			mlp_port = PMAPPORT;
3201		}
3202	} else {
3203		int i;
3204		boolean_t priv = B_FALSE;
3205
3206		/*
3207		 * If the requested_port is in the well-known privileged range,
3208		 * verify that the stream was opened by a privileged user.
3209		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
3210		 * but instead the code relies on:
3211		 * - the fact that the address of the array and its size never
3212		 *   changes
3213		 * - the atomic assignment of the elements of the array
3214		 */
3215		cr = DB_CREDDEF(mp, tcp->tcp_cred);
3216		if (requested_port < tcp_smallest_nonpriv_port) {
3217			priv = B_TRUE;
3218		} else {
3219			for (i = 0; i < tcp_g_num_epriv_ports; i++) {
3220				if (requested_port ==
3221				    tcp_g_epriv_ports[i]) {
3222					priv = B_TRUE;
3223					break;
3224				}
3225			}
3226		}
3227		if (priv) {
3228			if (secpolicy_net_privaddr(cr, requested_port) != 0) {
3229				if (tcp->tcp_debug) {
3230					(void) strlog(TCP_MOD_ID, 0, 1,
3231					    SL_ERROR|SL_TRACE,
3232					    "tcp_bind: no priv for port %d",
3233					    requested_port);
3234				}
3235				tcp_err_ack(tcp, mp, TACCES, 0);
3236				return;
3237			}
3238		}
3239		user_specified = B_TRUE;
3240
3241		connp = tcp->tcp_connp;
3242		if (is_system_labeled()) {
3243			zone = crgetzone(cr);
3244			addrtype = tsol_mlp_addr_type(zone->zone_id,
3245			    IPV6_VERSION, &v6addr);
3246			if (addrtype == mlptSingle) {
3247				tcp_err_ack(tcp, mp, TNOADDR, 0);
3248				return;
3249			}
3250			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3251			    requested_port, addrtype);
3252		}
3253	}
3254
3255	if (mlptype != mlptSingle) {
3256		if (secpolicy_net_bindmlp(cr) != 0) {
3257			if (tcp->tcp_debug) {
3258				(void) strlog(TCP_MOD_ID, 0, 1,
3259				    SL_ERROR|SL_TRACE,
3260				    "tcp_bind: no priv for multilevel port %d",
3261				    requested_port);
3262			}
3263			tcp_err_ack(tcp, mp, TACCES, 0);
3264			return;
3265		}
3266
3267		/*
3268		 * If we're specifically binding a shared IP address and the
3269		 * port is MLP on shared addresses, then check to see if this
3270		 * zone actually owns the MLP.  Reject if not.
3271		 */
3272		if (mlptype == mlptShared && addrtype == mlptShared) {
3273			zoneid_t mlpzone;
3274
3275			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
3276			    htons(mlp_port));
3277			if (connp->conn_zoneid != mlpzone) {
3278				if (tcp->tcp_debug) {
3279					(void) strlog(TCP_MOD_ID, 0, 1,
3280					    SL_ERROR|SL_TRACE,
3281					    "tcp_bind: attempt to bind port "
3282					    "%d on shared addr in zone %d "
3283					    "(should be %d)",
3284					    mlp_port, connp->conn_zoneid,
3285					    mlpzone);
3286				}
3287				tcp_err_ack(tcp, mp, TACCES, 0);
3288				return;
3289			}
3290		}
3291
3292		if (!user_specified) {
3293			err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3294			    requested_port, B_TRUE);
3295			if (err != 0) {
3296				if (tcp->tcp_debug) {
3297					(void) strlog(TCP_MOD_ID, 0, 1,
3298					    SL_ERROR|SL_TRACE,
3299					    "tcp_bind: cannot establish anon "
3300					    "MLP for port %d",
3301					    requested_port);
3302				}
3303				tcp_err_ack(tcp, mp, TSYSERR, err);
3304				return;
3305			}
3306			connp->conn_anon_port = B_TRUE;
3307		}
3308		connp->conn_mlp_type = mlptype;
3309	}
3310
3311	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
3312	    tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
3313
3314	if (allocated_port == 0) {
3315		connp->conn_mlp_type = mlptSingle;
3316		if (connp->conn_anon_port) {
3317			connp->conn_anon_port = B_FALSE;
3318			(void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3319			    requested_port, B_FALSE);
3320		}
3321		if (bind_to_req_port_only) {
3322			if (tcp->tcp_debug) {
3323				(void) strlog(TCP_MOD_ID, 0, 1,
3324				    SL_ERROR|SL_TRACE,
3325				    "tcp_bind: requested addr busy");
3326			}
3327			tcp_err_ack(tcp, mp, TADDRBUSY, 0);
3328		} else {
3329			/* If we are out of ports, fail the bind. */
3330			if (tcp->tcp_debug) {
3331				(void) strlog(TCP_MOD_ID, 0, 1,
3332				    SL_ERROR|SL_TRACE,
3333				    "tcp_bind: out of ports?");
3334			}
3335			tcp_err_ack(tcp, mp, TNOADDR, 0);
3336		}
3337		return;
3338	}
3339	ASSERT(tcp->tcp_state == TCPS_BOUND);
3340do_bind:
3341	if (!backlog_update) {
3342		if (tcp->tcp_family == AF_INET)
3343			sin->sin_port = htons(allocated_port);
3344		else
3345			sin6->sin6_port = htons(allocated_port);
3346	}
3347	if (tcp->tcp_family == AF_INET) {
3348		if (tbr->CONIND_number != 0) {
3349			mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3350			    sizeof (sin_t));
3351		} else {
3352			/* Just verify the local IP address */
3353			mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN);
3354		}
3355	} else {
3356		if (tbr->CONIND_number != 0) {
3357			mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3358			    sizeof (sin6_t));
3359		} else {
3360			/* Just verify the local IP address */
3361			mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3362			    IPV6_ADDR_LEN);
3363		}
3364	}
3365	if (mp1 == NULL) {
3366		if (connp->conn_anon_port) {
3367			connp->conn_anon_port = B_FALSE;
3368			(void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3369			    requested_port, B_FALSE);
3370		}
3371		connp->conn_mlp_type = mlptSingle;
3372		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3373		return;
3374	}
3375
3376	tbr->PRIM_type = T_BIND_ACK;
3377	mp->b_datap->db_type = M_PCPROTO;
3378
3379	/* Chain in the reply mp for tcp_rput() */
3380	mp1->b_cont = mp;
3381	mp = mp1;
3382
3383	tcp->tcp_conn_req_max = tbr->CONIND_number;
3384	if (tcp->tcp_conn_req_max) {
3385		if (tcp->tcp_conn_req_max < tcp_conn_req_min)
3386			tcp->tcp_conn_req_max = tcp_conn_req_min;
3387		if (tcp->tcp_conn_req_max > tcp_conn_req_max_q)
3388			tcp->tcp_conn_req_max = tcp_conn_req_max_q;
3389		/*
3390		 * If this is a listener, do not reset the eager list
3391		 * and other stuffs.  Note that we don't check if the
3392		 * existing eager list meets the new tcp_conn_req_max
3393		 * requirement.
3394		 */
3395		if (tcp->tcp_state != TCPS_LISTEN) {
3396			tcp->tcp_state = TCPS_LISTEN;
3397			/* Initialize the chain. Don't need the eager_lock */
3398			tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3399			tcp->tcp_second_ctimer_threshold =
3400			    tcp_ip_abort_linterval;
3401		}
3402	}
3403
3404	/*
3405	 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3406	 * processing continues in tcp_rput_other().
3407	 */
3408	if (tcp->tcp_family == AF_INET6) {
3409		ASSERT(tcp->tcp_connp->conn_af_isv6);
3410		mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp);
3411	} else {
3412		ASSERT(!tcp->tcp_connp->conn_af_isv6);
3413		mp = ip_bind_v4(q, mp, tcp->tcp_connp);
3414	}
3415	/*
3416	 * If the bind cannot complete immediately
3417	 * IP will arrange to call tcp_rput_other
3418	 * when the bind completes.
3419	 */
3420	if (mp != NULL) {
3421		tcp_rput_other(tcp, mp);
3422	} else {
3423		/*
3424		 * Bind will be resumed later. Need to ensure
3425		 * that conn doesn't disappear when that happens.
3426		 * This will be decremented in ip_resume_tcp_bind().
3427		 */
3428		CONN_INC_REF(tcp->tcp_connp);
3429	}
3430}
3431
3432
3433/*
3434 * If the "bind_to_req_port_only" parameter is set, if the requested port
3435 * number is available, return it, If not return 0
3436 *
3437 * If "bind_to_req_port_only" parameter is not set and
3438 * If the requested port number is available, return it.  If not, return
3439 * the first anonymous port we happen across.  If no anonymous ports are
3440 * available, return 0. addr is the requested local address, if any.
3441 *
3442 * In either case, when succeeding update the tcp_t to record the port number
3443 * and insert it in the bind hash table.
3444 *
3445 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
3446 * without setting SO_REUSEADDR. This is needed so that they
3447 * can be viewed as two independent transport protocols.
3448 */
3449static in_port_t
3450tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
3451    int reuseaddr, boolean_t quick_connect,
3452    boolean_t bind_to_req_port_only, boolean_t user_specified)
3453{
3454	/* number of times we have run around the loop */
3455	int count = 0;
3456	/* maximum number of times to run around the loop */
3457	int loopmax;
3458	conn_t *connp = tcp->tcp_connp;
3459	zoneid_t zoneid = connp->conn_zoneid;
3460
3461	/*
3462	 * Lookup for free addresses is done in a loop and "loopmax"
3463	 * influences how long we spin in the loop
3464	 */
3465	if (bind_to_req_port_only) {
3466		/*
3467		 * If the requested port is busy, don't bother to look
3468		 * for a new one. Setting loop maximum count to 1 has
3469		 * that effect.
3470		 */
3471		loopmax = 1;
3472	} else {
3473		/*
3474		 * If the requested port is busy, look for a free one
3475		 * in the anonymous port range.
3476		 * Set loopmax appropriately so that one does not look
3477		 * forever in the case all of the anonymous ports are in use.
3478		 */
3479		if (tcp->tcp_anon_priv_bind) {
3480			/*
3481			 * loopmax =
3482			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
3483			 */
3484			loopmax = IPPORT_RESERVED - tcp_min_anonpriv_port;
3485		} else {
3486			loopmax = (tcp_largest_anon_port -
3487			    tcp_smallest_anon_port + 1);
3488		}
3489	}
3490	do {
3491		uint16_t	lport;
3492		tf_t		*tbf;
3493		tcp_t		*ltcp;
3494		conn_t		*lconnp;
3495
3496		lport = htons(port);
3497
3498		/*
3499		 * Ensure that the tcp_t is not currently in the bind hash.
3500		 * Hold the lock on the hash bucket to ensure that
3501		 * the duplicate check plus the insertion is an atomic
3502		 * operation.
3503		 *
3504		 * This function does an inline lookup on the bind hash list
3505		 * Make sure that we access only members of tcp_t
3506		 * and that we don't look at tcp_tcp, since we are not
3507		 * doing a CONN_INC_REF.
3508		 */
3509		tcp_bind_hash_remove(tcp);
3510		tbf = &tcp_bind_fanout[TCP_BIND_HASH(lport)];
3511		mutex_enter(&tbf->tf_lock);
3512		for (ltcp = tbf->tf_tcp; ltcp != NULL;
3513		    ltcp = ltcp->tcp_bind_hash) {
3514			boolean_t not_socket;
3515			boolean_t exclbind;
3516
3517			if (lport != ltcp->tcp_lport)
3518				continue;
3519
3520			lconnp = ltcp->tcp_connp;
3521
3522			/*
3523			 * On a labeled system, we must treat bindings to ports
3524			 * on shared IP addresses by sockets with MAC exemption
3525			 * privilege as being in all zones, as there's
3526			 * otherwise no way to identify the right receiver.
3527			 */
3528			if (!IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) &&
3529			    !lconnp->conn_mac_exempt &&
3530			    !connp->conn_mac_exempt)
3531				continue;
3532
3533			/*
3534			 * If TCP_EXCLBIND is set for either the bound or
3535			 * binding endpoint, the semantics of bind
3536			 * is changed according to the following.
3537			 *
3538			 * spec = specified address (v4 or v6)
3539			 * unspec = unspecified address (v4 or v6)
3540			 * A = specified addresses are different for endpoints
3541			 *
3542			 * bound	bind to		allowed
3543			 * -------------------------------------
3544			 * unspec	unspec		no
3545			 * unspec	spec		no
3546			 * spec		unspec		no
3547			 * spec		spec		yes if A
3548			 *
3549			 * For labeled systems, SO_MAC_EXEMPT behaves the same
3550			 * as TCP_EXCLBIND, except that zoneid is ignored.
3551			 *
3552			 * Note:
3553			 *
3554			 * 1. Because of TLI semantics, an endpoint can go
3555			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
3556			 * TCPS_BOUND, depending on whether it is originally
3557			 * a listener or not.  That is why we need to check
3558			 * for states greater than or equal to TCPS_BOUND
3559			 * here.
3560			 *
3561			 * 2. Ideally, we should only check for state equals
3562			 * to TCPS_LISTEN. And the following check should be
3563			 * added.
3564			 *
3565			 * if (ltcp->tcp_state == TCPS_LISTEN ||
3566			 *	!reuseaddr || !ltcp->tcp_reuseaddr) {
3567			 *		...
3568			 * }
3569			 *
3570			 * The semantics will be changed to this.  If the
3571			 * endpoint on the list is in state not equal to
3572			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
3573			 * set, let the bind succeed.
3574			 *
3575			 * Because of (1), we cannot do that for TLI
3576			 * endpoints.  But we can do that for socket endpoints.
3577			 * If in future, we can change this going back
3578			 * semantics, we can use the above check for TLI also.
3579			 */
3580			not_socket = !(TCP_IS_SOCKET(ltcp) &&
3581			    TCP_IS_SOCKET(tcp));
3582			exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
3583
3584			if (lconnp->conn_mac_exempt || connp->conn_mac_exempt ||
3585			    (exclbind && (not_socket ||
3586			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
3587				if (V6_OR_V4_INADDR_ANY(
3588				    ltcp->tcp_bound_source_v6) ||
3589				    V6_OR_V4_INADDR_ANY(*laddr) ||
3590				    IN6_ARE_ADDR_EQUAL(laddr,
3591				    &ltcp->tcp_bound_source_v6)) {
3592					break;
3593				}
3594				continue;
3595			}
3596
3597			/*
3598			 * Check ipversion to allow IPv4 and IPv6 sockets to
3599			 * have disjoint port number spaces, if *_EXCLBIND
3600			 * is not set and only if the application binds to a
3601			 * specific port. We use the same autoassigned port
3602			 * number space for IPv4 and IPv6 sockets.
3603			 */
3604			if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
3605			    bind_to_req_port_only)
3606				continue;
3607
3608			/*
3609			 * Ideally, we should make sure that the source
3610			 * address, remote address, and remote port in the
3611			 * four tuple for this tcp-connection is unique.
3612			 * However, trying to find out the local source
3613			 * address would require too much code duplication
3614			 * with IP, since IP needs needs to have that code
3615			 * to support userland TCP implementations.
3616			 */
3617			if (quick_connect &&
3618			    (ltcp->tcp_state > TCPS_LISTEN) &&
3619			    ((tcp->tcp_fport != ltcp->tcp_fport) ||
3620				!IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
3621				    &ltcp->tcp_remote_v6)))
3622				continue;
3623
3624			if (!reuseaddr) {
3625				/*
3626				 * No socket option SO_REUSEADDR.
3627				 * If existing port is bound to
3628				 * a non-wildcard IP address
3629				 * and the requesting stream is
3630				 * bound to a distinct
3631				 * different IP addresses
3632				 * (non-wildcard, also), keep
3633				 * going.
3634				 */
3635				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
3636				    !V6_OR_V4_INADDR_ANY(
3637				    ltcp->tcp_bound_source_v6) &&
3638				    !IN6_ARE_ADDR_EQUAL(laddr,
3639					&ltcp->tcp_bound_source_v6))
3640					continue;
3641				if (ltcp->tcp_state >= TCPS_BOUND) {
3642					/*
3643					 * This port is being used and
3644					 * its state is >= TCPS_BOUND,
3645					 * so we can't bind to it.
3646					 */
3647					break;
3648				}
3649			} else {
3650				/*
3651				 * socket option SO_REUSEADDR is set on the
3652				 * binding tcp_t.
3653				 *
3654				 * If two streams are bound to
3655				 * same IP address or both addr
3656				 * and bound source are wildcards
3657				 * (INADDR_ANY), we want to stop
3658				 * searching.
3659				 * We have found a match of IP source
3660				 * address and source port, which is
3661				 * refused regardless of the
3662				 * SO_REUSEADDR setting, so we break.
3663				 */
3664				if (IN6_ARE_ADDR_EQUAL(laddr,
3665				    &ltcp->tcp_bound_source_v6) &&
3666				    (ltcp->tcp_state == TCPS_LISTEN ||
3667					ltcp->tcp_state == TCPS_BOUND))
3668					break;
3669			}
3670		}
3671		if (ltcp != NULL) {
3672			/* The port number is busy */
3673			mutex_exit(&tbf->tf_lock);
3674		} else {
3675			/*
3676			 * This port is ours. Insert in fanout and mark as
3677			 * bound to prevent others from getting the port
3678			 * number.
3679			 */
3680			tcp->tcp_state = TCPS_BOUND;
3681			tcp->tcp_lport = htons(port);
3682			*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3683
3684			ASSERT(&tcp_bind_fanout[TCP_BIND_HASH(
3685			    tcp->tcp_lport)] == tbf);
3686			tcp_bind_hash_insert(tbf, tcp, 1);
3687
3688			mutex_exit(&tbf->tf_lock);
3689
3690			/*
3691			 * We don't want tcp_next_port_to_try to "inherit"
3692			 * a port number supplied by the user in a bind.
3693			 */
3694			if (user_specified)
3695				return (port);
3696
3697			/*
3698			 * This is the only place where tcp_next_port_to_try
3699			 * is updated. After the update, it may or may not
3700			 * be in the valid range.
3701			 */
3702			if (!tcp->tcp_anon_priv_bind)
3703				tcp_next_port_to_try = port + 1;
3704			return (port);
3705		}
3706
3707		if (tcp->tcp_anon_priv_bind) {
3708			port = tcp_get_next_priv_port(tcp);
3709		} else {
3710			if (count == 0 && user_specified) {
3711				/*
3712				 * We may have to return an anonymous port. So
3713				 * get one to start with.
3714				 */
3715				port =
3716				    tcp_update_next_port(tcp_next_port_to_try,
3717					tcp, B_TRUE);
3718				user_specified = B_FALSE;
3719			} else {
3720				port = tcp_update_next_port(port + 1, tcp,
3721				    B_FALSE);
3722			}
3723		}
3724		if (port == 0)
3725			break;
3726
3727		/*
3728		 * Don't let this loop run forever in the case where
3729		 * all of the anonymous ports are in use.
3730		 */
3731	} while (++count < loopmax);
3732	return (0);
3733}
3734
3735/*
3736 * We are dying for some reason.  Try to do it gracefully.  (May be called
3737 * as writer.)
3738 *
3739 * Return -1 if the structure was not cleaned up (if the cleanup had to be
3740 * done by a service procedure).
3741 * TBD - Should the return value distinguish between the tcp_t being
3742 * freed and it being reinitialized?
3743 */
3744static int
3745tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
3746{
3747	mblk_t	*mp;
3748	queue_t	*q;
3749
3750	TCP_CLD_STAT(tag);
3751
3752#if TCP_TAG_CLEAN_DEATH
3753	tcp->tcp_cleandeathtag = tag;
3754#endif
3755
3756	if (tcp->tcp_fused)
3757		tcp_unfuse(tcp);
3758
3759	if (tcp->tcp_linger_tid != 0 &&
3760	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
3761		tcp_stop_lingering(tcp);
3762	}
3763
3764	ASSERT(tcp != NULL);
3765	ASSERT((tcp->tcp_family == AF_INET &&
3766	    tcp->tcp_ipversion == IPV4_VERSION) ||
3767	    (tcp->tcp_family == AF_INET6 &&
3768	    (tcp->tcp_ipversion == IPV4_VERSION ||
3769	    tcp->tcp_ipversion == IPV6_VERSION)));
3770
3771	if (TCP_IS_DETACHED(tcp)) {
3772		if (tcp->tcp_hard_binding) {
3773			/*
3774			 * Its an eager that we are dealing with. We close the
3775			 * eager but in case a conn_ind has already gone to the
3776			 * listener, let tcp_accept_finish() send a discon_ind
3777			 * to the listener and drop the last reference. If the
3778			 * listener doesn't even know about the eager i.e. the
3779			 * conn_ind hasn't gone up, blow away the eager and drop
3780			 * the last reference as well. If the conn_ind has gone
3781			 * up, state should be BOUND. tcp_accept_finish
3782			 * will figure out that the connection has received a
3783			 * RST and will send a DISCON_IND to the application.
3784			 */
3785			tcp_closei_local(tcp);
3786			if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) {
3787				CONN_DEC_REF(tcp->tcp_connp);
3788			} else {
3789				tcp->tcp_state = TCPS_BOUND;
3790			}
3791		} else {
3792			tcp_close_detached(tcp);
3793		}
3794		return (0);
3795	}
3796
3797	TCP_STAT(tcp_clean_death_nondetached);
3798
3799	/*
3800	 * If T_ORDREL_IND has not been sent yet (done when service routine
3801	 * is run) postpone cleaning up the endpoint until service routine
3802	 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3803	 * client_errno since tcp_close uses the client_errno field.
3804	 */
3805	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3806		if (err != 0)
3807			tcp->tcp_client_errno = err;
3808
3809		tcp->tcp_deferred_clean_death = B_TRUE;
3810		return (-1);
3811	}
3812
3813	q = tcp->tcp_rq;
3814
3815	/* Trash all inbound data */
3816	flushq(q, FLUSHALL);
3817
3818	/*
3819	 * If we are at least part way open and there is error
3820	 * (err==0 implies no error)
3821	 * notify our client by a T_DISCON_IND.
3822	 */
3823	if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
3824		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
3825		    !TCP_IS_SOCKET(tcp)) {
3826			/*
3827			 * Send M_FLUSH according to TPI. Because sockets will
3828			 * (and must) ignore FLUSHR we do that only for TPI
3829			 * endpoints and sockets in STREAMS mode.
3830			 */
3831			(void) putnextctl1(q, M_FLUSH, FLUSHR);
3832		}
3833		if (tcp->tcp_debug) {
3834			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
3835			    "tcp_clean_death: discon err %d", err);
3836		}
3837		mp = mi_tpi_discon_ind(NULL, err, 0);
3838		if (mp != NULL) {
3839			putnext(q, mp);
3840		} else {
3841			if (tcp->tcp_debug) {
3842				(void) strlog(TCP_MOD_ID, 0, 1,
3843				    SL_ERROR|SL_TRACE,
3844				    "tcp_clean_death, sending M_ERROR");
3845			}
3846			(void) putnextctl1(q, M_ERROR, EPROTO);
3847		}
3848		if (tcp->tcp_state <= TCPS_SYN_RCVD) {
3849			/* SYN_SENT or SYN_RCVD */
3850			BUMP_MIB(&tcp_mib, tcpAttemptFails);
3851		} else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
3852			/* ESTABLISHED or CLOSE_WAIT */
3853			BUMP_MIB(&tcp_mib, tcpEstabResets);
3854		}
3855	}
3856
3857	tcp_reinit(tcp);
3858	return (-1);
3859}
3860
3861/*
3862 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
3863 * to expire, stop the wait and finish the close.
3864 */
3865static void
3866tcp_stop_lingering(tcp_t *tcp)
3867{
3868	clock_t	delta = 0;
3869
3870	tcp->tcp_linger_tid = 0;
3871	if (tcp->tcp_state > TCPS_LISTEN) {
3872		tcp_acceptor_hash_remove(tcp);
3873		if (tcp->tcp_flow_stopped) {
3874			tcp_clrqfull(tcp);
3875		}
3876
3877		if (tcp->tcp_timer_tid != 0) {
3878			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
3879			tcp->tcp_timer_tid = 0;
3880		}
3881		/*
3882		 * Need to cancel those timers which will not be used when
3883		 * TCP is detached.  This has to be done before the tcp_wq
3884		 * is set to the global queue.
3885		 */
3886		tcp_timers_stop(tcp);
3887
3888
3889		tcp->tcp_detached = B_TRUE;
3890		tcp->tcp_rq = tcp_g_q;
3891		tcp->tcp_wq = WR(tcp_g_q);
3892
3893		if (tcp->tcp_state == TCPS_TIME_WAIT) {
3894			tcp_time_wait_append(tcp);
3895			TCP_DBGSTAT(tcp_detach_time_wait);
3896			goto finish;
3897		}
3898
3899		/*
3900		 * If delta is zero the timer event wasn't executed and was
3901		 * successfully canceled. In this case we need to restart it
3902		 * with the minimal delta possible.
3903		 */
3904		if (delta >= 0) {
3905			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
3906			    delta ? delta : 1);
3907		}
3908	} else {
3909		tcp_closei_local(tcp);
3910		CONN_DEC_REF(tcp->tcp_connp);
3911	}
3912finish:
3913	/* Signal closing thread that it can complete close */
3914	mutex_enter(&tcp->tcp_closelock);
3915	tcp->tcp_detached = B_TRUE;
3916	tcp->tcp_rq = tcp_g_q;
3917	tcp->tcp_wq = WR(tcp_g_q);
3918	tcp->tcp_closed = 1;
3919	cv_signal(&tcp->tcp_closecv);
3920	mutex_exit(&tcp->tcp_closelock);
3921}
3922
3923/*
3924 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
3925 * expires.
3926 */
3927static void
3928tcp_close_linger_timeout(void *arg)
3929{
3930	conn_t	*connp = (conn_t *)arg;
3931	tcp_t 	*tcp = connp->conn_tcp;
3932
3933	tcp->tcp_client_errno = ETIMEDOUT;
3934	tcp_stop_lingering(tcp);
3935}
3936
3937static int
3938tcp_close(queue_t *q, int flags)
3939{
3940	conn_t		*connp = Q_TO_CONN(q);
3941	tcp_t		*tcp = connp->conn_tcp;
3942	mblk_t 		*mp = &tcp->tcp_closemp;
3943	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
3944
3945	ASSERT(WR(q)->q_next == NULL);
3946	ASSERT(connp->conn_ref >= 2);
3947	ASSERT((connp->conn_flags & IPCL_TCPMOD) == 0);
3948
3949	/*
3950	 * We are being closed as /dev/tcp or /dev/tcp6.
3951	 *
3952	 * Mark the conn as closing. ill_pending_mp_add will not
3953	 * add any mp to the pending mp list, after this conn has
3954	 * started closing. Same for sq_pending_mp_add
3955	 */
3956	mutex_enter(&connp->conn_lock);
3957	connp->conn_state_flags |= CONN_CLOSING;
3958	if (connp->conn_oper_pending_ill != NULL)
3959		conn_ioctl_cleanup_reqd = B_TRUE;
3960	CONN_INC_REF_LOCKED(connp);
3961	mutex_exit(&connp->conn_lock);
3962	tcp->tcp_closeflags = (uint8_t)flags;
3963	ASSERT(connp->conn_ref >= 3);
3964
3965	(*tcp_squeue_close_proc)(connp->conn_sqp, mp,
3966	    tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
3967
3968	mutex_enter(&tcp->tcp_closelock);
3969
3970	while (!tcp->tcp_closed)
3971		cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
3972	mutex_exit(&tcp->tcp_closelock);
3973	/*
3974	 * In the case of listener streams that have eagers in the q or q0
3975	 * we wait for the eagers to drop their reference to us. tcp_rq and
3976	 * tcp_wq of the eagers point to our queues. By waiting for the
3977	 * refcnt to drop to 1, we are sure that the eagers have cleaned
3978	 * up their queue pointers and also dropped their references to us.
3979	 */
3980	if (tcp->tcp_wait_for_eagers) {
3981		mutex_enter(&connp->conn_lock);
3982		while (connp->conn_ref != 1) {
3983			cv_wait(&connp->conn_cv, &connp->conn_lock);
3984		}
3985		mutex_exit(&connp->conn_lock);
3986	}
3987	/*
3988	 * ioctl cleanup. The mp is queued in the
3989	 * ill_pending_mp or in the sq_pending_mp.
3990	 */
3991	if (conn_ioctl_cleanup_reqd)
3992		conn_ioctl_cleanup(connp);
3993
3994	qprocsoff(q);
3995	inet_minor_free(ip_minor_arena, connp->conn_dev);
3996
3997	tcp->tcp_cpid = -1;
3998
3999	/*
4000	 * Drop IP's reference on the conn. This is the last reference
4001	 * on the connp if the state was less than established. If the
4002	 * connection has gone into timewait state, then we will have
4003	 * one ref for the TCP and one more ref (total of two) for the
4004	 * classifier connected hash list (a timewait connections stays
4005	 * in connected hash till closed).
4006	 *
4007	 * We can't assert the references because there might be other
4008	 * transient reference places because of some walkers or queued
4009	 * packets in squeue for the timewait state.
4010	 */
4011	CONN_DEC_REF(connp);
4012	q->q_ptr = WR(q)->q_ptr = NULL;
4013	return (0);
4014}
4015
4016static int
4017tcpclose_accept(queue_t *q)
4018{
4019	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
4020
4021	/*
4022	 * We had opened an acceptor STREAM for sockfs which is
4023	 * now being closed due to some error.
4024	 */
4025	qprocsoff(q);
4026	inet_minor_free(ip_minor_arena, (dev_t)q->q_ptr);
4027	q->q_ptr = WR(q)->q_ptr = NULL;
4028	return (0);
4029}
4030
4031
4032/*
4033 * Called by streams close routine via squeues when our client blows off her
4034 * descriptor, we take this to mean: "close the stream state NOW, close the tcp
4035 * connection politely" When SO_LINGER is set (with a non-zero linger time and
4036 * it is not a nonblocking socket) then this routine sleeps until the FIN is
4037 * acked.
4038 *
4039 * NOTE: tcp_close potentially returns error when lingering.
4040 * However, the stream head currently does not pass these errors
4041 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
4042 * errors to the application (from tsleep()) and not errors
4043 * like ECONNRESET caused by receiving a reset packet.
4044 */
4045
4046/* ARGSUSED */
4047static void
4048tcp_close_output(void *arg, mblk_t *mp, void *arg2)
4049{
4050	char	*msg;
4051	conn_t	*connp = (conn_t *)arg;
4052	tcp_t	*tcp = connp->conn_tcp;
4053	clock_t	delta = 0;
4054
4055	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
4056	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
4057
4058	/* Cancel any pending timeout */
4059	if (tcp->tcp_ordrelid != 0) {
4060		if (tcp->tcp_timeout) {
4061			(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid);
4062		}
4063		tcp->tcp_ordrelid = 0;
4064		tcp->tcp_timeout = B_FALSE;
4065	}
4066
4067	mutex_enter(&tcp->tcp_eager_lock);
4068	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
4069		/* Cleanup for listener */
4070		tcp_eager_cleanup(tcp, 0);
4071		tcp->tcp_wait_for_eagers = 1;
4072	}
4073	mutex_exit(&tcp->tcp_eager_lock);
4074
4075	connp->conn_mdt_ok = B_FALSE;
4076	tcp->tcp_mdt = B_FALSE;
4077
4078	msg = NULL;
4079	switch (tcp->tcp_state) {
4080	case TCPS_CLOSED:
4081	case TCPS_IDLE:
4082	case TCPS_BOUND:
4083	case TCPS_LISTEN:
4084		break;
4085	case TCPS_SYN_SENT:
4086		msg = "tcp_close, during connect";
4087		break;
4088	case TCPS_SYN_RCVD:
4089		/*
4090		 * Close during the connect 3-way handshake
4091		 * but here there may or may not be pending data
4092		 * already on queue. Process almost same as in
4093		 * the ESTABLISHED state.
4094		 */
4095		/* FALLTHRU */
4096	default:
4097		if (tcp->tcp_fused)
4098			tcp_unfuse(tcp);
4099
4100		/*
4101		 * If SO_LINGER has set a zero linger time, abort the
4102		 * connection with a reset.
4103		 */
4104		if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
4105			msg = "tcp_close, zero lingertime";
4106			break;
4107		}
4108
4109		ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
4110		/*
4111		 * Abort connection if there is unread data queued.
4112		 */
4113		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
4114			msg = "tcp_close, unread data";
4115			break;
4116		}
4117		/*
4118		 * tcp_hard_bound is now cleared thus all packets go through
4119		 * tcp_lookup. This fact is used by tcp_detach below.
4120		 *
4121		 * We have done a qwait() above which could have possibly
4122		 * drained more messages in turn causing transition to a
4123		 * different state. Check whether we have to do the rest
4124		 * of the processing or not.
4125		 */
4126		if (tcp->tcp_state <= TCPS_LISTEN)
4127			break;
4128
4129		/*
4130		 * Transmit the FIN before detaching the tcp_t.
4131		 * After tcp_detach returns this queue/perimeter
4132		 * no longer owns the tcp_t thus others can modify it.
4133		 */
4134		(void) tcp_xmit_end(tcp);
4135
4136		/*
4137		 * If lingering on close then wait until the fin is acked,
4138		 * the SO_LINGER time passes, or a reset is sent/received.
4139		 */
4140		if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
4141		    !(tcp->tcp_fin_acked) &&
4142		    tcp->tcp_state >= TCPS_ESTABLISHED) {
4143			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
4144				tcp->tcp_client_errno = EWOULDBLOCK;
4145			} else if (tcp->tcp_client_errno == 0) {
4146
4147				ASSERT(tcp->tcp_linger_tid == 0);
4148
4149				tcp->tcp_linger_tid = TCP_TIMER(tcp,
4150				    tcp_close_linger_timeout,
4151				    tcp->tcp_lingertime * hz);
4152
4153				/* tcp_close_linger_timeout will finish close */
4154				if (tcp->tcp_linger_tid == 0)
4155					tcp->tcp_client_errno = ENOSR;
4156				else
4157					return;
4158			}
4159
4160			/*
4161			 * Check if we need to detach or just close
4162			 * the instance.
4163			 */
4164			if (tcp->tcp_state <= TCPS_LISTEN)
4165				break;
4166		}
4167
4168		/*
4169		 * Make sure that no other thread will access the tcp_rq of
4170		 * this instance (through lookups etc.) as tcp_rq will go
4171		 * away shortly.
4172		 */
4173		tcp_acceptor_hash_remove(tcp);
4174
4175		if (tcp->tcp_flow_stopped) {
4176			tcp_clrqfull(tcp);
4177		}
4178
4179		if (tcp->tcp_timer_tid != 0) {
4180			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4181			tcp->tcp_timer_tid = 0;
4182		}
4183		/*
4184		 * Need to cancel those timers which will not be used when
4185		 * TCP is detached.  This has to be done before the tcp_wq
4186		 * is set to the global queue.
4187		 */
4188		tcp_timers_stop(tcp);
4189
4190		tcp->tcp_detached = B_TRUE;
4191		if (tcp->tcp_state == TCPS_TIME_WAIT) {
4192			tcp_time_wait_append(tcp);
4193			TCP_DBGSTAT(tcp_detach_time_wait);
4194			ASSERT(connp->conn_ref >= 3);
4195			goto finish;
4196		}
4197
4198		/*
4199		 * If delta is zero the timer event wasn't executed and was
4200		 * successfully canceled. In this case we need to restart it
4201		 * with the minimal delta possible.
4202		 */
4203		if (delta >= 0)
4204			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4205			    delta ? delta : 1);
4206
4207		ASSERT(connp->conn_ref >= 3);
4208		goto finish;
4209	}
4210
4211	/* Detach did not complete. Still need to remove q from stream. */
4212	if (msg) {
4213		if (tcp->tcp_state == TCPS_ESTABLISHED ||
4214		    tcp->tcp_state == TCPS_CLOSE_WAIT)
4215			BUMP_MIB(&tcp_mib, tcpEstabResets);
4216		if (tcp->tcp_state == TCPS_SYN_SENT ||
4217		    tcp->tcp_state == TCPS_SYN_RCVD)
4218			BUMP_MIB(&tcp_mib, tcpAttemptFails);
4219		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
4220	}
4221
4222	tcp_closei_local(tcp);
4223	CONN_DEC_REF(connp);
4224	ASSERT(connp->conn_ref >= 2);
4225
4226finish:
4227	/*
4228	 * Although packets are always processed on the correct
4229	 * tcp's perimeter and access is serialized via squeue's,
4230	 * IP still needs a queue when sending packets in time_wait
4231	 * state so use WR(tcp_g_q) till ip_output() can be
4232	 * changed to deal with just connp. For read side, we
4233	 * could have set tcp_rq to NULL but there are some cases
4234	 * in tcp_rput_data() from early days of this code which
4235	 * do a putnext without checking if tcp is closed. Those
4236	 * need to be identified before both tcp_rq and tcp_wq
4237	 * can be set to NULL and tcp_q_q can disappear forever.
4238	 */
4239	mutex_enter(&tcp->tcp_closelock);
4240	/*
4241	 * Don't change the queues in the case of a listener that has
4242	 * eagers in its q or q0. It could surprise the eagers.
4243	 * Instead wait for the eagers outside the squeue.
4244	 */
4245	if (!tcp->tcp_wait_for_eagers) {
4246		tcp->tcp_detached = B_TRUE;
4247		tcp->tcp_rq = tcp_g_q;
4248		tcp->tcp_wq = WR(tcp_g_q);
4249	}
4250
4251	/* Signal tcp_close() to finish closing. */
4252	tcp->tcp_closed = 1;
4253	cv_signal(&tcp->tcp_closecv);
4254	mutex_exit(&tcp->tcp_closelock);
4255}
4256
4257
4258/*
4259 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
4260 * Some stream heads get upset if they see these later on as anything but NULL.
4261 */
4262static void
4263tcp_close_mpp(mblk_t **mpp)
4264{
4265	mblk_t	*mp;
4266
4267	if ((mp = *mpp) != NULL) {
4268		do {
4269			mp->b_next = NULL;
4270			mp->b_prev = NULL;
4271		} while ((mp = mp->b_cont) != NULL);
4272
4273		mp = *mpp;
4274		*mpp = NULL;
4275		freemsg(mp);
4276	}
4277}
4278
4279/* Do detached close. */
4280static void
4281tcp_close_detached(tcp_t *tcp)
4282{
4283	if (tcp->tcp_fused)
4284		tcp_unfuse(tcp);
4285
4286	/*
4287	 * Clustering code serializes TCP disconnect callbacks and
4288	 * cluster tcp list walks by blocking a TCP disconnect callback
4289	 * if a cluster tcp list walk is in progress. This ensures
4290	 * accurate accounting of TCPs in the cluster code even though
4291	 * the TCP list walk itself is not atomic.
4292	 */
4293	tcp_closei_local(tcp);
4294	CONN_DEC_REF(tcp->tcp_connp);
4295}
4296
4297/*
4298 * Stop all TCP timers, and free the timer mblks if requested.
4299 */
4300void
4301tcp_timers_stop(tcp_t *tcp)
4302{
4303	if (tcp->tcp_timer_tid != 0) {
4304		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4305		tcp->tcp_timer_tid = 0;
4306	}
4307	if (tcp->tcp_ka_tid != 0) {
4308		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
4309		tcp->tcp_ka_tid = 0;
4310	}
4311	if (tcp->tcp_ack_tid != 0) {
4312		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4313		tcp->tcp_ack_tid = 0;
4314	}
4315	if (tcp->tcp_push_tid != 0) {
4316		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
4317		tcp->tcp_push_tid = 0;
4318	}
4319}
4320
4321/*
4322 * The tcp_t is going away. Remove it from all lists and set it
4323 * to TCPS_CLOSED. The freeing up of memory is deferred until
4324 * tcp_inactive. This is needed since a thread in tcp_rput might have
4325 * done a CONN_INC_REF on this structure before it was removed from the
4326 * hashes.
4327 */
4328static void
4329tcp_closei_local(tcp_t *tcp)
4330{
4331	ire_t 	*ire;
4332	conn_t	*connp = tcp->tcp_connp;
4333
4334	if (!TCP_IS_SOCKET(tcp))
4335		tcp_acceptor_hash_remove(tcp);
4336
4337	UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs);
4338	tcp->tcp_ibsegs = 0;
4339	UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs);
4340	tcp->tcp_obsegs = 0;
4341
4342	/*
4343	 * If we are an eager connection hanging off a listener that
4344	 * hasn't formally accepted the connection yet, get off his
4345	 * list and blow off any data that we have accumulated.
4346	 */
4347	if (tcp->tcp_listener != NULL) {
4348		tcp_t	*listener = tcp->tcp_listener;
4349		mutex_enter(&listener->tcp_eager_lock);
4350		/*
4351		 * tcp_eager_conn_ind == NULL means that the
4352		 * conn_ind has already gone to listener. At
4353		 * this point, eager will be closed but we
4354		 * leave it in listeners eager list so that
4355		 * if listener decides to close without doing
4356		 * accept, we can clean this up. In tcp_wput_accept
4357		 * we take case of the case of accept on closed
4358		 * eager.
4359		 */
4360		if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) {
4361			tcp_eager_unlink(tcp);
4362			mutex_exit(&listener->tcp_eager_lock);
4363			/*
4364			 * We don't want to have any pointers to the
4365			 * listener queue, after we have released our
4366			 * reference on the listener
4367			 */
4368			tcp->tcp_rq = tcp_g_q;
4369			tcp->tcp_wq = WR(tcp_g_q);
4370			CONN_DEC_REF(listener->tcp_connp);
4371		} else {
4372			mutex_exit(&listener->tcp_eager_lock);
4373		}
4374	}
4375
4376	/* Stop all the timers */
4377	tcp_timers_stop(tcp);
4378
4379	if (tcp->tcp_state == TCPS_LISTEN) {
4380		if (tcp->tcp_ip_addr_cache) {
4381			kmem_free((void *)tcp->tcp_ip_addr_cache,
4382			    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
4383			tcp->tcp_ip_addr_cache = NULL;
4384		}
4385	}
4386	if (tcp->tcp_flow_stopped)
4387		tcp_clrqfull(tcp);
4388
4389	tcp_bind_hash_remove(tcp);
4390	/*
4391	 * If the tcp_time_wait_collector (which runs outside the squeue)
4392	 * is trying to remove this tcp from the time wait list, we will
4393	 * block in tcp_time_wait_remove while trying to acquire the
4394	 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
4395	 * requires the ipcl_hash_remove to be ordered after the
4396	 * tcp_time_wait_remove for the refcnt checks to work correctly.
4397	 */
4398	if (tcp->tcp_state == TCPS_TIME_WAIT)
4399		tcp_time_wait_remove(tcp, NULL);
4400	CL_INET_DISCONNECT(tcp);
4401	ipcl_hash_remove(connp);
4402
4403	/*
4404	 * Delete the cached ire in conn_ire_cache and also mark
4405	 * the conn as CONDEMNED
4406	 */
4407	mutex_enter(&connp->conn_lock);
4408	connp->conn_state_flags |= CONN_CONDEMNED;
4409	ire = connp->conn_ire_cache;
4410	connp->conn_ire_cache = NULL;
4411	mutex_exit(&connp->conn_lock);
4412	if (ire != NULL)
4413		IRE_REFRELE_NOTR(ire);
4414
4415	/* Need to cleanup any pending ioctls */
4416	ASSERT(tcp->tcp_time_wait_next == NULL);
4417	ASSERT(tcp->tcp_time_wait_prev == NULL);
4418	ASSERT(tcp->tcp_time_wait_expire == 0);
4419	tcp->tcp_state = TCPS_CLOSED;
4420
4421	/* Release any SSL context */
4422	if (tcp->tcp_kssl_ent != NULL) {
4423		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4424		tcp->tcp_kssl_ent = NULL;
4425	}
4426	if (tcp->tcp_kssl_ctx != NULL) {
4427		kssl_release_ctx(tcp->tcp_kssl_ctx);
4428		tcp->tcp_kssl_ctx = NULL;
4429	}
4430	tcp->tcp_kssl_pending = B_FALSE;
4431}
4432
4433/*
4434 * tcp is dying (called from ipcl_conn_destroy and error cases).
4435 * Free the tcp_t in either case.
4436 */
4437void
4438tcp_free(tcp_t *tcp)
4439{
4440	mblk_t	*mp;
4441	ip6_pkt_t	*ipp;
4442
4443	ASSERT(tcp != NULL);
4444	ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
4445
4446	tcp->tcp_rq = NULL;
4447	tcp->tcp_wq = NULL;
4448
4449	tcp_close_mpp(&tcp->tcp_xmit_head);
4450	tcp_close_mpp(&tcp->tcp_reass_head);
4451	if (tcp->tcp_rcv_list != NULL) {
4452		/* Free b_next chain */
4453		tcp_close_mpp(&tcp->tcp_rcv_list);
4454	}
4455	if ((mp = tcp->tcp_urp_mp) != NULL) {
4456		freemsg(mp);
4457	}
4458	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
4459		freemsg(mp);
4460	}
4461
4462	if (tcp->tcp_fused_sigurg_mp != NULL) {
4463		freeb(tcp->tcp_fused_sigurg_mp);
4464		tcp->tcp_fused_sigurg_mp = NULL;
4465	}
4466
4467	if (tcp->tcp_sack_info != NULL) {
4468		if (tcp->tcp_notsack_list != NULL) {
4469			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
4470		}
4471		bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
4472	}
4473
4474	if (tcp->tcp_hopopts != NULL) {
4475		mi_free(tcp->tcp_hopopts);
4476		tcp->tcp_hopopts = NULL;
4477		tcp->tcp_hopoptslen = 0;
4478	}
4479	ASSERT(tcp->tcp_hopoptslen == 0);
4480	if (tcp->tcp_dstopts != NULL) {
4481		mi_free(tcp->tcp_dstopts);
4482		tcp->tcp_dstopts = NULL;
4483		tcp->tcp_dstoptslen = 0;
4484	}
4485	ASSERT(tcp->tcp_dstoptslen == 0);
4486	if (tcp->tcp_rtdstopts != NULL) {
4487		mi_free(tcp->tcp_rtdstopts);
4488		tcp->tcp_rtdstopts = NULL;
4489		tcp->tcp_rtdstoptslen = 0;
4490	}
4491	ASSERT(tcp->tcp_rtdstoptslen == 0);
4492	if (tcp->tcp_rthdr != NULL) {
4493		mi_free(tcp->tcp_rthdr);
4494		tcp->tcp_rthdr = NULL;
4495		tcp->tcp_rthdrlen = 0;
4496	}
4497	ASSERT(tcp->tcp_rthdrlen == 0);
4498
4499	ipp = &tcp->tcp_sticky_ipp;
4500	if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
4501	    IPPF_RTHDR))
4502		ip6_pkt_free(ipp);
4503
4504	/*
4505	 * Free memory associated with the tcp/ip header template.
4506	 */
4507
4508	if (tcp->tcp_iphc != NULL)
4509		bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
4510
4511	/*
4512	 * Following is really a blowing away a union.
4513	 * It happens to have exactly two members of identical size
4514	 * the following code is enough.
4515	 */
4516	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
4517
4518	if (tcp->tcp_tracebuf != NULL) {
4519		kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t));
4520		tcp->tcp_tracebuf = NULL;
4521	}
4522}
4523
4524
4525/*
4526 * Put a connection confirmation message upstream built from the
4527 * address information within 'iph' and 'tcph'.  Report our success or failure.
4528 */
4529static boolean_t
4530tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
4531    mblk_t **defermp)
4532{
4533	sin_t	sin;
4534	sin6_t	sin6;
4535	mblk_t	*mp;
4536	char	*optp = NULL;
4537	int	optlen = 0;
4538	cred_t	*cr;
4539
4540	if (defermp != NULL)
4541		*defermp = NULL;
4542
4543	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
4544		/*
4545		 * Return in T_CONN_CON results of option negotiation through
4546		 * the T_CONN_REQ. Note: If there is an real end-to-end option
4547		 * negotiation, then what is received from remote end needs
4548		 * to be taken into account but there is no such thing (yet?)
4549		 * in our TCP/IP.
4550		 * Note: We do not use mi_offset_param() here as
4551		 * tcp_opts_conn_req contents do not directly come from
4552		 * an application and are either generated in kernel or
4553		 * from user input that was already verified.
4554		 */
4555		mp = tcp->tcp_conn.tcp_opts_conn_req;
4556		optp = (char *)(mp->b_rptr +
4557		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
4558		optlen = (int)
4559		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
4560	}
4561
4562	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
4563		ipha_t *ipha = (ipha_t *)iphdr;
4564
4565		/* packet is IPv4 */
4566		if (tcp->tcp_family == AF_INET) {
4567			sin = sin_null;
4568			sin.sin_addr.s_addr = ipha->ipha_src;
4569			sin.sin_port = *(uint16_t *)tcph->th_lport;
4570			sin.sin_family = AF_INET;
4571			mp = mi_tpi_conn_con(NULL, (char *)&sin,
4572			    (int)sizeof (sin_t), optp, optlen);
4573		} else {
4574			sin6 = sin6_null;
4575			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4576			sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4577			sin6.sin6_family = AF_INET6;
4578			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4579			    (int)sizeof (sin6_t), optp, optlen);
4580
4581		}
4582	} else {
4583		ip6_t	*ip6h = (ip6_t *)iphdr;
4584
4585		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
4586		ASSERT(tcp->tcp_family == AF_INET6);
4587		sin6 = sin6_null;
4588		sin6.sin6_addr = ip6h->ip6_src;
4589		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4590		sin6.sin6_family = AF_INET6;
4591		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4592		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4593		    (int)sizeof (sin6_t), optp, optlen);
4594	}
4595
4596	if (!mp)
4597		return (B_FALSE);
4598
4599	if ((cr = DB_CRED(idmp)) != NULL) {
4600		mblk_setcred(mp, cr);
4601		DB_CPID(mp) = DB_CPID(idmp);
4602	}
4603
4604	if (defermp == NULL)
4605		putnext(tcp->tcp_rq, mp);
4606	else
4607		*defermp = mp;
4608
4609	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
4610		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
4611	return (B_TRUE);
4612}
4613
4614/*
4615 * Defense for the SYN attack -
4616 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest
4617 *    one that doesn't have the dontdrop bit set.
4618 * 2. Don't drop a SYN request before its first timeout. This gives every
4619 *    request at least til the first timeout to complete its 3-way handshake.
4620 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
4621 *    requests currently on the queue that has timed out. This will be used
4622 *    as an indicator of whether an attack is under way, so that appropriate
4623 *    actions can be taken. (It's incremented in tcp_timer() and decremented
4624 *    either when eager goes into ESTABLISHED, or gets freed up.)
4625 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
4626 *    # of timeout drops back to <= q0len/32 => SYN alert off
4627 */
4628static boolean_t
4629tcp_drop_q0(tcp_t *tcp)
4630{
4631	tcp_t	*eager;
4632
4633	ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
4634	ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
4635	/*
4636	 * New one is added after next_q0 so prev_q0 points to the oldest
4637	 * Also do not drop any established connections that are deferred on
4638	 * q0 due to q being full
4639	 */
4640
4641	eager = tcp->tcp_eager_prev_q0;
4642	while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) {
4643		eager = eager->tcp_eager_prev_q0;
4644		if (eager == tcp) {
4645			eager = tcp->tcp_eager_prev_q0;
4646			break;
4647		}
4648	}
4649	if (eager->tcp_syn_rcvd_timeout == 0)
4650		return (B_FALSE);
4651
4652	if (tcp->tcp_debug) {
4653		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
4654		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
4655		    " (%d pending) on %s, drop one", tcp_conn_req_max_q0,
4656		    tcp->tcp_conn_req_cnt_q0,
4657		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
4658	}
4659
4660	BUMP_MIB(&tcp_mib, tcpHalfOpenDrop);
4661
4662	/*
4663	 * need to do refhold here because the selected eager could
4664	 * be removed by someone else if we release the eager lock.
4665	 */
4666	CONN_INC_REF(eager->tcp_connp);
4667	mutex_exit(&tcp->tcp_eager_lock);
4668
4669	/* Mark the IRE created for this SYN request temporary */
4670	tcp_ip_ire_mark_advice(eager);
4671	(void) tcp_clean_death(eager, ETIMEDOUT, 5);
4672	CONN_DEC_REF(eager->tcp_connp);
4673
4674	mutex_enter(&tcp->tcp_eager_lock);
4675	return (B_TRUE);
4676}
4677
4678int
4679tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
4680    tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
4681{
4682	tcp_t 		*ltcp = lconnp->conn_tcp;
4683	tcp_t		*tcp = connp->conn_tcp;
4684	mblk_t		*tpi_mp;
4685	ipha_t		*ipha;
4686	ip6_t		*ip6h;
4687	sin6_t 		sin6;
4688	in6_addr_t 	v6dst;
4689	int		err;
4690	int		ifindex = 0;
4691	cred_t		*cr;
4692
4693	if (ipvers == IPV4_VERSION) {
4694		ipha = (ipha_t *)mp->b_rptr;
4695
4696		connp->conn_send = ip_output;
4697		connp->conn_recv = tcp_input;
4698
4699		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
4700		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
4701
4702		sin6 = sin6_null;
4703		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4704		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
4705		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4706		sin6.sin6_family = AF_INET6;
4707		sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
4708		    lconnp->conn_zoneid);
4709		if (tcp->tcp_recvdstaddr) {
4710			sin6_t	sin6d;
4711
4712			sin6d = sin6_null;
4713			IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
4714			    &sin6d.sin6_addr);
4715			sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4716			sin6d.sin6_family = AF_INET;
4717			tpi_mp = mi_tpi_extconn_ind(NULL,
4718			    (char *)&sin6d, sizeof (sin6_t),
4719			    (char *)&tcp,
4720			    (t_scalar_t)sizeof (intptr_t),
4721			    (char *)&sin6d, sizeof (sin6_t),
4722			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4723		} else {
4724			tpi_mp = mi_tpi_conn_ind(NULL,
4725			    (char *)&sin6, sizeof (sin6_t),
4726			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4727			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4728		}
4729	} else {
4730		ip6h = (ip6_t *)mp->b_rptr;
4731
4732		connp->conn_send = ip_output_v6;
4733		connp->conn_recv = tcp_input;
4734
4735		connp->conn_srcv6 = ip6h->ip6_dst;
4736		connp->conn_remv6 = ip6h->ip6_src;
4737
4738		/* db_cksumstuff is set at ip_fanout_tcp_v6 */
4739		ifindex = (int)DB_CKSUMSTUFF(mp);
4740		DB_CKSUMSTUFF(mp) = 0;
4741
4742		sin6 = sin6_null;
4743		sin6.sin6_addr = ip6h->ip6_src;
4744		sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4745		sin6.sin6_family = AF_INET6;
4746		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4747		sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
4748		    lconnp->conn_zoneid);
4749
4750		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
4751			/* Pass up the scope_id of remote addr */
4752			sin6.sin6_scope_id = ifindex;
4753		} else {
4754			sin6.sin6_scope_id = 0;
4755		}
4756		if (tcp->tcp_recvdstaddr) {
4757			sin6_t	sin6d;
4758
4759			sin6d = sin6_null;
4760			sin6.sin6_addr = ip6h->ip6_dst;
4761			sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4762			sin6d.sin6_family = AF_INET;
4763			tpi_mp = mi_tpi_extconn_ind(NULL,
4764			    (char *)&sin6d, sizeof (sin6_t),
4765			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4766			    (char *)&sin6d, sizeof (sin6_t),
4767			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4768		} else {
4769			tpi_mp = mi_tpi_conn_ind(NULL,
4770			    (char *)&sin6, sizeof (sin6_t),
4771			    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4772			    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4773		}
4774	}
4775
4776	if (tpi_mp == NULL)
4777		return (ENOMEM);
4778
4779	connp->conn_fport = *(uint16_t *)tcph->th_lport;
4780	connp->conn_lport = *(uint16_t *)tcph->th_fport;
4781	connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
4782	connp->conn_fully_bound = B_FALSE;
4783
4784	if (tcp_trace)
4785		tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
4786
4787	/* Inherit information from the "parent" */
4788	tcp->tcp_ipversion = ltcp->tcp_ipversion;
4789	tcp->tcp_family = ltcp->tcp_family;
4790	tcp->tcp_wq = ltcp->tcp_wq;
4791	tcp->tcp_rq = ltcp->tcp_rq;
4792	tcp->tcp_mss = tcp_mss_def_ipv6;
4793	tcp->tcp_detached = B_TRUE;
4794	if ((err = tcp_init_values(tcp)) != 0) {
4795		freemsg(tpi_mp);
4796		return (err);
4797	}
4798
4799	if (ipvers == IPV4_VERSION) {
4800		if ((err = tcp_header_init_ipv4(tcp)) != 0) {
4801			freemsg(tpi_mp);
4802			return (err);
4803		}
4804		ASSERT(tcp->tcp_ipha != NULL);
4805	} else {
4806		/* ifindex must be already set */
4807		ASSERT(ifindex != 0);
4808
4809		if (ltcp->tcp_bound_if != 0) {
4810			/*
4811			 * Set newtcp's bound_if equal to
4812			 * listener's value. If ifindex is
4813			 * not the same as ltcp->tcp_bound_if,
4814			 * it must be a packet for the ipmp group
4815			 * of interfaces
4816			 */
4817			tcp->tcp_bound_if = ltcp->tcp_bound_if;
4818		} else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
4819			tcp->tcp_bound_if = ifindex;
4820		}
4821
4822		tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
4823		tcp->tcp_recvifindex = 0;
4824		tcp->tcp_recvhops = 0xffffffffU;
4825		ASSERT(tcp->tcp_ip6h != NULL);
4826	}
4827
4828	tcp->tcp_lport = ltcp->tcp_lport;
4829
4830	if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
4831		if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
4832			/*
4833			 * Listener had options of some sort; eager inherits.
4834			 * Free up the eager template and allocate one
4835			 * of the right size.
4836			 */
4837			if (tcp->tcp_hdr_grown) {
4838				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
4839			} else {
4840				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
4841				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
4842			}
4843			tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
4844			    KM_NOSLEEP);
4845			if (tcp->tcp_iphc == NULL) {
4846				tcp->tcp_iphc_len = 0;
4847				freemsg(tpi_mp);
4848				return (ENOMEM);
4849			}
4850			tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
4851			tcp->tcp_hdr_grown = B_TRUE;
4852		}
4853		tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
4854		tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
4855		tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
4856		tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
4857		tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
4858
4859		/*
4860		 * Copy the IP+TCP header template from listener to eager
4861		 */
4862		bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
4863		if (tcp->tcp_ipversion == IPV6_VERSION) {
4864			if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
4865			    IPPROTO_RAW) {
4866				tcp->tcp_ip6h =
4867				    (ip6_t *)(tcp->tcp_iphc +
4868					sizeof (ip6i_t));
4869			} else {
4870				tcp->tcp_ip6h =
4871				    (ip6_t *)(tcp->tcp_iphc);
4872			}
4873			tcp->tcp_ipha = NULL;
4874		} else {
4875			tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
4876			tcp->tcp_ip6h = NULL;
4877		}
4878		tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
4879		    tcp->tcp_ip_hdr_len);
4880	} else {
4881		/*
4882		 * only valid case when ipversion of listener and
4883		 * eager differ is when listener is IPv6 and
4884		 * eager is IPv4.
4885		 * Eager header template has been initialized to the
4886		 * maximum v4 header sizes, which includes space for
4887		 * TCP and IP options.
4888		 */
4889		ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
4890		    (tcp->tcp_ipversion == IPV4_VERSION));
4891		ASSERT(tcp->tcp_iphc_len >=
4892		    TCP_MAX_COMBINED_HEADER_LENGTH);
4893		tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
4894		/* copy IP header fields individually */
4895		tcp->tcp_ipha->ipha_ttl =
4896		    ltcp->tcp_ip6h->ip6_hops;
4897		bcopy(ltcp->tcp_tcph->th_lport,
4898		    tcp->tcp_tcph->th_lport, sizeof (ushort_t));
4899	}
4900
4901	bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
4902	bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
4903	    sizeof (in_port_t));
4904
4905	if (ltcp->tcp_lport == 0) {
4906		tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
4907		bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
4908		    sizeof (in_port_t));
4909	}
4910
4911	if (tcp->tcp_ipversion == IPV4_VERSION) {
4912		ASSERT(ipha != NULL);
4913		tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
4914		tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
4915
4916		/* Source routing option copyover (reverse it) */
4917		if (tcp_rev_src_routes)
4918			tcp_opt_reverse(tcp, ipha);
4919	} else {
4920		ASSERT(ip6h != NULL);
4921		tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
4922		tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
4923	}
4924
4925	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
4926	/*
4927	 * If the SYN contains a credential, it's a loopback packet; attach
4928	 * the credential to the TPI message.
4929	 */
4930	if ((cr = DB_CRED(idmp)) != NULL) {
4931		mblk_setcred(tpi_mp, cr);
4932		DB_CPID(tpi_mp) = DB_CPID(idmp);
4933	}
4934	tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
4935
4936	/* Inherit the listener's SSL protection state */
4937
4938	if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
4939		kssl_hold_ent(tcp->tcp_kssl_ent);
4940		tcp->tcp_kssl_pending = B_TRUE;
4941	}
4942
4943	return (0);
4944}
4945
4946
4947int
4948tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
4949    tcph_t *tcph, mblk_t *idmp)
4950{
4951	tcp_t 		*ltcp = lconnp->conn_tcp;
4952	tcp_t		*tcp = connp->conn_tcp;
4953	sin_t		sin;
4954	mblk_t		*tpi_mp = NULL;
4955	int		err;
4956	cred_t		*cr;
4957
4958	sin = sin_null;
4959	sin.sin_addr.s_addr = ipha->ipha_src;
4960	sin.sin_port = *(uint16_t *)tcph->th_lport;
4961	sin.sin_family = AF_INET;
4962	if (ltcp->tcp_recvdstaddr) {
4963		sin_t	sind;
4964
4965		sind = sin_null;
4966		sind.sin_addr.s_addr = ipha->ipha_dst;
4967		sind.sin_port = *(uint16_t *)tcph->th_fport;
4968		sind.sin_family = AF_INET;
4969		tpi_mp = mi_tpi_extconn_ind(NULL,
4970		    (char *)&sind, sizeof (sin_t), (char *)&tcp,
4971		    (t_scalar_t)sizeof (intptr_t), (char *)&sind,
4972		    sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4973	} else {
4974		tpi_mp = mi_tpi_conn_ind(NULL,
4975		    (char *)&sin, sizeof (sin_t),
4976		    (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4977		    (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4978	}
4979
4980	if (tpi_mp == NULL) {
4981		return (ENOMEM);
4982	}
4983
4984	connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
4985	connp->conn_send = ip_output;
4986	connp->conn_recv = tcp_input;
4987	connp->conn_fully_bound = B_FALSE;
4988
4989	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
4990	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
4991	connp->conn_fport = *(uint16_t *)tcph->th_lport;
4992	connp->conn_lport = *(uint16_t *)tcph->th_fport;
4993
4994	if (tcp_trace) {
4995		tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
4996	}
4997
4998	/* Inherit information from the "parent" */
4999	tcp->tcp_ipversion = ltcp->tcp_ipversion;
5000	tcp->tcp_family = ltcp->tcp_family;
5001	tcp->tcp_wq = ltcp->tcp_wq;
5002	tcp->tcp_rq = ltcp->tcp_rq;
5003	tcp->tcp_mss = tcp_mss_def_ipv4;
5004	tcp->tcp_detached = B_TRUE;
5005	if ((err = tcp_init_values(tcp)) != 0) {
5006		freemsg(tpi_mp);
5007		return (err);
5008	}
5009
5010	/*
5011	 * Let's make sure that eager tcp template has enough space to
5012	 * copy IPv4 listener's tcp template. Since the conn_t structure is
5013	 * preserved and tcp_iphc_len is also preserved, an eager conn_t may
5014	 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
5015	 * more (in case of re-allocation of conn_t with tcp-IPv6 template with
5016	 * extension headers or with ip6i_t struct). Note that bcopy() below
5017	 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
5018	 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
5019	 */
5020	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
5021	ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
5022
5023	tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5024	tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5025	tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5026	tcp->tcp_ttl = ltcp->tcp_ttl;
5027	tcp->tcp_tos = ltcp->tcp_tos;
5028
5029	/* Copy the IP+TCP header template from listener to eager */
5030	bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5031	tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5032	tcp->tcp_ip6h = NULL;
5033	tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5034	    tcp->tcp_ip_hdr_len);
5035
5036	/* Initialize the IP addresses and Ports */
5037	tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5038	tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5039	bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5040	bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
5041
5042	/* Source routing option copyover (reverse it) */
5043	if (tcp_rev_src_routes)
5044		tcp_opt_reverse(tcp, ipha);
5045
5046	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5047
5048	/*
5049	 * If the SYN contains a credential, it's a loopback packet; attach
5050	 * the credential to the TPI message.
5051	 */
5052	if ((cr = DB_CRED(idmp)) != NULL) {
5053		mblk_setcred(tpi_mp, cr);
5054		DB_CPID(tpi_mp) = DB_CPID(idmp);
5055	}
5056	tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5057
5058	/* Inherit the listener's SSL protection state */
5059	if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5060		kssl_hold_ent(tcp->tcp_kssl_ent);
5061		tcp->tcp_kssl_pending = B_TRUE;
5062	}
5063
5064	return (0);
5065}
5066
5067/*
5068 * sets up conn for ipsec.
5069 * if the first mblk is M_CTL it is consumed and mpp is updated.
5070 * in case of error mpp is freed.
5071 */
5072conn_t *
5073tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
5074{
5075	conn_t 		*connp = tcp->tcp_connp;
5076	conn_t 		*econnp;
5077	squeue_t 	*new_sqp;
5078	mblk_t 		*first_mp = *mpp;
5079	mblk_t		*mp = *mpp;
5080	boolean_t	mctl_present = B_FALSE;
5081	uint_t		ipvers;
5082
5083	econnp = tcp_get_conn(sqp);
5084	if (econnp == NULL) {
5085		freemsg(first_mp);
5086		return (NULL);
5087	}
5088	if (DB_TYPE(mp) == M_CTL) {
5089		if (mp->b_cont == NULL ||
5090		    mp->b_cont->b_datap->db_type != M_DATA) {
5091			freemsg(first_mp);
5092			return (NULL);
5093		}
5094		mp = mp->b_cont;
5095		if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
5096			freemsg(first_mp);
5097			return (NULL);
5098		}
5099
5100		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5101		first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5102		mctl_present = B_TRUE;
5103	} else {
5104		ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
5105		mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5106	}
5107
5108	new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5109	DB_CKSUMSTART(mp) = 0;
5110
5111	ASSERT(OK_32PTR(mp->b_rptr));
5112	ipvers = IPH_HDR_VERSION(mp->b_rptr);
5113	if (ipvers == IPV4_VERSION) {
5114		uint16_t  	*up;
5115		uint32_t	ports;
5116		ipha_t		*ipha;
5117
5118		ipha = (ipha_t *)mp->b_rptr;
5119		up = (uint16_t *)((uchar_t *)ipha +
5120		    IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
5121		ports = *(uint32_t *)up;
5122		IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
5123		    ipha->ipha_dst, ipha->ipha_src, ports);
5124	} else {
5125		uint16_t  	*up;
5126		uint32_t	ports;
5127		uint16_t	ip_hdr_len;
5128		uint8_t		*nexthdrp;
5129		ip6_t 		*ip6h;
5130		tcph_t		*tcph;
5131
5132		ip6h = (ip6_t *)mp->b_rptr;
5133		if (ip6h->ip6_nxt == IPPROTO_TCP) {
5134			ip_hdr_len = IPV6_HDR_LEN;
5135		} else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
5136		    &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
5137			CONN_DEC_REF(econnp);
5138			freemsg(first_mp);
5139			return (NULL);
5140		}
5141		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5142		up = (uint16_t *)tcph->th_lport;
5143		ports = *(uint32_t *)up;
5144		IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
5145		    ip6h->ip6_dst, ip6h->ip6_src, ports);
5146	}
5147
5148	/*
5149	 * The caller already ensured that there is a sqp present.
5150	 */
5151	econnp->conn_sqp = new_sqp;
5152
5153	if (connp->conn_policy != NULL) {
5154		ipsec_in_t *ii;
5155		ii = (ipsec_in_t *)(first_mp->b_rptr);
5156		ASSERT(ii->ipsec_in_policy == NULL);
5157		IPPH_REFHOLD(connp->conn_policy);
5158		ii->ipsec_in_policy = connp->conn_policy;
5159
5160		first_mp->b_datap->db_type = IPSEC_POLICY_SET;
5161		if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
5162			CONN_DEC_REF(econnp);
5163			freemsg(first_mp);
5164			return (NULL);
5165		}
5166	}
5167
5168	if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
5169		CONN_DEC_REF(econnp);
5170		freemsg(first_mp);
5171		return (NULL);
5172	}
5173
5174	/*
5175	 * If we know we have some policy, pass the "IPSEC"
5176	 * options size TCP uses this adjust the MSS.
5177	 */
5178	econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
5179	if (mctl_present) {
5180		freeb(first_mp);
5181		*mpp = mp;
5182	}
5183
5184	return (econnp);
5185}
5186
5187/*
5188 * tcp_get_conn/tcp_free_conn
5189 *
5190 * tcp_get_conn is used to get a clean tcp connection structure.
5191 * It tries to reuse the connections put on the freelist by the
5192 * time_wait_collector failing which it goes to kmem_cache. This
5193 * way has two benefits compared to just allocating from and
5194 * freeing to kmem_cache.
5195 * 1) The time_wait_collector can free (which includes the cleanup)
5196 * outside the squeue. So when the interrupt comes, we have a clean
5197 * connection sitting in the freelist. Obviously, this buys us
5198 * performance.
5199 *
5200 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
5201 * has multiple disadvantages - tying up the squeue during alloc, and the
5202 * fact that IPSec policy initialization has to happen here which
5203 * requires us sending a M_CTL and checking for it i.e. real ugliness.
5204 * But allocating the conn/tcp in IP land is also not the best since
5205 * we can't check the 'q' and 'q0' which are protected by squeue and
5206 * blindly allocate memory which might have to be freed here if we are
5207 * not allowed to accept the connection. By using the freelist and
5208 * putting the conn/tcp back in freelist, we don't pay a penalty for
5209 * allocating memory without checking 'q/q0' and freeing it if we can't
5210 * accept the connection.
5211 *
5212 * Care should be taken to put the conn back in the same squeue's freelist
5213 * from which it was allocated. Best results are obtained if conn is
5214 * allocated from listener's squeue and freed to the same. Time wait
5215 * collector will free up the freelist is the connection ends up sitting
5216 * there for too long.
5217 */
5218void *
5219tcp_get_conn(void *arg)
5220{
5221	tcp_t			*tcp = NULL;
5222	conn_t			*connp = NULL;
5223	squeue_t		*sqp = (squeue_t *)arg;
5224	tcp_squeue_priv_t 	*tcp_time_wait;
5225
5226	tcp_time_wait =
5227	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
5228
5229	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
5230	tcp = tcp_time_wait->tcp_free_list;
5231	ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
5232	if (tcp != NULL) {
5233		tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
5234		tcp_time_wait->tcp_free_list_cnt--;
5235		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5236		tcp->tcp_time_wait_next = NULL;
5237		connp = tcp->tcp_connp;
5238		connp->conn_flags |= IPCL_REUSED;
5239		return ((void *)connp);
5240	}
5241	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5242	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL)
5243		return (NULL);
5244	return ((void *)connp);
5245}
5246
5247/*
5248 * Update the cached label for the given tcp_t.  This should be called once per
5249 * connection, and before any packets are sent or tcp_process_options is
5250 * invoked.  Returns B_FALSE if the correct label could not be constructed.
5251 */
5252static boolean_t
5253tcp_update_label(tcp_t *tcp, const cred_t *cr)
5254{
5255	conn_t *connp = tcp->tcp_connp;
5256
5257	if (tcp->tcp_ipversion == IPV4_VERSION) {
5258		uchar_t optbuf[IP_MAX_OPT_LENGTH];
5259		int added;
5260
5261		if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
5262		    connp->conn_mac_exempt) != 0)
5263			return (B_FALSE);
5264
5265		added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
5266		if (added == -1)
5267			return (B_FALSE);
5268		tcp->tcp_hdr_len += added;
5269		tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
5270		tcp->tcp_ip_hdr_len += added;
5271		if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
5272			tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
5273			added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
5274			    tcp->tcp_hdr_len);
5275			if (added == -1)
5276				return (B_FALSE);
5277			tcp->tcp_hdr_len += added;
5278			tcp->tcp_tcph = (tcph_t *)
5279			    ((uchar_t *)tcp->tcp_tcph + added);
5280			tcp->tcp_ip_hdr_len += added;
5281		}
5282	} else {
5283		uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
5284
5285		if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
5286		    connp->conn_mac_exempt) != 0)
5287			return (B_FALSE);
5288		if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
5289		    &tcp->tcp_label_len, optbuf) != 0)
5290			return (B_FALSE);
5291		if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0)
5292			return (B_FALSE);
5293	}
5294
5295	connp->conn_ulp_labeled = 1;
5296
5297	return (B_TRUE);
5298}
5299
5300/* BEGIN CSTYLED */
5301/*
5302 *
5303 * The sockfs ACCEPT path:
5304 * =======================
5305 *
5306 * The eager is now established in its own perimeter as soon as SYN is
5307 * received in tcp_conn_request(). When sockfs receives conn_ind, it
5308 * completes the accept processing on the acceptor STREAM. The sending
5309 * of conn_ind part is common for both sockfs listener and a TLI/XTI
5310 * listener but a TLI/XTI listener completes the accept processing
5311 * on the listener perimeter.
5312 *
5313 * Common control flow for 3 way handshake:
5314 * ----------------------------------------
5315 *
5316 * incoming SYN (listener perimeter) 	-> tcp_rput_data()
5317 *					-> tcp_conn_request()
5318 *
5319 * incoming SYN-ACK-ACK (eager perim) 	-> tcp_rput_data()
5320 * send T_CONN_IND (listener perim)	-> tcp_send_conn_ind()
5321 *
5322 * Sockfs ACCEPT Path:
5323 * -------------------
5324 *
5325 * open acceptor stream (ip_tcpopen allocates tcp_wput_accept()
5326 * as STREAM entry point)
5327 *
5328 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
5329 *
5330 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
5331 * association (we are not behind eager's squeue but sockfs is protecting us
5332 * and no one knows about this stream yet. The STREAMS entry point q->q_info
5333 * is changed to point at tcp_wput().
5334 *
5335 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
5336 * listener (done on listener's perimeter).
5337 *
5338 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
5339 * accept.
5340 *
5341 * TLI/XTI client ACCEPT path:
5342 * ---------------------------
5343 *
5344 * soaccept() sends T_CONN_RES on the listener STREAM.
5345 *
5346 * tcp_accept() -> tcp_accept_swap() complete the processing and send
5347 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
5348 *
5349 * Locks:
5350 * ======
5351 *
5352 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
5353 * and listeners->tcp_eager_next_q.
5354 *
5355 * Referencing:
5356 * ============
5357 *
5358 * 1) We start out in tcp_conn_request by eager placing a ref on
5359 * listener and listener adding eager to listeners->tcp_eager_next_q0.
5360 *
5361 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
5362 * doing so we place a ref on the eager. This ref is finally dropped at the
5363 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
5364 * reference is dropped by the squeue framework.
5365 *
5366 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
5367 *
5368 * The reference must be released by the same entity that added the reference
5369 * In the above scheme, the eager is the entity that adds and releases the
5370 * references. Note that tcp_accept_finish executes in the squeue of the eager
5371 * (albeit after it is attached to the acceptor stream). Though 1. executes
5372 * in the listener's squeue, the eager is nascent at this point and the
5373 * reference can be considered to have been added on behalf of the eager.
5374 *
5375 * Eager getting a Reset or listener closing:
5376 * ==========================================
5377 *
5378 * Once the listener and eager are linked, the listener never does the unlink.
5379 * If the listener needs to close, tcp_eager_cleanup() is called which queues
5380 * a message on all eager perimeter. The eager then does the unlink, clears
5381 * any pointers to the listener's queue and drops the reference to the
5382 * listener. The listener waits in tcp_close outside the squeue until its
5383 * refcount has dropped to 1. This ensures that the listener has waited for
5384 * all eagers to clear their association with the listener.
5385 *
5386 * Similarly, if eager decides to go away, it can unlink itself and close.
5387 * When the T_CONN_RES comes down, we check if eager has closed. Note that
5388 * the reference to eager is still valid because of the extra ref we put
5389 * in tcp_send_conn_ind.
5390 *
5391 * Listener can always locate the eager under the protection
5392 * of the listener->tcp_eager_lock, and then do a refhold
5393 * on the eager during the accept processing.
5394 *
5395 * The acceptor stream accesses the eager in the accept processing
5396 * based on the ref placed on eager before sending T_conn_ind.
5397 * The only entity that can negate this refhold is a listener close
5398 * which is mutually exclusive with an active acceptor stream.
5399 *
5400 * Eager's reference on the listener
5401 * ===================================
5402 *
5403 * If the accept happens (even on a closed eager) the eager drops its
5404 * reference on the listener at the start of tcp_accept_finish. If the
5405 * eager is killed due to an incoming RST before the T_conn_ind is sent up,
5406 * the reference is dropped in tcp_closei_local. If the listener closes,
5407 * the reference is dropped in tcp_eager_kill. In all cases the reference
5408 * is dropped while executing in the eager's context (squeue).
5409 */
5410/* END CSTYLED */
5411
5412/* Process the SYN packet, mp, directed at the listener 'tcp' */
5413
5414/*
5415 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
5416 * tcp_rput_data will not see any SYN packets.
5417 */
5418/* ARGSUSED */
5419void
5420tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
5421{
5422	tcph_t		*tcph;
5423	uint32_t	seg_seq;
5424	tcp_t		*eager;
5425	uint_t		ipvers;
5426	ipha_t		*ipha;
5427	ip6_t		*ip6h;
5428	int		err;
5429	conn_t		*econnp = NULL;
5430	squeue_t	*new_sqp;
5431	mblk_t		*mp1;
5432	uint_t 		ip_hdr_len;
5433	conn_t		*connp = (conn_t *)arg;
5434	tcp_t		*tcp = connp->conn_tcp;
5435	ire_t		*ire;
5436	cred_t		*credp;
5437
5438	if (tcp->tcp_state != TCPS_LISTEN)
5439		goto error2;
5440
5441	ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
5442
5443	mutex_enter(&tcp->tcp_eager_lock);
5444	if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
5445		mutex_exit(&tcp->tcp_eager_lock);
5446		TCP_STAT(tcp_listendrop);
5447		BUMP_MIB(&tcp_mib, tcpListenDrop);
5448		if (tcp->tcp_debug) {
5449			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
5450			    "tcp_conn_request: listen backlog (max=%d) "
5451			    "overflow (%d pending) on %s",
5452			    tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
5453			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
5454		}
5455		goto error2;
5456	}
5457
5458	if (tcp->tcp_conn_req_cnt_q0 >=
5459	    tcp->tcp_conn_req_max + tcp_conn_req_max_q0) {
5460		/*
5461		 * Q0 is full. Drop a pending half-open req from the queue
5462		 * to make room for the new SYN req. Also mark the time we
5463		 * drop a SYN.
5464		 *
5465		 * A more aggressive defense against SYN attack will
5466		 * be to set the "tcp_syn_defense" flag now.
5467		 */
5468		TCP_STAT(tcp_listendropq0);
5469		tcp->tcp_last_rcv_lbolt = lbolt64;
5470		if (!tcp_drop_q0(tcp)) {
5471			mutex_exit(&tcp->tcp_eager_lock);
5472			BUMP_MIB(&tcp_mib, tcpListenDropQ0);
5473			if (tcp->tcp_debug) {
5474				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
5475				    "tcp_conn_request: listen half-open queue "
5476				    "(max=%d) full (%d pending) on %s",
5477				    tcp_conn_req_max_q0,
5478				    tcp->tcp_conn_req_cnt_q0,
5479				    tcp_display(tcp, NULL,
5480				    DISP_PORT_ONLY));
5481			}
5482			goto error2;
5483		}
5484	}
5485	mutex_exit(&tcp->tcp_eager_lock);
5486
5487	/*
5488	 * IP adds STRUIO_EAGER and ensures that the received packet is
5489	 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
5490	 * link local address.  If IPSec is enabled, db_struioflag has
5491	 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
5492	 * otherwise an error case if neither of them is set.
5493	 */
5494	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
5495		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5496		DB_CKSUMSTART(mp) = 0;
5497		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5498		econnp = (conn_t *)tcp_get_conn(arg2);
5499		if (econnp == NULL)
5500			goto error2;
5501		econnp->conn_sqp = new_sqp;
5502	} else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
5503		/*
5504		 * mp is updated in tcp_get_ipsec_conn().
5505		 */
5506		econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
5507		if (econnp == NULL) {
5508			/*
5509			 * mp freed by tcp_get_ipsec_conn.
5510			 */
5511			return;
5512		}
5513	} else {
5514		goto error2;
5515	}
5516
5517	ASSERT(DB_TYPE(mp) == M_DATA);
5518
5519	ipvers = IPH_HDR_VERSION(mp->b_rptr);
5520	ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
5521	ASSERT(OK_32PTR(mp->b_rptr));
5522	if (ipvers == IPV4_VERSION) {
5523		ipha = (ipha_t *)mp->b_rptr;
5524		ip_hdr_len = IPH_HDR_LENGTH(ipha);
5525		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5526	} else {
5527		ip6h = (ip6_t *)mp->b_rptr;
5528		ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
5529		tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5530	}
5531
5532	if (tcp->tcp_family == AF_INET) {
5533		ASSERT(ipvers == IPV4_VERSION);
5534		err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
5535	} else {
5536		err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
5537	}
5538
5539	if (err)
5540		goto error3;
5541
5542	eager = econnp->conn_tcp;
5543
5544	/* Inherit various TCP parameters from the listener */
5545	eager->tcp_naglim = tcp->tcp_naglim;
5546	eager->tcp_first_timer_threshold =
5547	    tcp->tcp_first_timer_threshold;
5548	eager->tcp_second_timer_threshold =
5549	    tcp->tcp_second_timer_threshold;
5550
5551	eager->tcp_first_ctimer_threshold =
5552	    tcp->tcp_first_ctimer_threshold;
5553	eager->tcp_second_ctimer_threshold =
5554	    tcp->tcp_second_ctimer_threshold;
5555
5556	/*
5557	 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
5558	 * If it does not, the eager's receive window will be set to the
5559	 * listener's receive window later in this function.
5560	 */
5561	eager->tcp_rwnd = 0;
5562
5563	/*
5564	 * Inherit listener's tcp_init_cwnd.  Need to do this before
5565	 * calling tcp_process_options() where tcp_mss_set() is called
5566	 * to set the initial cwnd.
5567	 */
5568	eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
5569
5570	/*
5571	 * Zones: tcp_adapt_ire() and tcp_send_data() both need the
5572	 * zone id before the accept is completed in tcp_wput_accept().
5573	 */
5574	econnp->conn_zoneid = connp->conn_zoneid;
5575
5576	/* Copy nexthop information from listener to eager */
5577	if (connp->conn_nexthop_set) {
5578		econnp->conn_nexthop_set = connp->conn_nexthop_set;
5579		econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
5580	}
5581
5582	/*
5583	 * TSOL: tsol_input_proc() needs the eager's cred before the
5584	 * eager is accepted
5585	 */
5586	econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
5587	crhold(credp);
5588
5589	/*
5590	 * If the caller has the process-wide flag set, then default to MAC
5591	 * exempt mode.  This allows read-down to unlabeled hosts.
5592	 */
5593	if (getpflags(NET_MAC_AWARE, credp) != 0)
5594		econnp->conn_mac_exempt = B_TRUE;
5595
5596	if (is_system_labeled()) {
5597		cred_t *cr;
5598
5599		if (connp->conn_mlp_type != mlptSingle) {
5600			cr = econnp->conn_peercred = DB_CRED(mp);
5601			if (cr != NULL)
5602				crhold(cr);
5603			else
5604				cr = econnp->conn_cred;
5605			DTRACE_PROBE2(mlp_syn_accept, conn_t *,
5606			    econnp, cred_t *, cr)
5607		} else {
5608			cr = econnp->conn_cred;
5609			DTRACE_PROBE2(syn_accept, conn_t *,
5610			    econnp, cred_t *, cr)
5611		}
5612
5613		if (!tcp_update_label(eager, cr)) {
5614			DTRACE_PROBE3(
5615			    tx__ip__log__error__connrequest__tcp,
5616			    char *, "eager connp(1) label on SYN mp(2) failed",
5617			    conn_t *, econnp, mblk_t *, mp);
5618			goto error3;
5619		}
5620	}
5621
5622	eager->tcp_hard_binding = B_TRUE;
5623
5624	tcp_bind_hash_insert(&tcp_bind_fanout[
5625	    TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5626
5627	CL_INET_CONNECT(eager);
5628
5629	/*
5630	 * No need to check for multicast destination since ip will only pass
5631	 * up multicasts to those that have expressed interest
5632	 * TODO: what about rejecting broadcasts?
5633	 * Also check that source is not a multicast or broadcast address.
5634	 */
5635	eager->tcp_state = TCPS_SYN_RCVD;
5636
5637
5638	/*
5639	 * There should be no ire in the mp as we are being called after
5640	 * receiving the SYN.
5641	 */
5642	ASSERT(tcp_ire_mp(mp) == NULL);
5643
5644	/*
5645	 * Adapt our mss, ttl, ... according to information provided in IRE.
5646	 */
5647
5648	if (tcp_adapt_ire(eager, NULL) == 0) {
5649		/* Undo the bind_hash_insert */
5650		tcp_bind_hash_remove(eager);
5651		goto error3;
5652	}
5653
5654	/* Process all TCP options. */
5655	tcp_process_options(eager, tcph);
5656
5657	/* Is the other end ECN capable? */
5658	if (tcp_ecn_permitted >= 1 &&
5659	    (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5660		eager->tcp_ecn_ok = B_TRUE;
5661	}
5662
5663	/*
5664	 * listener->tcp_rq->q_hiwat should be the default window size or a
5665	 * window size changed via SO_RCVBUF option.  First round up the
5666	 * eager's tcp_rwnd to the nearest MSS.  Then find out the window
5667	 * scale option value if needed.  Call tcp_rwnd_set() to finish the
5668	 * setting.
5669	 *
5670	 * Note if there is a rpipe metric associated with the remote host,
5671	 * we should not inherit receive window size from listener.
5672	 */
5673	eager->tcp_rwnd = MSS_ROUNDUP(
5674	    (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat :
5675	    eager->tcp_rwnd), eager->tcp_mss);
5676	if (eager->tcp_snd_ws_ok)
5677		tcp_set_ws_value(eager);
5678	/*
5679	 * Note that this is the only place tcp_rwnd_set() is called for
5680	 * accepting a connection.  We need to call it here instead of
5681	 * after the 3-way handshake because we need to tell the other
5682	 * side our rwnd in the SYN-ACK segment.
5683	 */
5684	(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
5685
5686	/*
5687	 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
5688	 * via soaccept()->soinheritoptions() which essentially applies
5689	 * all the listener options to the new STREAM. The options that we
5690	 * need to take care of are:
5691	 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
5692	 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
5693	 * SO_SNDBUF, SO_RCVBUF.
5694	 *
5695	 * SO_RCVBUF:	tcp_rwnd_set() above takes care of it.
5696	 * SO_SNDBUF:	Set the tcp_xmit_hiwater for the eager. When
5697	 *		tcp_maxpsz_set() gets called later from
5698	 *		tcp_accept_finish(), the option takes effect.
5699	 *
5700	 */
5701	/* Set the TCP options */
5702	eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
5703	eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
5704	eager->tcp_oobinline = tcp->tcp_oobinline;
5705	eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
5706	eager->tcp_broadcast = tcp->tcp_broadcast;
5707	eager->tcp_useloopback = tcp->tcp_useloopback;
5708	eager->tcp_dontroute = tcp->tcp_dontroute;
5709	eager->tcp_linger = tcp->tcp_linger;
5710	eager->tcp_lingertime = tcp->tcp_lingertime;
5711	if (tcp->tcp_ka_enabled)
5712		eager->tcp_ka_enabled = 1;
5713
5714	/* Set the IP options */
5715	econnp->conn_broadcast = connp->conn_broadcast;
5716	econnp->conn_loopback = connp->conn_loopback;
5717	econnp->conn_dontroute = connp->conn_dontroute;
5718	econnp->conn_reuseaddr = connp->conn_reuseaddr;
5719
5720	/* Put a ref on the listener for the eager. */
5721	CONN_INC_REF(connp);
5722	mutex_enter(&tcp->tcp_eager_lock);
5723	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
5724	eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
5725	tcp->tcp_eager_next_q0 = eager;
5726	eager->tcp_eager_prev_q0 = tcp;
5727
5728	/* Set tcp_listener before adding it to tcp_conn_fanout */
5729	eager->tcp_listener = tcp;
5730	eager->tcp_saved_listener = tcp;
5731
5732	/*
5733	 * Tag this detached tcp vector for later retrieval
5734	 * by our listener client in tcp_accept().
5735	 */
5736	eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
5737	tcp->tcp_conn_req_cnt_q0++;
5738	if (++tcp->tcp_conn_req_seqnum == -1) {
5739		/*
5740		 * -1 is "special" and defined in TPI as something
5741		 * that should never be used in T_CONN_IND
5742		 */
5743		++tcp->tcp_conn_req_seqnum;
5744	}
5745	mutex_exit(&tcp->tcp_eager_lock);
5746
5747	if (tcp->tcp_syn_defense) {
5748		/* Don't drop the SYN that comes from a good IP source */
5749		ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
5750		if (addr_cache != NULL && eager->tcp_remote ==
5751		    addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
5752			eager->tcp_dontdrop = B_TRUE;
5753		}
5754	}
5755
5756	/*
5757	 * We need to insert the eager in its own perimeter but as soon
5758	 * as we do that, we expose the eager to the classifier and
5759	 * should not touch any field outside the eager's perimeter.
5760	 * So do all the work necessary before inserting the eager
5761	 * in its own perimeter. Be optimistic that ipcl_conn_insert()
5762	 * will succeed but undo everything if it fails.
5763	 */
5764	seg_seq = ABE32_TO_U32(tcph->th_seq);
5765	eager->tcp_irs = seg_seq;
5766	eager->tcp_rack = seg_seq;
5767	eager->tcp_rnxt = seg_seq + 1;
5768	U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
5769	BUMP_MIB(&tcp_mib, tcpPassiveOpens);
5770	eager->tcp_state = TCPS_SYN_RCVD;
5771	mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
5772	    NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
5773	if (mp1 == NULL)
5774		goto error1;
5775	DB_CPID(mp1) = tcp->tcp_cpid;
5776
5777	/*
5778	 * We need to start the rto timer. In normal case, we start
5779	 * the timer after sending the packet on the wire (or at
5780	 * least believing that packet was sent by waiting for
5781	 * CALL_IP_WPUT() to return). Since this is the first packet
5782	 * being sent on the wire for the eager, our initial tcp_rto
5783	 * is at least tcp_rexmit_interval_min which is a fairly
5784	 * large value to allow the algorithm to adjust slowly to large
5785	 * fluctuations of RTT during first few transmissions.
5786	 *
5787	 * Starting the timer first and then sending the packet in this
5788	 * case shouldn't make much difference since tcp_rexmit_interval_min
5789	 * is of the order of several 100ms and starting the timer
5790	 * first and then sending the packet will result in difference
5791	 * of few micro seconds.
5792	 *
5793	 * Without this optimization, we are forced to hold the fanout
5794	 * lock across the ipcl_bind_insert() and sending the packet
5795	 * so that we don't race against an incoming packet (maybe RST)
5796	 * for this eager.
5797	 */
5798
5799	TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT);
5800	TCP_TIMER_RESTART(eager, eager->tcp_rto);
5801
5802
5803	/*
5804	 * Insert the eager in its own perimeter now. We are ready to deal
5805	 * with any packets on eager.
5806	 */
5807	if (eager->tcp_ipversion == IPV4_VERSION) {
5808		if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
5809			goto error;
5810		}
5811	} else {
5812		if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
5813			goto error;
5814		}
5815	}
5816
5817	/* mark conn as fully-bound */
5818	econnp->conn_fully_bound = B_TRUE;
5819
5820	/* Send the SYN-ACK */
5821	tcp_send_data(eager, eager->tcp_wq, mp1);
5822	freemsg(mp);
5823
5824	return;
5825error:
5826	(void) TCP_TIMER_CANCEL(eager, eager->tcp_timer_tid);
5827	freemsg(mp1);
5828error1:
5829	/* Undo what we did above */
5830	mutex_enter(&tcp->tcp_eager_lock);
5831	tcp_eager_unlink(eager);
5832	mutex_exit(&tcp->tcp_eager_lock);
5833	/* Drop eager's reference on the listener */
5834	CONN_DEC_REF(connp);
5835
5836	/*
5837	 * Delete the cached ire in conn_ire_cache and also mark
5838	 * the conn as CONDEMNED
5839	 */
5840	mutex_enter(&econnp->conn_lock);
5841	econnp->conn_state_flags |= CONN_CONDEMNED;
5842	ire = econnp->conn_ire_cache;
5843	econnp->conn_ire_cache = NULL;
5844	mutex_exit(&econnp->conn_lock);
5845	if (ire != NULL)
5846		IRE_REFRELE_NOTR(ire);
5847
5848	/*
5849	 * tcp_accept_comm inserts the eager to the bind_hash
5850	 * we need to remove it from the hash if ipcl_conn_insert
5851	 * fails.
5852	 */
5853	tcp_bind_hash_remove(eager);
5854	/* Drop the eager ref placed in tcp_open_detached */
5855	CONN_DEC_REF(econnp);
5856
5857	/*
5858	 * If a connection already exists, send the mp to that connections so
5859	 * that it can be appropriately dealt with.
5860	 */
5861	if ((econnp = ipcl_classify(mp, connp->conn_zoneid)) != NULL) {
5862		if (!IPCL_IS_CONNECTED(econnp)) {
5863			/*
5864			 * Something bad happened. ipcl_conn_insert()
5865			 * failed because a connection already existed
5866			 * in connected hash but we can't find it
5867			 * anymore (someone blew it away). Just
5868			 * free this message and hopefully remote
5869			 * will retransmit at which time the SYN can be
5870			 * treated as a new connection or dealth with
5871			 * a TH_RST if a connection already exists.
5872			 */
5873			freemsg(mp);
5874		} else {
5875			squeue_fill(econnp->conn_sqp, mp, tcp_input,
5876			    econnp, SQTAG_TCP_CONN_REQ);
5877		}
5878	} else {
5879		/* Nobody wants this packet */
5880		freemsg(mp);
5881	}
5882	return;
5883error2:
5884	freemsg(mp);
5885	return;
5886error3:
5887	CONN_DEC_REF(econnp);
5888	freemsg(mp);
5889}
5890
5891/*
5892 * In an ideal case of vertical partition in NUMA architecture, its
5893 * beneficial to have the listener and all the incoming connections
5894 * tied to the same squeue. The other constraint is that incoming
5895 * connections should be tied to the squeue attached to interrupted
5896 * CPU for obvious locality reason so this leaves the listener to
5897 * be tied to the same squeue. Our only problem is that when listener
5898 * is binding, the CPU that will get interrupted by the NIC whose
5899 * IP address the listener is binding to is not even known. So
5900 * the code below allows us to change that binding at the time the
5901 * CPU is interrupted by virtue of incoming connection's squeue.
5902 *
5903 * This is usefull only in case of a listener bound to a specific IP
5904 * address. For other kind of listeners, they get bound the
5905 * very first time and there is no attempt to rebind them.
5906 */
5907void
5908tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
5909{
5910	conn_t		*connp = (conn_t *)arg;
5911	squeue_t	*sqp = (squeue_t *)arg2;
5912	squeue_t	*new_sqp;
5913	uint32_t	conn_flags;
5914
5915	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
5916		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5917	} else {
5918		goto done;
5919	}
5920
5921	if (connp->conn_fanout == NULL)
5922		goto done;
5923
5924	if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
5925		mutex_enter(&connp->conn_fanout->connf_lock);
5926		mutex_enter(&connp->conn_lock);
5927		/*
5928		 * No one from read or write side can access us now
5929		 * except for already queued packets on this squeue.
5930		 * But since we haven't changed the squeue yet, they
5931		 * can't execute. If they are processed after we have
5932		 * changed the squeue, they are sent back to the
5933		 * correct squeue down below.
5934		 */
5935		if (connp->conn_sqp != new_sqp) {
5936			while (connp->conn_sqp != new_sqp)
5937				(void) casptr(&connp->conn_sqp, sqp, new_sqp);
5938		}
5939
5940		do {
5941			conn_flags = connp->conn_flags;
5942			conn_flags |= IPCL_FULLY_BOUND;
5943			(void) cas32(&connp->conn_flags, connp->conn_flags,
5944			    conn_flags);
5945		} while (!(connp->conn_flags & IPCL_FULLY_BOUND));
5946
5947		mutex_exit(&connp->conn_fanout->connf_lock);
5948		mutex_exit(&connp->conn_lock);
5949	}
5950
5951done:
5952	if (connp->conn_sqp != sqp) {
5953		CONN_INC_REF(connp);
5954		squeue_fill(connp->conn_sqp, mp,
5955		    connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
5956	} else {
5957		tcp_conn_request(connp, mp, sqp);
5958	}
5959}
5960
5961/*
5962 * Successful connect request processing begins when our client passes
5963 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
5964 * our T_OK_ACK reply message upstream.  The control flow looks like this:
5965 *   upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP
5966 *   upstream <- tcp_rput()                <- IP
5967 * After various error checks are completed, tcp_connect() lays
5968 * the target address and port into the composite header template,
5969 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
5970 * request followed by an IRE request, and passes the three mblk message
5971 * down to IP looking like this:
5972 *   O_T_BIND_REQ for IP  --> IRE req --> T_OK_ACK for our client
5973 * Processing continues in tcp_rput() when we receive the following message:
5974 *   T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
5975 * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
5976 * to fire off the connection request, and then passes the T_OK_ACK mblk
5977 * upstream that we filled in below.  There are, of course, numerous
5978 * error conditions along the way which truncate the processing described
5979 * above.
5980 */
5981static void
5982tcp_connect(tcp_t *tcp, mblk_t *mp)
5983{
5984	sin_t		*sin;
5985	sin6_t		*sin6;
5986	queue_t		*q = tcp->tcp_wq;
5987	struct T_conn_req	*tcr;
5988	ipaddr_t	*dstaddrp;
5989	in_port_t	dstport;
5990	uint_t		srcid;
5991
5992	tcr = (struct T_conn_req *)mp->b_rptr;
5993
5994	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
5995	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
5996		tcp_err_ack(tcp, mp, TPROTO, 0);
5997		return;
5998	}
5999
6000	/*
6001	 * Determine packet type based on type of address passed in
6002	 * the request should contain an IPv4 or IPv6 address.
6003	 * Make sure that address family matches the type of
6004	 * family of the the address passed down
6005	 */
6006	switch (tcr->DEST_length) {
6007	default:
6008		tcp_err_ack(tcp, mp, TBADADDR, 0);
6009		return;
6010
6011	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
6012		/*
6013		 * XXX: The check for valid DEST_length was not there
6014		 * in earlier releases and some buggy
6015		 * TLI apps (e.g Sybase) got away with not feeding
6016		 * in sin_zero part of address.
6017		 * We allow that bug to keep those buggy apps humming.
6018		 * Test suites require the check on DEST_length.
6019		 * We construct a new mblk with valid DEST_length
6020		 * free the original so the rest of the code does
6021		 * not have to keep track of this special shorter
6022		 * length address case.
6023		 */
6024		mblk_t *nmp;
6025		struct T_conn_req *ntcr;
6026		sin_t *nsin;
6027
6028		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
6029		    tcr->OPT_length, BPRI_HI);
6030		if (nmp == NULL) {
6031			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
6032			return;
6033		}
6034		ntcr = (struct T_conn_req *)nmp->b_rptr;
6035		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
6036		ntcr->PRIM_type = T_CONN_REQ;
6037		ntcr->DEST_length = sizeof (sin_t);
6038		ntcr->DEST_offset = sizeof (struct T_conn_req);
6039
6040		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
6041		*nsin = sin_null;
6042		/* Get pointer to shorter address to copy from original mp */
6043		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6044		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
6045		if (sin == NULL || !OK_32PTR((char *)sin)) {
6046			freemsg(nmp);
6047			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6048			return;
6049		}
6050		nsin->sin_family = sin->sin_family;
6051		nsin->sin_port = sin->sin_port;
6052		nsin->sin_addr = sin->sin_addr;
6053		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
6054		nmp->b_wptr = (uchar_t *)&nsin[1];
6055		if (tcr->OPT_length != 0) {
6056			ntcr->OPT_length = tcr->OPT_length;
6057			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
6058			bcopy((uchar_t *)tcr + tcr->OPT_offset,
6059			    (uchar_t *)ntcr + ntcr->OPT_offset,
6060			    tcr->OPT_length);
6061			nmp->b_wptr += tcr->OPT_length;
6062		}
6063		freemsg(mp);	/* original mp freed */
6064		mp = nmp;	/* re-initialize original variables */
6065		tcr = ntcr;
6066	}
6067	/* FALLTHRU */
6068
6069	case sizeof (sin_t):
6070		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6071		    sizeof (sin_t));
6072		if (sin == NULL || !OK_32PTR((char *)sin)) {
6073			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6074			return;
6075		}
6076		if (tcp->tcp_family != AF_INET ||
6077		    sin->sin_family != AF_INET) {
6078			tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6079			return;
6080		}
6081		if (sin->sin_port == 0) {
6082			tcp_err_ack(tcp, mp, TBADADDR, 0);
6083			return;
6084		}
6085		if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
6086			tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6087			return;
6088		}
6089
6090		break;
6091
6092	case sizeof (sin6_t):
6093		sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
6094		    sizeof (sin6_t));
6095		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
6096			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6097			return;
6098		}
6099		if (tcp->tcp_family != AF_INET6 ||
6100		    sin6->sin6_family != AF_INET6) {
6101			tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6102			return;
6103		}
6104		if (sin6->sin6_port == 0) {
6105			tcp_err_ack(tcp, mp, TBADADDR, 0);
6106			return;
6107		}
6108		break;
6109	}
6110	/*
6111	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
6112	 * should key on their sequence number and cut them loose.
6113	 */
6114
6115	/*
6116	 * If options passed in, feed it for verification and handling
6117	 */
6118	if (tcr->OPT_length != 0) {
6119		mblk_t	*ok_mp;
6120		mblk_t	*discon_mp;
6121		mblk_t  *conn_opts_mp;
6122		int t_error, sys_error, do_disconnect;
6123
6124		conn_opts_mp = NULL;
6125
6126		if (tcp_conprim_opt_process(tcp, mp,
6127			&do_disconnect, &t_error, &sys_error) < 0) {
6128			if (do_disconnect) {
6129				ASSERT(t_error == 0 && sys_error == 0);
6130				discon_mp = mi_tpi_discon_ind(NULL,
6131				    ECONNREFUSED, 0);
6132				if (!discon_mp) {
6133					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6134					    TSYSERR, ENOMEM);
6135					return;
6136				}
6137				ok_mp = mi_tpi_ok_ack_alloc(mp);
6138				if (!ok_mp) {
6139					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6140					    TSYSERR, ENOMEM);
6141					return;
6142				}
6143				qreply(q, ok_mp);
6144				qreply(q, discon_mp); /* no flush! */
6145			} else {
6146				ASSERT(t_error != 0);
6147				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
6148				    sys_error);
6149			}
6150			return;
6151		}
6152		/*
6153		 * Success in setting options, the mp option buffer represented
6154		 * by OPT_length/offset has been potentially modified and
6155		 * contains results of option processing. We copy it in
6156		 * another mp to save it for potentially influencing returning
6157		 * it in T_CONN_CONN.
6158		 */
6159		if (tcr->OPT_length != 0) { /* there are resulting options */
6160			conn_opts_mp = copyb(mp);
6161			if (!conn_opts_mp) {
6162				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6163				    TSYSERR, ENOMEM);
6164				return;
6165			}
6166			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
6167			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
6168			/*
6169			 * Note:
6170			 * These resulting option negotiation can include any
6171			 * end-to-end negotiation options but there no such
6172			 * thing (yet?) in our TCP/IP.
6173			 */
6174		}
6175	}
6176
6177	/*
6178	 * If we're connecting to an IPv4-mapped IPv6 address, we need to
6179	 * make sure that the template IP header in the tcp structure is an
6180	 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION.  We
6181	 * need to this before we call tcp_bindi() so that the port lookup
6182	 * code will look for ports in the correct port space (IPv4 and
6183	 * IPv6 have separate port spaces).
6184	 */
6185	if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
6186	    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6187		int err = 0;
6188
6189		err = tcp_header_init_ipv4(tcp);
6190		if (err != 0) {
6191			mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6192			goto connect_failed;
6193		}
6194		if (tcp->tcp_lport != 0)
6195			*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
6196	}
6197
6198	switch (tcp->tcp_state) {
6199	case TCPS_IDLE:
6200		/*
6201		 * We support quick connect, refer to comments in
6202		 * tcp_connect_*()
6203		 */
6204		/* FALLTHRU */
6205	case TCPS_BOUND:
6206	case TCPS_LISTEN:
6207		if (tcp->tcp_family == AF_INET6) {
6208			if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6209				tcp_connect_ipv6(tcp, mp,
6210				    &sin6->sin6_addr,
6211				    sin6->sin6_port, sin6->sin6_flowinfo,
6212				    sin6->__sin6_src_id, sin6->sin6_scope_id);
6213				return;
6214			}
6215			/*
6216			 * Destination adress is mapped IPv6 address.
6217			 * Source bound address should be unspecified or
6218			 * IPv6 mapped address as well.
6219			 */
6220			if (!IN6_IS_ADDR_UNSPECIFIED(
6221			    &tcp->tcp_bound_source_v6) &&
6222			    !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
6223				mp = mi_tpi_err_ack_alloc(mp, TSYSERR,
6224				    EADDRNOTAVAIL);
6225				break;
6226			}
6227			dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
6228			dstport = sin6->sin6_port;
6229			srcid = sin6->__sin6_src_id;
6230		} else {
6231			dstaddrp = &sin->sin_addr.s_addr;
6232			dstport = sin->sin_port;
6233			srcid = 0;
6234		}
6235
6236		tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid);
6237		return;
6238	default:
6239		mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0);
6240		break;
6241	}
6242	/*
6243	 * Note: Code below is the "failure" case
6244	 */
6245	/* return error ack and blow away saved option results if any */
6246connect_failed:
6247	if (mp != NULL)
6248		putnext(tcp->tcp_rq, mp);
6249	else {
6250		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6251		    TSYSERR, ENOMEM);
6252	}
6253	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6254		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6255}
6256
6257/*
6258 * Handle connect to IPv4 destinations, including connections for AF_INET6
6259 * sockets connecting to IPv4 mapped IPv6 destinations.
6260 */
6261static void
6262tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
6263    uint_t srcid)
6264{
6265	tcph_t	*tcph;
6266	mblk_t	*mp1;
6267	ipaddr_t dstaddr = *dstaddrp;
6268	int32_t	oldstate;
6269	uint16_t lport;
6270
6271	ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
6272
6273	/* Check for attempt to connect to INADDR_ANY */
6274	if (dstaddr == INADDR_ANY)  {
6275		/*
6276		 * SunOS 4.x and 4.3 BSD allow an application
6277		 * to connect a TCP socket to INADDR_ANY.
6278		 * When they do this, the kernel picks the
6279		 * address of one interface and uses it
6280		 * instead.  The kernel usually ends up
6281		 * picking the address of the loopback
6282		 * interface.  This is an undocumented feature.
6283		 * However, we provide the same thing here
6284		 * in order to have source and binary
6285		 * compatibility with SunOS 4.x.
6286		 * Update the T_CONN_REQ (sin/sin6) since it is used to
6287		 * generate the T_CONN_CON.
6288		 */
6289		dstaddr = htonl(INADDR_LOOPBACK);
6290		*dstaddrp = dstaddr;
6291	}
6292
6293	/* Handle __sin6_src_id if socket not bound to an IP address */
6294	if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
6295		ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
6296		    tcp->tcp_connp->conn_zoneid);
6297		IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
6298		    tcp->tcp_ipha->ipha_src);
6299	}
6300
6301	/*
6302	 * Don't let an endpoint connect to itself.  Note that
6303	 * the test here does not catch the case where the
6304	 * source IP addr was left unspecified by the user. In
6305	 * this case, the source addr is set in tcp_adapt_ire()
6306	 * using the reply to the T_BIND message that we send
6307	 * down to IP here and the check is repeated in tcp_rput_other.
6308	 */
6309	if (dstaddr == tcp->tcp_ipha->ipha_src &&
6310	    dstport == tcp->tcp_lport) {
6311		mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6312		goto failed;
6313	}
6314
6315	tcp->tcp_ipha->ipha_dst = dstaddr;
6316	IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
6317
6318	/*
6319	 * Massage a source route if any putting the first hop
6320	 * in iph_dst. Compute a starting value for the checksum which
6321	 * takes into account that the original iph_dst should be
6322	 * included in the checksum but that ip will include the
6323	 * first hop in the source route in the tcp checksum.
6324	 */
6325	tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha);
6326	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6327	tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
6328	    (tcp->tcp_ipha->ipha_dst & 0xffff));
6329	if ((int)tcp->tcp_sum < 0)
6330		tcp->tcp_sum--;
6331	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6332	tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6333	    (tcp->tcp_sum >> 16));
6334	tcph = tcp->tcp_tcph;
6335	*(uint16_t *)tcph->th_fport = dstport;
6336	tcp->tcp_fport = dstport;
6337
6338	oldstate = tcp->tcp_state;
6339	/*
6340	 * At this point the remote destination address and remote port fields
6341	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6342	 * have to see which state tcp was in so we can take apropriate action.
6343	 */
6344	if (oldstate == TCPS_IDLE) {
6345		/*
6346		 * We support a quick connect capability here, allowing
6347		 * clients to transition directly from IDLE to SYN_SENT
6348		 * tcp_bindi will pick an unused port, insert the connection
6349		 * in the bind hash and transition to BOUND state.
6350		 */
6351		lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE);
6352		lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6353		    B_FALSE, B_FALSE);
6354		if (lport == 0) {
6355			mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6356			goto failed;
6357		}
6358	}
6359	tcp->tcp_state = TCPS_SYN_SENT;
6360
6361	/*
6362	 * TODO: allow data with connect requests
6363	 * by unlinking M_DATA trailers here and
6364	 * linking them in behind the T_OK_ACK mblk.
6365	 * The tcp_rput() bind ack handler would then
6366	 * feed them to tcp_wput_data() rather than call
6367	 * tcp_timer().
6368	 */
6369	mp = mi_tpi_ok_ack_alloc(mp);
6370	if (!mp) {
6371		tcp->tcp_state = oldstate;
6372		goto failed;
6373	}
6374	if (tcp->tcp_family == AF_INET) {
6375		mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6376		    sizeof (ipa_conn_t));
6377	} else {
6378		mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6379		    sizeof (ipa6_conn_t));
6380	}
6381	if (mp1) {
6382		/* Hang onto the T_OK_ACK for later. */
6383		linkb(mp1, mp);
6384		mblk_setcred(mp1, tcp->tcp_cred);
6385		if (tcp->tcp_family == AF_INET)
6386			mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6387		else {
6388			mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6389			    &tcp->tcp_sticky_ipp);
6390		}
6391		BUMP_MIB(&tcp_mib, tcpActiveOpens);
6392		tcp->tcp_active_open = 1;
6393		/*
6394		 * If the bind cannot complete immediately
6395		 * IP will arrange to call tcp_rput_other
6396		 * when the bind completes.
6397		 */
6398		if (mp1 != NULL)
6399			tcp_rput_other(tcp, mp1);
6400		return;
6401	}
6402	/* Error case */
6403	tcp->tcp_state = oldstate;
6404	mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6405
6406failed:
6407	/* return error ack and blow away saved option results if any */
6408	if (mp != NULL)
6409		putnext(tcp->tcp_rq, mp);
6410	else {
6411		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6412		    TSYSERR, ENOMEM);
6413	}
6414	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6415		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6416
6417}
6418
6419/*
6420 * Handle connect to IPv6 destinations.
6421 */
6422static void
6423tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
6424    in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
6425{
6426	tcph_t	*tcph;
6427	mblk_t	*mp1;
6428	ip6_rthdr_t *rth;
6429	int32_t  oldstate;
6430	uint16_t lport;
6431
6432	ASSERT(tcp->tcp_family == AF_INET6);
6433
6434	/*
6435	 * If we're here, it means that the destination address is a native
6436	 * IPv6 address.  Return an error if tcp_ipversion is not IPv6.  A
6437	 * reason why it might not be IPv6 is if the socket was bound to an
6438	 * IPv4-mapped IPv6 address.
6439	 */
6440	if (tcp->tcp_ipversion != IPV6_VERSION) {
6441		mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6442		goto failed;
6443	}
6444
6445	/*
6446	 * Interpret a zero destination to mean loopback.
6447	 * Update the T_CONN_REQ (sin/sin6) since it is used to
6448	 * generate the T_CONN_CON.
6449	 */
6450	if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
6451		*dstaddrp = ipv6_loopback;
6452	}
6453
6454	/* Handle __sin6_src_id if socket not bound to an IP address */
6455	if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
6456		ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
6457		    tcp->tcp_connp->conn_zoneid);
6458		tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
6459	}
6460
6461	/*
6462	 * Take care of the scope_id now and add ip6i_t
6463	 * if ip6i_t is not already allocated through TCP
6464	 * sticky options. At this point tcp_ip6h does not
6465	 * have dst info, thus use dstaddrp.
6466	 */
6467	if (scope_id != 0 &&
6468	    IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
6469		ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
6470		ip6i_t  *ip6i;
6471
6472		ipp->ipp_ifindex = scope_id;
6473		ip6i = (ip6i_t *)tcp->tcp_iphc;
6474
6475		if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
6476		    ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
6477			/* Already allocated */
6478			ip6i->ip6i_flags |= IP6I_IFINDEX;
6479			ip6i->ip6i_ifindex = ipp->ipp_ifindex;
6480			ipp->ipp_fields |= IPPF_SCOPE_ID;
6481		} else {
6482			int reterr;
6483
6484			ipp->ipp_fields |= IPPF_SCOPE_ID;
6485			if (ipp->ipp_fields & IPPF_HAS_IP6I)
6486				ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
6487			reterr = tcp_build_hdrs(tcp->tcp_rq, tcp);
6488			if (reterr != 0)
6489				goto failed;
6490			ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
6491		}
6492	}
6493
6494	/*
6495	 * Don't let an endpoint connect to itself.  Note that
6496	 * the test here does not catch the case where the
6497	 * source IP addr was left unspecified by the user. In
6498	 * this case, the source addr is set in tcp_adapt_ire()
6499	 * using the reply to the T_BIND message that we send
6500	 * down to IP here and the check is repeated in tcp_rput_other.
6501	 */
6502	if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
6503	    (dstport == tcp->tcp_lport)) {
6504		mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6505		goto failed;
6506	}
6507
6508	tcp->tcp_ip6h->ip6_dst = *dstaddrp;
6509	tcp->tcp_remote_v6 = *dstaddrp;
6510	tcp->tcp_ip6h->ip6_vcf =
6511	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
6512	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
6513
6514
6515	/*
6516	 * Massage a routing header (if present) putting the first hop
6517	 * in ip6_dst. Compute a starting value for the checksum which
6518	 * takes into account that the original ip6_dst should be
6519	 * included in the checksum but that ip will include the
6520	 * first hop in the source route in the tcp checksum.
6521	 */
6522	rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
6523	if (rth != NULL) {
6524
6525		tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth);
6526		tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6527		    (tcp->tcp_sum >> 16));
6528	} else {
6529		tcp->tcp_sum = 0;
6530	}
6531
6532	tcph = tcp->tcp_tcph;
6533	*(uint16_t *)tcph->th_fport = dstport;
6534	tcp->tcp_fport = dstport;
6535
6536	oldstate = tcp->tcp_state;
6537	/*
6538	 * At this point the remote destination address and remote port fields
6539	 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6540	 * have to see which state tcp was in so we can take apropriate action.
6541	 */
6542	if (oldstate == TCPS_IDLE) {
6543		/*
6544		 * We support a quick connect capability here, allowing
6545		 * clients to transition directly from IDLE to SYN_SENT
6546		 * tcp_bindi will pick an unused port, insert the connection
6547		 * in the bind hash and transition to BOUND state.
6548		 */
6549		lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE);
6550		lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6551		    B_FALSE, B_FALSE);
6552		if (lport == 0) {
6553			mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6554			goto failed;
6555		}
6556	}
6557	tcp->tcp_state = TCPS_SYN_SENT;
6558	/*
6559	 * TODO: allow data with connect requests
6560	 * by unlinking M_DATA trailers here and
6561	 * linking them in behind the T_OK_ACK mblk.
6562	 * The tcp_rput() bind ack handler would then
6563	 * feed them to tcp_wput_data() rather than call
6564	 * tcp_timer().
6565	 */
6566	mp = mi_tpi_ok_ack_alloc(mp);
6567	if (!mp) {
6568		tcp->tcp_state = oldstate;
6569		goto failed;
6570	}
6571	mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6572	if (mp1) {
6573		/* Hang onto the T_OK_ACK for later. */
6574		linkb(mp1, mp);
6575		mblk_setcred(mp1, tcp->tcp_cred);
6576		mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6577		    &tcp->tcp_sticky_ipp);
6578		BUMP_MIB(&tcp_mib, tcpActiveOpens);
6579		tcp->tcp_active_open = 1;
6580		/* ip_bind_v6() may return ACK or ERROR */
6581		if (mp1 != NULL)
6582			tcp_rput_other(tcp, mp1);
6583		return;
6584	}
6585	/* Error case */
6586	tcp->tcp_state = oldstate;
6587	mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6588
6589failed:
6590	/* return error ack and blow away saved option results if any */
6591	if (mp != NULL)
6592		putnext(tcp->tcp_rq, mp);
6593	else {
6594		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6595		    TSYSERR, ENOMEM);
6596	}
6597	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6598		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6599}
6600
6601/*
6602 * We need a stream q for detached closing tcp connections
6603 * to use.  Our client hereby indicates that this q is the
6604 * one to use.
6605 */
6606static void
6607tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
6608{
6609	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
6610	queue_t	*q = tcp->tcp_wq;
6611
6612	mp->b_datap->db_type = M_IOCACK;
6613	iocp->ioc_count = 0;
6614	mutex_enter(&tcp_g_q_lock);
6615	if (tcp_g_q != NULL) {
6616		mutex_exit(&tcp_g_q_lock);
6617		iocp->ioc_error = EALREADY;
6618	} else {
6619		mblk_t *mp1;
6620
6621		mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0);
6622		if (mp1 == NULL) {
6623			mutex_exit(&tcp_g_q_lock);
6624			iocp->ioc_error = ENOMEM;
6625		} else {
6626			tcp_g_q = tcp->tcp_rq;
6627			mutex_exit(&tcp_g_q_lock);
6628			iocp->ioc_error = 0;
6629			iocp->ioc_rval = 0;
6630			/*
6631			 * We are passing tcp_sticky_ipp as NULL
6632			 * as it is not useful for tcp_default queue
6633			 */
6634			mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL);
6635			if (mp1 != NULL)
6636				tcp_rput_other(tcp, mp1);
6637		}
6638	}
6639	qreply(q, mp);
6640}
6641
6642/*
6643 * Our client hereby directs us to reject the connection request
6644 * that tcp_conn_request() marked with 'seqnum'.  Rejection consists
6645 * of sending the appropriate RST, not an ICMP error.
6646 */
6647static void
6648tcp_disconnect(tcp_t *tcp, mblk_t *mp)
6649{
6650	tcp_t	*ltcp = NULL;
6651	t_scalar_t seqnum;
6652	conn_t	*connp;
6653
6654	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6655	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
6656		tcp_err_ack(tcp, mp, TPROTO, 0);
6657		return;
6658	}
6659
6660	/*
6661	 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
6662	 * when the stream is in BOUND state. Do not send a reset,
6663	 * since the destination IP address is not valid, and it can
6664	 * be the initialized value of all zeros (broadcast address).
6665	 *
6666	 * If TCP has sent down a bind request to IP and has not
6667	 * received the reply, reject the request.  Otherwise, TCP
6668	 * will be confused.
6669	 */
6670	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
6671		if (tcp->tcp_debug) {
6672			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
6673			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
6674		}
6675		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
6676		return;
6677	}
6678
6679	seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
6680
6681	if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
6682
6683		/*
6684		 * According to TPI, for non-listeners, ignore seqnum
6685		 * and disconnect.
6686		 * Following interpretation of -1 seqnum is historical
6687		 * and implied TPI ? (TPI only states that for T_CONN_IND,
6688		 * a valid seqnum should not be -1).
6689		 *
6690		 *	-1 means disconnect everything
6691		 *	regardless even on a listener.
6692		 */
6693
6694		int old_state = tcp->tcp_state;
6695
6696		/*
6697		 * The connection can't be on the tcp_time_wait_head list
6698		 * since it is not detached.
6699		 */
6700		ASSERT(tcp->tcp_time_wait_next == NULL);
6701		ASSERT(tcp->tcp_time_wait_prev == NULL);
6702		ASSERT(tcp->tcp_time_wait_expire == 0);
6703		ltcp = NULL;
6704		/*
6705		 * If it used to be a listener, check to make sure no one else
6706		 * has taken the port before switching back to LISTEN state.
6707		 */
6708		if (tcp->tcp_ipversion == IPV4_VERSION) {
6709			connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
6710			    tcp->tcp_ipha->ipha_src,
6711			    tcp->tcp_connp->conn_zoneid);
6712			if (connp != NULL)
6713				ltcp = connp->conn_tcp;
6714		} else {
6715			/* Allow tcp_bound_if listeners? */
6716			connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
6717			    &tcp->tcp_ip6h->ip6_src, 0,
6718			    tcp->tcp_connp->conn_zoneid);
6719			if (connp != NULL)
6720				ltcp = connp->conn_tcp;
6721		}
6722		if (tcp->tcp_conn_req_max && ltcp == NULL) {
6723			tcp->tcp_state = TCPS_LISTEN;
6724		} else if (old_state > TCPS_BOUND) {
6725			tcp->tcp_conn_req_max = 0;
6726			tcp->tcp_state = TCPS_BOUND;
6727		}
6728		if (ltcp != NULL)
6729			CONN_DEC_REF(ltcp->tcp_connp);
6730		if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
6731			BUMP_MIB(&tcp_mib, tcpAttemptFails);
6732		} else if (old_state == TCPS_ESTABLISHED ||
6733		    old_state == TCPS_CLOSE_WAIT) {
6734			BUMP_MIB(&tcp_mib, tcpEstabResets);
6735		}
6736
6737		if (tcp->tcp_fused)
6738			tcp_unfuse(tcp);
6739
6740		mutex_enter(&tcp->tcp_eager_lock);
6741		if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
6742		    (tcp->tcp_conn_req_cnt_q != 0)) {
6743			tcp_eager_cleanup(tcp, 0);
6744		}
6745		mutex_exit(&tcp->tcp_eager_lock);
6746
6747		tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
6748		    tcp->tcp_rnxt, TH_RST | TH_ACK);
6749
6750		tcp_reinit(tcp);
6751
6752		if (old_state >= TCPS_ESTABLISHED) {
6753			/* Send M_FLUSH according to TPI */
6754			(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
6755		}
6756		mp = mi_tpi_ok_ack_alloc(mp);
6757		if (mp)
6758			putnext(tcp->tcp_rq, mp);
6759		return;
6760	} else if (!tcp_eager_blowoff(tcp, seqnum)) {
6761		tcp_err_ack(tcp, mp, TBADSEQ, 0);
6762		return;
6763	}
6764	if (tcp->tcp_state >= TCPS_ESTABLISHED) {
6765		/* Send M_FLUSH according to TPI */
6766		(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
6767	}
6768	mp = mi_tpi_ok_ack_alloc(mp);
6769	if (mp)
6770		putnext(tcp->tcp_rq, mp);
6771}
6772
6773/*
6774 * Diagnostic routine used to return a string associated with the tcp state.
6775 * Note that if the caller does not supply a buffer, it will use an internal
6776 * static string.  This means that if multiple threads call this function at
6777 * the same time, output can be corrupted...  Note also that this function
6778 * does not check the size of the supplied buffer.  The caller has to make
6779 * sure that it is big enough.
6780 */
6781static char *
6782tcp_display(tcp_t *tcp, char *sup_buf, char format)
6783{
6784	char		buf1[30];
6785	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
6786	char		*buf;
6787	char		*cp;
6788	in6_addr_t	local, remote;
6789	char		local_addrbuf[INET6_ADDRSTRLEN];
6790	char		remote_addrbuf[INET6_ADDRSTRLEN];
6791
6792	if (sup_buf != NULL)
6793		buf = sup_buf;
6794	else
6795		buf = priv_buf;
6796
6797	if (tcp == NULL)
6798		return ("NULL_TCP");
6799	switch (tcp->tcp_state) {
6800	case TCPS_CLOSED:
6801		cp = "TCP_CLOSED";
6802		break;
6803	case TCPS_IDLE:
6804		cp = "TCP_IDLE";
6805		break;
6806	case TCPS_BOUND:
6807		cp = "TCP_BOUND";
6808		break;
6809	case TCPS_LISTEN:
6810		cp = "TCP_LISTEN";
6811		break;
6812	case TCPS_SYN_SENT:
6813		cp = "TCP_SYN_SENT";
6814		break;
6815	case TCPS_SYN_RCVD:
6816		cp = "TCP_SYN_RCVD";
6817		break;
6818	case TCPS_ESTABLISHED:
6819		cp = "TCP_ESTABLISHED";
6820		break;
6821	case TCPS_CLOSE_WAIT:
6822		cp = "TCP_CLOSE_WAIT";
6823		break;
6824	case TCPS_FIN_WAIT_1:
6825		cp = "TCP_FIN_WAIT_1";
6826		break;
6827	case TCPS_CLOSING:
6828		cp = "TCP_CLOSING";
6829		break;
6830	case TCPS_LAST_ACK:
6831		cp = "TCP_LAST_ACK";
6832		break;
6833	case TCPS_FIN_WAIT_2:
6834		cp = "TCP_FIN_WAIT_2";
6835		break;
6836	case TCPS_TIME_WAIT:
6837		cp = "TCP_TIME_WAIT";
6838		break;
6839	default:
6840		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
6841		cp = buf1;
6842		break;
6843	}
6844	switch (format) {
6845	case DISP_ADDR_AND_PORT:
6846		if (tcp->tcp_ipversion == IPV4_VERSION) {
6847			/*
6848			 * Note that we use the remote address in the tcp_b
6849			 * structure.  This means that it will print out
6850			 * the real destination address, not the next hop's
6851			 * address if source routing is used.
6852			 */
6853			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
6854			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
6855
6856		} else {
6857			local = tcp->tcp_ip_src_v6;
6858			remote = tcp->tcp_remote_v6;
6859		}
6860		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
6861		    sizeof (local_addrbuf));
6862		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
6863		    sizeof (remote_addrbuf));
6864		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
6865		    local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
6866		    ntohs(tcp->tcp_fport), cp);
6867		break;
6868	case DISP_PORT_ONLY:
6869	default:
6870		(void) mi_sprintf(buf, "[%u, %u] %s",
6871		    ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
6872		break;
6873	}
6874
6875	return (buf);
6876}
6877
6878/*
6879 * Called via squeue to get on to eager's perimeter to send a
6880 * TH_RST. The listener wants the eager to disappear either
6881 * by means of tcp_eager_blowoff() or tcp_eager_cleanup()
6882 * being called.
6883 */
6884/* ARGSUSED */
6885void
6886tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
6887{
6888	conn_t	*econnp = (conn_t *)arg;
6889	tcp_t	*eager = econnp->conn_tcp;
6890	tcp_t	*listener = eager->tcp_listener;
6891
6892	/*
6893	 * We could be called because listener is closing. Since
6894	 * the eager is using listener's queue's, its not safe.
6895	 * Better use the default queue just to send the TH_RST
6896	 * out.
6897	 */
6898	eager->tcp_rq = tcp_g_q;
6899	eager->tcp_wq = WR(tcp_g_q);
6900
6901	if (eager->tcp_state > TCPS_LISTEN) {
6902		tcp_xmit_ctl("tcp_eager_kill, can't wait",
6903		    eager, eager->tcp_snxt, 0, TH_RST);
6904	}
6905
6906	/* We are here because listener wants this eager gone */
6907	if (listener != NULL) {
6908		mutex_enter(&listener->tcp_eager_lock);
6909		tcp_eager_unlink(eager);
6910		if (eager->tcp_conn.tcp_eager_conn_ind == NULL) {
6911			/*
6912			 * The eager has sent a conn_ind up to the
6913			 * listener but listener decides to close
6914			 * instead. We need to drop the extra ref
6915			 * placed on eager in tcp_rput_data() before
6916			 * sending the conn_ind to listener.
6917			 */
6918			CONN_DEC_REF(econnp);
6919		}
6920		mutex_exit(&listener->tcp_eager_lock);
6921		CONN_DEC_REF(listener->tcp_connp);
6922	}
6923
6924	if (eager->tcp_state > TCPS_BOUND)
6925		tcp_close_detached(eager);
6926}
6927
6928/*
6929 * Reset any eager connection hanging off this listener marked
6930 * with 'seqnum' and then reclaim it's resources.
6931 */
6932static boolean_t
6933tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
6934{
6935	tcp_t	*eager;
6936	mblk_t 	*mp;
6937
6938	TCP_STAT(tcp_eager_blowoff_calls);
6939	eager = listener;
6940	mutex_enter(&listener->tcp_eager_lock);
6941	do {
6942		eager = eager->tcp_eager_next_q;
6943		if (eager == NULL) {
6944			mutex_exit(&listener->tcp_eager_lock);
6945			return (B_FALSE);
6946		}
6947	} while (eager->tcp_conn_req_seqnum != seqnum);
6948	CONN_INC_REF(eager->tcp_connp);
6949	mutex_exit(&listener->tcp_eager_lock);
6950	mp = &eager->tcp_closemp;
6951	squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
6952	    eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
6953	return (B_TRUE);
6954}
6955
6956/*
6957 * Reset any eager connection hanging off this listener
6958 * and then reclaim it's resources.
6959 */
6960static void
6961tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
6962{
6963	tcp_t	*eager;
6964	mblk_t	*mp;
6965
6966	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
6967
6968	if (!q0_only) {
6969		/* First cleanup q */
6970		TCP_STAT(tcp_eager_blowoff_q);
6971		eager = listener->tcp_eager_next_q;
6972		while (eager != NULL) {
6973			CONN_INC_REF(eager->tcp_connp);
6974			mp = &eager->tcp_closemp;
6975			squeue_fill(eager->tcp_connp->conn_sqp, mp,
6976			    tcp_eager_kill, eager->tcp_connp,
6977			    SQTAG_TCP_EAGER_CLEANUP);
6978			eager = eager->tcp_eager_next_q;
6979		}
6980	}
6981	/* Then cleanup q0 */
6982	TCP_STAT(tcp_eager_blowoff_q0);
6983	eager = listener->tcp_eager_next_q0;
6984	while (eager != listener) {
6985		CONN_INC_REF(eager->tcp_connp);
6986		mp = &eager->tcp_closemp;
6987		squeue_fill(eager->tcp_connp->conn_sqp, mp,
6988		    tcp_eager_kill, eager->tcp_connp,
6989		    SQTAG_TCP_EAGER_CLEANUP_Q0);
6990		eager = eager->tcp_eager_next_q0;
6991	}
6992}
6993
6994/*
6995 * If we are an eager connection hanging off a listener that hasn't
6996 * formally accepted the connection yet, get off his list and blow off
6997 * any data that we have accumulated.
6998 */
6999static void
7000tcp_eager_unlink(tcp_t *tcp)
7001{
7002	tcp_t	*listener = tcp->tcp_listener;
7003
7004	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7005	ASSERT(listener != NULL);
7006	if (tcp->tcp_eager_next_q0 != NULL) {
7007		ASSERT(tcp->tcp_eager_prev_q0 != NULL);
7008
7009		/* Remove the eager tcp from q0 */
7010		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
7011		    tcp->tcp_eager_prev_q0;
7012		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
7013		    tcp->tcp_eager_next_q0;
7014		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
7015		listener->tcp_conn_req_cnt_q0--;
7016
7017		tcp->tcp_eager_next_q0 = NULL;
7018		tcp->tcp_eager_prev_q0 = NULL;
7019
7020		if (tcp->tcp_syn_rcvd_timeout != 0) {
7021			/* we have timed out before */
7022			ASSERT(listener->tcp_syn_rcvd_timeout > 0);
7023			listener->tcp_syn_rcvd_timeout--;
7024		}
7025	} else {
7026		tcp_t   **tcpp = &listener->tcp_eager_next_q;
7027		tcp_t	*prev = NULL;
7028
7029		for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
7030			if (tcpp[0] == tcp) {
7031				if (listener->tcp_eager_last_q == tcp) {
7032					/*
7033					 * If we are unlinking the last
7034					 * element on the list, adjust
7035					 * tail pointer. Set tail pointer
7036					 * to nil when list is empty.
7037					 */
7038					ASSERT(tcp->tcp_eager_next_q == NULL);
7039					if (listener->tcp_eager_last_q ==
7040					    listener->tcp_eager_next_q) {
7041						listener->tcp_eager_last_q =
7042						NULL;
7043					} else {
7044						/*
7045						 * We won't get here if there
7046						 * is only one eager in the
7047						 * list.
7048						 */
7049						ASSERT(prev != NULL);
7050						listener->tcp_eager_last_q =
7051						    prev;
7052					}
7053				}
7054				tcpp[0] = tcp->tcp_eager_next_q;
7055				tcp->tcp_eager_next_q = NULL;
7056				tcp->tcp_eager_last_q = NULL;
7057				ASSERT(listener->tcp_conn_req_cnt_q > 0);
7058				listener->tcp_conn_req_cnt_q--;
7059				break;
7060			}
7061			prev = tcpp[0];
7062		}
7063	}
7064	tcp->tcp_listener = NULL;
7065}
7066
7067/* Shorthand to generate and send TPI error acks to our client */
7068static void
7069tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
7070{
7071	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
7072		putnext(tcp->tcp_rq, mp);
7073}
7074
7075/* Shorthand to generate and send TPI error acks to our client */
7076static void
7077tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
7078    int t_error, int sys_error)
7079{
7080	struct T_error_ack	*teackp;
7081
7082	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
7083	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
7084		teackp = (struct T_error_ack *)mp->b_rptr;
7085		teackp->ERROR_prim = primitive;
7086		teackp->TLI_error = t_error;
7087		teackp->UNIX_error = sys_error;
7088		putnext(tcp->tcp_rq, mp);
7089	}
7090}
7091
7092/*
7093 * Note: No locks are held when inspecting tcp_g_*epriv_ports
7094 * but instead the code relies on:
7095 * - the fact that the address of the array and its size never changes
7096 * - the atomic assignment of the elements of the array
7097 */
7098/* ARGSUSED */
7099static int
7100tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
7101{
7102	int i;
7103
7104	for (i = 0; i < tcp_g_num_epriv_ports; i++) {
7105		if (tcp_g_epriv_ports[i] != 0)
7106			(void) mi_mpprintf(mp, "%d ", tcp_g_epriv_ports[i]);
7107	}
7108	return (0);
7109}
7110
7111/*
7112 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7113 * threads from changing it at the same time.
7114 */
7115/* ARGSUSED */
7116static int
7117tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7118    cred_t *cr)
7119{
7120	long	new_value;
7121	int	i;
7122
7123	/*
7124	 * Fail the request if the new value does not lie within the
7125	 * port number limits.
7126	 */
7127	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
7128	    new_value <= 0 || new_value >= 65536) {
7129		return (EINVAL);
7130	}
7131
7132	mutex_enter(&tcp_epriv_port_lock);
7133	/* Check if the value is already in the list */
7134	for (i = 0; i < tcp_g_num_epriv_ports; i++) {
7135		if (new_value == tcp_g_epriv_ports[i]) {
7136			mutex_exit(&tcp_epriv_port_lock);
7137			return (EEXIST);
7138		}
7139	}
7140	/* Find an empty slot */
7141	for (i = 0; i < tcp_g_num_epriv_ports; i++) {
7142		if (tcp_g_epriv_ports[i] == 0)
7143			break;
7144	}
7145	if (i == tcp_g_num_epriv_ports) {
7146		mutex_exit(&tcp_epriv_port_lock);
7147		return (EOVERFLOW);
7148	}
7149	/* Set the new value */
7150	tcp_g_epriv_ports[i] = (uint16_t)new_value;
7151	mutex_exit(&tcp_epriv_port_lock);
7152	return (0);
7153}
7154
7155/*
7156 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7157 * threads from changing it at the same time.
7158 */
7159/* ARGSUSED */
7160static int
7161tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7162    cred_t *cr)
7163{
7164	long	new_value;
7165	int	i;
7166
7167	/*
7168	 * Fail the request if the new value does not lie within the
7169	 * port number limits.
7170	 */
7171	if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
7172	    new_value >= 65536) {
7173		return (EINVAL);
7174	}
7175
7176	mutex_enter(&tcp_epriv_port_lock);
7177	/* Check that the value is already in the list */
7178	for (i = 0; i < tcp_g_num_epriv_ports; i++) {
7179		if (tcp_g_epriv_ports[i] == new_value)
7180			break;
7181	}
7182	if (i == tcp_g_num_epriv_ports) {
7183		mutex_exit(&tcp_epriv_port_lock);
7184		return (ESRCH);
7185	}
7186	/* Clear the value */
7187	tcp_g_epriv_ports[i] = 0;
7188	mutex_exit(&tcp_epriv_port_lock);
7189	return (0);
7190}
7191
7192/* Return the TPI/TLI equivalent of our current tcp_state */
7193static int
7194tcp_tpistate(tcp_t *tcp)
7195{
7196	switch (tcp->tcp_state) {
7197	case TCPS_IDLE:
7198		return (TS_UNBND);
7199	case TCPS_LISTEN:
7200		/*
7201		 * Return whether there are outstanding T_CONN_IND waiting
7202		 * for the matching T_CONN_RES. Therefore don't count q0.
7203		 */
7204		if (tcp->tcp_conn_req_cnt_q > 0)
7205			return (TS_WRES_CIND);
7206		else
7207			return (TS_IDLE);
7208	case TCPS_BOUND:
7209		return (TS_IDLE);
7210	case TCPS_SYN_SENT:
7211		return (TS_WCON_CREQ);
7212	case TCPS_SYN_RCVD:
7213		/*
7214		 * Note: assumption: this has to the active open SYN_RCVD.
7215		 * The passive instance is detached in SYN_RCVD stage of
7216		 * incoming connection processing so we cannot get request
7217		 * for T_info_ack on it.
7218		 */
7219		return (TS_WACK_CRES);
7220	case TCPS_ESTABLISHED:
7221		return (TS_DATA_XFER);
7222	case TCPS_CLOSE_WAIT:
7223		return (TS_WREQ_ORDREL);
7224	case TCPS_FIN_WAIT_1:
7225		return (TS_WIND_ORDREL);
7226	case TCPS_FIN_WAIT_2:
7227		return (TS_WIND_ORDREL);
7228
7229	case TCPS_CLOSING:
7230	case TCPS_LAST_ACK:
7231	case TCPS_TIME_WAIT:
7232	case TCPS_CLOSED:
7233		/*
7234		 * Following TS_WACK_DREQ7 is a rendition of "not
7235		 * yet TS_IDLE" TPI state. There is no best match to any
7236		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
7237		 * choose a value chosen that will map to TLI/XTI level
7238		 * state of TSTATECHNG (state is process of changing) which
7239		 * captures what this dummy state represents.
7240		 */
7241		return (TS_WACK_DREQ7);
7242	default:
7243		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
7244		    tcp->tcp_state, tcp_display(tcp, NULL,
7245		    DISP_PORT_ONLY));
7246		return (TS_UNBND);
7247	}
7248}
7249
7250static void
7251tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
7252{
7253	if (tcp->tcp_family == AF_INET6)
7254		*tia = tcp_g_t_info_ack_v6;
7255	else
7256		*tia = tcp_g_t_info_ack;
7257	tia->CURRENT_state = tcp_tpistate(tcp);
7258	tia->OPT_size = tcp_max_optsize;
7259	if (tcp->tcp_mss == 0) {
7260		/* Not yet set - tcp_open does not set mss */
7261		if (tcp->tcp_ipversion == IPV4_VERSION)
7262			tia->TIDU_size = tcp_mss_def_ipv4;
7263		else
7264			tia->TIDU_size = tcp_mss_def_ipv6;
7265	} else {
7266		tia->TIDU_size = tcp->tcp_mss;
7267	}
7268	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
7269}
7270
7271/*
7272 * This routine responds to T_CAPABILITY_REQ messages.  It is called by
7273 * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
7274 * tcp_g_t_info_ack.  The current state of the stream is copied from
7275 * tcp_state.
7276 */
7277static void
7278tcp_capability_req(tcp_t *tcp, mblk_t *mp)
7279{
7280	t_uscalar_t		cap_bits1;
7281	struct T_capability_ack	*tcap;
7282
7283	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
7284		freemsg(mp);
7285		return;
7286	}
7287
7288	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
7289
7290	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
7291	    mp->b_datap->db_type, T_CAPABILITY_ACK);
7292	if (mp == NULL)
7293		return;
7294
7295	tcap = (struct T_capability_ack *)mp->b_rptr;
7296	tcap->CAP_bits1 = 0;
7297
7298	if (cap_bits1 & TC1_INFO) {
7299		tcp_copy_info(&tcap->INFO_ack, tcp);
7300		tcap->CAP_bits1 |= TC1_INFO;
7301	}
7302
7303	if (cap_bits1 & TC1_ACCEPTOR_ID) {
7304		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
7305		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
7306	}
7307
7308	putnext(tcp->tcp_rq, mp);
7309}
7310
7311/*
7312 * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
7313 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
7314 * The current state of the stream is copied from tcp_state.
7315 */
7316static void
7317tcp_info_req(tcp_t *tcp, mblk_t *mp)
7318{
7319	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
7320	    T_INFO_ACK);
7321	if (!mp) {
7322		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7323		return;
7324	}
7325	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
7326	putnext(tcp->tcp_rq, mp);
7327}
7328
7329/* Respond to the TPI addr request */
7330static void
7331tcp_addr_req(tcp_t *tcp, mblk_t *mp)
7332{
7333	sin_t	*sin;
7334	mblk_t	*ackmp;
7335	struct T_addr_ack *taa;
7336
7337	/* Make it large enough for worst case */
7338	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
7339	    2 * sizeof (sin6_t), 1);
7340	if (ackmp == NULL) {
7341		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7342		return;
7343	}
7344
7345	if (tcp->tcp_ipversion == IPV6_VERSION) {
7346		tcp_addr_req_ipv6(tcp, ackmp);
7347		return;
7348	}
7349	taa = (struct T_addr_ack *)ackmp->b_rptr;
7350
7351	bzero(taa, sizeof (struct T_addr_ack));
7352	ackmp->b_wptr = (uchar_t *)&taa[1];
7353
7354	taa->PRIM_type = T_ADDR_ACK;
7355	ackmp->b_datap->db_type = M_PCPROTO;
7356
7357	/*
7358	 * Note: Following code assumes 32 bit alignment of basic
7359	 * data structures like sin_t and struct T_addr_ack.
7360	 */
7361	if (tcp->tcp_state >= TCPS_BOUND) {
7362		/*
7363		 * Fill in local address
7364		 */
7365		taa->LOCADDR_length = sizeof (sin_t);
7366		taa->LOCADDR_offset = sizeof (*taa);
7367
7368		sin = (sin_t *)&taa[1];
7369
7370		/* Fill zeroes and then intialize non-zero fields */
7371		*sin = sin_null;
7372
7373		sin->sin_family = AF_INET;
7374
7375		sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
7376		sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
7377
7378		ackmp->b_wptr = (uchar_t *)&sin[1];
7379
7380		if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7381			/*
7382			 * Fill in Remote address
7383			 */
7384			taa->REMADDR_length = sizeof (sin_t);
7385			taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7386						taa->LOCADDR_length);
7387
7388			sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7389			*sin = sin_null;
7390			sin->sin_family = AF_INET;
7391			sin->sin_addr.s_addr = tcp->tcp_remote;
7392			sin->sin_port = tcp->tcp_fport;
7393
7394			ackmp->b_wptr = (uchar_t *)&sin[1];
7395		}
7396	}
7397	putnext(tcp->tcp_rq, ackmp);
7398}
7399
7400/* Assumes that tcp_addr_req gets enough space and alignment */
7401static void
7402tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
7403{
7404	sin6_t	*sin6;
7405	struct T_addr_ack *taa;
7406
7407	ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
7408	ASSERT(OK_32PTR(ackmp->b_rptr));
7409	ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
7410	    2 * sizeof (sin6_t));
7411
7412	taa = (struct T_addr_ack *)ackmp->b_rptr;
7413
7414	bzero(taa, sizeof (struct T_addr_ack));
7415	ackmp->b_wptr = (uchar_t *)&taa[1];
7416
7417	taa->PRIM_type = T_ADDR_ACK;
7418	ackmp->b_datap->db_type = M_PCPROTO;
7419
7420	/*
7421	 * Note: Following code assumes 32 bit alignment of basic
7422	 * data structures like sin6_t and struct T_addr_ack.
7423	 */
7424	if (tcp->tcp_state >= TCPS_BOUND) {
7425		/*
7426		 * Fill in local address
7427		 */
7428		taa->LOCADDR_length = sizeof (sin6_t);
7429		taa->LOCADDR_offset = sizeof (*taa);
7430
7431		sin6 = (sin6_t *)&taa[1];
7432		*sin6 = sin6_null;
7433
7434		sin6->sin6_family = AF_INET6;
7435		sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
7436		sin6->sin6_port = tcp->tcp_lport;
7437
7438		ackmp->b_wptr = (uchar_t *)&sin6[1];
7439
7440		if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7441			/*
7442			 * Fill in Remote address
7443			 */
7444			taa->REMADDR_length = sizeof (sin6_t);
7445			taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7446						taa->LOCADDR_length);
7447
7448			sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7449			*sin6 = sin6_null;
7450			sin6->sin6_family = AF_INET6;
7451			sin6->sin6_flowinfo =
7452			    tcp->tcp_ip6h->ip6_vcf &
7453			    ~IPV6_VERS_AND_FLOW_MASK;
7454			sin6->sin6_addr = tcp->tcp_remote_v6;
7455			sin6->sin6_port = tcp->tcp_fport;
7456
7457			ackmp->b_wptr = (uchar_t *)&sin6[1];
7458		}
7459	}
7460	putnext(tcp->tcp_rq, ackmp);
7461}
7462
7463/*
7464 * Handle reinitialization of a tcp structure.
7465 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
7466 */
7467static void
7468tcp_reinit(tcp_t *tcp)
7469{
7470	mblk_t	*mp;
7471	int 	err;
7472
7473	TCP_STAT(tcp_reinit_calls);
7474
7475	/* tcp_reinit should never be called for detached tcp_t's */
7476	ASSERT(tcp->tcp_listener == NULL);
7477	ASSERT((tcp->tcp_family == AF_INET &&
7478	    tcp->tcp_ipversion == IPV4_VERSION) ||
7479	    (tcp->tcp_family == AF_INET6 &&
7480	    (tcp->tcp_ipversion == IPV4_VERSION ||
7481	    tcp->tcp_ipversion == IPV6_VERSION)));
7482
7483	/* Cancel outstanding timers */
7484	tcp_timers_stop(tcp);
7485
7486	/*
7487	 * Reset everything in the state vector, after updating global
7488	 * MIB data from instance counters.
7489	 */
7490	UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs);
7491	tcp->tcp_ibsegs = 0;
7492	UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs);
7493	tcp->tcp_obsegs = 0;
7494
7495	tcp_close_mpp(&tcp->tcp_xmit_head);
7496	if (tcp->tcp_snd_zcopy_aware)
7497		tcp_zcopy_notify(tcp);
7498	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
7499	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
7500	if (tcp->tcp_flow_stopped &&
7501	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
7502		tcp_clrqfull(tcp);
7503	}
7504	tcp_close_mpp(&tcp->tcp_reass_head);
7505	tcp->tcp_reass_tail = NULL;
7506	if (tcp->tcp_rcv_list != NULL) {
7507		/* Free b_next chain */
7508		tcp_close_mpp(&tcp->tcp_rcv_list);
7509		tcp->tcp_rcv_last_head = NULL;
7510		tcp->tcp_rcv_last_tail = NULL;
7511		tcp->tcp_rcv_cnt = 0;
7512	}
7513	tcp->tcp_rcv_last_tail = NULL;
7514
7515	if ((mp = tcp->tcp_urp_mp) != NULL) {
7516		freemsg(mp);
7517		tcp->tcp_urp_mp = NULL;
7518	}
7519	if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
7520		freemsg(mp);
7521		tcp->tcp_urp_mark_mp = NULL;
7522	}
7523	if (tcp->tcp_fused_sigurg_mp != NULL) {
7524		freeb(tcp->tcp_fused_sigurg_mp);
7525		tcp->tcp_fused_sigurg_mp = NULL;
7526	}
7527
7528	/*
7529	 * Following is a union with two members which are
7530	 * identical types and size so the following cleanup
7531	 * is enough.
7532	 */
7533	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
7534
7535	CL_INET_DISCONNECT(tcp);
7536
7537	/*
7538	 * The connection can't be on the tcp_time_wait_head list
7539	 * since it is not detached.
7540	 */
7541	ASSERT(tcp->tcp_time_wait_next == NULL);
7542	ASSERT(tcp->tcp_time_wait_prev == NULL);
7543	ASSERT(tcp->tcp_time_wait_expire == 0);
7544
7545	if (tcp->tcp_kssl_pending) {
7546		tcp->tcp_kssl_pending = B_FALSE;
7547
7548		/* Don't reset if the initialized by bind. */
7549		if (tcp->tcp_kssl_ent != NULL) {
7550			kssl_release_ent(tcp->tcp_kssl_ent, NULL,
7551			    KSSL_NO_PROXY);
7552		}
7553	}
7554	if (tcp->tcp_kssl_ctx != NULL) {
7555		kssl_release_ctx(tcp->tcp_kssl_ctx);
7556		tcp->tcp_kssl_ctx = NULL;
7557	}
7558
7559	/*
7560	 * Reset/preserve other values
7561	 */
7562	tcp_reinit_values(tcp);
7563	ipcl_hash_remove(tcp->tcp_connp);
7564	conn_delete_ire(tcp->tcp_connp, NULL);
7565
7566	if (tcp->tcp_conn_req_max != 0) {
7567		/*
7568		 * This is the case when a TLI program uses the same
7569		 * transport end point to accept a connection.  This
7570		 * makes the TCP both a listener and acceptor.  When
7571		 * this connection is closed, we need to set the state
7572		 * back to TCPS_LISTEN.  Make sure that the eager list
7573		 * is reinitialized.
7574		 *
7575		 * Note that this stream is still bound to the four
7576		 * tuples of the previous connection in IP.  If a new
7577		 * SYN with different foreign address comes in, IP will
7578		 * not find it and will send it to the global queue.  In
7579		 * the global queue, TCP will do a tcp_lookup_listener()
7580		 * to find this stream.  This works because this stream
7581		 * is only removed from connected hash.
7582		 *
7583		 */
7584		tcp->tcp_state = TCPS_LISTEN;
7585		tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7586		tcp->tcp_connp->conn_recv = tcp_conn_request;
7587		if (tcp->tcp_family == AF_INET6) {
7588			ASSERT(tcp->tcp_connp->conn_af_isv6);
7589			(void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7590			    &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7591		} else {
7592			ASSERT(!tcp->tcp_connp->conn_af_isv6);
7593			(void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7594			    tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7595		}
7596	} else {
7597		tcp->tcp_state = TCPS_BOUND;
7598	}
7599
7600	/*
7601	 * Initialize to default values
7602	 * Can't fail since enough header template space already allocated
7603	 * at open().
7604	 */
7605	err = tcp_init_values(tcp);
7606	ASSERT(err == 0);
7607	/* Restore state in tcp_tcph */
7608	bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
7609	if (tcp->tcp_ipversion == IPV4_VERSION)
7610		tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
7611	else
7612		tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
7613	/*
7614	 * Copy of the src addr. in tcp_t is needed in tcp_t
7615	 * since the lookup funcs can only lookup on tcp_t
7616	 */
7617	tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
7618
7619	ASSERT(tcp->tcp_ptpbhn != NULL);
7620	tcp->tcp_rq->q_hiwat = tcp_recv_hiwat;
7621	tcp->tcp_rwnd = tcp_recv_hiwat;
7622	tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
7623	    tcp_mss_def_ipv6 : tcp_mss_def_ipv4;
7624}
7625
7626/*
7627 * Force values to zero that need be zero.
7628 * Do not touch values asociated with the BOUND or LISTEN state
7629 * since the connection will end up in that state after the reinit.
7630 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
7631 * structure!
7632 */
7633static void
7634tcp_reinit_values(tcp)
7635	tcp_t *tcp;
7636{
7637#ifndef	lint
7638#define	DONTCARE(x)
7639#define	PRESERVE(x)
7640#else
7641#define	DONTCARE(x)	((x) = (x))
7642#define	PRESERVE(x)	((x) = (x))
7643#endif	/* lint */
7644
7645	PRESERVE(tcp->tcp_bind_hash);
7646	PRESERVE(tcp->tcp_ptpbhn);
7647	PRESERVE(tcp->tcp_acceptor_hash);
7648	PRESERVE(tcp->tcp_ptpahn);
7649
7650	/* Should be ASSERT NULL on these with new code! */
7651	ASSERT(tcp->tcp_time_wait_next == NULL);
7652	ASSERT(tcp->tcp_time_wait_prev == NULL);
7653	ASSERT(tcp->tcp_time_wait_expire == 0);
7654	PRESERVE(tcp->tcp_state);
7655	PRESERVE(tcp->tcp_rq);
7656	PRESERVE(tcp->tcp_wq);
7657
7658	ASSERT(tcp->tcp_xmit_head == NULL);
7659	ASSERT(tcp->tcp_xmit_last == NULL);
7660	ASSERT(tcp->tcp_unsent == 0);
7661	ASSERT(tcp->tcp_xmit_tail == NULL);
7662	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
7663
7664	tcp->tcp_snxt = 0;			/* Displayed in mib */
7665	tcp->tcp_suna = 0;			/* Displayed in mib */
7666	tcp->tcp_swnd = 0;
7667	DONTCARE(tcp->tcp_cwnd);		/* Init in tcp_mss_set */
7668
7669	ASSERT(tcp->tcp_ibsegs == 0);
7670	ASSERT(tcp->tcp_obsegs == 0);
7671
7672	if (tcp->tcp_iphc != NULL) {
7673		ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
7674		bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
7675	}
7676
7677	DONTCARE(tcp->tcp_naglim);		/* Init in tcp_init_values */
7678	DONTCARE(tcp->tcp_hdr_len);		/* Init in tcp_init_values */
7679	DONTCARE(tcp->tcp_ipha);
7680	DONTCARE(tcp->tcp_ip6h);
7681	DONTCARE(tcp->tcp_ip_hdr_len);
7682	DONTCARE(tcp->tcp_tcph);
7683	DONTCARE(tcp->tcp_tcp_hdr_len);		/* Init in tcp_init_values */
7684	tcp->tcp_valid_bits = 0;
7685
7686	DONTCARE(tcp->tcp_xmit_hiwater);	/* Init in tcp_init_values */
7687	DONTCARE(tcp->tcp_timer_backoff);	/* Init in tcp_init_values */
7688	DONTCARE(tcp->tcp_last_recv_time);	/* Init in tcp_init_values */
7689	tcp->tcp_last_rcv_lbolt = 0;
7690
7691	tcp->tcp_init_cwnd = 0;
7692
7693	tcp->tcp_urp_last_valid = 0;
7694	tcp->tcp_hard_binding = 0;
7695	tcp->tcp_hard_bound = 0;
7696	PRESERVE(tcp->tcp_cred);
7697	PRESERVE(tcp->tcp_cpid);
7698	PRESERVE(tcp->tcp_exclbind);
7699
7700	tcp->tcp_fin_acked = 0;
7701	tcp->tcp_fin_rcvd = 0;
7702	tcp->tcp_fin_sent = 0;
7703	tcp->tcp_ordrel_done = 0;
7704
7705	tcp->tcp_debug = 0;
7706	tcp->tcp_dontroute = 0;
7707	tcp->tcp_broadcast = 0;
7708
7709	tcp->tcp_useloopback = 0;
7710	tcp->tcp_reuseaddr = 0;
7711	tcp->tcp_oobinline = 0;
7712	tcp->tcp_dgram_errind = 0;
7713
7714	tcp->tcp_detached = 0;
7715	tcp->tcp_bind_pending = 0;
7716	tcp->tcp_unbind_pending = 0;
7717	tcp->tcp_deferred_clean_death = 0;
7718
7719	tcp->tcp_snd_ws_ok = B_FALSE;
7720	tcp->tcp_snd_ts_ok = B_FALSE;
7721	tcp->tcp_linger = 0;
7722	tcp->tcp_ka_enabled = 0;
7723	tcp->tcp_zero_win_probe = 0;
7724
7725	tcp->tcp_loopback = 0;
7726	tcp->tcp_localnet = 0;
7727	tcp->tcp_syn_defense = 0;
7728	tcp->tcp_set_timer = 0;
7729
7730	tcp->tcp_active_open = 0;
7731	ASSERT(tcp->tcp_timeout == B_FALSE);
7732	tcp->tcp_rexmit = B_FALSE;
7733	tcp->tcp_xmit_zc_clean = B_FALSE;
7734
7735	tcp->tcp_snd_sack_ok = B_FALSE;
7736	PRESERVE(tcp->tcp_recvdstaddr);
7737	tcp->tcp_hwcksum = B_FALSE;
7738
7739	tcp->tcp_ire_ill_check_done = B_FALSE;
7740	DONTCARE(tcp->tcp_maxpsz);		/* Init in tcp_init_values */
7741
7742	tcp->tcp_mdt = B_FALSE;
7743	tcp->tcp_mdt_hdr_head = 0;
7744	tcp->tcp_mdt_hdr_tail = 0;
7745
7746	tcp->tcp_conn_def_q0 = 0;
7747	tcp->tcp_ip_forward_progress = B_FALSE;
7748	tcp->tcp_anon_priv_bind = 0;
7749	tcp->tcp_ecn_ok = B_FALSE;
7750
7751	tcp->tcp_cwr = B_FALSE;
7752	tcp->tcp_ecn_echo_on = B_FALSE;
7753
7754	if (tcp->tcp_sack_info != NULL) {
7755		if (tcp->tcp_notsack_list != NULL) {
7756			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
7757		}
7758		kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
7759		tcp->tcp_sack_info = NULL;
7760	}
7761
7762	tcp->tcp_rcv_ws = 0;
7763	tcp->tcp_snd_ws = 0;
7764	tcp->tcp_ts_recent = 0;
7765	tcp->tcp_rnxt = 0;			/* Displayed in mib */
7766	DONTCARE(tcp->tcp_rwnd);		/* Set in tcp_reinit() */
7767	tcp->tcp_if_mtu = 0;
7768
7769	ASSERT(tcp->tcp_reass_head == NULL);
7770	ASSERT(tcp->tcp_reass_tail == NULL);
7771
7772	tcp->tcp_cwnd_cnt = 0;
7773
7774	ASSERT(tcp->tcp_rcv_list == NULL);
7775	ASSERT(tcp->tcp_rcv_last_head == NULL);
7776	ASSERT(tcp->tcp_rcv_last_tail == NULL);
7777	ASSERT(tcp->tcp_rcv_cnt == 0);
7778
7779	DONTCARE(tcp->tcp_cwnd_ssthresh);	/* Init in tcp_adapt_ire */
7780	DONTCARE(tcp->tcp_cwnd_max);		/* Init in tcp_init_values */
7781	tcp->tcp_csuna = 0;
7782
7783	tcp->tcp_rto = 0;			/* Displayed in MIB */
7784	DONTCARE(tcp->tcp_rtt_sa);		/* Init in tcp_init_values */
7785	DONTCARE(tcp->tcp_rtt_sd);		/* Init in tcp_init_values */
7786	tcp->tcp_rtt_update = 0;
7787
7788	DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
7789	DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
7790
7791	tcp->tcp_rack = 0;			/* Displayed in mib */
7792	tcp->tcp_rack_cnt = 0;
7793	tcp->tcp_rack_cur_max = 0;
7794	tcp->tcp_rack_abs_max = 0;
7795
7796	tcp->tcp_max_swnd = 0;
7797
7798	ASSERT(tcp->tcp_listener == NULL);
7799
7800	DONTCARE(tcp->tcp_xmit_lowater);	/* Init in tcp_init_values */
7801
7802	DONTCARE(tcp->tcp_irs);			/* tcp_valid_bits cleared */
7803	DONTCARE(tcp->tcp_iss);			/* tcp_valid_bits cleared */
7804	DONTCARE(tcp->tcp_fss);			/* tcp_valid_bits cleared */
7805	DONTCARE(tcp->tcp_urg);			/* tcp_valid_bits cleared */
7806
7807	ASSERT(tcp->tcp_conn_req_cnt_q == 0);
7808	ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
7809	PRESERVE(tcp->tcp_conn_req_max);
7810	PRESERVE(tcp->tcp_conn_req_seqnum);
7811
7812	DONTCARE(tcp->tcp_ip_hdr_len);		/* Init in tcp_init_values */
7813	DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
7814	DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
7815	DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
7816	DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
7817
7818	tcp->tcp_lingertime = 0;
7819
7820	DONTCARE(tcp->tcp_urp_last);	/* tcp_urp_last_valid is cleared */
7821	ASSERT(tcp->tcp_urp_mp == NULL);
7822	ASSERT(tcp->tcp_urp_mark_mp == NULL);
7823	ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
7824
7825	ASSERT(tcp->tcp_eager_next_q == NULL);
7826	ASSERT(tcp->tcp_eager_last_q == NULL);
7827	ASSERT((tcp->tcp_eager_next_q0 == NULL &&
7828	    tcp->tcp_eager_prev_q0 == NULL) ||
7829	    tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
7830	ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
7831
7832	tcp->tcp_client_errno = 0;
7833
7834	DONTCARE(tcp->tcp_sum);			/* Init in tcp_init_values */
7835
7836	tcp->tcp_remote_v6 = ipv6_all_zeros;	/* Displayed in MIB */
7837
7838	PRESERVE(tcp->tcp_bound_source_v6);
7839	tcp->tcp_last_sent_len = 0;
7840	tcp->tcp_dupack_cnt = 0;
7841
7842	tcp->tcp_fport = 0;			/* Displayed in MIB */
7843	PRESERVE(tcp->tcp_lport);
7844
7845	PRESERVE(tcp->tcp_acceptor_lockp);
7846
7847	ASSERT(tcp->tcp_ordrelid == 0);
7848	PRESERVE(tcp->tcp_acceptor_id);
7849	DONTCARE(tcp->tcp_ipsec_overhead);
7850
7851	/*
7852	 * If tcp_tracing flag is ON (i.e. We have a trace buffer
7853	 * in tcp structure and now tracing), Re-initialize all
7854	 * members of tcp_traceinfo.
7855	 */
7856	if (tcp->tcp_tracebuf != NULL) {
7857		bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t));
7858	}
7859
7860	PRESERVE(tcp->tcp_family);
7861	if (tcp->tcp_family == AF_INET6) {
7862		tcp->tcp_ipversion = IPV6_VERSION;
7863		tcp->tcp_mss = tcp_mss_def_ipv6;
7864	} else {
7865		tcp->tcp_ipversion = IPV4_VERSION;
7866		tcp->tcp_mss = tcp_mss_def_ipv4;
7867	}
7868
7869	tcp->tcp_bound_if = 0;
7870	tcp->tcp_ipv6_recvancillary = 0;
7871	tcp->tcp_recvifindex = 0;
7872	tcp->tcp_recvhops = 0;
7873	tcp->tcp_closed = 0;
7874	tcp->tcp_cleandeathtag = 0;
7875	if (tcp->tcp_hopopts != NULL) {
7876		mi_free(tcp->tcp_hopopts);
7877		tcp->tcp_hopopts = NULL;
7878		tcp->tcp_hopoptslen = 0;
7879	}
7880	ASSERT(tcp->tcp_hopoptslen == 0);
7881	if (tcp->tcp_dstopts != NULL) {
7882		mi_free(tcp->tcp_dstopts);
7883		tcp->tcp_dstopts = NULL;
7884		tcp->tcp_dstoptslen = 0;
7885	}
7886	ASSERT(tcp->tcp_dstoptslen == 0);
7887	if (tcp->tcp_rtdstopts != NULL) {
7888		mi_free(tcp->tcp_rtdstopts);
7889		tcp->tcp_rtdstopts = NULL;
7890		tcp->tcp_rtdstoptslen = 0;
7891	}
7892	ASSERT(tcp->tcp_rtdstoptslen == 0);
7893	if (tcp->tcp_rthdr != NULL) {
7894		mi_free(tcp->tcp_rthdr);
7895		tcp->tcp_rthdr = NULL;
7896		tcp->tcp_rthdrlen = 0;
7897	}
7898	ASSERT(tcp->tcp_rthdrlen == 0);
7899	PRESERVE(tcp->tcp_drop_opt_ack_cnt);
7900
7901	/* Reset fusion-related fields */
7902	tcp->tcp_fused = B_FALSE;
7903	tcp->tcp_unfusable = B_FALSE;
7904	tcp->tcp_fused_sigurg = B_FALSE;
7905	tcp->tcp_direct_sockfs = B_FALSE;
7906	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
7907	tcp->tcp_loopback_peer = NULL;
7908	tcp->tcp_fuse_rcv_hiwater = 0;
7909	tcp->tcp_fuse_rcv_unread_hiwater = 0;
7910	tcp->tcp_fuse_rcv_unread_cnt = 0;
7911
7912	tcp->tcp_in_ack_unsent = 0;
7913	tcp->tcp_cork = B_FALSE;
7914
7915	PRESERVE(tcp->tcp_squeue_bytes);
7916
7917	ASSERT(tcp->tcp_kssl_ctx == NULL);
7918	ASSERT(!tcp->tcp_kssl_pending);
7919	PRESERVE(tcp->tcp_kssl_ent);
7920
7921#undef	DONTCARE
7922#undef	PRESERVE
7923}
7924
7925/*
7926 * Allocate necessary resources and initialize state vector.
7927 * Guaranteed not to fail so that when an error is returned,
7928 * the caller doesn't need to do any additional cleanup.
7929 */
7930int
7931tcp_init(tcp_t *tcp, queue_t *q)
7932{
7933	int	err;
7934
7935	tcp->tcp_rq = q;
7936	tcp->tcp_wq = WR(q);
7937	tcp->tcp_state = TCPS_IDLE;
7938	if ((err = tcp_init_values(tcp)) != 0)
7939		tcp_timers_stop(tcp);
7940	return (err);
7941}
7942
7943static int
7944tcp_init_values(tcp_t *tcp)
7945{
7946	int	err;
7947
7948	ASSERT((tcp->tcp_family == AF_INET &&
7949	    tcp->tcp_ipversion == IPV4_VERSION) ||
7950	    (tcp->tcp_family == AF_INET6 &&
7951	    (tcp->tcp_ipversion == IPV4_VERSION ||
7952	    tcp->tcp_ipversion == IPV6_VERSION)));
7953
7954	/*
7955	 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
7956	 * will be close to tcp_rexmit_interval_initial.  By doing this, we
7957	 * allow the algorithm to adjust slowly to large fluctuations of RTT
7958	 * during first few transmissions of a connection as seen in slow
7959	 * links.
7960	 */
7961	tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2;
7962	tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1;
7963	tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
7964	    tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
7965	    tcp_conn_grace_period;
7966	if (tcp->tcp_rto < tcp_rexmit_interval_min)
7967		tcp->tcp_rto = tcp_rexmit_interval_min;
7968	tcp->tcp_timer_backoff = 0;
7969	tcp->tcp_ms_we_have_waited = 0;
7970	tcp->tcp_last_recv_time = lbolt;
7971	tcp->tcp_cwnd_max = tcp_cwnd_max_;
7972	tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
7973	tcp->tcp_snd_burst = TCP_CWND_INFINITE;
7974
7975	tcp->tcp_maxpsz = tcp_maxpsz_multiplier;
7976
7977	tcp->tcp_first_timer_threshold = tcp_ip_notify_interval;
7978	tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval;
7979	tcp->tcp_second_timer_threshold = tcp_ip_abort_interval;
7980	/*
7981	 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
7982	 * passive open.
7983	 */
7984	tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval;
7985
7986	tcp->tcp_naglim = tcp_naglim_def;
7987
7988	/* NOTE:  ISS is now set in tcp_adapt_ire(). */
7989
7990	tcp->tcp_mdt_hdr_head = 0;
7991	tcp->tcp_mdt_hdr_tail = 0;
7992
7993	/* Reset fusion-related fields */
7994	tcp->tcp_fused = B_FALSE;
7995	tcp->tcp_unfusable = B_FALSE;
7996	tcp->tcp_fused_sigurg = B_FALSE;
7997	tcp->tcp_direct_sockfs = B_FALSE;
7998	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
7999	tcp->tcp_loopback_peer = NULL;
8000	tcp->tcp_fuse_rcv_hiwater = 0;
8001	tcp->tcp_fuse_rcv_unread_hiwater = 0;
8002	tcp->tcp_fuse_rcv_unread_cnt = 0;
8003
8004	/* Initialize the header template */
8005	if (tcp->tcp_ipversion == IPV4_VERSION) {
8006		err = tcp_header_init_ipv4(tcp);
8007	} else {
8008		err = tcp_header_init_ipv6(tcp);
8009	}
8010	if (err)
8011		return (err);
8012
8013	/*
8014	 * Init the window scale to the max so tcp_rwnd_set() won't pare
8015	 * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
8016	 */
8017	tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
8018	tcp->tcp_xmit_lowater = tcp_xmit_lowat;
8019	tcp->tcp_xmit_hiwater = tcp_xmit_hiwat;
8020
8021	tcp->tcp_cork = B_FALSE;
8022	/*
8023	 * Init the tcp_debug option.  This value determines whether TCP
8024	 * calls strlog() to print out debug messages.  Doing this
8025	 * initialization here means that this value is not inherited thru
8026	 * tcp_reinit().
8027	 */
8028	tcp->tcp_debug = tcp_dbg;
8029
8030	tcp->tcp_ka_interval = tcp_keepalive_interval;
8031	tcp->tcp_ka_abort_thres = tcp_keepalive_abort_interval;
8032
8033	return (0);
8034}
8035
8036/*
8037 * Initialize the IPv4 header. Loses any record of any IP options.
8038 */
8039static int
8040tcp_header_init_ipv4(tcp_t *tcp)
8041{
8042	tcph_t		*tcph;
8043	uint32_t	sum;
8044	conn_t		*connp;
8045
8046	/*
8047	 * This is a simple initialization. If there's
8048	 * already a template, it should never be too small,
8049	 * so reuse it.  Otherwise, allocate space for the new one.
8050	 */
8051	if (tcp->tcp_iphc == NULL) {
8052		ASSERT(tcp->tcp_iphc_len == 0);
8053		tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8054		tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8055		if (tcp->tcp_iphc == NULL) {
8056			tcp->tcp_iphc_len = 0;
8057			return (ENOMEM);
8058		}
8059	}
8060
8061	/* options are gone; may need a new label */
8062	connp = tcp->tcp_connp;
8063	connp->conn_mlp_type = mlptSingle;
8064	connp->conn_ulp_labeled = !is_system_labeled();
8065	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8066	tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
8067	tcp->tcp_ip6h = NULL;
8068	tcp->tcp_ipversion = IPV4_VERSION;
8069	tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
8070	tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8071	tcp->tcp_ip_hdr_len = sizeof (ipha_t);
8072	tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
8073	tcp->tcp_ipha->ipha_version_and_hdr_length
8074		= (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
8075	tcp->tcp_ipha->ipha_ident = 0;
8076
8077	tcp->tcp_ttl = (uchar_t)tcp_ipv4_ttl;
8078	tcp->tcp_tos = 0;
8079	tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
8080	tcp->tcp_ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl;
8081	tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
8082
8083	tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
8084	tcp->tcp_tcph = tcph;
8085	tcph->th_offset_and_rsrvd[0] = (5 << 4);
8086	/*
8087	 * IP wants our header length in the checksum field to
8088	 * allow it to perform a single pseudo-header+checksum
8089	 * calculation on behalf of TCP.
8090	 * Include the adjustment for a source route once IP_OPTIONS is set.
8091	 */
8092	sum = sizeof (tcph_t) + tcp->tcp_sum;
8093	sum = (sum >> 16) + (sum & 0xFFFF);
8094	U16_TO_ABE16(sum, tcph->th_sum);
8095	return (0);
8096}
8097
8098/*
8099 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
8100 */
8101static int
8102tcp_header_init_ipv6(tcp_t *tcp)
8103{
8104	tcph_t	*tcph;
8105	uint32_t	sum;
8106	conn_t	*connp;
8107
8108	/*
8109	 * This is a simple initialization. If there's
8110	 * already a template, it should never be too small,
8111	 * so reuse it. Otherwise, allocate space for the new one.
8112	 * Ensure that there is enough space to "downgrade" the tcp_t
8113	 * to an IPv4 tcp_t. This requires having space for a full load
8114	 * of IPv4 options, as well as a full load of TCP options
8115	 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
8116	 * than a v6 header and a TCP header with a full load of TCP options
8117	 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
8118	 * We want to avoid reallocation in the "downgraded" case when
8119	 * processing outbound IPv4 options.
8120	 */
8121	if (tcp->tcp_iphc == NULL) {
8122		ASSERT(tcp->tcp_iphc_len == 0);
8123		tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8124		tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8125		if (tcp->tcp_iphc == NULL) {
8126			tcp->tcp_iphc_len = 0;
8127			return (ENOMEM);
8128		}
8129	}
8130
8131	/* options are gone; may need a new label */
8132	connp = tcp->tcp_connp;
8133	connp->conn_mlp_type = mlptSingle;
8134	connp->conn_ulp_labeled = !is_system_labeled();
8135
8136	ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8137	tcp->tcp_ipversion = IPV6_VERSION;
8138	tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
8139	tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8140	tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
8141	tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
8142	tcp->tcp_ipha = NULL;
8143
8144	/* Initialize the header template */
8145
8146	tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
8147	tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
8148	tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
8149	tcp->tcp_ip6h->ip6_hops = (uint8_t)tcp_ipv6_hoplimit;
8150
8151	tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
8152	tcp->tcp_tcph = tcph;
8153	tcph->th_offset_and_rsrvd[0] = (5 << 4);
8154	/*
8155	 * IP wants our header length in the checksum field to
8156	 * allow it to perform a single psuedo-header+checksum
8157	 * calculation on behalf of TCP.
8158	 * Include the adjustment for a source route when IPV6_RTHDR is set.
8159	 */
8160	sum = sizeof (tcph_t) + tcp->tcp_sum;
8161	sum = (sum >> 16) + (sum & 0xFFFF);
8162	U16_TO_ABE16(sum, tcph->th_sum);
8163	return (0);
8164}
8165
8166/* At minimum we need 4 bytes in the TCP header for the lookup */
8167#define	ICMP_MIN_TCP_HDR	12
8168
8169/*
8170 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
8171 * passed up by IP. The message is always received on the correct tcp_t.
8172 * Assumes that IP has pulled up everything up to and including the ICMP header.
8173 */
8174void
8175tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
8176{
8177	icmph_t *icmph;
8178	ipha_t	*ipha;
8179	int	iph_hdr_length;
8180	tcph_t	*tcph;
8181	boolean_t ipsec_mctl = B_FALSE;
8182	boolean_t secure;
8183	mblk_t *first_mp = mp;
8184	uint32_t new_mss;
8185	uint32_t ratio;
8186	size_t mp_size = MBLKL(mp);
8187	uint32_t seg_ack;
8188	uint32_t seg_seq;
8189
8190	/* Assume IP provides aligned packets - otherwise toss */
8191	if (!OK_32PTR(mp->b_rptr)) {
8192		freemsg(mp);
8193		return;
8194	}
8195
8196	/*
8197	 * Since ICMP errors are normal data marked with M_CTL when sent
8198	 * to TCP or UDP, we have to look for a IPSEC_IN value to identify
8199	 * packets starting with an ipsec_info_t, see ipsec_info.h.
8200	 */
8201	if ((mp_size == sizeof (ipsec_info_t)) &&
8202	    (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
8203		ASSERT(mp->b_cont != NULL);
8204		mp = mp->b_cont;
8205		/* IP should have done this */
8206		ASSERT(OK_32PTR(mp->b_rptr));
8207		mp_size = MBLKL(mp);
8208		ipsec_mctl = B_TRUE;
8209	}
8210
8211	/*
8212	 * Verify that we have a complete outer IP header. If not, drop it.
8213	 */
8214	if (mp_size < sizeof (ipha_t)) {
8215noticmpv4:
8216		freemsg(first_mp);
8217		return;
8218	}
8219
8220	ipha = (ipha_t *)mp->b_rptr;
8221	/*
8222	 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
8223	 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
8224	 */
8225	switch (IPH_HDR_VERSION(ipha)) {
8226	case IPV6_VERSION:
8227		tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
8228		return;
8229	case IPV4_VERSION:
8230		break;
8231	default:
8232		goto noticmpv4;
8233	}
8234
8235	/* Skip past the outer IP and ICMP headers */
8236	iph_hdr_length = IPH_HDR_LENGTH(ipha);
8237	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
8238	/*
8239	 * If we don't have the correct outer IP header length or if the ULP
8240	 * is not IPPROTO_ICMP or if we don't have a complete inner IP header
8241	 * send it upstream.
8242	 */
8243	if (iph_hdr_length < sizeof (ipha_t) ||
8244	    ipha->ipha_protocol != IPPROTO_ICMP ||
8245	    (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
8246		goto noticmpv4;
8247	}
8248	ipha = (ipha_t *)&icmph[1];
8249
8250	/* Skip past the inner IP and find the ULP header */
8251	iph_hdr_length = IPH_HDR_LENGTH(ipha);
8252	tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
8253	/*
8254	 * If we don't have the correct inner IP header length or if the ULP
8255	 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
8256	 * bytes of TCP header, drop it.
8257	 */
8258	if (iph_hdr_length < sizeof (ipha_t) ||
8259	    ipha->ipha_protocol != IPPROTO_TCP ||
8260	    (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
8261		goto noticmpv4;
8262	}
8263
8264	if (TCP_IS_DETACHED_NONEAGER(tcp)) {
8265		if (ipsec_mctl) {
8266			secure = ipsec_in_is_secure(first_mp);
8267		} else {
8268			secure = B_FALSE;
8269		}
8270		if (secure) {
8271			/*
8272			 * If we are willing to accept this in clear
8273			 * we don't have to verify policy.
8274			 */
8275			if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
8276				if (!tcp_check_policy(tcp, first_mp,
8277				    ipha, NULL, secure, ipsec_mctl)) {
8278					/*
8279					 * tcp_check_policy called
8280					 * ip_drop_packet() on failure.
8281					 */
8282					return;
8283				}
8284			}
8285		}
8286	} else if (ipsec_mctl) {
8287		/*
8288		 * This is a hard_bound connection. IP has already
8289		 * verified policy. We don't have to do it again.
8290		 */
8291		freeb(first_mp);
8292		first_mp = mp;
8293		ipsec_mctl = B_FALSE;
8294	}
8295
8296	seg_ack = ABE32_TO_U32(tcph->th_ack);
8297	seg_seq = ABE32_TO_U32(tcph->th_seq);
8298	/*
8299	 * TCP SHOULD check that the TCP sequence number contained in
8300	 * payload of the ICMP error message is within the range
8301	 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT
8302	 */
8303	if (SEQ_LT(seg_seq, tcp->tcp_suna) ||
8304		SEQ_GEQ(seg_seq, tcp->tcp_snxt) ||
8305		SEQ_GT(seg_ack, tcp->tcp_rnxt)) {
8306		/*
8307		 * If the ICMP message is bogus, should we kill the
8308		 * connection, or should we just drop the bogus ICMP
8309		 * message? It would probably make more sense to just
8310		 * drop the message so that if this one managed to get
8311		 * in, the real connection should not suffer.
8312		 */
8313		goto noticmpv4;
8314	}
8315
8316	switch (icmph->icmph_type) {
8317	case ICMP_DEST_UNREACHABLE:
8318		switch (icmph->icmph_code) {
8319		case ICMP_FRAGMENTATION_NEEDED:
8320			/*
8321			 * Reduce the MSS based on the new MTU.  This will
8322			 * eliminate any fragmentation locally.
8323			 * N.B.  There may well be some funny side-effects on
8324			 * the local send policy and the remote receive policy.
8325			 * Pending further research, we provide
8326			 * tcp_ignore_path_mtu just in case this proves
8327			 * disastrous somewhere.
8328			 *
8329			 * After updating the MSS, retransmit part of the
8330			 * dropped segment using the new mss by calling
8331			 * tcp_wput_data().  Need to adjust all those
8332			 * params to make sure tcp_wput_data() work properly.
8333			 */
8334			if (tcp_ignore_path_mtu)
8335				break;
8336
8337			/*
8338			 * Decrease the MSS by time stamp options
8339			 * IP options and IPSEC options. tcp_hdr_len
8340			 * includes time stamp option and IP option
8341			 * length.
8342			 */
8343
8344			new_mss = ntohs(icmph->icmph_du_mtu) -
8345			    tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
8346
8347			/*
8348			 * Only update the MSS if the new one is
8349			 * smaller than the previous one.  This is
8350			 * to avoid problems when getting multiple
8351			 * ICMP errors for the same MTU.
8352			 */
8353			if (new_mss >= tcp->tcp_mss)
8354				break;
8355
8356			/*
8357			 * Stop doing PMTU if new_mss is less than 68
8358			 * or less than tcp_mss_min.
8359			 * The value 68 comes from rfc 1191.
8360			 */
8361			if (new_mss < MAX(68, tcp_mss_min))
8362				tcp->tcp_ipha->ipha_fragment_offset_and_flags =
8363				    0;
8364
8365			ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8366			ASSERT(ratio >= 1);
8367			tcp_mss_set(tcp, new_mss);
8368
8369			/*
8370			 * Make sure we have something to
8371			 * send.
8372			 */
8373			if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8374			    (tcp->tcp_xmit_head != NULL)) {
8375				/*
8376				 * Shrink tcp_cwnd in
8377				 * proportion to the old MSS/new MSS.
8378				 */
8379				tcp->tcp_cwnd = ratio * tcp->tcp_mss;
8380				if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
8381				    (tcp->tcp_unsent == 0)) {
8382					tcp->tcp_rexmit_max = tcp->tcp_fss;
8383				} else {
8384					tcp->tcp_rexmit_max = tcp->tcp_snxt;
8385				}
8386				tcp->tcp_rexmit_nxt = tcp->tcp_suna;
8387				tcp->tcp_rexmit = B_TRUE;
8388				tcp->tcp_dupack_cnt = 0;
8389				tcp->tcp_snd_burst = TCP_CWND_SS;
8390				tcp_ss_rexmit(tcp);
8391			}
8392			break;
8393		case ICMP_PORT_UNREACHABLE:
8394		case ICMP_PROTOCOL_UNREACHABLE:
8395			switch (tcp->tcp_state) {
8396			case TCPS_SYN_SENT:
8397			case TCPS_SYN_RCVD:
8398				/*
8399				 * ICMP can snipe away incipient
8400				 * TCP connections as long as
8401				 * seq number is same as initial
8402				 * send seq number.
8403				 */
8404				if (seg_seq == tcp->tcp_iss) {
8405					(void) tcp_clean_death(tcp,
8406					    ECONNREFUSED, 6);
8407				}
8408				break;
8409			}
8410			break;
8411		case ICMP_HOST_UNREACHABLE:
8412		case ICMP_NET_UNREACHABLE:
8413			/* Record the error in case we finally time out. */
8414			if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
8415				tcp->tcp_client_errno = EHOSTUNREACH;
8416			else
8417				tcp->tcp_client_errno = ENETUNREACH;
8418			if (tcp->tcp_state == TCPS_SYN_RCVD) {
8419				if (tcp->tcp_listener != NULL &&
8420				    tcp->tcp_listener->tcp_syn_defense) {
8421					/*
8422					 * Ditch the half-open connection if we
8423					 * suspect a SYN attack is under way.
8424					 */
8425					tcp_ip_ire_mark_advice(tcp);
8426					(void) tcp_clean_death(tcp,
8427					    tcp->tcp_client_errno, 7);
8428				}
8429			}
8430			break;
8431		default:
8432			break;
8433		}
8434		break;
8435	case ICMP_SOURCE_QUENCH: {
8436		/*
8437		 * use a global boolean to control
8438		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
8439		 * The default is false.
8440		 */
8441		if (tcp_icmp_source_quench) {
8442			/*
8443			 * Reduce the sending rate as if we got a
8444			 * retransmit timeout
8445			 */
8446			uint32_t npkt;
8447
8448			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
8449			    tcp->tcp_mss;
8450			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
8451			tcp->tcp_cwnd = tcp->tcp_mss;
8452			tcp->tcp_cwnd_cnt = 0;
8453		}
8454		break;
8455	}
8456	}
8457	freemsg(first_mp);
8458}
8459
8460/*
8461 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
8462 * error messages passed up by IP.
8463 * Assumes that IP has pulled up all the extension headers as well
8464 * as the ICMPv6 header.
8465 */
8466static void
8467tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
8468{
8469	icmp6_t *icmp6;
8470	ip6_t	*ip6h;
8471	uint16_t	iph_hdr_length;
8472	tcpha_t	*tcpha;
8473	uint8_t	*nexthdrp;
8474	uint32_t new_mss;
8475	uint32_t ratio;
8476	boolean_t secure;
8477	mblk_t *first_mp = mp;
8478	size_t mp_size;
8479	uint32_t seg_ack;
8480	uint32_t seg_seq;
8481
8482	/*
8483	 * The caller has determined if this is an IPSEC_IN packet and
8484	 * set ipsec_mctl appropriately (see tcp_icmp_error).
8485	 */
8486	if (ipsec_mctl)
8487		mp = mp->b_cont;
8488
8489	mp_size = MBLKL(mp);
8490
8491	/*
8492	 * Verify that we have a complete IP header. If not, send it upstream.
8493	 */
8494	if (mp_size < sizeof (ip6_t)) {
8495noticmpv6:
8496		freemsg(first_mp);
8497		return;
8498	}
8499
8500	/*
8501	 * Verify this is an ICMPV6 packet, else send it upstream.
8502	 */
8503	ip6h = (ip6_t *)mp->b_rptr;
8504	if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8505		iph_hdr_length = IPV6_HDR_LEN;
8506	} else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
8507	    &nexthdrp) ||
8508	    *nexthdrp != IPPROTO_ICMPV6) {
8509		goto noticmpv6;
8510	}
8511	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
8512	ip6h = (ip6_t *)&icmp6[1];
8513	/*
8514	 * Verify if we have a complete ICMP and inner IP header.
8515	 */
8516	if ((uchar_t *)&ip6h[1] > mp->b_wptr)
8517		goto noticmpv6;
8518
8519	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
8520		goto noticmpv6;
8521	tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
8522	/*
8523	 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
8524	 * have at least ICMP_MIN_TCP_HDR bytes of  TCP header drop the
8525	 * packet.
8526	 */
8527	if ((*nexthdrp != IPPROTO_TCP) ||
8528	    ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
8529		goto noticmpv6;
8530	}
8531
8532	/*
8533	 * ICMP errors come on the right queue or come on
8534	 * listener/global queue for detached connections and
8535	 * get switched to the right queue. If it comes on the
8536	 * right queue, policy check has already been done by IP
8537	 * and thus free the first_mp without verifying the policy.
8538	 * If it has come for a non-hard bound connection, we need
8539	 * to verify policy as IP may not have done it.
8540	 */
8541	if (!tcp->tcp_hard_bound) {
8542		if (ipsec_mctl) {
8543			secure = ipsec_in_is_secure(first_mp);
8544		} else {
8545			secure = B_FALSE;
8546		}
8547		if (secure) {
8548			/*
8549			 * If we are willing to accept this in clear
8550			 * we don't have to verify policy.
8551			 */
8552			if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
8553				if (!tcp_check_policy(tcp, first_mp,
8554				    NULL, ip6h, secure, ipsec_mctl)) {
8555					/*
8556					 * tcp_check_policy called
8557					 * ip_drop_packet() on failure.
8558					 */
8559					return;
8560				}
8561			}
8562		}
8563	} else if (ipsec_mctl) {
8564		/*
8565		 * This is a hard_bound connection. IP has already
8566		 * verified policy. We don't have to do it again.
8567		 */
8568		freeb(first_mp);
8569		first_mp = mp;
8570		ipsec_mctl = B_FALSE;
8571	}
8572
8573	seg_ack = ntohl(tcpha->tha_ack);
8574	seg_seq = ntohl(tcpha->tha_seq);
8575	/*
8576	 * TCP SHOULD check that the TCP sequence number contained in
8577	 * payload of the ICMP error message is within the range
8578	 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT
8579	 */
8580	if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt) ||
8581	    SEQ_GT(seg_ack, tcp->tcp_rnxt)) {
8582		/*
8583		 * If the ICMP message is bogus, should we kill the
8584		 * connection, or should we just drop the bogus ICMP
8585		 * message? It would probably make more sense to just
8586		 * drop the message so that if this one managed to get
8587		 * in, the real connection should not suffer.
8588		 */
8589		goto noticmpv6;
8590	}
8591
8592	switch (icmp6->icmp6_type) {
8593	case ICMP6_PACKET_TOO_BIG:
8594		/*
8595		 * Reduce the MSS based on the new MTU.  This will
8596		 * eliminate any fragmentation locally.
8597		 * N.B.  There may well be some funny side-effects on
8598		 * the local send policy and the remote receive policy.
8599		 * Pending further research, we provide
8600		 * tcp_ignore_path_mtu just in case this proves
8601		 * disastrous somewhere.
8602		 *
8603		 * After updating the MSS, retransmit part of the
8604		 * dropped segment using the new mss by calling
8605		 * tcp_wput_data().  Need to adjust all those
8606		 * params to make sure tcp_wput_data() work properly.
8607		 */
8608		if (tcp_ignore_path_mtu)
8609			break;
8610
8611		/*
8612		 * Decrease the MSS by time stamp options
8613		 * IP options and IPSEC options. tcp_hdr_len
8614		 * includes time stamp option and IP option
8615		 * length.
8616		 */
8617		new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
8618			    tcp->tcp_ipsec_overhead;
8619
8620		/*
8621		 * Only update the MSS if the new one is
8622		 * smaller than the previous one.  This is
8623		 * to avoid problems when getting multiple
8624		 * ICMP errors for the same MTU.
8625		 */
8626		if (new_mss >= tcp->tcp_mss)
8627			break;
8628
8629		ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8630		ASSERT(ratio >= 1);
8631		tcp_mss_set(tcp, new_mss);
8632
8633		/*
8634		 * Make sure we have something to
8635		 * send.
8636		 */
8637		if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8638		    (tcp->tcp_xmit_head != NULL)) {
8639			/*
8640			 * Shrink tcp_cwnd in
8641			 * proportion to the old MSS/new MSS.
8642			 */
8643			tcp->tcp_cwnd = ratio * tcp->tcp_mss;
8644			if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
8645			    (tcp->tcp_unsent == 0)) {
8646				tcp->tcp_rexmit_max = tcp->tcp_fss;
8647			} else {
8648				tcp->tcp_rexmit_max = tcp->tcp_snxt;
8649			}
8650			tcp->tcp_rexmit_nxt = tcp->tcp_suna;
8651			tcp->tcp_rexmit = B_TRUE;
8652			tcp->tcp_dupack_cnt = 0;
8653			tcp->tcp_snd_burst = TCP_CWND_SS;
8654			tcp_ss_rexmit(tcp);
8655		}
8656		break;
8657
8658	case ICMP6_DST_UNREACH:
8659		switch (icmp6->icmp6_code) {
8660		case ICMP6_DST_UNREACH_NOPORT:
8661			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
8662			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
8663			    (seg_seq == tcp->tcp_iss)) {
8664				(void) tcp_clean_death(tcp,
8665				    ECONNREFUSED, 8);
8666			}
8667			break;
8668
8669		case ICMP6_DST_UNREACH_ADMIN:
8670		case ICMP6_DST_UNREACH_NOROUTE:
8671		case ICMP6_DST_UNREACH_BEYONDSCOPE:
8672		case ICMP6_DST_UNREACH_ADDR:
8673			/* Record the error in case we finally time out. */
8674			tcp->tcp_client_errno = EHOSTUNREACH;
8675			if (((tcp->tcp_state == TCPS_SYN_SENT) ||
8676			    (tcp->tcp_state == TCPS_SYN_RCVD)) &&
8677			    (seg_seq == tcp->tcp_iss)) {
8678				if (tcp->tcp_listener != NULL &&
8679				    tcp->tcp_listener->tcp_syn_defense) {
8680					/*
8681					 * Ditch the half-open connection if we
8682					 * suspect a SYN attack is under way.
8683					 */
8684					tcp_ip_ire_mark_advice(tcp);
8685					(void) tcp_clean_death(tcp,
8686					    tcp->tcp_client_errno, 9);
8687				}
8688			}
8689
8690
8691			break;
8692		default:
8693			break;
8694		}
8695		break;
8696
8697	case ICMP6_PARAM_PROB:
8698		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
8699		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
8700		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
8701		    (uchar_t *)nexthdrp) {
8702			if (tcp->tcp_state == TCPS_SYN_SENT ||
8703			    tcp->tcp_state == TCPS_SYN_RCVD) {
8704				(void) tcp_clean_death(tcp,
8705				    ECONNREFUSED, 10);
8706			}
8707			break;
8708		}
8709		break;
8710
8711	case ICMP6_TIME_EXCEEDED:
8712	default:
8713		break;
8714	}
8715	freemsg(first_mp);
8716}
8717
8718/*
8719 * IP recognizes seven kinds of bind requests:
8720 *
8721 * - A zero-length address binds only to the protocol number.
8722 *
8723 * - A 4-byte address is treated as a request to
8724 * validate that the address is a valid local IPv4
8725 * address, appropriate for an application to bind to.
8726 * IP does the verification, but does not make any note
8727 * of the address at this time.
8728 *
8729 * - A 16-byte address contains is treated as a request
8730 * to validate a local IPv6 address, as the 4-byte
8731 * address case above.
8732 *
8733 * - A 16-byte sockaddr_in to validate the local IPv4 address and also
8734 * use it for the inbound fanout of packets.
8735 *
8736 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
8737 * use it for the inbound fanout of packets.
8738 *
8739 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
8740 * information consisting of local and remote addresses
8741 * and ports.  In this case, the addresses are both
8742 * validated as appropriate for this operation, and, if
8743 * so, the information is retained for use in the
8744 * inbound fanout.
8745 *
8746 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
8747 * fanout information, like the 12-byte case above.
8748 *
8749 * IP will also fill in the IRE request mblk with information
8750 * regarding our peer.  In all cases, we notify IP of our protocol
8751 * type by appending a single protocol byte to the bind request.
8752 */
8753static mblk_t *
8754tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length)
8755{
8756	char	*cp;
8757	mblk_t	*mp;
8758	struct T_bind_req *tbr;
8759	ipa_conn_t	*ac;
8760	ipa6_conn_t	*ac6;
8761	sin_t		*sin;
8762	sin6_t		*sin6;
8763
8764	ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
8765	ASSERT((tcp->tcp_family == AF_INET &&
8766	    tcp->tcp_ipversion == IPV4_VERSION) ||
8767	    (tcp->tcp_family == AF_INET6 &&
8768	    (tcp->tcp_ipversion == IPV4_VERSION ||
8769	    tcp->tcp_ipversion == IPV6_VERSION)));
8770
8771	mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
8772	if (!mp)
8773		return (mp);
8774	mp->b_datap->db_type = M_PROTO;
8775	tbr = (struct T_bind_req *)mp->b_rptr;
8776	tbr->PRIM_type = bind_prim;
8777	tbr->ADDR_offset = sizeof (*tbr);
8778	tbr->CONIND_number = 0;
8779	tbr->ADDR_length = addr_length;
8780	cp = (char *)&tbr[1];
8781	switch (addr_length) {
8782	case sizeof (ipa_conn_t):
8783		ASSERT(tcp->tcp_family == AF_INET);
8784		ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
8785
8786		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
8787		if (mp->b_cont == NULL) {
8788			freemsg(mp);
8789			return (NULL);
8790		}
8791		mp->b_cont->b_wptr += sizeof (ire_t);
8792		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
8793
8794		/* cp known to be 32 bit aligned */
8795		ac = (ipa_conn_t *)cp;
8796		ac->ac_laddr = tcp->tcp_ipha->ipha_src;
8797		ac->ac_faddr = tcp->tcp_remote;
8798		ac->ac_fport = tcp->tcp_fport;
8799		ac->ac_lport = tcp->tcp_lport;
8800		tcp->tcp_hard_binding = 1;
8801		break;
8802
8803	case sizeof (ipa6_conn_t):
8804		ASSERT(tcp->tcp_family == AF_INET6);
8805
8806		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
8807		if (mp->b_cont == NULL) {
8808			freemsg(mp);
8809			return (NULL);
8810		}
8811		mp->b_cont->b_wptr += sizeof (ire_t);
8812		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
8813
8814		/* cp known to be 32 bit aligned */
8815		ac6 = (ipa6_conn_t *)cp;
8816		if (tcp->tcp_ipversion == IPV4_VERSION) {
8817			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
8818			    &ac6->ac6_laddr);
8819		} else {
8820			ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src;
8821		}
8822		ac6->ac6_faddr = tcp->tcp_remote_v6;
8823		ac6->ac6_fport = tcp->tcp_fport;
8824		ac6->ac6_lport = tcp->tcp_lport;
8825		tcp->tcp_hard_binding = 1;
8826		break;
8827
8828	case sizeof (sin_t):
8829		/*
8830		 * NOTE: IPV6_ADDR_LEN also has same size.
8831		 * Use family to discriminate.
8832		 */
8833		if (tcp->tcp_family == AF_INET) {
8834			sin = (sin_t *)cp;
8835
8836			*sin = sin_null;
8837			sin->sin_family = AF_INET;
8838			sin->sin_addr.s_addr = tcp->tcp_bound_source;
8839			sin->sin_port = tcp->tcp_lport;
8840			break;
8841		} else {
8842			*(in6_addr_t *)cp = tcp->tcp_bound_source_v6;
8843		}
8844		break;
8845
8846	case sizeof (sin6_t):
8847		ASSERT(tcp->tcp_family == AF_INET6);
8848		sin6 = (sin6_t *)cp;
8849
8850		*sin6 = sin6_null;
8851		sin6->sin6_family = AF_INET6;
8852		sin6->sin6_addr = tcp->tcp_bound_source_v6;
8853		sin6->sin6_port = tcp->tcp_lport;
8854		break;
8855
8856	case IP_ADDR_LEN:
8857		ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
8858		*(uint32_t *)cp = tcp->tcp_ipha->ipha_src;
8859		break;
8860
8861	}
8862	/* Add protocol number to end */
8863	cp[addr_length] = (char)IPPROTO_TCP;
8864	mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
8865	return (mp);
8866}
8867
8868/*
8869 * Notify IP that we are having trouble with this connection.  IP should
8870 * blow the IRE away and start over.
8871 */
8872static void
8873tcp_ip_notify(tcp_t *tcp)
8874{
8875	struct iocblk	*iocp;
8876	ipid_t	*ipid;
8877	mblk_t	*mp;
8878
8879	/* IPv6 has NUD thus notification to delete the IRE is not needed */
8880	if (tcp->tcp_ipversion == IPV6_VERSION)
8881		return;
8882
8883	mp = mkiocb(IP_IOCTL);
8884	if (mp == NULL)
8885		return;
8886
8887	iocp = (struct iocblk *)mp->b_rptr;
8888	iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
8889
8890	mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
8891	if (!mp->b_cont) {
8892		freeb(mp);
8893		return;
8894	}
8895
8896	ipid = (ipid_t *)mp->b_cont->b_rptr;
8897	mp->b_cont->b_wptr += iocp->ioc_count;
8898	bzero(ipid, sizeof (*ipid));
8899	ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
8900	ipid->ipid_ire_type = IRE_CACHE;
8901	ipid->ipid_addr_offset = sizeof (ipid_t);
8902	ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
8903	/*
8904	 * Note: in the case of source routing we want to blow away the
8905	 * route to the first source route hop.
8906	 */
8907	bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
8908	    sizeof (tcp->tcp_ipha->ipha_dst));
8909
8910	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
8911}
8912
8913/* Unlink and return any mblk that looks like it contains an ire */
8914static mblk_t *
8915tcp_ire_mp(mblk_t *mp)
8916{
8917	mblk_t	*prev_mp;
8918
8919	for (;;) {
8920		prev_mp = mp;
8921		mp = mp->b_cont;
8922		if (mp == NULL)
8923			break;
8924		switch (DB_TYPE(mp)) {
8925		case IRE_DB_TYPE:
8926		case IRE_DB_REQ_TYPE:
8927			if (prev_mp != NULL)
8928				prev_mp->b_cont = mp->b_cont;
8929			mp->b_cont = NULL;
8930			return (mp);
8931		default:
8932			break;
8933		}
8934	}
8935	return (mp);
8936}
8937
8938/*
8939 * Timer callback routine for keepalive probe.  We do a fake resend of
8940 * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
8941 * check to see if we have heard anything from the other end for the last
8942 * RTO period.  If we have, set the timer to expire for another
8943 * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
8944 * RTO << 1 and check again when it expires.  Keep exponentially increasing
8945 * the timeout if we have not heard from the other side.  If for more than
8946 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
8947 * kill the connection unless the keepalive abort threshold is 0.  In
8948 * that case, we will probe "forever."
8949 */
8950static void
8951tcp_keepalive_killer(void *arg)
8952{
8953	mblk_t	*mp;
8954	conn_t	*connp = (conn_t *)arg;
8955	tcp_t  	*tcp = connp->conn_tcp;
8956	int32_t	firetime;
8957	int32_t	idletime;
8958	int32_t	ka_intrvl;
8959
8960	tcp->tcp_ka_tid = 0;
8961
8962	if (tcp->tcp_fused)
8963		return;
8964
8965	BUMP_MIB(&tcp_mib, tcpTimKeepalive);
8966	ka_intrvl = tcp->tcp_ka_interval;
8967
8968	/*
8969	 * Keepalive probe should only be sent if the application has not
8970	 * done a close on the connection.
8971	 */
8972	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
8973		return;
8974	}
8975	/* Timer fired too early, restart it. */
8976	if (tcp->tcp_state < TCPS_ESTABLISHED) {
8977		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
8978		    MSEC_TO_TICK(ka_intrvl));
8979		return;
8980	}
8981
8982	idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time);
8983	/*
8984	 * If we have not heard from the other side for a long
8985	 * time, kill the connection unless the keepalive abort
8986	 * threshold is 0.  In that case, we will probe "forever."
8987	 */
8988	if (tcp->tcp_ka_abort_thres != 0 &&
8989	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
8990		BUMP_MIB(&tcp_mib, tcpTimKeepaliveDrop);
8991		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
8992		    tcp->tcp_client_errno : ETIMEDOUT, 11);
8993		return;
8994	}
8995
8996	if (tcp->tcp_snxt == tcp->tcp_suna &&
8997	    idletime >= ka_intrvl) {
8998		/* Fake resend of last ACKed byte. */
8999		mblk_t	*mp1 = allocb(1, BPRI_LO);
9000
9001		if (mp1 != NULL) {
9002			*mp1->b_wptr++ = '\0';
9003			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
9004			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
9005			freeb(mp1);
9006			/*
9007			 * if allocation failed, fall through to start the
9008			 * timer back.
9009			 */
9010			if (mp != NULL) {
9011				TCP_RECORD_TRACE(tcp, mp,
9012				    TCP_TRACE_SEND_PKT);
9013				tcp_send_data(tcp, tcp->tcp_wq, mp);
9014				BUMP_MIB(&tcp_mib, tcpTimKeepaliveProbe);
9015				if (tcp->tcp_ka_last_intrvl != 0) {
9016					/*
9017					 * We should probe again at least
9018					 * in ka_intrvl, but not more than
9019					 * tcp_rexmit_interval_max.
9020					 */
9021					firetime = MIN(ka_intrvl - 1,
9022					    tcp->tcp_ka_last_intrvl << 1);
9023					if (firetime > tcp_rexmit_interval_max)
9024						firetime =
9025						    tcp_rexmit_interval_max;
9026				} else {
9027					firetime = tcp->tcp_rto;
9028				}
9029				tcp->tcp_ka_tid = TCP_TIMER(tcp,
9030				    tcp_keepalive_killer,
9031				    MSEC_TO_TICK(firetime));
9032				tcp->tcp_ka_last_intrvl = firetime;
9033				return;
9034			}
9035		}
9036	} else {
9037		tcp->tcp_ka_last_intrvl = 0;
9038	}
9039
9040	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
9041	if ((firetime = ka_intrvl - idletime) < 0) {
9042		firetime = ka_intrvl;
9043	}
9044	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9045	    MSEC_TO_TICK(firetime));
9046}
9047
9048int
9049tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
9050{
9051	queue_t	*q = tcp->tcp_rq;
9052	int32_t	mss = tcp->tcp_mss;
9053	int	maxpsz;
9054
9055	if (TCP_IS_DETACHED(tcp))
9056		return (mss);
9057
9058	if (tcp->tcp_fused) {
9059		maxpsz = tcp_fuse_maxpsz_set(tcp);
9060		mss = INFPSZ;
9061	} else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
9062		/*
9063		 * Set the sd_qn_maxpsz according to the socket send buffer
9064		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
9065		 * instruct the stream head to copyin user data into contiguous
9066		 * kernel-allocated buffers without breaking it up into smaller
9067		 * chunks.  We round up the buffer size to the nearest SMSS.
9068		 */
9069		maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
9070		if (tcp->tcp_kssl_ctx == NULL)
9071			mss = INFPSZ;
9072		else
9073			mss = SSL3_MAX_RECORD_LEN;
9074	} else {
9075		/*
9076		 * Set sd_qn_maxpsz to approx half the (receivers) buffer
9077		 * (and a multiple of the mss).  This instructs the stream
9078		 * head to break down larger than SMSS writes into SMSS-
9079		 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
9080		 */
9081		maxpsz = tcp->tcp_maxpsz * mss;
9082		if (maxpsz > tcp->tcp_xmit_hiwater/2) {
9083			maxpsz = tcp->tcp_xmit_hiwater/2;
9084			/* Round up to nearest mss */
9085			maxpsz = MSS_ROUNDUP(maxpsz, mss);
9086		}
9087	}
9088	(void) setmaxps(q, maxpsz);
9089	tcp->tcp_wq->q_maxpsz = maxpsz;
9090
9091	if (set_maxblk)
9092		(void) mi_set_sth_maxblk(q, mss);
9093
9094	return (mss);
9095}
9096
9097/*
9098 * Extract option values from a tcp header.  We put any found values into the
9099 * tcpopt struct and return a bitmask saying which options were found.
9100 */
9101static int
9102tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
9103{
9104	uchar_t		*endp;
9105	int		len;
9106	uint32_t	mss;
9107	uchar_t		*up = (uchar_t *)tcph;
9108	int		found = 0;
9109	int32_t		sack_len;
9110	tcp_seq		sack_begin, sack_end;
9111	tcp_t		*tcp;
9112
9113	endp = up + TCP_HDR_LENGTH(tcph);
9114	up += TCP_MIN_HEADER_LENGTH;
9115	while (up < endp) {
9116		len = endp - up;
9117		switch (*up) {
9118		case TCPOPT_EOL:
9119			break;
9120
9121		case TCPOPT_NOP:
9122			up++;
9123			continue;
9124
9125		case TCPOPT_MAXSEG:
9126			if (len < TCPOPT_MAXSEG_LEN ||
9127			    up[1] != TCPOPT_MAXSEG_LEN)
9128				break;
9129
9130			mss = BE16_TO_U16(up+2);
9131			/* Caller must handle tcp_mss_min and tcp_mss_max_* */
9132			tcpopt->tcp_opt_mss = mss;
9133			found |= TCP_OPT_MSS_PRESENT;
9134
9135			up += TCPOPT_MAXSEG_LEN;
9136			continue;
9137
9138		case TCPOPT_WSCALE:
9139			if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
9140				break;
9141
9142			if (up[2] > TCP_MAX_WINSHIFT)
9143				tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
9144			else
9145				tcpopt->tcp_opt_wscale = up[2];
9146			found |= TCP_OPT_WSCALE_PRESENT;
9147
9148			up += TCPOPT_WS_LEN;
9149			continue;
9150
9151		case TCPOPT_SACK_PERMITTED:
9152			if (len < TCPOPT_SACK_OK_LEN ||
9153			    up[1] != TCPOPT_SACK_OK_LEN)
9154				break;
9155			found |= TCP_OPT_SACK_OK_PRESENT;
9156			up += TCPOPT_SACK_OK_LEN;
9157			continue;
9158
9159		case TCPOPT_SACK:
9160			if (len <= 2 || up[1] <= 2 || len < up[1])
9161				break;
9162
9163			/* If TCP is not interested in SACK blks... */
9164			if ((tcp = tcpopt->tcp) == NULL) {
9165				up += up[1];
9166				continue;
9167			}
9168			sack_len = up[1] - TCPOPT_HEADER_LEN;
9169			up += TCPOPT_HEADER_LEN;
9170
9171			/*
9172			 * If the list is empty, allocate one and assume
9173			 * nothing is sack'ed.
9174			 */
9175			ASSERT(tcp->tcp_sack_info != NULL);
9176			if (tcp->tcp_notsack_list == NULL) {
9177				tcp_notsack_update(&(tcp->tcp_notsack_list),
9178				    tcp->tcp_suna, tcp->tcp_snxt,
9179				    &(tcp->tcp_num_notsack_blk),
9180				    &(tcp->tcp_cnt_notsack_list));
9181
9182				/*
9183				 * Make sure tcp_notsack_list is not NULL.
9184				 * This happens when kmem_alloc(KM_NOSLEEP)
9185				 * returns NULL.
9186				 */
9187				if (tcp->tcp_notsack_list == NULL) {
9188					up += sack_len;
9189					continue;
9190				}
9191				tcp->tcp_fack = tcp->tcp_suna;
9192			}
9193
9194			while (sack_len > 0) {
9195				if (up + 8 > endp) {
9196					up = endp;
9197					break;
9198				}
9199				sack_begin = BE32_TO_U32(up);
9200				up += 4;
9201				sack_end = BE32_TO_U32(up);
9202				up += 4;
9203				sack_len -= 8;
9204				/*
9205				 * Bounds checking.  Make sure the SACK
9206				 * info is within tcp_suna and tcp_snxt.
9207				 * If this SACK blk is out of bound, ignore
9208				 * it but continue to parse the following
9209				 * blks.
9210				 */
9211				if (SEQ_LEQ(sack_end, sack_begin) ||
9212				    SEQ_LT(sack_begin, tcp->tcp_suna) ||
9213				    SEQ_GT(sack_end, tcp->tcp_snxt)) {
9214					continue;
9215				}
9216				tcp_notsack_insert(&(tcp->tcp_notsack_list),
9217				    sack_begin, sack_end,
9218				    &(tcp->tcp_num_notsack_blk),
9219				    &(tcp->tcp_cnt_notsack_list));
9220				if (SEQ_GT(sack_end, tcp->tcp_fack)) {
9221					tcp->tcp_fack = sack_end;
9222				}
9223			}
9224			found |= TCP_OPT_SACK_PRESENT;
9225			continue;
9226
9227		case TCPOPT_TSTAMP:
9228			if (len < TCPOPT_TSTAMP_LEN ||
9229			    up[1] != TCPOPT_TSTAMP_LEN)
9230				break;
9231
9232			tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
9233			tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
9234
9235			found |= TCP_OPT_TSTAMP_PRESENT;
9236
9237			up += TCPOPT_TSTAMP_LEN;
9238			continue;
9239
9240		default:
9241			if (len <= 1 || len < (int)up[1] || up[1] == 0)
9242				break;
9243			up += up[1];
9244			continue;
9245		}
9246		break;
9247	}
9248	return (found);
9249}
9250
9251/*
9252 * Set the mss associated with a particular tcp based on its current value,
9253 * and a new one passed in. Observe minimums and maximums, and reset
9254 * other state variables that we want to view as multiples of mss.
9255 *
9256 * This function is called in various places mainly because
9257 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the
9258 *    other side's SYN/SYN-ACK packet arrives.
9259 * 2) PMTUd may get us a new MSS.
9260 * 3) If the other side stops sending us timestamp option, we need to
9261 *    increase the MSS size to use the extra bytes available.
9262 */
9263static void
9264tcp_mss_set(tcp_t *tcp, uint32_t mss)
9265{
9266	uint32_t	mss_max;
9267
9268	if (tcp->tcp_ipversion == IPV4_VERSION)
9269		mss_max = tcp_mss_max_ipv4;
9270	else
9271		mss_max = tcp_mss_max_ipv6;
9272
9273	if (mss < tcp_mss_min)
9274		mss = tcp_mss_min;
9275	if (mss > mss_max)
9276		mss = mss_max;
9277	/*
9278	 * Unless naglim has been set by our client to
9279	 * a non-mss value, force naglim to track mss.
9280	 * This can help to aggregate small writes.
9281	 */
9282	if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
9283		tcp->tcp_naglim = mss;
9284	/*
9285	 * TCP should be able to buffer at least 4 MSS data for obvious
9286	 * performance reason.
9287	 */
9288	if ((mss << 2) > tcp->tcp_xmit_hiwater)
9289		tcp->tcp_xmit_hiwater = mss << 2;
9290
9291	/*
9292	 * Check if we need to apply the tcp_init_cwnd here.  If
9293	 * it is set and the MSS gets bigger (should not happen
9294	 * normally), we need to adjust the resulting tcp_cwnd properly.
9295	 * The new tcp_cwnd should not get bigger.
9296	 */
9297	if (tcp->tcp_init_cwnd == 0) {
9298		tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss,
9299		    MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss)));
9300	} else {
9301		if (tcp->tcp_mss < mss) {
9302			tcp->tcp_cwnd = MAX(1,
9303			    (tcp->tcp_init_cwnd * tcp->tcp_mss / mss)) * mss;
9304		} else {
9305			tcp->tcp_cwnd = tcp->tcp_init_cwnd * mss;
9306		}
9307	}
9308	tcp->tcp_mss = mss;
9309	tcp->tcp_cwnd_cnt = 0;
9310	(void) tcp_maxpsz_set(tcp, B_TRUE);
9311}
9312
9313static int
9314tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
9315{
9316	tcp_t		*tcp = NULL;
9317	conn_t		*connp;
9318	int		err;
9319	dev_t		conn_dev;
9320	zoneid_t	zoneid = getzoneid();
9321
9322	/*
9323	 * Special case for install: miniroot needs to be able to access files
9324	 * via NFS as though it were always in the global zone.
9325	 */
9326	if (credp == kcred && nfs_global_client_only != 0)
9327		zoneid = GLOBAL_ZONEID;
9328
9329	if (q->q_ptr != NULL)
9330		return (0);
9331
9332	if (sflag == MODOPEN) {
9333		/*
9334		 * This is a special case. The purpose of a modopen
9335		 * is to allow just the T_SVR4_OPTMGMT_REQ to pass
9336		 * through for MIB browsers. Everything else is failed.
9337		 */
9338		connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt));
9339
9340		if (connp == NULL)
9341			return (ENOMEM);
9342
9343		connp->conn_flags |= IPCL_TCPMOD;
9344		connp->conn_cred = credp;
9345		connp->conn_zoneid = zoneid;
9346		q->q_ptr = WR(q)->q_ptr = connp;
9347		crhold(credp);
9348		q->q_qinfo = &tcp_mod_rinit;
9349		WR(q)->q_qinfo = &tcp_mod_winit;
9350		qprocson(q);
9351		return (0);
9352	}
9353
9354	if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0)
9355		return (EBUSY);
9356
9357	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
9358
9359	if (flag & SO_ACCEPTOR) {
9360		q->q_qinfo = &tcp_acceptor_rinit;
9361		q->q_ptr = (void *)conn_dev;
9362		WR(q)->q_qinfo = &tcp_acceptor_winit;
9363		WR(q)->q_ptr = (void *)conn_dev;
9364		qprocson(q);
9365		return (0);
9366	}
9367
9368	connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt));
9369	if (connp == NULL) {
9370		inet_minor_free(ip_minor_arena, conn_dev);
9371		q->q_ptr = NULL;
9372		return (ENOSR);
9373	}
9374	connp->conn_sqp = IP_SQUEUE_GET(lbolt);
9375	tcp = connp->conn_tcp;
9376
9377	q->q_ptr = WR(q)->q_ptr = connp;
9378	if (getmajor(*devp) == TCP6_MAJ) {
9379		connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6);
9380		connp->conn_send = ip_output_v6;
9381		connp->conn_af_isv6 = B_TRUE;
9382		connp->conn_pkt_isv6 = B_TRUE;
9383		connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
9384		tcp->tcp_ipversion = IPV6_VERSION;
9385		tcp->tcp_family = AF_INET6;
9386		tcp->tcp_mss = tcp_mss_def_ipv6;
9387	} else {
9388		connp->conn_flags |= IPCL_TCP4;
9389		connp->conn_send = ip_output;
9390		connp->conn_af_isv6 = B_FALSE;
9391		connp->conn_pkt_isv6 = B_FALSE;
9392		tcp->tcp_ipversion = IPV4_VERSION;
9393		tcp->tcp_family = AF_INET;
9394		tcp->tcp_mss = tcp_mss_def_ipv4;
9395	}
9396
9397	/*
9398	 * TCP keeps a copy of cred for cache locality reasons but
9399	 * we put a reference only once. If connp->conn_cred
9400	 * becomes invalid, tcp_cred should also be set to NULL.
9401	 */
9402	tcp->tcp_cred = connp->conn_cred = credp;
9403	crhold(connp->conn_cred);
9404	tcp->tcp_cpid = curproc->p_pid;
9405	connp->conn_zoneid = zoneid;
9406	connp->conn_mlp_type = mlptSingle;
9407	connp->conn_ulp_labeled = !is_system_labeled();
9408
9409	/*
9410	 * If the caller has the process-wide flag set, then default to MAC
9411	 * exempt mode.  This allows read-down to unlabeled hosts.
9412	 */
9413	if (getpflags(NET_MAC_AWARE, credp) != 0)
9414		connp->conn_mac_exempt = B_TRUE;
9415
9416	connp->conn_dev = conn_dev;
9417
9418	ASSERT(q->q_qinfo == &tcp_rinit);
9419	ASSERT(WR(q)->q_qinfo == &tcp_winit);
9420
9421	if (flag & SO_SOCKSTR) {
9422		/*
9423		 * No need to insert a socket in tcp acceptor hash.
9424		 * If it was a socket acceptor stream, we dealt with
9425		 * it above. A socket listener can never accept a
9426		 * connection and doesn't need acceptor_id.
9427		 */
9428		connp->conn_flags |= IPCL_SOCKET;
9429		tcp->tcp_issocket = 1;
9430		WR(q)->q_qinfo = &tcp_sock_winit;
9431	} else {
9432#ifdef	_ILP32
9433		tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
9434#else
9435		tcp->tcp_acceptor_id = conn_dev;
9436#endif	/* _ILP32 */
9437		tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
9438	}
9439
9440	if (tcp_trace)
9441		tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP);
9442
9443	err = tcp_init(tcp, q);
9444	if (err != 0) {
9445		inet_minor_free(ip_minor_arena, connp->conn_dev);
9446		tcp_acceptor_hash_remove(tcp);
9447		CONN_DEC_REF(connp);
9448		q->q_ptr = WR(q)->q_ptr = NULL;
9449		return (err);
9450	}
9451
9452	RD(q)->q_hiwat = tcp_recv_hiwat;
9453	tcp->tcp_rwnd = tcp_recv_hiwat;
9454
9455	/* Non-zero default values */
9456	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
9457	/*
9458	 * Put the ref for TCP. Ref for IP was already put
9459	 * by ipcl_conn_create. Also Make the conn_t globally
9460	 * visible to walkers
9461	 */
9462	mutex_enter(&connp->conn_lock);
9463	CONN_INC_REF_LOCKED(connp);
9464	ASSERT(connp->conn_ref == 2);
9465	connp->conn_state_flags &= ~CONN_INCIPIENT;
9466	mutex_exit(&connp->conn_lock);
9467
9468	qprocson(q);
9469	return (0);
9470}
9471
9472/*
9473 * Some TCP options can be "set" by requesting them in the option
9474 * buffer. This is needed for XTI feature test though we do not
9475 * allow it in general. We interpret that this mechanism is more
9476 * applicable to OSI protocols and need not be allowed in general.
9477 * This routine filters out options for which it is not allowed (most)
9478 * and lets through those (few) for which it is. [ The XTI interface
9479 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
9480 * ever implemented will have to be allowed here ].
9481 */
9482static boolean_t
9483tcp_allow_connopt_set(int level, int name)
9484{
9485
9486	switch (level) {
9487	case IPPROTO_TCP:
9488		switch (name) {
9489		case TCP_NODELAY:
9490			return (B_TRUE);
9491		default:
9492			return (B_FALSE);
9493		}
9494		/*NOTREACHED*/
9495	default:
9496		return (B_FALSE);
9497	}
9498	/*NOTREACHED*/
9499}
9500
9501/*
9502 * This routine gets default values of certain options whose default
9503 * values are maintained by protocol specific code
9504 */
9505/* ARGSUSED */
9506int
9507tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
9508{
9509	int32_t	*i1 = (int32_t *)ptr;
9510
9511	switch (level) {
9512	case IPPROTO_TCP:
9513		switch (name) {
9514		case TCP_NOTIFY_THRESHOLD:
9515			*i1 = tcp_ip_notify_interval;
9516			break;
9517		case TCP_ABORT_THRESHOLD:
9518			*i1 = tcp_ip_abort_interval;
9519			break;
9520		case TCP_CONN_NOTIFY_THRESHOLD:
9521			*i1 = tcp_ip_notify_cinterval;
9522			break;
9523		case TCP_CONN_ABORT_THRESHOLD:
9524			*i1 = tcp_ip_abort_cinterval;
9525			break;
9526		default:
9527			return (-1);
9528		}
9529		break;
9530	case IPPROTO_IP:
9531		switch (name) {
9532		case IP_TTL:
9533			*i1 = tcp_ipv4_ttl;
9534			break;
9535		default:
9536			return (-1);
9537		}
9538		break;
9539	case IPPROTO_IPV6:
9540		switch (name) {
9541		case IPV6_UNICAST_HOPS:
9542			*i1 = tcp_ipv6_hoplimit;
9543			break;
9544		default:
9545			return (-1);
9546		}
9547		break;
9548	default:
9549		return (-1);
9550	}
9551	return (sizeof (int));
9552}
9553
9554
9555/*
9556 * TCP routine to get the values of options.
9557 */
9558int
9559tcp_opt_get(queue_t *q, int level, int	name, uchar_t *ptr)
9560{
9561	int		*i1 = (int *)ptr;
9562	conn_t		*connp = Q_TO_CONN(q);
9563	tcp_t		*tcp = connp->conn_tcp;
9564	ip6_pkt_t	*ipp = &tcp->tcp_sticky_ipp;
9565
9566	switch (level) {
9567	case SOL_SOCKET:
9568		switch (name) {
9569		case SO_LINGER:	{
9570			struct linger *lgr = (struct linger *)ptr;
9571
9572			lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0;
9573			lgr->l_linger = tcp->tcp_lingertime;
9574			}
9575			return (sizeof (struct linger));
9576		case SO_DEBUG:
9577			*i1 = tcp->tcp_debug ? SO_DEBUG : 0;
9578			break;
9579		case SO_KEEPALIVE:
9580			*i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0;
9581			break;
9582		case SO_DONTROUTE:
9583			*i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0;
9584			break;
9585		case SO_USELOOPBACK:
9586			*i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0;
9587			break;
9588		case SO_BROADCAST:
9589			*i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0;
9590			break;
9591		case SO_REUSEADDR:
9592			*i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0;
9593			break;
9594		case SO_OOBINLINE:
9595			*i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0;
9596			break;
9597		case SO_DGRAM_ERRIND:
9598			*i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0;
9599			break;
9600		case SO_TYPE:
9601			*i1 = SOCK_STREAM;
9602			break;
9603		case SO_SNDBUF:
9604			*i1 = tcp->tcp_xmit_hiwater;
9605			break;
9606		case SO_RCVBUF:
9607			*i1 = RD(q)->q_hiwat;
9608			break;
9609		case SO_SND_COPYAVOID:
9610			*i1 = tcp->tcp_snd_zcopy_on ?
9611			    SO_SND_COPYAVOID : 0;
9612			break;
9613		case SO_ALLZONES:
9614			*i1 = connp->conn_allzones ? 1 : 0;
9615			break;
9616		case SO_ANON_MLP:
9617			*i1 = connp->conn_anon_mlp;
9618			break;
9619		case SO_MAC_EXEMPT:
9620			*i1 = connp->conn_mac_exempt;
9621			break;
9622		case SO_EXCLBIND:
9623			*i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0;
9624			break;
9625		default:
9626			return (-1);
9627		}
9628		break;
9629	case IPPROTO_TCP:
9630		switch (name) {
9631		case TCP_NODELAY:
9632			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
9633			break;
9634		case TCP_MAXSEG:
9635			*i1 = tcp->tcp_mss;
9636			break;
9637		case TCP_NOTIFY_THRESHOLD:
9638			*i1 = (int)tcp->tcp_first_timer_threshold;
9639			break;
9640		case TCP_ABORT_THRESHOLD:
9641			*i1 = tcp->tcp_second_timer_threshold;
9642			break;
9643		case TCP_CONN_NOTIFY_THRESHOLD:
9644			*i1 = tcp->tcp_first_ctimer_threshold;
9645			break;
9646		case TCP_CONN_ABORT_THRESHOLD:
9647			*i1 = tcp->tcp_second_ctimer_threshold;
9648			break;
9649		case TCP_RECVDSTADDR:
9650			*i1 = tcp->tcp_recvdstaddr;
9651			break;
9652		case TCP_ANONPRIVBIND:
9653			*i1 = tcp->tcp_anon_priv_bind;
9654			break;
9655		case TCP_EXCLBIND:
9656			*i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0;
9657			break;
9658		case TCP_INIT_CWND:
9659			*i1 = tcp->tcp_init_cwnd;
9660			break;
9661		case TCP_KEEPALIVE_THRESHOLD:
9662			*i1 = tcp->tcp_ka_interval;
9663			break;
9664		case TCP_KEEPALIVE_ABORT_THRESHOLD:
9665			*i1 = tcp->tcp_ka_abort_thres;
9666			break;
9667		case TCP_CORK:
9668			*i1 = tcp->tcp_cork;
9669			break;
9670		default:
9671			return (-1);
9672		}
9673		break;
9674	case IPPROTO_IP:
9675		if (tcp->tcp_family != AF_INET)
9676			return (-1);
9677		switch (name) {
9678		case IP_OPTIONS:
9679		case T_IP_OPTIONS: {
9680			/*
9681			 * This is compatible with BSD in that in only return
9682			 * the reverse source route with the final destination
9683			 * as the last entry. The first 4 bytes of the option
9684			 * will contain the final destination.
9685			 */
9686			int	opt_len;
9687
9688			opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha;
9689			opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH;
9690			ASSERT(opt_len >= 0);
9691			/* Caller ensures enough space */
9692			if (opt_len > 0) {
9693				/*
9694				 * TODO: Do we have to handle getsockopt on an
9695				 * initiator as well?
9696				 */
9697				return (ip_opt_get_user(tcp->tcp_ipha, ptr));
9698			}
9699			return (0);
9700			}
9701		case IP_TOS:
9702		case T_IP_TOS:
9703			*i1 = (int)tcp->tcp_ipha->ipha_type_of_service;
9704			break;
9705		case IP_TTL:
9706			*i1 = (int)tcp->tcp_ipha->ipha_ttl;
9707			break;
9708		case IP_NEXTHOP:
9709			/* Handled at IP level */
9710			return (-EINVAL);
9711		default:
9712			return (-1);
9713		}
9714		break;
9715	case IPPROTO_IPV6:
9716		/*
9717		 * IPPROTO_IPV6 options are only supported for sockets
9718		 * that are using IPv6 on the wire.
9719		 */
9720		if (tcp->tcp_ipversion != IPV6_VERSION) {
9721			return (-1);
9722		}
9723		switch (name) {
9724		case IPV6_UNICAST_HOPS:
9725			*i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops;
9726			break;	/* goto sizeof (int) option return */
9727		case IPV6_BOUND_IF:
9728			/* Zero if not set */
9729			*i1 = tcp->tcp_bound_if;
9730			break;	/* goto sizeof (int) option return */
9731		case IPV6_RECVPKTINFO:
9732			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)
9733				*i1 = 1;
9734			else
9735				*i1 = 0;
9736			break;	/* goto sizeof (int) option return */
9737		case IPV6_RECVTCLASS:
9738			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)
9739				*i1 = 1;
9740			else
9741				*i1 = 0;
9742			break;	/* goto sizeof (int) option return */
9743		case IPV6_RECVHOPLIMIT:
9744			if (tcp->tcp_ipv6_recvancillary &
9745			    TCP_IPV6_RECVHOPLIMIT)
9746				*i1 = 1;
9747			else
9748				*i1 = 0;
9749			break;	/* goto sizeof (int) option return */
9750		case IPV6_RECVHOPOPTS:
9751			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS)
9752				*i1 = 1;
9753			else
9754				*i1 = 0;
9755			break;	/* goto sizeof (int) option return */
9756		case IPV6_RECVDSTOPTS:
9757			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS)
9758				*i1 = 1;
9759			else
9760				*i1 = 0;
9761			break;	/* goto sizeof (int) option return */
9762		case _OLD_IPV6_RECVDSTOPTS:
9763			if (tcp->tcp_ipv6_recvancillary &
9764			    TCP_OLD_IPV6_RECVDSTOPTS)
9765				*i1 = 1;
9766			else
9767				*i1 = 0;
9768			break;	/* goto sizeof (int) option return */
9769		case IPV6_RECVRTHDR:
9770			if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR)
9771				*i1 = 1;
9772			else
9773				*i1 = 0;
9774			break;	/* goto sizeof (int) option return */
9775		case IPV6_RECVRTHDRDSTOPTS:
9776			if (tcp->tcp_ipv6_recvancillary &
9777			    TCP_IPV6_RECVRTDSTOPTS)
9778				*i1 = 1;
9779			else
9780				*i1 = 0;
9781			break;	/* goto sizeof (int) option return */
9782		case IPV6_PKTINFO: {
9783			/* XXX assumes that caller has room for max size! */
9784			struct in6_pktinfo *pkti;
9785
9786			pkti = (struct in6_pktinfo *)ptr;
9787			if (ipp->ipp_fields & IPPF_IFINDEX)
9788				pkti->ipi6_ifindex = ipp->ipp_ifindex;
9789			else
9790				pkti->ipi6_ifindex = 0;
9791			if (ipp->ipp_fields & IPPF_ADDR)
9792				pkti->ipi6_addr = ipp->ipp_addr;
9793			else
9794				pkti->ipi6_addr = ipv6_all_zeros;
9795			return (sizeof (struct in6_pktinfo));
9796		}
9797		case IPV6_TCLASS:
9798			if (ipp->ipp_fields & IPPF_TCLASS)
9799				*i1 = ipp->ipp_tclass;
9800			else
9801				*i1 = IPV6_FLOW_TCLASS(
9802				    IPV6_DEFAULT_VERS_AND_FLOW);
9803			break;	/* goto sizeof (int) option return */
9804		case IPV6_NEXTHOP: {
9805			sin6_t *sin6 = (sin6_t *)ptr;
9806
9807			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
9808				return (0);
9809			*sin6 = sin6_null;
9810			sin6->sin6_family = AF_INET6;
9811			sin6->sin6_addr = ipp->ipp_nexthop;
9812			return (sizeof (sin6_t));
9813		}
9814		case IPV6_HOPOPTS:
9815			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
9816				return (0);
9817			if (ipp->ipp_hopoptslen <= tcp->tcp_label_len)
9818				return (0);
9819			bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len,
9820			    ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len);
9821			if (tcp->tcp_label_len > 0) {
9822				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
9823				ptr[1] = (ipp->ipp_hopoptslen -
9824				    tcp->tcp_label_len + 7) / 8 - 1;
9825			}
9826			return (ipp->ipp_hopoptslen - tcp->tcp_label_len);
9827		case IPV6_RTHDRDSTOPTS:
9828			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
9829				return (0);
9830			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
9831			return (ipp->ipp_rtdstoptslen);
9832		case IPV6_RTHDR:
9833			if (!(ipp->ipp_fields & IPPF_RTHDR))
9834				return (0);
9835			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
9836			return (ipp->ipp_rthdrlen);
9837		case IPV6_DSTOPTS:
9838			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
9839				return (0);
9840			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
9841			return (ipp->ipp_dstoptslen);
9842		case IPV6_SRC_PREFERENCES:
9843			return (ip6_get_src_preferences(connp,
9844			    (uint32_t *)ptr));
9845		case IPV6_PATHMTU: {
9846			struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr;
9847
9848			if (tcp->tcp_state < TCPS_ESTABLISHED)
9849				return (-1);
9850
9851			return (ip_fill_mtuinfo(&connp->conn_remv6,
9852				connp->conn_fport, mtuinfo));
9853		}
9854		default:
9855			return (-1);
9856		}
9857		break;
9858	default:
9859		return (-1);
9860	}
9861	return (sizeof (int));
9862}
9863
9864/*
9865 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
9866 * Parameters are assumed to be verified by the caller.
9867 */
9868/* ARGSUSED */
9869int
9870tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
9871    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
9872    void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
9873{
9874	conn_t	*connp = Q_TO_CONN(q);
9875	tcp_t	*tcp = connp->conn_tcp;
9876	int	*i1 = (int *)invalp;
9877	boolean_t onoff = (*i1 == 0) ? 0 : 1;
9878	boolean_t checkonly;
9879	int	reterr;
9880
9881	switch (optset_context) {
9882	case SETFN_OPTCOM_CHECKONLY:
9883		checkonly = B_TRUE;
9884		/*
9885		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
9886		 * inlen != 0 implies value supplied and
9887		 * 	we have to "pretend" to set it.
9888		 * inlen == 0 implies that there is no
9889		 * 	value part in T_CHECK request and just validation
9890		 * done elsewhere should be enough, we just return here.
9891		 */
9892		if (inlen == 0) {
9893			*outlenp = 0;
9894			return (0);
9895		}
9896		break;
9897	case SETFN_OPTCOM_NEGOTIATE:
9898		checkonly = B_FALSE;
9899		break;
9900	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
9901	case SETFN_CONN_NEGOTIATE:
9902		checkonly = B_FALSE;
9903		/*
9904		 * Negotiating local and "association-related" options
9905		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
9906		 * primitives is allowed by XTI, but we choose
9907		 * to not implement this style negotiation for Internet
9908		 * protocols (We interpret it is a must for OSI world but
9909		 * optional for Internet protocols) for all options.
9910		 * [ Will do only for the few options that enable test
9911		 * suites that our XTI implementation of this feature
9912		 * works for transports that do allow it ]
9913		 */
9914		if (!tcp_allow_connopt_set(level, name)) {
9915			*outlenp = 0;
9916			return (EINVAL);
9917		}
9918		break;
9919	default:
9920		/*
9921		 * We should never get here
9922		 */
9923		*outlenp = 0;
9924		return (EINVAL);
9925	}
9926
9927	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
9928	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
9929
9930	/*
9931	 * For TCP, we should have no ancillary data sent down
9932	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
9933	 * has to be zero.
9934	 */
9935	ASSERT(thisdg_attrs == NULL);
9936
9937	/*
9938	 * For fixed length options, no sanity check
9939	 * of passed in length is done. It is assumed *_optcom_req()
9940	 * routines do the right thing.
9941	 */
9942
9943	switch (level) {
9944	case SOL_SOCKET:
9945		switch (name) {
9946		case SO_LINGER: {
9947			struct linger *lgr = (struct linger *)invalp;
9948
9949			if (!checkonly) {
9950				if (lgr->l_onoff) {
9951					tcp->tcp_linger = 1;
9952					tcp->tcp_lingertime = lgr->l_linger;
9953				} else {
9954					tcp->tcp_linger = 0;
9955					tcp->tcp_lingertime = 0;
9956				}
9957				/* struct copy */
9958				*(struct linger *)outvalp = *lgr;
9959			} else {
9960				if (!lgr->l_onoff) {
9961				    ((struct linger *)outvalp)->l_onoff = 0;
9962				    ((struct linger *)outvalp)->l_linger = 0;
9963				} else {
9964				    /* struct copy */
9965				    *(struct linger *)outvalp = *lgr;
9966				}
9967			}
9968			*outlenp = sizeof (struct linger);
9969			return (0);
9970		}
9971		case SO_DEBUG:
9972			if (!checkonly)
9973				tcp->tcp_debug = onoff;
9974			break;
9975		case SO_KEEPALIVE:
9976			if (checkonly) {
9977				/* T_CHECK case */
9978				break;
9979			}
9980
9981			if (!onoff) {
9982				if (tcp->tcp_ka_enabled) {
9983					if (tcp->tcp_ka_tid != 0) {
9984						(void) TCP_TIMER_CANCEL(tcp,
9985						    tcp->tcp_ka_tid);
9986						tcp->tcp_ka_tid = 0;
9987					}
9988					tcp->tcp_ka_enabled = 0;
9989				}
9990				break;
9991			}
9992			if (!tcp->tcp_ka_enabled) {
9993				/* Crank up the keepalive timer */
9994				tcp->tcp_ka_last_intrvl = 0;
9995				tcp->tcp_ka_tid = TCP_TIMER(tcp,
9996				    tcp_keepalive_killer,
9997				    MSEC_TO_TICK(tcp->tcp_ka_interval));
9998				tcp->tcp_ka_enabled = 1;
9999			}
10000			break;
10001		case SO_DONTROUTE:
10002			/*
10003			 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are
10004			 * only of interest to IP.  We track them here only so
10005			 * that we can report their current value.
10006			 */
10007			if (!checkonly) {
10008				tcp->tcp_dontroute = onoff;
10009				tcp->tcp_connp->conn_dontroute = onoff;
10010			}
10011			break;
10012		case SO_USELOOPBACK:
10013			if (!checkonly) {
10014				tcp->tcp_useloopback = onoff;
10015				tcp->tcp_connp->conn_loopback = onoff;
10016			}
10017			break;
10018		case SO_BROADCAST:
10019			if (!checkonly) {
10020				tcp->tcp_broadcast = onoff;
10021				tcp->tcp_connp->conn_broadcast = onoff;
10022			}
10023			break;
10024		case SO_REUSEADDR:
10025			if (!checkonly) {
10026				tcp->tcp_reuseaddr = onoff;
10027				tcp->tcp_connp->conn_reuseaddr = onoff;
10028			}
10029			break;
10030		case SO_OOBINLINE:
10031			if (!checkonly)
10032				tcp->tcp_oobinline = onoff;
10033			break;
10034		case SO_DGRAM_ERRIND:
10035			if (!checkonly)
10036				tcp->tcp_dgram_errind = onoff;
10037			break;
10038		case SO_SNDBUF: {
10039			tcp_t *peer_tcp;
10040
10041			if (*i1 > tcp_max_buf) {
10042				*outlenp = 0;
10043				return (ENOBUFS);
10044			}
10045			if (checkonly)
10046				break;
10047
10048			tcp->tcp_xmit_hiwater = *i1;
10049			if (tcp_snd_lowat_fraction != 0)
10050				tcp->tcp_xmit_lowater =
10051				    tcp->tcp_xmit_hiwater /
10052				    tcp_snd_lowat_fraction;
10053			(void) tcp_maxpsz_set(tcp, B_TRUE);
10054			/*
10055			 * If we are flow-controlled, recheck the condition.
10056			 * There are apps that increase SO_SNDBUF size when
10057			 * flow-controlled (EWOULDBLOCK), and expect the flow
10058			 * control condition to be lifted right away.
10059			 *
10060			 * For the fused tcp loopback case, in order to avoid
10061			 * a race with the peer's tcp_fuse_rrw() we need to
10062			 * hold its fuse_lock while accessing tcp_flow_stopped.
10063			 */
10064			peer_tcp = tcp->tcp_loopback_peer;
10065			ASSERT(!tcp->tcp_fused || peer_tcp != NULL);
10066			if (tcp->tcp_fused)
10067				mutex_enter(&peer_tcp->tcp_fuse_lock);
10068
10069			if (tcp->tcp_flow_stopped &&
10070			    TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
10071				tcp_clrqfull(tcp);
10072			}
10073			if (tcp->tcp_fused)
10074				mutex_exit(&peer_tcp->tcp_fuse_lock);
10075			break;
10076		}
10077		case SO_RCVBUF:
10078			if (*i1 > tcp_max_buf) {
10079				*outlenp = 0;
10080				return (ENOBUFS);
10081			}
10082			/* Silently ignore zero */
10083			if (!checkonly && *i1 != 0) {
10084				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
10085				(void) tcp_rwnd_set(tcp, *i1);
10086			}
10087			/*
10088			 * XXX should we return the rwnd here
10089			 * and tcp_opt_get ?
10090			 */
10091			break;
10092		case SO_SND_COPYAVOID:
10093			if (!checkonly) {
10094				/* we only allow enable at most once for now */
10095				if (tcp->tcp_loopback ||
10096				    (!tcp->tcp_snd_zcopy_aware &&
10097				    (onoff != 1 || !tcp_zcopy_check(tcp)))) {
10098					*outlenp = 0;
10099					return (EOPNOTSUPP);
10100				}
10101				tcp->tcp_snd_zcopy_aware = 1;
10102			}
10103			break;
10104		case SO_ALLZONES:
10105			/* Handled at the IP level */
10106			return (-EINVAL);
10107		case SO_ANON_MLP:
10108			if (!checkonly) {
10109				mutex_enter(&connp->conn_lock);
10110				connp->conn_anon_mlp = onoff;
10111				mutex_exit(&connp->conn_lock);
10112			}
10113			break;
10114		case SO_MAC_EXEMPT:
10115			if (secpolicy_net_mac_aware(cr) != 0 ||
10116			    IPCL_IS_BOUND(connp))
10117				return (EACCES);
10118			if (!checkonly) {
10119				mutex_enter(&connp->conn_lock);
10120				connp->conn_mac_exempt = onoff;
10121				mutex_exit(&connp->conn_lock);
10122			}
10123			break;
10124		case SO_EXCLBIND:
10125			if (!checkonly)
10126				tcp->tcp_exclbind = onoff;
10127			break;
10128		default:
10129			*outlenp = 0;
10130			return (EINVAL);
10131		}
10132		break;
10133	case IPPROTO_TCP:
10134		switch (name) {
10135		case TCP_NODELAY:
10136			if (!checkonly)
10137				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
10138			break;
10139		case TCP_NOTIFY_THRESHOLD:
10140			if (!checkonly)
10141				tcp->tcp_first_timer_threshold = *i1;
10142			break;
10143		case TCP_ABORT_THRESHOLD:
10144			if (!checkonly)
10145				tcp->tcp_second_timer_threshold = *i1;
10146			break;
10147		case TCP_CONN_NOTIFY_THRESHOLD:
10148			if (!checkonly)
10149				tcp->tcp_first_ctimer_threshold = *i1;
10150			break;
10151		case TCP_CONN_ABORT_THRESHOLD:
10152			if (!checkonly)
10153				tcp->tcp_second_ctimer_threshold = *i1;
10154			break;
10155		case TCP_RECVDSTADDR:
10156			if (tcp->tcp_state > TCPS_LISTEN)
10157				return (EOPNOTSUPP);
10158			if (!checkonly)
10159				tcp->tcp_recvdstaddr = onoff;
10160			break;
10161		case TCP_ANONPRIVBIND:
10162			if ((reterr = secpolicy_net_privaddr(cr, 0)) != 0) {
10163				*outlenp = 0;
10164				return (reterr);
10165			}
10166			if (!checkonly) {
10167				tcp->tcp_anon_priv_bind = onoff;
10168			}
10169			break;
10170		case TCP_EXCLBIND:
10171			if (!checkonly)
10172				tcp->tcp_exclbind = onoff;
10173			break;	/* goto sizeof (int) option return */
10174		case TCP_INIT_CWND: {
10175			uint32_t init_cwnd = *((uint32_t *)invalp);
10176
10177			if (checkonly)
10178				break;
10179
10180			/*
10181			 * Only allow socket with network configuration
10182			 * privilege to set the initial cwnd to be larger
10183			 * than allowed by RFC 3390.
10184			 */
10185			if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
10186				tcp->tcp_init_cwnd = init_cwnd;
10187				break;
10188			}
10189			if ((reterr = secpolicy_net_config(cr, B_TRUE)) != 0) {
10190				*outlenp = 0;
10191				return (reterr);
10192			}
10193			if (init_cwnd > TCP_MAX_INIT_CWND) {
10194				*outlenp = 0;
10195				return (EINVAL);
10196			}
10197			tcp->tcp_init_cwnd = init_cwnd;
10198			break;
10199		}
10200		case TCP_KEEPALIVE_THRESHOLD:
10201			if (checkonly)
10202				break;
10203
10204			if (*i1 < tcp_keepalive_interval_low ||
10205			    *i1 > tcp_keepalive_interval_high) {
10206				*outlenp = 0;
10207				return (EINVAL);
10208			}
10209			if (*i1 != tcp->tcp_ka_interval) {
10210				tcp->tcp_ka_interval = *i1;
10211				/*
10212				 * Check if we need to restart the
10213				 * keepalive timer.
10214				 */
10215				if (tcp->tcp_ka_tid != 0) {
10216					ASSERT(tcp->tcp_ka_enabled);
10217					(void) TCP_TIMER_CANCEL(tcp,
10218					    tcp->tcp_ka_tid);
10219					tcp->tcp_ka_last_intrvl = 0;
10220					tcp->tcp_ka_tid = TCP_TIMER(tcp,
10221					    tcp_keepalive_killer,
10222					    MSEC_TO_TICK(tcp->tcp_ka_interval));
10223				}
10224			}
10225			break;
10226		case TCP_KEEPALIVE_ABORT_THRESHOLD:
10227			if (!checkonly) {
10228				if (*i1 < tcp_keepalive_abort_interval_low ||
10229				    *i1 > tcp_keepalive_abort_interval_high) {
10230					*outlenp = 0;
10231					return (EINVAL);
10232				}
10233				tcp->tcp_ka_abort_thres = *i1;
10234			}
10235			break;
10236		case TCP_CORK:
10237			if (!checkonly) {
10238				/*
10239				 * if tcp->tcp_cork was set and is now
10240				 * being unset, we have to make sure that
10241				 * the remaining data gets sent out. Also
10242				 * unset tcp->tcp_cork so that tcp_wput_data()
10243				 * can send data even if it is less than mss
10244				 */
10245				if (tcp->tcp_cork && onoff == 0 &&
10246				    tcp->tcp_unsent > 0) {
10247					tcp->tcp_cork = B_FALSE;
10248					tcp_wput_data(tcp, NULL, B_FALSE);
10249				}
10250				tcp->tcp_cork = onoff;
10251			}
10252			break;
10253		default:
10254			*outlenp = 0;
10255			return (EINVAL);
10256		}
10257		break;
10258	case IPPROTO_IP:
10259		if (tcp->tcp_family != AF_INET) {
10260			*outlenp = 0;
10261			return (ENOPROTOOPT);
10262		}
10263		switch (name) {
10264		case IP_OPTIONS:
10265		case T_IP_OPTIONS:
10266			reterr = tcp_opt_set_header(tcp, checkonly,
10267			    invalp, inlen);
10268			if (reterr) {
10269				*outlenp = 0;
10270				return (reterr);
10271			}
10272			/* OK return - copy input buffer into output buffer */
10273			if (invalp != outvalp) {
10274				/* don't trust bcopy for identical src/dst */
10275				bcopy(invalp, outvalp, inlen);
10276			}
10277			*outlenp = inlen;
10278			return (0);
10279		case IP_TOS:
10280		case T_IP_TOS:
10281			if (!checkonly) {
10282				tcp->tcp_ipha->ipha_type_of_service =
10283				    (uchar_t)*i1;
10284				tcp->tcp_tos = (uchar_t)*i1;
10285			}
10286			break;
10287		case IP_TTL:
10288			if (!checkonly) {
10289				tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1;
10290				tcp->tcp_ttl = (uchar_t)*i1;
10291			}
10292			break;
10293		case IP_BOUND_IF:
10294		case IP_NEXTHOP:
10295			/* Handled at the IP level */
10296			return (-EINVAL);
10297		case IP_SEC_OPT:
10298			/*
10299			 * We should not allow policy setting after
10300			 * we start listening for connections.
10301			 */
10302			if (tcp->tcp_state == TCPS_LISTEN) {
10303				return (EINVAL);
10304			} else {
10305				/* Handled at the IP level */
10306				return (-EINVAL);
10307			}
10308		default:
10309			*outlenp = 0;
10310			return (EINVAL);
10311		}
10312		break;
10313	case IPPROTO_IPV6: {
10314		ip6_pkt_t		*ipp;
10315
10316		/*
10317		 * IPPROTO_IPV6 options are only supported for sockets
10318		 * that are using IPv6 on the wire.
10319		 */
10320		if (tcp->tcp_ipversion != IPV6_VERSION) {
10321			*outlenp = 0;
10322			return (ENOPROTOOPT);
10323		}
10324		/*
10325		 * Only sticky options; no ancillary data
10326		 */
10327		ASSERT(thisdg_attrs == NULL);
10328		ipp = &tcp->tcp_sticky_ipp;
10329
10330		switch (name) {
10331		case IPV6_UNICAST_HOPS:
10332			/* -1 means use default */
10333			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
10334				*outlenp = 0;
10335				return (EINVAL);
10336			}
10337			if (!checkonly) {
10338				if (*i1 == -1) {
10339					tcp->tcp_ip6h->ip6_hops =
10340					    ipp->ipp_unicast_hops =
10341					    (uint8_t)tcp_ipv6_hoplimit;
10342					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
10343					/* Pass modified value to IP. */
10344					*i1 = tcp->tcp_ip6h->ip6_hops;
10345				} else {
10346					tcp->tcp_ip6h->ip6_hops =
10347					    ipp->ipp_unicast_hops =
10348					    (uint8_t)*i1;
10349					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
10350				}
10351				reterr = tcp_build_hdrs(q, tcp);
10352				if (reterr != 0)
10353					return (reterr);
10354			}
10355			break;
10356		case IPV6_BOUND_IF:
10357			if (!checkonly) {
10358				int error = 0;
10359
10360				tcp->tcp_bound_if = *i1;
10361				error = ip_opt_set_ill(tcp->tcp_connp, *i1,
10362				    B_TRUE, checkonly, level, name, mblk);
10363				if (error != 0) {
10364					*outlenp = 0;
10365					return (error);
10366				}
10367			}
10368			break;
10369		/*
10370		 * Set boolean switches for ancillary data delivery
10371		 */
10372		case IPV6_RECVPKTINFO:
10373			if (!checkonly) {
10374				if (onoff)
10375					tcp->tcp_ipv6_recvancillary |=
10376					    TCP_IPV6_RECVPKTINFO;
10377				else
10378					tcp->tcp_ipv6_recvancillary &=
10379					    ~TCP_IPV6_RECVPKTINFO;
10380				/* Force it to be sent up with the next msg */
10381				tcp->tcp_recvifindex = 0;
10382			}
10383			break;
10384		case IPV6_RECVTCLASS:
10385			if (!checkonly) {
10386				if (onoff)
10387					tcp->tcp_ipv6_recvancillary |=
10388					    TCP_IPV6_RECVTCLASS;
10389				else
10390					tcp->tcp_ipv6_recvancillary &=
10391					    ~TCP_IPV6_RECVTCLASS;
10392			}
10393			break;
10394		case IPV6_RECVHOPLIMIT:
10395			if (!checkonly) {
10396				if (onoff)
10397					tcp->tcp_ipv6_recvancillary |=
10398					    TCP_IPV6_RECVHOPLIMIT;
10399				else
10400					tcp->tcp_ipv6_recvancillary &=
10401					    ~TCP_IPV6_RECVHOPLIMIT;
10402				/* Force it to be sent up with the next msg */
10403				tcp->tcp_recvhops = 0xffffffffU;
10404			}
10405			break;
10406		case IPV6_RECVHOPOPTS:
10407			if (!checkonly) {
10408				if (onoff)
10409					tcp->tcp_ipv6_recvancillary |=
10410					    TCP_IPV6_RECVHOPOPTS;
10411				else
10412					tcp->tcp_ipv6_recvancillary &=
10413					    ~TCP_IPV6_RECVHOPOPTS;
10414			}
10415			break;
10416		case IPV6_RECVDSTOPTS:
10417			if (!checkonly) {
10418				if (onoff)
10419					tcp->tcp_ipv6_recvancillary |=
10420					    TCP_IPV6_RECVDSTOPTS;
10421				else
10422					tcp->tcp_ipv6_recvancillary &=
10423					    ~TCP_IPV6_RECVDSTOPTS;
10424			}
10425			break;
10426		case _OLD_IPV6_RECVDSTOPTS:
10427			if (!checkonly) {
10428				if (onoff)
10429					tcp->tcp_ipv6_recvancillary |=
10430					    TCP_OLD_IPV6_RECVDSTOPTS;
10431				else
10432					tcp->tcp_ipv6_recvancillary &=
10433					    ~TCP_OLD_IPV6_RECVDSTOPTS;
10434			}
10435			break;
10436		case IPV6_RECVRTHDR:
10437			if (!checkonly) {
10438				if (onoff)
10439					tcp->tcp_ipv6_recvancillary |=
10440					    TCP_IPV6_RECVRTHDR;
10441				else
10442					tcp->tcp_ipv6_recvancillary &=
10443					    ~TCP_IPV6_RECVRTHDR;
10444			}
10445			break;
10446		case IPV6_RECVRTHDRDSTOPTS:
10447			if (!checkonly) {
10448				if (onoff)
10449					tcp->tcp_ipv6_recvancillary |=
10450					    TCP_IPV6_RECVRTDSTOPTS;
10451				else
10452					tcp->tcp_ipv6_recvancillary &=
10453					    ~TCP_IPV6_RECVRTDSTOPTS;
10454			}
10455			break;
10456		case IPV6_PKTINFO:
10457			if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
10458				return (EINVAL);
10459			if (checkonly)
10460				break;
10461
10462			if (inlen == 0) {
10463				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
10464			} else {
10465				struct in6_pktinfo *pkti;
10466
10467				pkti = (struct in6_pktinfo *)invalp;
10468				/*
10469				 * RFC 3542 states that ipi6_addr must be
10470				 * the unspecified address when setting the
10471				 * IPV6_PKTINFO sticky socket option on a
10472				 * TCP socket.
10473				 */
10474				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
10475					return (EINVAL);
10476				/*
10477				 * ip6_set_pktinfo() validates the source
10478				 * address and interface index.
10479				 */
10480				reterr = ip6_set_pktinfo(cr, tcp->tcp_connp,
10481				    pkti, mblk);
10482				if (reterr != 0)
10483					return (reterr);
10484				ipp->ipp_ifindex = pkti->ipi6_ifindex;
10485				ipp->ipp_addr = pkti->ipi6_addr;
10486				if (ipp->ipp_ifindex != 0)
10487					ipp->ipp_fields |= IPPF_IFINDEX;
10488				else
10489					ipp->ipp_fields &= ~IPPF_IFINDEX;
10490				if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
10491					ipp->ipp_fields |= IPPF_ADDR;
10492				else
10493					ipp->ipp_fields &= ~IPPF_ADDR;
10494			}
10495			reterr = tcp_build_hdrs(q, tcp);
10496			if (reterr != 0)
10497				return (reterr);
10498			break;
10499		case IPV6_TCLASS:
10500			if (inlen != 0 && inlen != sizeof (int))
10501				return (EINVAL);
10502			if (checkonly)
10503				break;
10504
10505			if (inlen == 0) {
10506				ipp->ipp_fields &= ~IPPF_TCLASS;
10507			} else {
10508				if (*i1 > 255 || *i1 < -1)
10509					return (EINVAL);
10510				if (*i1 == -1) {
10511					ipp->ipp_tclass = 0;
10512					*i1 = 0;
10513				} else {
10514					ipp->ipp_tclass = *i1;
10515				}
10516				ipp->ipp_fields |= IPPF_TCLASS;
10517			}
10518			reterr = tcp_build_hdrs(q, tcp);
10519			if (reterr != 0)
10520				return (reterr);
10521			break;
10522		case IPV6_NEXTHOP:
10523			/*
10524			 * IP will verify that the nexthop is reachable
10525			 * and fail for sticky options.
10526			 */
10527			if (inlen != 0 && inlen != sizeof (sin6_t))
10528				return (EINVAL);
10529			if (checkonly)
10530				break;
10531
10532			if (inlen == 0) {
10533				ipp->ipp_fields &= ~IPPF_NEXTHOP;
10534			} else {
10535				sin6_t *sin6 = (sin6_t *)invalp;
10536
10537				if (sin6->sin6_family != AF_INET6)
10538					return (EAFNOSUPPORT);
10539				if (IN6_IS_ADDR_V4MAPPED(
10540				    &sin6->sin6_addr))
10541					return (EADDRNOTAVAIL);
10542				ipp->ipp_nexthop = sin6->sin6_addr;
10543				if (!IN6_IS_ADDR_UNSPECIFIED(
10544				    &ipp->ipp_nexthop))
10545					ipp->ipp_fields |= IPPF_NEXTHOP;
10546				else
10547					ipp->ipp_fields &= ~IPPF_NEXTHOP;
10548			}
10549			reterr = tcp_build_hdrs(q, tcp);
10550			if (reterr != 0)
10551				return (reterr);
10552			break;
10553		case IPV6_HOPOPTS: {
10554			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
10555
10556			/*
10557			 * Sanity checks - minimum size, size a multiple of
10558			 * eight bytes, and matching size passed in.
10559			 */
10560			if (inlen != 0 &&
10561			    inlen != (8 * (hopts->ip6h_len + 1)))
10562				return (EINVAL);
10563
10564			if (checkonly)
10565				break;
10566
10567			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
10568			    (uchar_t **)&ipp->ipp_hopopts,
10569			    &ipp->ipp_hopoptslen, tcp->tcp_label_len);
10570			if (reterr != 0)
10571				return (reterr);
10572			if (ipp->ipp_hopoptslen == 0)
10573				ipp->ipp_fields &= ~IPPF_HOPOPTS;
10574			else
10575				ipp->ipp_fields |= IPPF_HOPOPTS;
10576			reterr = tcp_build_hdrs(q, tcp);
10577			if (reterr != 0)
10578				return (reterr);
10579			break;
10580		}
10581		case IPV6_RTHDRDSTOPTS: {
10582			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
10583
10584			/*
10585			 * Sanity checks - minimum size, size a multiple of
10586			 * eight bytes, and matching size passed in.
10587			 */
10588			if (inlen != 0 &&
10589			    inlen != (8 * (dopts->ip6d_len + 1)))
10590				return (EINVAL);
10591
10592			if (checkonly)
10593				break;
10594
10595			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
10596			    (uchar_t **)&ipp->ipp_rtdstopts,
10597			    &ipp->ipp_rtdstoptslen, 0);
10598			if (reterr != 0)
10599				return (reterr);
10600			if (ipp->ipp_rtdstoptslen == 0)
10601				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
10602			else
10603				ipp->ipp_fields |= IPPF_RTDSTOPTS;
10604			reterr = tcp_build_hdrs(q, tcp);
10605			if (reterr != 0)
10606				return (reterr);
10607			break;
10608		}
10609		case IPV6_DSTOPTS: {
10610			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
10611
10612			/*
10613			 * Sanity checks - minimum size, size a multiple of
10614			 * eight bytes, and matching size passed in.
10615			 */
10616			if (inlen != 0 &&
10617			    inlen != (8 * (dopts->ip6d_len + 1)))
10618				return (EINVAL);
10619
10620			if (checkonly)
10621				break;
10622
10623			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
10624			    (uchar_t **)&ipp->ipp_dstopts,
10625			    &ipp->ipp_dstoptslen, 0);
10626			if (reterr != 0)
10627				return (reterr);
10628			if (ipp->ipp_dstoptslen == 0)
10629				ipp->ipp_fields &= ~IPPF_DSTOPTS;
10630			else
10631				ipp->ipp_fields |= IPPF_DSTOPTS;
10632			reterr = tcp_build_hdrs(q, tcp);
10633			if (reterr != 0)
10634				return (reterr);
10635			break;
10636		}
10637		case IPV6_RTHDR: {
10638			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
10639
10640			/*
10641			 * Sanity checks - minimum size, size a multiple of
10642			 * eight bytes, and matching size passed in.
10643			 */
10644			if (inlen != 0 &&
10645			    inlen != (8 * (rt->ip6r_len + 1)))
10646				return (EINVAL);
10647
10648			if (checkonly)
10649				break;
10650
10651			reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
10652			    (uchar_t **)&ipp->ipp_rthdr,
10653			    &ipp->ipp_rthdrlen, 0);
10654			if (reterr != 0)
10655				return (reterr);
10656			if (ipp->ipp_rthdrlen == 0)
10657				ipp->ipp_fields &= ~IPPF_RTHDR;
10658			else
10659				ipp->ipp_fields |= IPPF_RTHDR;
10660			reterr = tcp_build_hdrs(q, tcp);
10661			if (reterr != 0)
10662				return (reterr);
10663			break;
10664		}
10665		case IPV6_V6ONLY:
10666			if (!checkonly)
10667				tcp->tcp_connp->conn_ipv6_v6only = onoff;
10668			break;
10669		case IPV6_USE_MIN_MTU:
10670			if (inlen != sizeof (int))
10671				return (EINVAL);
10672
10673			if (*i1 < -1 || *i1 > 1)
10674				return (EINVAL);
10675
10676			if (checkonly)
10677				break;
10678
10679			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
10680			ipp->ipp_use_min_mtu = *i1;
10681			break;
10682		case IPV6_BOUND_PIF:
10683			/* Handled at the IP level */
10684			return (-EINVAL);
10685		case IPV6_SEC_OPT:
10686			/*
10687			 * We should not allow policy setting after
10688			 * we start listening for connections.
10689			 */
10690			if (tcp->tcp_state == TCPS_LISTEN) {
10691				return (EINVAL);
10692			} else {
10693				/* Handled at the IP level */
10694				return (-EINVAL);
10695			}
10696		case IPV6_SRC_PREFERENCES:
10697			if (inlen != sizeof (uint32_t))
10698				return (EINVAL);
10699			reterr = ip6_set_src_preferences(tcp->tcp_connp,
10700			    *(uint32_t *)invalp);
10701			if (reterr != 0) {
10702				*outlenp = 0;
10703				return (reterr);
10704			}
10705			break;
10706		default:
10707			*outlenp = 0;
10708			return (EINVAL);
10709		}
10710		break;
10711	}		/* end IPPROTO_IPV6 */
10712	default:
10713		*outlenp = 0;
10714		return (EINVAL);
10715	}
10716	/*
10717	 * Common case of OK return with outval same as inval
10718	 */
10719	if (invalp != outvalp) {
10720		/* don't trust bcopy for identical src/dst */
10721		(void) bcopy(invalp, outvalp, inlen);
10722	}
10723	*outlenp = inlen;
10724	return (0);
10725}
10726
10727/*
10728 * Update tcp_sticky_hdrs based on tcp_sticky_ipp.
10729 * The headers include ip6i_t (if needed), ip6_t, any sticky extension
10730 * headers, and the maximum size tcp header (to avoid reallocation
10731 * on the fly for additional tcp options).
10732 * Returns failure if can't allocate memory.
10733 */
10734static int
10735tcp_build_hdrs(queue_t *q, tcp_t *tcp)
10736{
10737	char	*hdrs;
10738	uint_t	hdrs_len;
10739	ip6i_t	*ip6i;
10740	char	buf[TCP_MAX_HDR_LENGTH];
10741	ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
10742	in6_addr_t src, dst;
10743
10744	/*
10745	 * save the existing tcp header and source/dest IP addresses
10746	 */
10747	bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len);
10748	src = tcp->tcp_ip6h->ip6_src;
10749	dst = tcp->tcp_ip6h->ip6_dst;
10750	hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH;
10751	ASSERT(hdrs_len != 0);
10752	if (hdrs_len > tcp->tcp_iphc_len) {
10753		/* Need to reallocate */
10754		hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP);
10755		if (hdrs == NULL)
10756			return (ENOMEM);
10757		if (tcp->tcp_iphc != NULL) {
10758			if (tcp->tcp_hdr_grown) {
10759				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
10760			} else {
10761				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
10762				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
10763			}
10764			tcp->tcp_iphc_len = 0;
10765		}
10766		ASSERT(tcp->tcp_iphc_len == 0);
10767		tcp->tcp_iphc = hdrs;
10768		tcp->tcp_iphc_len = hdrs_len;
10769		tcp->tcp_hdr_grown = B_TRUE;
10770	}
10771	ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc,
10772	    hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP);
10773
10774	/* Set header fields not in ipp */
10775	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
10776		ip6i = (ip6i_t *)tcp->tcp_iphc;
10777		tcp->tcp_ip6h = (ip6_t *)&ip6i[1];
10778	} else {
10779		tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
10780	}
10781	/*
10782	 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one.
10783	 *
10784	 * tcp->tcp_tcp_hdr_len doesn't change here.
10785	 */
10786	tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH;
10787	tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len);
10788	tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len;
10789
10790	bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len);
10791
10792	tcp->tcp_ip6h->ip6_src = src;
10793	tcp->tcp_ip6h->ip6_dst = dst;
10794
10795	/*
10796	 * If the hop limit was not set by ip_build_hdrs_v6(), set it to
10797	 * the default value for TCP.
10798	 */
10799	if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS))
10800		tcp->tcp_ip6h->ip6_hops = tcp_ipv6_hoplimit;
10801
10802	/*
10803	 * If we're setting extension headers after a connection
10804	 * has been established, and if we have a routing header
10805	 * among the extension headers, call ip_massage_options_v6 to
10806	 * manipulate the routing header/ip6_dst set the checksum
10807	 * difference in the tcp header template.
10808	 * (This happens in tcp_connect_ipv6 if the routing header
10809	 * is set prior to the connect.)
10810	 * Set the tcp_sum to zero first in case we've cleared a
10811	 * routing header or don't have one at all.
10812	 */
10813	tcp->tcp_sum = 0;
10814	if ((tcp->tcp_state >= TCPS_SYN_SENT) &&
10815	    (tcp->tcp_ipp_fields & IPPF_RTHDR)) {
10816		ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h,
10817		    (uint8_t *)tcp->tcp_tcph);
10818		if (rth != NULL) {
10819			tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h,
10820			    rth);
10821			tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
10822			    (tcp->tcp_sum >> 16));
10823		}
10824	}
10825
10826	/* Try to get everything in a single mblk */
10827	(void) mi_set_sth_wroff(RD(q), hdrs_len + tcp_wroff_xtra);
10828	return (0);
10829}
10830
10831/*
10832 * Transfer any source route option from ipha to buf/dst in reversed form.
10833 */
10834static int
10835tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst)
10836{
10837	ipoptp_t	opts;
10838	uchar_t		*opt;
10839	uint8_t		optval;
10840	uint8_t		optlen;
10841	uint32_t	len = 0;
10842
10843	for (optval = ipoptp_first(&opts, ipha);
10844	    optval != IPOPT_EOL;
10845	    optval = ipoptp_next(&opts)) {
10846		opt = opts.ipoptp_cur;
10847		optlen = opts.ipoptp_len;
10848		switch (optval) {
10849			int	off1, off2;
10850		case IPOPT_SSRR:
10851		case IPOPT_LSRR:
10852
10853			/* Reverse source route */
10854			/*
10855			 * First entry should be the next to last one in the
10856			 * current source route (the last entry is our
10857			 * address.)
10858			 * The last entry should be the final destination.
10859			 */
10860			buf[IPOPT_OPTVAL] = (uint8_t)optval;
10861			buf[IPOPT_OLEN] = (uint8_t)optlen;
10862			off1 = IPOPT_MINOFF_SR - 1;
10863			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
10864			if (off2 < 0) {
10865				/* No entries in source route */
10866				break;
10867			}
10868			bcopy(opt + off2, dst, IP_ADDR_LEN);
10869			/*
10870			 * Note: use src since ipha has not had its src
10871			 * and dst reversed (it is in the state it was
10872			 * received.
10873			 */
10874			bcopy(&ipha->ipha_src, buf + off2,
10875			    IP_ADDR_LEN);
10876			off2 -= IP_ADDR_LEN;
10877
10878			while (off2 > 0) {
10879				bcopy(opt + off2, buf + off1,
10880				    IP_ADDR_LEN);
10881				off1 += IP_ADDR_LEN;
10882				off2 -= IP_ADDR_LEN;
10883			}
10884			buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
10885			buf += optlen;
10886			len += optlen;
10887			break;
10888		}
10889	}
10890done:
10891	/* Pad the resulting options */
10892	while (len & 0x3) {
10893		*buf++ = IPOPT_EOL;
10894		len++;
10895	}
10896	return (len);
10897}
10898
10899
10900/*
10901 * Extract and revert a source route from ipha (if any)
10902 * and then update the relevant fields in both tcp_t and the standard header.
10903 */
10904static void
10905tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha)
10906{
10907	char	buf[TCP_MAX_HDR_LENGTH];
10908	uint_t	tcph_len;
10909	int	len;
10910
10911	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
10912	len = IPH_HDR_LENGTH(ipha);
10913	if (len == IP_SIMPLE_HDR_LENGTH)
10914		/* Nothing to do */
10915		return;
10916	if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH ||
10917	    (len & 0x3))
10918		return;
10919
10920	tcph_len = tcp->tcp_tcp_hdr_len;
10921	bcopy(tcp->tcp_tcph, buf, tcph_len);
10922	tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) +
10923		(tcp->tcp_ipha->ipha_dst & 0xffff);
10924	len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha +
10925	    IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst);
10926	len += IP_SIMPLE_HDR_LENGTH;
10927	tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
10928	    (tcp->tcp_ipha->ipha_dst & 0xffff));
10929	if ((int)tcp->tcp_sum < 0)
10930		tcp->tcp_sum--;
10931	tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
10932	tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16));
10933	tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len);
10934	bcopy(buf, tcp->tcp_tcph, tcph_len);
10935	tcp->tcp_ip_hdr_len = len;
10936	tcp->tcp_ipha->ipha_version_and_hdr_length =
10937	    (IP_VERSION << 4) | (len >> 2);
10938	len += tcph_len;
10939	tcp->tcp_hdr_len = len;
10940}
10941
10942/*
10943 * Copy the standard header into its new location,
10944 * lay in the new options and then update the relevant
10945 * fields in both tcp_t and the standard header.
10946 */
10947static int
10948tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len)
10949{
10950	uint_t	tcph_len;
10951	uint8_t	*ip_optp;
10952	tcph_t	*new_tcph;
10953
10954	if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
10955		return (EINVAL);
10956
10957	if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len)
10958		return (EINVAL);
10959
10960	if (checkonly) {
10961		/*
10962		 * do not really set, just pretend to - T_CHECK
10963		 */
10964		return (0);
10965	}
10966
10967	ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH;
10968	if (tcp->tcp_label_len > 0) {
10969		int padlen;
10970		uint8_t opt;
10971
10972		/* convert list termination to no-ops */
10973		padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN];
10974		ip_optp += ip_optp[IPOPT_OLEN];
10975		opt = len > 0 ? IPOPT_NOP : IPOPT_EOL;
10976		while (--padlen >= 0)
10977			*ip_optp++ = opt;
10978	}
10979	tcph_len = tcp->tcp_tcp_hdr_len;
10980	new_tcph = (tcph_t *)(ip_optp + len);
10981	ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len);
10982	tcp->tcp_tcph = new_tcph;
10983	bcopy(ptr, ip_optp, len);
10984
10985	len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len;
10986
10987	tcp->tcp_ip_hdr_len = len;
10988	tcp->tcp_ipha->ipha_version_and_hdr_length =
10989	    (IP_VERSION << 4) | (len >> 2);
10990	tcp->tcp_hdr_len = len + tcph_len;
10991	if (!TCP_IS_DETACHED(tcp)) {
10992		/* Always allocate room for all options. */
10993		(void) mi_set_sth_wroff(tcp->tcp_rq,
10994		    TCP_MAX_COMBINED_HEADER_LENGTH + tcp_wroff_xtra);
10995	}
10996	return (0);
10997}
10998
10999/* Get callback routine passed to nd_load by tcp_param_register */
11000/* ARGSUSED */
11001static int
11002tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
11003{
11004	tcpparam_t	*tcppa = (tcpparam_t *)cp;
11005
11006	(void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val);
11007	return (0);
11008}
11009
11010/*
11011 * Walk through the param array specified registering each element with the
11012 * named dispatch handler.
11013 */
11014static boolean_t
11015tcp_param_register(tcpparam_t *tcppa, int cnt)
11016{
11017	for (; cnt-- > 0; tcppa++) {
11018		if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) {
11019			if (!nd_load(&tcp_g_nd, tcppa->tcp_param_name,
11020			    tcp_param_get, tcp_param_set,
11021			    (caddr_t)tcppa)) {
11022				nd_free(&tcp_g_nd);
11023				return (B_FALSE);
11024			}
11025		}
11026	}
11027	if (!nd_load(&tcp_g_nd, tcp_wroff_xtra_param.tcp_param_name,
11028	    tcp_param_get, tcp_param_set_aligned,
11029	    (caddr_t)&tcp_wroff_xtra_param)) {
11030		nd_free(&tcp_g_nd);
11031		return (B_FALSE);
11032	}
11033	if (!nd_load(&tcp_g_nd, tcp_mdt_head_param.tcp_param_name,
11034	    tcp_param_get, tcp_param_set_aligned,
11035	    (caddr_t)&tcp_mdt_head_param)) {
11036		nd_free(&tcp_g_nd);
11037		return (B_FALSE);
11038	}
11039	if (!nd_load(&tcp_g_nd, tcp_mdt_tail_param.tcp_param_name,
11040	    tcp_param_get, tcp_param_set_aligned,
11041	    (caddr_t)&tcp_mdt_tail_param)) {
11042		nd_free(&tcp_g_nd);
11043		return (B_FALSE);
11044	}
11045	if (!nd_load(&tcp_g_nd, tcp_mdt_max_pbufs_param.tcp_param_name,
11046	    tcp_param_get, tcp_param_set,
11047	    (caddr_t)&tcp_mdt_max_pbufs_param)) {
11048		nd_free(&tcp_g_nd);
11049		return (B_FALSE);
11050	}
11051	if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports",
11052	    tcp_extra_priv_ports_get, NULL, NULL)) {
11053		nd_free(&tcp_g_nd);
11054		return (B_FALSE);
11055	}
11056	if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_add",
11057	    NULL, tcp_extra_priv_ports_add, NULL)) {
11058		nd_free(&tcp_g_nd);
11059		return (B_FALSE);
11060	}
11061	if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_del",
11062	    NULL, tcp_extra_priv_ports_del, NULL)) {
11063		nd_free(&tcp_g_nd);
11064		return (B_FALSE);
11065	}
11066	if (!nd_load(&tcp_g_nd, "tcp_status", tcp_status_report, NULL,
11067	    NULL)) {
11068		nd_free(&tcp_g_nd);
11069		return (B_FALSE);
11070	}
11071	if (!nd_load(&tcp_g_nd, "tcp_bind_hash", tcp_bind_hash_report,
11072	    NULL, NULL)) {
11073		nd_free(&tcp_g_nd);
11074		return (B_FALSE);
11075	}
11076	if (!nd_load(&tcp_g_nd, "tcp_listen_hash", tcp_listen_hash_report,
11077	    NULL, NULL)) {
11078		nd_free(&tcp_g_nd);
11079		return (B_FALSE);
11080	}
11081	if (!nd_load(&tcp_g_nd, "tcp_conn_hash", tcp_conn_hash_report,
11082	    NULL, NULL)) {
11083		nd_free(&tcp_g_nd);
11084		return (B_FALSE);
11085	}
11086	if (!nd_load(&tcp_g_nd, "tcp_acceptor_hash", tcp_acceptor_hash_report,
11087	    NULL, NULL)) {
11088		nd_free(&tcp_g_nd);
11089		return (B_FALSE);
11090	}
11091	if (!nd_load(&tcp_g_nd, "tcp_host_param", tcp_host_param_report,
11092	    tcp_host_param_set, NULL)) {
11093		nd_free(&tcp_g_nd);
11094		return (B_FALSE);
11095	}
11096	if (!nd_load(&tcp_g_nd, "tcp_host_param_ipv6", tcp_host_param_report,
11097	    tcp_host_param_set_ipv6, NULL)) {
11098		nd_free(&tcp_g_nd);
11099		return (B_FALSE);
11100	}
11101	if (!nd_load(&tcp_g_nd, "tcp_1948_phrase", NULL, tcp_1948_phrase_set,
11102	    NULL)) {
11103		nd_free(&tcp_g_nd);
11104		return (B_FALSE);
11105	}
11106	if (!nd_load(&tcp_g_nd, "tcp_reserved_port_list",
11107	    tcp_reserved_port_list, NULL, NULL)) {
11108		nd_free(&tcp_g_nd);
11109		return (B_FALSE);
11110	}
11111	/*
11112	 * Dummy ndd variables - only to convey obsolescence information
11113	 * through printing of their name (no get or set routines)
11114	 * XXX Remove in future releases ?
11115	 */
11116	if (!nd_load(&tcp_g_nd,
11117	    "tcp_close_wait_interval(obsoleted - "
11118	    "use tcp_time_wait_interval)", NULL, NULL, NULL)) {
11119		nd_free(&tcp_g_nd);
11120		return (B_FALSE);
11121	}
11122	return (B_TRUE);
11123}
11124
11125/* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */
11126/* ARGSUSED */
11127static int
11128tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
11129    cred_t *cr)
11130{
11131	long new_value;
11132	tcpparam_t *tcppa = (tcpparam_t *)cp;
11133
11134	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
11135	    new_value < tcppa->tcp_param_min ||
11136	    new_value > tcppa->tcp_param_max) {
11137		return (EINVAL);
11138	}
11139	/*
11140	 * Need to make sure new_value is a multiple of 4.  If it is not,
11141	 * round it up.  For future 64 bit requirement, we actually make it
11142	 * a multiple of 8.
11143	 */
11144	if (new_value & 0x7) {
11145		new_value = (new_value & ~0x7) + 0x8;
11146	}
11147	tcppa->tcp_param_val = new_value;
11148	return (0);
11149}
11150
11151/* Set callback routine passed to nd_load by tcp_param_register */
11152/* ARGSUSED */
11153static int
11154tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
11155{
11156	long	new_value;
11157	tcpparam_t	*tcppa = (tcpparam_t *)cp;
11158
11159	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
11160	    new_value < tcppa->tcp_param_min ||
11161	    new_value > tcppa->tcp_param_max) {
11162		return (EINVAL);
11163	}
11164	tcppa->tcp_param_val = new_value;
11165	return (0);
11166}
11167
11168/*
11169 * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
11170 * is filled, return as much as we can.  The message passed in may be
11171 * multi-part, chained using b_cont.  "start" is the starting sequence
11172 * number for this piece.
11173 */
11174static mblk_t *
11175tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
11176{
11177	uint32_t	end;
11178	mblk_t		*mp1;
11179	mblk_t		*mp2;
11180	mblk_t		*next_mp;
11181	uint32_t	u1;
11182
11183	/* Walk through all the new pieces. */
11184	do {
11185		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
11186		    (uintptr_t)INT_MAX);
11187		end = start + (int)(mp->b_wptr - mp->b_rptr);
11188		next_mp = mp->b_cont;
11189		if (start == end) {
11190			/* Empty.  Blast it. */
11191			freeb(mp);
11192			continue;
11193		}
11194		mp->b_cont = NULL;
11195		TCP_REASS_SET_SEQ(mp, start);
11196		TCP_REASS_SET_END(mp, end);
11197		mp1 = tcp->tcp_reass_tail;
11198		if (!mp1) {
11199			tcp->tcp_reass_tail = mp;
11200			tcp->tcp_reass_head = mp;
11201			BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs);
11202			UPDATE_MIB(&tcp_mib,
11203			    tcpInDataUnorderBytes, end - start);
11204			continue;
11205		}
11206		/* New stuff completely beyond tail? */
11207		if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
11208			/* Link it on end. */
11209			mp1->b_cont = mp;
11210			tcp->tcp_reass_tail = mp;
11211			BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs);
11212			UPDATE_MIB(&tcp_mib,
11213			    tcpInDataUnorderBytes, end - start);
11214			continue;
11215		}
11216		mp1 = tcp->tcp_reass_head;
11217		u1 = TCP_REASS_SEQ(mp1);
11218		/* New stuff at the front? */
11219		if (SEQ_LT(start, u1)) {
11220			/* Yes... Check for overlap. */
11221			mp->b_cont = mp1;
11222			tcp->tcp_reass_head = mp;
11223			tcp_reass_elim_overlap(tcp, mp);
11224			continue;
11225		}
11226		/*
11227		 * The new piece fits somewhere between the head and tail.
11228		 * We find our slot, where mp1 precedes us and mp2 trails.
11229		 */
11230		for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
11231			u1 = TCP_REASS_SEQ(mp2);
11232			if (SEQ_LEQ(start, u1))
11233				break;
11234		}
11235		/* Link ourselves in */
11236		mp->b_cont = mp2;
11237		mp1->b_cont = mp;
11238
11239		/* Trim overlap with following mblk(s) first */
11240		tcp_reass_elim_overlap(tcp, mp);
11241
11242		/* Trim overlap with preceding mblk */
11243		tcp_reass_elim_overlap(tcp, mp1);
11244
11245	} while (start = end, mp = next_mp);
11246	mp1 = tcp->tcp_reass_head;
11247	/* Anything ready to go? */
11248	if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt)
11249		return (NULL);
11250	/* Eat what we can off the queue */
11251	for (;;) {
11252		mp = mp1->b_cont;
11253		end = TCP_REASS_END(mp1);
11254		TCP_REASS_SET_SEQ(mp1, 0);
11255		TCP_REASS_SET_END(mp1, 0);
11256		if (!mp) {
11257			tcp->tcp_reass_tail = NULL;
11258			break;
11259		}
11260		if (end != TCP_REASS_SEQ(mp)) {
11261			mp1->b_cont = NULL;
11262			break;
11263		}
11264		mp1 = mp;
11265	}
11266	mp1 = tcp->tcp_reass_head;
11267	tcp->tcp_reass_head = mp;
11268	return (mp1);
11269}
11270
11271/* Eliminate any overlap that mp may have over later mblks */
11272static void
11273tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp)
11274{
11275	uint32_t	end;
11276	mblk_t		*mp1;
11277	uint32_t	u1;
11278
11279	end = TCP_REASS_END(mp);
11280	while ((mp1 = mp->b_cont) != NULL) {
11281		u1 = TCP_REASS_SEQ(mp1);
11282		if (!SEQ_GT(end, u1))
11283			break;
11284		if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) {
11285			mp->b_wptr -= end - u1;
11286			TCP_REASS_SET_END(mp, u1);
11287			BUMP_MIB(&tcp_mib, tcpInDataPartDupSegs);
11288			UPDATE_MIB(&tcp_mib, tcpInDataPartDupBytes, end - u1);
11289			break;
11290		}
11291		mp->b_cont = mp1->b_cont;
11292		TCP_REASS_SET_SEQ(mp1, 0);
11293		TCP_REASS_SET_END(mp1, 0);
11294		freeb(mp1);
11295		BUMP_MIB(&tcp_mib, tcpInDataDupSegs);
11296		UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, end - u1);
11297	}
11298	if (!mp1)
11299		tcp->tcp_reass_tail = mp;
11300}
11301
11302/*
11303 * Send up all messages queued on tcp_rcv_list.
11304 */
11305static uint_t
11306tcp_rcv_drain(queue_t *q, tcp_t *tcp)
11307{
11308	mblk_t *mp;
11309	uint_t ret = 0;
11310	uint_t thwin;
11311#ifdef DEBUG
11312	uint_t cnt = 0;
11313#endif
11314	/* Can't drain on an eager connection */
11315	if (tcp->tcp_listener != NULL)
11316		return (ret);
11317
11318	/*
11319	 * Handle two cases here: we are currently fused or we were
11320	 * previously fused and have some urgent data to be delivered
11321	 * upstream.  The latter happens because we either ran out of
11322	 * memory or were detached and therefore sending the SIGURG was
11323	 * deferred until this point.  In either case we pass control
11324	 * over to tcp_fuse_rcv_drain() since it may need to complete
11325	 * some work.
11326	 */
11327	if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
11328		ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
11329		if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
11330		    &tcp->tcp_fused_sigurg_mp))
11331			return (ret);
11332	}
11333
11334	while ((mp = tcp->tcp_rcv_list) != NULL) {
11335		tcp->tcp_rcv_list = mp->b_next;
11336		mp->b_next = NULL;
11337#ifdef DEBUG
11338		cnt += msgdsize(mp);
11339#endif
11340		/* Does this need SSL processing first? */
11341		if ((tcp->tcp_kssl_ctx  != NULL) && (DB_TYPE(mp) == M_DATA)) {
11342			tcp_kssl_input(tcp, mp);
11343			continue;
11344		}
11345		putnext(q, mp);
11346	}
11347	ASSERT(cnt == tcp->tcp_rcv_cnt);
11348	tcp->tcp_rcv_last_head = NULL;
11349	tcp->tcp_rcv_last_tail = NULL;
11350	tcp->tcp_rcv_cnt = 0;
11351
11352	/* Learn the latest rwnd information that we sent to the other side. */
11353	thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
11354	    << tcp->tcp_rcv_ws;
11355	/* This is peer's calculated send window (our receive window). */
11356	thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
11357	/*
11358	 * Increase the receive window to max.  But we need to do receiver
11359	 * SWS avoidance.  This means that we need to check the increase of
11360	 * of receive window is at least 1 MSS.
11361	 */
11362	if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) {
11363		/*
11364		 * If the window that the other side knows is less than max
11365		 * deferred acks segments, send an update immediately.
11366		 */
11367		if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
11368			BUMP_MIB(&tcp_mib, tcpOutWinUpdate);
11369			ret = TH_ACK_NEEDED;
11370		}
11371		tcp->tcp_rwnd = q->q_hiwat;
11372	}
11373	/* No need for the push timer now. */
11374	if (tcp->tcp_push_tid != 0) {
11375		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
11376		tcp->tcp_push_tid = 0;
11377	}
11378	return (ret);
11379}
11380
11381/*
11382 * Queue data on tcp_rcv_list which is a b_next chain.
11383 * tcp_rcv_last_head/tail is the last element of this chain.
11384 * Each element of the chain is a b_cont chain.
11385 *
11386 * M_DATA messages are added to the current element.
11387 * Other messages are added as new (b_next) elements.
11388 */
11389void
11390tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
11391{
11392	ASSERT(seg_len == msgdsize(mp));
11393	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
11394
11395	if (tcp->tcp_rcv_list == NULL) {
11396		ASSERT(tcp->tcp_rcv_last_head == NULL);
11397		tcp->tcp_rcv_list = mp;
11398		tcp->tcp_rcv_last_head = mp;
11399	} else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) {
11400		tcp->tcp_rcv_last_tail->b_cont = mp;
11401	} else {
11402		tcp->tcp_rcv_last_head->b_next = mp;
11403		tcp->tcp_rcv_last_head = mp;
11404	}
11405
11406	while (mp->b_cont)
11407		mp = mp->b_cont;
11408
11409	tcp->tcp_rcv_last_tail = mp;
11410	tcp->tcp_rcv_cnt += seg_len;
11411	tcp->tcp_rwnd -= seg_len;
11412}
11413
11414/*
11415 * DEFAULT TCP ENTRY POINT via squeue on READ side.
11416 *
11417 * This is the default entry function into TCP on the read side. TCP is
11418 * always entered via squeue i.e. using squeue's for mutual exclusion.
11419 * When classifier does a lookup to find the tcp, it also puts a reference
11420 * on the conn structure associated so the tcp is guaranteed to exist
11421 * when we come here. We still need to check the state because it might
11422 * as well has been closed. The squeue processing function i.e. squeue_enter,
11423 * squeue_enter_nodrain, or squeue_drain is responsible for doing the
11424 * CONN_DEC_REF.
11425 *
11426 * Apart from the default entry point, IP also sends packets directly to
11427 * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming
11428 * connections.
11429 */
11430void
11431tcp_input(void *arg, mblk_t *mp, void *arg2)
11432{
11433	conn_t	*connp = (conn_t *)arg;
11434	tcp_t	*tcp = (tcp_t *)connp->conn_tcp;
11435
11436	/* arg2 is the sqp */
11437	ASSERT(arg2 != NULL);
11438	ASSERT(mp != NULL);
11439
11440	/*
11441	 * Don't accept any input on a closed tcp as this TCP logically does
11442	 * not exist on the system. Don't proceed further with this TCP.
11443	 * For eg. this packet could trigger another close of this tcp
11444	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
11445	 * tcp_clean_death / tcp_closei_local must be called at most once
11446	 * on a TCP. In this case we need to refeed the packet into the
11447	 * classifier and figure out where the packet should go. Need to
11448	 * preserve the recv_ill somehow. Until we figure that out, for
11449	 * now just drop the packet if we can't classify the packet.
11450	 */
11451	if (tcp->tcp_state == TCPS_CLOSED ||
11452	    tcp->tcp_state == TCPS_BOUND) {
11453		conn_t	*new_connp;
11454
11455		new_connp = ipcl_classify(mp, connp->conn_zoneid);
11456		if (new_connp != NULL) {
11457			tcp_reinput(new_connp, mp, arg2);
11458			return;
11459		}
11460		/* We failed to classify. For now just drop the packet */
11461		freemsg(mp);
11462		return;
11463	}
11464
11465	if (DB_TYPE(mp) == M_DATA)
11466		tcp_rput_data(connp, mp, arg2);
11467	else
11468		tcp_rput_common(tcp, mp);
11469}
11470
11471/*
11472 * The read side put procedure.
11473 * The packets passed up by ip are assume to be aligned according to
11474 * OK_32PTR and the IP+TCP headers fitting in the first mblk.
11475 */
11476static void
11477tcp_rput_common(tcp_t *tcp, mblk_t *mp)
11478{
11479	/*
11480	 * tcp_rput_data() does not expect M_CTL except for the case
11481	 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO
11482	 * type. Need to make sure that any other M_CTLs don't make
11483	 * it to tcp_rput_data since it is not expecting any and doesn't
11484	 * check for it.
11485	 */
11486	if (DB_TYPE(mp) == M_CTL) {
11487		switch (*(uint32_t *)(mp->b_rptr)) {
11488		case TCP_IOC_ABORT_CONN:
11489			/*
11490			 * Handle connection abort request.
11491			 */
11492			tcp_ioctl_abort_handler(tcp, mp);
11493			return;
11494		case IPSEC_IN:
11495			/*
11496			 * Only secure icmp arrive in TCP and they
11497			 * don't go through data path.
11498			 */
11499			tcp_icmp_error(tcp, mp);
11500			return;
11501		case IN_PKTINFO:
11502			/*
11503			 * Handle IPV6_RECVPKTINFO socket option on AF_INET6
11504			 * sockets that are receiving IPv4 traffic. tcp
11505			 */
11506			ASSERT(tcp->tcp_family == AF_INET6);
11507			ASSERT(tcp->tcp_ipv6_recvancillary &
11508			    TCP_IPV6_RECVPKTINFO);
11509			tcp_rput_data(tcp->tcp_connp, mp,
11510			    tcp->tcp_connp->conn_sqp);
11511			return;
11512		case MDT_IOC_INFO_UPDATE:
11513			/*
11514			 * Handle Multidata information update; the
11515			 * following routine will free the message.
11516			 */
11517			if (tcp->tcp_connp->conn_mdt_ok) {
11518				tcp_mdt_update(tcp,
11519				    &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab,
11520				    B_FALSE);
11521			}
11522			freemsg(mp);
11523			return;
11524		default:
11525			break;
11526		}
11527	}
11528
11529	/* No point processing the message if tcp is already closed */
11530	if (TCP_IS_DETACHED_NONEAGER(tcp)) {
11531		freemsg(mp);
11532		return;
11533	}
11534
11535	tcp_rput_other(tcp, mp);
11536}
11537
11538
11539/* The minimum of smoothed mean deviation in RTO calculation. */
11540#define	TCP_SD_MIN	400
11541
11542/*
11543 * Set RTO for this connection.  The formula is from Jacobson and Karels'
11544 * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
11545 * are the same as those in Appendix A.2 of that paper.
11546 *
11547 * m = new measurement
11548 * sa = smoothed RTT average (8 * average estimates).
11549 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
11550 */
11551static void
11552tcp_set_rto(tcp_t *tcp, clock_t rtt)
11553{
11554	long m = TICK_TO_MSEC(rtt);
11555	clock_t sa = tcp->tcp_rtt_sa;
11556	clock_t sv = tcp->tcp_rtt_sd;
11557	clock_t rto;
11558
11559	BUMP_MIB(&tcp_mib, tcpRttUpdate);
11560	tcp->tcp_rtt_update++;
11561
11562	/* tcp_rtt_sa is not 0 means this is a new sample. */
11563	if (sa != 0) {
11564		/*
11565		 * Update average estimator:
11566		 *	new rtt = 7/8 old rtt + 1/8 Error
11567		 */
11568
11569		/* m is now Error in estimate. */
11570		m -= sa >> 3;
11571		if ((sa += m) <= 0) {
11572			/*
11573			 * Don't allow the smoothed average to be negative.
11574			 * We use 0 to denote reinitialization of the
11575			 * variables.
11576			 */
11577			sa = 1;
11578		}
11579
11580		/*
11581		 * Update deviation estimator:
11582		 *	new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
11583		 */
11584		if (m < 0)
11585			m = -m;
11586		m -= sv >> 2;
11587		sv += m;
11588	} else {
11589		/*
11590		 * This follows BSD's implementation.  So the reinitialized
11591		 * RTO is 3 * m.  We cannot go less than 2 because if the
11592		 * link is bandwidth dominated, doubling the window size
11593		 * during slow start means doubling the RTT.  We want to be
11594		 * more conservative when we reinitialize our estimates.  3
11595		 * is just a convenient number.
11596		 */
11597		sa = m << 3;
11598		sv = m << 1;
11599	}
11600	if (sv < TCP_SD_MIN) {
11601		/*
11602		 * We do not know that if sa captures the delay ACK
11603		 * effect as in a long train of segments, a receiver
11604		 * does not delay its ACKs.  So set the minimum of sv
11605		 * to be TCP_SD_MIN, which is default to 400 ms, twice
11606		 * of BSD DATO.  That means the minimum of mean
11607		 * deviation is 100 ms.
11608		 *
11609		 */
11610		sv = TCP_SD_MIN;
11611	}
11612	tcp->tcp_rtt_sa = sa;
11613	tcp->tcp_rtt_sd = sv;
11614	/*
11615	 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
11616	 *
11617	 * Add tcp_rexmit_interval extra in case of extreme environment
11618	 * where the algorithm fails to work.  The default value of
11619	 * tcp_rexmit_interval_extra should be 0.
11620	 *
11621	 * As we use a finer grained clock than BSD and update
11622	 * RTO for every ACKs, add in another .25 of RTT to the
11623	 * deviation of RTO to accomodate burstiness of 1/4 of
11624	 * window size.
11625	 */
11626	rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5);
11627
11628	if (rto > tcp_rexmit_interval_max) {
11629		tcp->tcp_rto = tcp_rexmit_interval_max;
11630	} else if (rto < tcp_rexmit_interval_min) {
11631		tcp->tcp_rto = tcp_rexmit_interval_min;
11632	} else {
11633		tcp->tcp_rto = rto;
11634	}
11635
11636	/* Now, we can reset tcp_timer_backoff to use the new RTO... */
11637	tcp->tcp_timer_backoff = 0;
11638}
11639
11640/*
11641 * tcp_get_seg_mp() is called to get the pointer to a segment in the
11642 * send queue which starts at the given seq. no.
11643 *
11644 * Parameters:
11645 *	tcp_t *tcp: the tcp instance pointer.
11646 *	uint32_t seq: the starting seq. no of the requested segment.
11647 *	int32_t *off: after the execution, *off will be the offset to
11648 *		the returned mblk which points to the requested seq no.
11649 *		It is the caller's responsibility to send in a non-null off.
11650 *
11651 * Return:
11652 *	A mblk_t pointer pointing to the requested segment in send queue.
11653 */
11654static mblk_t *
11655tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
11656{
11657	int32_t	cnt;
11658	mblk_t	*mp;
11659
11660	/* Defensive coding.  Make sure we don't send incorrect data. */
11661	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
11662		return (NULL);
11663
11664	cnt = seq - tcp->tcp_suna;
11665	mp = tcp->tcp_xmit_head;
11666	while (cnt > 0 && mp != NULL) {
11667		cnt -= mp->b_wptr - mp->b_rptr;
11668		if (cnt < 0) {
11669			cnt += mp->b_wptr - mp->b_rptr;
11670			break;
11671		}
11672		mp = mp->b_cont;
11673	}
11674	ASSERT(mp != NULL);
11675	*off = cnt;
11676	return (mp);
11677}
11678
11679/*
11680 * This function handles all retransmissions if SACK is enabled for this
11681 * connection.  First it calculates how many segments can be retransmitted
11682 * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
11683 * segments.  A segment is eligible if sack_cnt for that segment is greater
11684 * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
11685 * all eligible segments, it checks to see if TCP can send some new segments
11686 * (fast recovery).  If it can, set the appropriate flag for tcp_rput_data().
11687 *
11688 * Parameters:
11689 *	tcp_t *tcp: the tcp structure of the connection.
11690 *	uint_t *flags: in return, appropriate value will be set for
11691 *	tcp_rput_data().
11692 */
11693static void
11694tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
11695{
11696	notsack_blk_t	*notsack_blk;
11697	int32_t		usable_swnd;
11698	int32_t		mss;
11699	uint32_t	seg_len;
11700	mblk_t		*xmit_mp;
11701
11702	ASSERT(tcp->tcp_sack_info != NULL);
11703	ASSERT(tcp->tcp_notsack_list != NULL);
11704	ASSERT(tcp->tcp_rexmit == B_FALSE);
11705
11706	/* Defensive coding in case there is a bug... */
11707	if (tcp->tcp_notsack_list == NULL) {
11708		return;
11709	}
11710	notsack_blk = tcp->tcp_notsack_list;
11711	mss = tcp->tcp_mss;
11712
11713	/*
11714	 * Limit the num of outstanding data in the network to be
11715	 * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
11716	 */
11717	usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
11718
11719	/* At least retransmit 1 MSS of data. */
11720	if (usable_swnd <= 0) {
11721		usable_swnd = mss;
11722	}
11723
11724	/* Make sure no new RTT samples will be taken. */
11725	tcp->tcp_csuna = tcp->tcp_snxt;
11726
11727	notsack_blk = tcp->tcp_notsack_list;
11728	while (usable_swnd > 0) {
11729		mblk_t		*snxt_mp, *tmp_mp;
11730		tcp_seq		begin = tcp->tcp_sack_snxt;
11731		tcp_seq		end;
11732		int32_t		off;
11733
11734		for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
11735			if (SEQ_GT(notsack_blk->end, begin) &&
11736			    (notsack_blk->sack_cnt >=
11737			    tcp_dupack_fast_retransmit)) {
11738				end = notsack_blk->end;
11739				if (SEQ_LT(begin, notsack_blk->begin)) {
11740					begin = notsack_blk->begin;
11741				}
11742				break;
11743			}
11744		}
11745		/*
11746		 * All holes are filled.  Manipulate tcp_cwnd to send more
11747		 * if we can.  Note that after the SACK recovery, tcp_cwnd is
11748		 * set to tcp_cwnd_ssthresh.
11749		 */
11750		if (notsack_blk == NULL) {
11751			usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
11752			if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
11753				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
11754				ASSERT(tcp->tcp_cwnd > 0);
11755				return;
11756			} else {
11757				usable_swnd = usable_swnd / mss;
11758				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
11759				    MAX(usable_swnd * mss, mss);
11760				*flags |= TH_XMIT_NEEDED;
11761				return;
11762			}
11763		}
11764
11765		/*
11766		 * Note that we may send more than usable_swnd allows here
11767		 * because of round off, but no more than 1 MSS of data.
11768		 */
11769		seg_len = end - begin;
11770		if (seg_len > mss)
11771			seg_len = mss;
11772		snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
11773		ASSERT(snxt_mp != NULL);
11774		/* This should not happen.  Defensive coding again... */
11775		if (snxt_mp == NULL) {
11776			return;
11777		}
11778
11779		xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
11780		    &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
11781		if (xmit_mp == NULL)
11782			return;
11783
11784		usable_swnd -= seg_len;
11785		tcp->tcp_pipe += seg_len;
11786		tcp->tcp_sack_snxt = begin + seg_len;
11787		TCP_RECORD_TRACE(tcp, xmit_mp, TCP_TRACE_SEND_PKT);
11788		tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
11789
11790		/*
11791		 * Update the send timestamp to avoid false retransmission.
11792		 */
11793		snxt_mp->b_prev = (mblk_t *)lbolt;
11794
11795		BUMP_MIB(&tcp_mib, tcpRetransSegs);
11796		UPDATE_MIB(&tcp_mib, tcpRetransBytes, seg_len);
11797		BUMP_MIB(&tcp_mib, tcpOutSackRetransSegs);
11798		/*
11799		 * Update tcp_rexmit_max to extend this SACK recovery phase.
11800		 * This happens when new data sent during fast recovery is
11801		 * also lost.  If TCP retransmits those new data, it needs
11802		 * to extend SACK recover phase to avoid starting another
11803		 * fast retransmit/recovery unnecessarily.
11804		 */
11805		if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
11806			tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
11807		}
11808	}
11809}
11810
11811/*
11812 * This function handles policy checking at TCP level for non-hard_bound/
11813 * detached connections.
11814 */
11815static boolean_t
11816tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h,
11817    boolean_t secure, boolean_t mctl_present)
11818{
11819	ipsec_latch_t *ipl = NULL;
11820	ipsec_action_t *act = NULL;
11821	mblk_t *data_mp;
11822	ipsec_in_t *ii;
11823	const char *reason;
11824	kstat_named_t *counter;
11825
11826	ASSERT(mctl_present || !secure);
11827
11828	ASSERT((ipha == NULL && ip6h != NULL) ||
11829	    (ip6h == NULL && ipha != NULL));
11830
11831	/*
11832	 * We don't necessarily have an ipsec_in_act action to verify
11833	 * policy because of assymetrical policy where we have only
11834	 * outbound policy and no inbound policy (possible with global
11835	 * policy).
11836	 */
11837	if (!secure) {
11838		if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS ||
11839		    act->ipa_act.ipa_type == IPSEC_ACT_CLEAR)
11840			return (B_TRUE);
11841		ipsec_log_policy_failure(tcp->tcp_wq, IPSEC_POLICY_MISMATCH,
11842		    "tcp_check_policy", ipha, ip6h, secure);
11843		ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
11844		    &ipdrops_tcp_clear, &tcp_dropper);
11845		return (B_FALSE);
11846	}
11847
11848	/*
11849	 * We have a secure packet.
11850	 */
11851	if (act == NULL) {
11852		ipsec_log_policy_failure(tcp->tcp_wq,
11853		    IPSEC_POLICY_NOT_NEEDED, "tcp_check_policy", ipha, ip6h,
11854		    secure);
11855		ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
11856		    &ipdrops_tcp_secure, &tcp_dropper);
11857		return (B_FALSE);
11858	}
11859
11860	/*
11861	 * XXX This whole routine is currently incorrect.  ipl should
11862	 * be set to the latch pointer, but is currently not set, so
11863	 * we initialize it to NULL to avoid picking up random garbage.
11864	 */
11865	if (ipl == NULL)
11866		return (B_TRUE);
11867
11868	data_mp = first_mp->b_cont;
11869
11870	ii = (ipsec_in_t *)first_mp->b_rptr;
11871
11872	if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason,
11873	    &counter)) {
11874		BUMP_MIB(&ip_mib, ipsecInSucceeded);
11875		return (B_TRUE);
11876	}
11877	(void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
11878	    "tcp inbound policy mismatch: %s, packet dropped\n",
11879	    reason);
11880	BUMP_MIB(&ip_mib, ipsecInFailed);
11881
11882	ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, &tcp_dropper);
11883	return (B_FALSE);
11884}
11885
11886/*
11887 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start
11888 * retransmission after a timeout.
11889 *
11890 * To limit the number of duplicate segments, we limit the number of segment
11891 * to be sent in one time to tcp_snd_burst, the burst variable.
11892 */
11893static void
11894tcp_ss_rexmit(tcp_t *tcp)
11895{
11896	uint32_t	snxt;
11897	uint32_t	smax;
11898	int32_t		win;
11899	int32_t		mss;
11900	int32_t		off;
11901	int32_t		burst = tcp->tcp_snd_burst;
11902	mblk_t		*snxt_mp;
11903
11904	/*
11905	 * Note that tcp_rexmit can be set even though TCP has retransmitted
11906	 * all unack'ed segments.
11907	 */
11908	if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
11909		smax = tcp->tcp_rexmit_max;
11910		snxt = tcp->tcp_rexmit_nxt;
11911		if (SEQ_LT(snxt, tcp->tcp_suna)) {
11912			snxt = tcp->tcp_suna;
11913		}
11914		win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
11915		win -= snxt - tcp->tcp_suna;
11916		mss = tcp->tcp_mss;
11917		snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
11918
11919		while (SEQ_LT(snxt, smax) && (win > 0) &&
11920		    (burst > 0) && (snxt_mp != NULL)) {
11921			mblk_t	*xmit_mp;
11922			mblk_t	*old_snxt_mp = snxt_mp;
11923			uint32_t cnt = mss;
11924
11925			if (win < cnt) {
11926				cnt = win;
11927			}
11928			if (SEQ_GT(snxt + cnt, smax)) {
11929				cnt = smax - snxt;
11930			}
11931			xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
11932			    &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
11933			if (xmit_mp == NULL)
11934				return;
11935
11936			tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
11937
11938			snxt += cnt;
11939			win -= cnt;
11940			/*
11941			 * Update the send timestamp to avoid false
11942			 * retransmission.
11943			 */
11944			old_snxt_mp->b_prev = (mblk_t *)lbolt;
11945			BUMP_MIB(&tcp_mib, tcpRetransSegs);
11946			UPDATE_MIB(&tcp_mib, tcpRetransBytes, cnt);
11947
11948			tcp->tcp_rexmit_nxt = snxt;
11949			burst--;
11950		}
11951		/*
11952		 * If we have transmitted all we have at the time
11953		 * we started the retranmission, we can leave
11954		 * the rest of the job to tcp_wput_data().  But we
11955		 * need to check the send window first.  If the
11956		 * win is not 0, go on with tcp_wput_data().
11957		 */
11958		if (SEQ_LT(snxt, smax) || win == 0) {
11959			return;
11960		}
11961	}
11962	/* Only call tcp_wput_data() if there is data to be sent. */
11963	if (tcp->tcp_unsent) {
11964		tcp_wput_data(tcp, NULL, B_FALSE);
11965	}
11966}
11967
11968/*
11969 * Process all TCP option in SYN segment.  Note that this function should
11970 * be called after tcp_adapt_ire() is called so that the necessary info
11971 * from IRE is already set in the tcp structure.
11972 *
11973 * This function sets up the correct tcp_mss value according to the
11974 * MSS option value and our header size.  It also sets up the window scale
11975 * and timestamp values, and initialize SACK info blocks.  But it does not
11976 * change receive window size after setting the tcp_mss value.  The caller
11977 * should do the appropriate change.
11978 */
11979void
11980tcp_process_options(tcp_t *tcp, tcph_t *tcph)
11981{
11982	int options;
11983	tcp_opt_t tcpopt;
11984	uint32_t mss_max;
11985	char *tmp_tcph;
11986
11987	tcpopt.tcp = NULL;
11988	options = tcp_parse_options(tcph, &tcpopt);
11989
11990	/*
11991	 * Process MSS option.  Note that MSS option value does not account
11992	 * for IP or TCP options.  This means that it is equal to MTU - minimum
11993	 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for
11994	 * IPv6.
11995	 */
11996	if (!(options & TCP_OPT_MSS_PRESENT)) {
11997		if (tcp->tcp_ipversion == IPV4_VERSION)
11998			tcpopt.tcp_opt_mss = tcp_mss_def_ipv4;
11999		else
12000			tcpopt.tcp_opt_mss = tcp_mss_def_ipv6;
12001	} else {
12002		if (tcp->tcp_ipversion == IPV4_VERSION)
12003			mss_max = tcp_mss_max_ipv4;
12004		else
12005			mss_max = tcp_mss_max_ipv6;
12006		if (tcpopt.tcp_opt_mss < tcp_mss_min)
12007			tcpopt.tcp_opt_mss = tcp_mss_min;
12008		else if (tcpopt.tcp_opt_mss > mss_max)
12009			tcpopt.tcp_opt_mss = mss_max;
12010	}
12011
12012	/* Process Window Scale option. */
12013	if (options & TCP_OPT_WSCALE_PRESENT) {
12014		tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale;
12015		tcp->tcp_snd_ws_ok = B_TRUE;
12016	} else {
12017		tcp->tcp_snd_ws = B_FALSE;
12018		tcp->tcp_snd_ws_ok = B_FALSE;
12019		tcp->tcp_rcv_ws = B_FALSE;
12020	}
12021
12022	/* Process Timestamp option. */
12023	if ((options & TCP_OPT_TSTAMP_PRESENT) &&
12024	    (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) {
12025		tmp_tcph = (char *)tcp->tcp_tcph;
12026
12027		tcp->tcp_snd_ts_ok = B_TRUE;
12028		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
12029		tcp->tcp_last_rcv_lbolt = lbolt64;
12030		ASSERT(OK_32PTR(tmp_tcph));
12031		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
12032
12033		/* Fill in our template header with basic timestamp option. */
12034		tmp_tcph += tcp->tcp_tcp_hdr_len;
12035		tmp_tcph[0] = TCPOPT_NOP;
12036		tmp_tcph[1] = TCPOPT_NOP;
12037		tmp_tcph[2] = TCPOPT_TSTAMP;
12038		tmp_tcph[3] = TCPOPT_TSTAMP_LEN;
12039		tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN;
12040		tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN;
12041		tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4);
12042	} else {
12043		tcp->tcp_snd_ts_ok = B_FALSE;
12044	}
12045
12046	/*
12047	 * Process SACK options.  If SACK is enabled for this connection,
12048	 * then allocate the SACK info structure.  Note the following ways
12049	 * when tcp_snd_sack_ok is set to true.
12050	 *
12051	 * For active connection: in tcp_adapt_ire() called in
12052	 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted
12053	 * is checked.
12054	 *
12055	 * For passive connection: in tcp_adapt_ire() called in
12056	 * tcp_accept_comm().
12057	 *
12058	 * That's the reason why the extra TCP_IS_DETACHED() check is there.
12059	 * That check makes sure that if we did not send a SACK OK option,
12060	 * we will not enable SACK for this connection even though the other
12061	 * side sends us SACK OK option.  For active connection, the SACK
12062	 * info structure has already been allocated.  So we need to free
12063	 * it if SACK is disabled.
12064	 */
12065	if ((options & TCP_OPT_SACK_OK_PRESENT) &&
12066	    (tcp->tcp_snd_sack_ok ||
12067	    (tcp_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) {
12068		/* This should be true only in the passive case. */
12069		if (tcp->tcp_sack_info == NULL) {
12070			ASSERT(TCP_IS_DETACHED(tcp));
12071			tcp->tcp_sack_info =
12072			    kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP);
12073		}
12074		if (tcp->tcp_sack_info == NULL) {
12075			tcp->tcp_snd_sack_ok = B_FALSE;
12076		} else {
12077			tcp->tcp_snd_sack_ok = B_TRUE;
12078			if (tcp->tcp_snd_ts_ok) {
12079				tcp->tcp_max_sack_blk = 3;
12080			} else {
12081				tcp->tcp_max_sack_blk = 4;
12082			}
12083		}
12084	} else {
12085		/*
12086		 * Resetting tcp_snd_sack_ok to B_FALSE so that
12087		 * no SACK info will be used for this
12088		 * connection.  This assumes that SACK usage
12089		 * permission is negotiated.  This may need
12090		 * to be changed once this is clarified.
12091		 */
12092		if (tcp->tcp_sack_info != NULL) {
12093			ASSERT(tcp->tcp_notsack_list == NULL);
12094			kmem_cache_free(tcp_sack_info_cache,
12095			    tcp->tcp_sack_info);
12096			tcp->tcp_sack_info = NULL;
12097		}
12098		tcp->tcp_snd_sack_ok = B_FALSE;
12099	}
12100
12101	/*
12102	 * Now we know the exact TCP/IP header length, subtract
12103	 * that from tcp_mss to get our side's MSS.
12104	 */
12105	tcp->tcp_mss -= tcp->tcp_hdr_len;
12106	/*
12107	 * Here we assume that the other side's header size will be equal to
12108	 * our header size.  We calculate the real MSS accordingly.  Need to
12109	 * take into additional stuffs IPsec puts in.
12110	 *
12111	 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header)
12112	 */
12113	tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead -
12114	    ((tcp->tcp_ipversion == IPV4_VERSION ?
12115	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
12116
12117	/*
12118	 * Set MSS to the smaller one of both ends of the connection.
12119	 * We should not have called tcp_mss_set() before, but our
12120	 * side of the MSS should have been set to a proper value
12121	 * by tcp_adapt_ire().  tcp_mss_set() will also set up the
12122	 * STREAM head parameters properly.
12123	 *
12124	 * If we have a larger-than-16-bit window but the other side
12125	 * didn't want to do window scale, tcp_rwnd_set() will take
12126	 * care of that.
12127	 */
12128	tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
12129}
12130
12131/*
12132 * Sends the T_CONN_IND to the listener. The caller calls this
12133 * functions via squeue to get inside the listener's perimeter
12134 * once the 3 way hand shake is done a T_CONN_IND needs to be
12135 * sent. As an optimization, the caller can call this directly
12136 * if listener's perimeter is same as eager's.
12137 */
12138/* ARGSUSED */
12139void
12140tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
12141{
12142	conn_t			*lconnp = (conn_t *)arg;
12143	tcp_t			*listener = lconnp->conn_tcp;
12144	tcp_t			*tcp;
12145	struct T_conn_ind	*conn_ind;
12146	ipaddr_t 		*addr_cache;
12147	boolean_t		need_send_conn_ind = B_FALSE;
12148
12149	/* retrieve the eager */
12150	conn_ind = (struct T_conn_ind *)mp->b_rptr;
12151	ASSERT(conn_ind->OPT_offset != 0 &&
12152	    conn_ind->OPT_length == sizeof (intptr_t));
12153	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
12154		conn_ind->OPT_length);
12155
12156	/*
12157	 * TLI/XTI applications will get confused by
12158	 * sending eager as an option since it violates
12159	 * the option semantics. So remove the eager as
12160	 * option since TLI/XTI app doesn't need it anyway.
12161	 */
12162	if (!TCP_IS_SOCKET(listener)) {
12163		conn_ind->OPT_length = 0;
12164		conn_ind->OPT_offset = 0;
12165	}
12166	if (listener->tcp_state == TCPS_CLOSED ||
12167	    TCP_IS_DETACHED(listener)) {
12168		/*
12169		 * If listener has closed, it would have caused a
12170		 * a cleanup/blowoff to happen for the eager. We
12171		 * just need to return.
12172		 */
12173		freemsg(mp);
12174		return;
12175	}
12176
12177
12178	/*
12179	 * if the conn_req_q is full defer passing up the
12180	 * T_CONN_IND until space is availabe after t_accept()
12181	 * processing
12182	 */
12183	mutex_enter(&listener->tcp_eager_lock);
12184	if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
12185		tcp_t *tail;
12186
12187		/*
12188		 * The eager already has an extra ref put in tcp_rput_data
12189		 * so that it stays till accept comes back even though it
12190		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
12191		 */
12192		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
12193		listener->tcp_conn_req_cnt_q0--;
12194		listener->tcp_conn_req_cnt_q++;
12195
12196		/* Move from SYN_RCVD to ESTABLISHED list  */
12197		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
12198		    tcp->tcp_eager_prev_q0;
12199		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
12200		    tcp->tcp_eager_next_q0;
12201		tcp->tcp_eager_prev_q0 = NULL;
12202		tcp->tcp_eager_next_q0 = NULL;
12203
12204		/*
12205		 * Insert at end of the queue because sockfs
12206		 * sends down T_CONN_RES in chronological
12207		 * order. Leaving the older conn indications
12208		 * at front of the queue helps reducing search
12209		 * time.
12210		 */
12211		tail = listener->tcp_eager_last_q;
12212		if (tail != NULL)
12213			tail->tcp_eager_next_q = tcp;
12214		else
12215			listener->tcp_eager_next_q = tcp;
12216		listener->tcp_eager_last_q = tcp;
12217		tcp->tcp_eager_next_q = NULL;
12218		/*
12219		 * Delay sending up the T_conn_ind until we are
12220		 * done with the eager. Once we have have sent up
12221		 * the T_conn_ind, the accept can potentially complete
12222		 * any time and release the refhold we have on the eager.
12223		 */
12224		need_send_conn_ind = B_TRUE;
12225	} else {
12226		/*
12227		 * Defer connection on q0 and set deferred
12228		 * connection bit true
12229		 */
12230		tcp->tcp_conn_def_q0 = B_TRUE;
12231
12232		/* take tcp out of q0 ... */
12233		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
12234		    tcp->tcp_eager_next_q0;
12235		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
12236		    tcp->tcp_eager_prev_q0;
12237
12238		/* ... and place it at the end of q0 */
12239		tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
12240		tcp->tcp_eager_next_q0 = listener;
12241		listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
12242		listener->tcp_eager_prev_q0 = tcp;
12243		tcp->tcp_conn.tcp_eager_conn_ind = mp;
12244	}
12245
12246	/* we have timed out before */
12247	if (tcp->tcp_syn_rcvd_timeout != 0) {
12248		tcp->tcp_syn_rcvd_timeout = 0;
12249		listener->tcp_syn_rcvd_timeout--;
12250		if (listener->tcp_syn_defense &&
12251		    listener->tcp_syn_rcvd_timeout <=
12252		    (tcp_conn_req_max_q0 >> 5) &&
12253		    10*MINUTES < TICK_TO_MSEC(lbolt64 -
12254			listener->tcp_last_rcv_lbolt)) {
12255			/*
12256			 * Turn off the defense mode if we
12257			 * believe the SYN attack is over.
12258			 */
12259			listener->tcp_syn_defense = B_FALSE;
12260			if (listener->tcp_ip_addr_cache) {
12261				kmem_free((void *)listener->tcp_ip_addr_cache,
12262				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
12263				listener->tcp_ip_addr_cache = NULL;
12264			}
12265		}
12266	}
12267	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
12268	if (addr_cache != NULL) {
12269		/*
12270		 * We have finished a 3-way handshake with this
12271		 * remote host. This proves the IP addr is good.
12272		 * Cache it!
12273		 */
12274		addr_cache[IP_ADDR_CACHE_HASH(
12275			tcp->tcp_remote)] = tcp->tcp_remote;
12276	}
12277	mutex_exit(&listener->tcp_eager_lock);
12278	if (need_send_conn_ind)
12279		putnext(listener->tcp_rq, mp);
12280}
12281
12282mblk_t *
12283tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp,
12284    uint_t *ifindexp, ip6_pkt_t *ippp)
12285{
12286	in_pktinfo_t	*pinfo;
12287	ip6_t		*ip6h;
12288	uchar_t		*rptr;
12289	mblk_t		*first_mp = mp;
12290	boolean_t	mctl_present = B_FALSE;
12291	uint_t 		ifindex = 0;
12292	ip6_pkt_t	ipp;
12293	uint_t		ipvers;
12294	uint_t		ip_hdr_len;
12295
12296	rptr = mp->b_rptr;
12297	ASSERT(OK_32PTR(rptr));
12298	ASSERT(tcp != NULL);
12299	ipp.ipp_fields = 0;
12300
12301	switch DB_TYPE(mp) {
12302	case M_CTL:
12303		mp = mp->b_cont;
12304		if (mp == NULL) {
12305			freemsg(first_mp);
12306			return (NULL);
12307		}
12308		if (DB_TYPE(mp) != M_DATA) {
12309			freemsg(first_mp);
12310			return (NULL);
12311		}
12312		mctl_present = B_TRUE;
12313		break;
12314	case M_DATA:
12315		break;
12316	default:
12317		cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type");
12318		freemsg(mp);
12319		return (NULL);
12320	}
12321	ipvers = IPH_HDR_VERSION(rptr);
12322	if (ipvers == IPV4_VERSION) {
12323		if (tcp == NULL) {
12324			ip_hdr_len = IPH_HDR_LENGTH(rptr);
12325			goto done;
12326		}
12327
12328		ipp.ipp_fields |= IPPF_HOPLIMIT;
12329		ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl;
12330
12331		/*
12332		 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary
12333		 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp.
12334		 */
12335		if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) &&
12336		    mctl_present) {
12337			pinfo = (in_pktinfo_t *)first_mp->b_rptr;
12338			if ((MBLKL(first_mp) == sizeof (in_pktinfo_t)) &&
12339			    (pinfo->in_pkt_ulp_type == IN_PKTINFO) &&
12340			    (pinfo->in_pkt_flags & IPF_RECVIF)) {
12341				ipp.ipp_fields |= IPPF_IFINDEX;
12342				ipp.ipp_ifindex = pinfo->in_pkt_ifindex;
12343				ifindex = pinfo->in_pkt_ifindex;
12344			}
12345			freeb(first_mp);
12346			mctl_present = B_FALSE;
12347		}
12348		ip_hdr_len = IPH_HDR_LENGTH(rptr);
12349	} else {
12350		ip6h = (ip6_t *)rptr;
12351
12352		ASSERT(ipvers == IPV6_VERSION);
12353		ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS;
12354		ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20;
12355		ipp.ipp_hoplimit = ip6h->ip6_hops;
12356
12357		if (ip6h->ip6_nxt != IPPROTO_TCP) {
12358			uint8_t	nexthdrp;
12359
12360			/* Look for ifindex information */
12361			if (ip6h->ip6_nxt == IPPROTO_RAW) {
12362				ip6i_t *ip6i = (ip6i_t *)ip6h;
12363				if ((uchar_t *)&ip6i[1] > mp->b_wptr) {
12364					BUMP_MIB(&ip_mib, tcpInErrs);
12365					freemsg(first_mp);
12366					return (NULL);
12367				}
12368
12369				if (ip6i->ip6i_flags & IP6I_IFINDEX) {
12370					ASSERT(ip6i->ip6i_ifindex != 0);
12371					ipp.ipp_fields |= IPPF_IFINDEX;
12372					ipp.ipp_ifindex = ip6i->ip6i_ifindex;
12373					ifindex = ip6i->ip6i_ifindex;
12374				}
12375				rptr = (uchar_t *)&ip6i[1];
12376				mp->b_rptr = rptr;
12377				if (rptr == mp->b_wptr) {
12378					mblk_t *mp1;
12379					mp1 = mp->b_cont;
12380					freeb(mp);
12381					mp = mp1;
12382					rptr = mp->b_rptr;
12383				}
12384				if (MBLKL(mp) < IPV6_HDR_LEN +
12385				    sizeof (tcph_t)) {
12386					BUMP_MIB(&ip_mib, tcpInErrs);
12387					freemsg(first_mp);
12388					return (NULL);
12389				}
12390				ip6h = (ip6_t *)rptr;
12391			}
12392
12393			/*
12394			 * Find any potentially interesting extension headers
12395			 * as well as the length of the IPv6 + extension
12396			 * headers.
12397			 */
12398			ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp);
12399			/* Verify if this is a TCP packet */
12400			if (nexthdrp != IPPROTO_TCP) {
12401				BUMP_MIB(&ip_mib, tcpInErrs);
12402				freemsg(first_mp);
12403				return (NULL);
12404			}
12405		} else {
12406			ip_hdr_len = IPV6_HDR_LEN;
12407		}
12408	}
12409
12410done:
12411	if (ipversp != NULL)
12412		*ipversp = ipvers;
12413	if (ip_hdr_lenp != NULL)
12414		*ip_hdr_lenp = ip_hdr_len;
12415	if (ippp != NULL)
12416		*ippp = ipp;
12417	if (ifindexp != NULL)
12418		*ifindexp = ifindex;
12419	if (mctl_present) {
12420		freeb(first_mp);
12421	}
12422	return (mp);
12423}
12424
12425/*
12426 * Handle M_DATA messages from IP. Its called directly from IP via
12427 * squeue for AF_INET type sockets fast path. No M_CTL are expected
12428 * in this path.
12429 *
12430 * For everything else (including AF_INET6 sockets with 'tcp_ipversion'
12431 * v4 and v6), we are called through tcp_input() and a M_CTL can
12432 * be present for options but tcp_find_pktinfo() deals with it. We
12433 * only expect M_DATA packets after tcp_find_pktinfo() is done.
12434 *
12435 * The first argument is always the connp/tcp to which the mp belongs.
12436 * There are no exceptions to this rule. The caller has already put
12437 * a reference on this connp/tcp and once tcp_rput_data() returns,
12438 * the squeue will do the refrele.
12439 *
12440 * The TH_SYN for the listener directly go to tcp_conn_request via
12441 * squeue.
12442 *
12443 * sqp: NULL = recursive, sqp != NULL means called from squeue
12444 */
12445void
12446tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
12447{
12448	int32_t		bytes_acked;
12449	int32_t		gap;
12450	mblk_t		*mp1;
12451	uint_t		flags;
12452	uint32_t	new_swnd = 0;
12453	uchar_t		*iphdr;
12454	uchar_t		*rptr;
12455	int32_t		rgap;
12456	uint32_t	seg_ack;
12457	int		seg_len;
12458	uint_t		ip_hdr_len;
12459	uint32_t	seg_seq;
12460	tcph_t		*tcph;
12461	int		urp;
12462	tcp_opt_t	tcpopt;
12463	uint_t		ipvers;
12464	ip6_pkt_t	ipp;
12465	boolean_t	ofo_seg = B_FALSE; /* Out of order segment */
12466	uint32_t	cwnd;
12467	uint32_t	add;
12468	int		npkt;
12469	int		mss;
12470	conn_t		*connp = (conn_t *)arg;
12471	squeue_t	*sqp = (squeue_t *)arg2;
12472	tcp_t		*tcp = connp->conn_tcp;
12473
12474	/*
12475	 * RST from fused tcp loopback peer should trigger an unfuse.
12476	 */
12477	if (tcp->tcp_fused) {
12478		TCP_STAT(tcp_fusion_aborted);
12479		tcp_unfuse(tcp);
12480	}
12481
12482	iphdr = mp->b_rptr;
12483	rptr = mp->b_rptr;
12484	ASSERT(OK_32PTR(rptr));
12485
12486	/*
12487	 * An AF_INET socket is not capable of receiving any pktinfo. Do inline
12488	 * processing here. For rest call tcp_find_pktinfo to fill up the
12489	 * necessary information.
12490	 */
12491	if (IPCL_IS_TCP4(connp)) {
12492		ipvers = IPV4_VERSION;
12493		ip_hdr_len = IPH_HDR_LENGTH(rptr);
12494	} else {
12495		mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len,
12496		    NULL, &ipp);
12497		if (mp == NULL) {
12498			TCP_STAT(tcp_rput_v6_error);
12499			return;
12500		}
12501		iphdr = mp->b_rptr;
12502		rptr = mp->b_rptr;
12503	}
12504	ASSERT(DB_TYPE(mp) == M_DATA);
12505
12506	tcph = (tcph_t *)&rptr[ip_hdr_len];
12507	seg_seq = ABE32_TO_U32(tcph->th_seq);
12508	seg_ack = ABE32_TO_U32(tcph->th_ack);
12509	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
12510	seg_len = (int)(mp->b_wptr - rptr) -
12511	    (ip_hdr_len + TCP_HDR_LENGTH(tcph));
12512	if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) {
12513		do {
12514			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
12515			    (uintptr_t)INT_MAX);
12516			seg_len += (int)(mp1->b_wptr - mp1->b_rptr);
12517		} while ((mp1 = mp1->b_cont) != NULL &&
12518		    mp1->b_datap->db_type == M_DATA);
12519	}
12520
12521	if (tcp->tcp_state == TCPS_TIME_WAIT) {
12522		tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
12523		    seg_len, tcph);
12524		return;
12525	}
12526
12527	if (sqp != NULL) {
12528		/*
12529		 * This is the correct place to update tcp_last_recv_time. Note
12530		 * that it is also updated for tcp structure that belongs to
12531		 * global and listener queues which do not really need updating.
12532		 * But that should not cause any harm.  And it is updated for
12533		 * all kinds of incoming segments, not only for data segments.
12534		 */
12535		tcp->tcp_last_recv_time = lbolt;
12536	}
12537
12538	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
12539
12540	BUMP_LOCAL(tcp->tcp_ibsegs);
12541	TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT);
12542
12543	if ((flags & TH_URG) && sqp != NULL) {
12544		/*
12545		 * TCP can't handle urgent pointers that arrive before
12546		 * the connection has been accept()ed since it can't
12547		 * buffer OOB data.  Discard segment if this happens.
12548		 *
12549		 * Nor can it reassemble urgent pointers, so discard
12550		 * if it's not the next segment expected.
12551		 *
12552		 * Otherwise, collapse chain into one mblk (discard if
12553		 * that fails).  This makes sure the headers, retransmitted
12554		 * data, and new data all are in the same mblk.
12555		 */
12556		ASSERT(mp != NULL);
12557		if (tcp->tcp_listener || !pullupmsg(mp, -1)) {
12558			freemsg(mp);
12559			return;
12560		}
12561		/* Update pointers into message */
12562		iphdr = rptr = mp->b_rptr;
12563		tcph = (tcph_t *)&rptr[ip_hdr_len];
12564		if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
12565			/*
12566			 * Since we can't handle any data with this urgent
12567			 * pointer that is out of sequence, we expunge
12568			 * the data.  This allows us to still register
12569			 * the urgent mark and generate the M_PCSIG,
12570			 * which we can do.
12571			 */
12572			mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
12573			seg_len = 0;
12574		}
12575	}
12576
12577	switch (tcp->tcp_state) {
12578	case TCPS_SYN_SENT:
12579		if (flags & TH_ACK) {
12580			/*
12581			 * Note that our stack cannot send data before a
12582			 * connection is established, therefore the
12583			 * following check is valid.  Otherwise, it has
12584			 * to be changed.
12585			 */
12586			if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
12587			    SEQ_GT(seg_ack, tcp->tcp_snxt)) {
12588				freemsg(mp);
12589				if (flags & TH_RST)
12590					return;
12591				tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
12592				    tcp, seg_ack, 0, TH_RST);
12593				return;
12594			}
12595			ASSERT(tcp->tcp_suna + 1 == seg_ack);
12596		}
12597		if (flags & TH_RST) {
12598			freemsg(mp);
12599			if (flags & TH_ACK)
12600				(void) tcp_clean_death(tcp,
12601				    ECONNREFUSED, 13);
12602			return;
12603		}
12604		if (!(flags & TH_SYN)) {
12605			freemsg(mp);
12606			return;
12607		}
12608
12609		/* Process all TCP options. */
12610		tcp_process_options(tcp, tcph);
12611		/*
12612		 * The following changes our rwnd to be a multiple of the
12613		 * MIN(peer MSS, our MSS) for performance reason.
12614		 */
12615		(void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
12616		    tcp->tcp_mss));
12617
12618		/* Is the other end ECN capable? */
12619		if (tcp->tcp_ecn_ok) {
12620			if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) {
12621				tcp->tcp_ecn_ok = B_FALSE;
12622			}
12623		}
12624		/*
12625		 * Clear ECN flags because it may interfere with later
12626		 * processing.
12627		 */
12628		flags &= ~(TH_ECE|TH_CWR);
12629
12630		tcp->tcp_irs = seg_seq;
12631		tcp->tcp_rack = seg_seq;
12632		tcp->tcp_rnxt = seg_seq + 1;
12633		U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
12634		if (!TCP_IS_DETACHED(tcp)) {
12635			/* Allocate room for SACK options if needed. */
12636			if (tcp->tcp_snd_sack_ok) {
12637				(void) mi_set_sth_wroff(tcp->tcp_rq,
12638				    tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
12639				    (tcp->tcp_loopback ? 0 : tcp_wroff_xtra));
12640			} else {
12641				(void) mi_set_sth_wroff(tcp->tcp_rq,
12642				    tcp->tcp_hdr_len +
12643				    (tcp->tcp_loopback ? 0 : tcp_wroff_xtra));
12644			}
12645		}
12646		if (flags & TH_ACK) {
12647			/*
12648			 * If we can't get the confirmation upstream, pretend
12649			 * we didn't even see this one.
12650			 *
12651			 * XXX: how can we pretend we didn't see it if we
12652			 * have updated rnxt et. al.
12653			 *
12654			 * For loopback we defer sending up the T_CONN_CON
12655			 * until after some checks below.
12656			 */
12657			mp1 = NULL;
12658			if (!tcp_conn_con(tcp, iphdr, tcph, mp,
12659			    tcp->tcp_loopback ? &mp1 : NULL)) {
12660				freemsg(mp);
12661				return;
12662			}
12663			/* SYN was acked - making progress */
12664			if (tcp->tcp_ipversion == IPV6_VERSION)
12665				tcp->tcp_ip_forward_progress = B_TRUE;
12666
12667			/* One for the SYN */
12668			tcp->tcp_suna = tcp->tcp_iss + 1;
12669			tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
12670			tcp->tcp_state = TCPS_ESTABLISHED;
12671
12672			/*
12673			 * If SYN was retransmitted, need to reset all
12674			 * retransmission info.  This is because this
12675			 * segment will be treated as a dup ACK.
12676			 */
12677			if (tcp->tcp_rexmit) {
12678				tcp->tcp_rexmit = B_FALSE;
12679				tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
12680				tcp->tcp_rexmit_max = tcp->tcp_snxt;
12681				tcp->tcp_snd_burst = tcp->tcp_localnet ?
12682				    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
12683				tcp->tcp_ms_we_have_waited = 0;
12684
12685				/*
12686				 * Set tcp_cwnd back to 1 MSS, per
12687				 * recommendation from
12688				 * draft-floyd-incr-init-win-01.txt,
12689				 * Increasing TCP's Initial Window.
12690				 */
12691				tcp->tcp_cwnd = tcp->tcp_mss;
12692			}
12693
12694			tcp->tcp_swl1 = seg_seq;
12695			tcp->tcp_swl2 = seg_ack;
12696
12697			new_swnd = BE16_TO_U16(tcph->th_win);
12698			tcp->tcp_swnd = new_swnd;
12699			if (new_swnd > tcp->tcp_max_swnd)
12700				tcp->tcp_max_swnd = new_swnd;
12701
12702			/*
12703			 * Always send the three-way handshake ack immediately
12704			 * in order to make the connection complete as soon as
12705			 * possible on the accepting host.
12706			 */
12707			flags |= TH_ACK_NEEDED;
12708
12709			/*
12710			 * Special case for loopback.  At this point we have
12711			 * received SYN-ACK from the remote endpoint.  In
12712			 * order to ensure that both endpoints reach the
12713			 * fused state prior to any data exchange, the final
12714			 * ACK needs to be sent before we indicate T_CONN_CON
12715			 * to the module upstream.
12716			 */
12717			if (tcp->tcp_loopback) {
12718				mblk_t *ack_mp;
12719
12720				ASSERT(!tcp->tcp_unfusable);
12721				ASSERT(mp1 != NULL);
12722				/*
12723				 * For loopback, we always get a pure SYN-ACK
12724				 * and only need to send back the final ACK
12725				 * with no data (this is because the other
12726				 * tcp is ours and we don't do T/TCP).  This
12727				 * final ACK triggers the passive side to
12728				 * perform fusion in ESTABLISHED state.
12729				 */
12730				if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
12731					if (tcp->tcp_ack_tid != 0) {
12732						(void) TCP_TIMER_CANCEL(tcp,
12733						    tcp->tcp_ack_tid);
12734						tcp->tcp_ack_tid = 0;
12735					}
12736					TCP_RECORD_TRACE(tcp, ack_mp,
12737					    TCP_TRACE_SEND_PKT);
12738					tcp_send_data(tcp, tcp->tcp_wq, ack_mp);
12739					BUMP_LOCAL(tcp->tcp_obsegs);
12740					BUMP_MIB(&tcp_mib, tcpOutAck);
12741
12742					/* Send up T_CONN_CON */
12743					putnext(tcp->tcp_rq, mp1);
12744
12745					freemsg(mp);
12746					return;
12747				}
12748				/*
12749				 * Forget fusion; we need to handle more
12750				 * complex cases below.  Send the deferred
12751				 * T_CONN_CON message upstream and proceed
12752				 * as usual.  Mark this tcp as not capable
12753				 * of fusion.
12754				 */
12755				TCP_STAT(tcp_fusion_unfusable);
12756				tcp->tcp_unfusable = B_TRUE;
12757				putnext(tcp->tcp_rq, mp1);
12758			}
12759
12760			/*
12761			 * Check to see if there is data to be sent.  If
12762			 * yes, set the transmit flag.  Then check to see
12763			 * if received data processing needs to be done.
12764			 * If not, go straight to xmit_check.  This short
12765			 * cut is OK as we don't support T/TCP.
12766			 */
12767			if (tcp->tcp_unsent)
12768				flags |= TH_XMIT_NEEDED;
12769
12770			if (seg_len == 0 && !(flags & TH_URG)) {
12771				freemsg(mp);
12772				goto xmit_check;
12773			}
12774
12775			flags &= ~TH_SYN;
12776			seg_seq++;
12777			break;
12778		}
12779		tcp->tcp_state = TCPS_SYN_RCVD;
12780		mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
12781		    NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
12782		if (mp1) {
12783			DB_CPID(mp1) = tcp->tcp_cpid;
12784			TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
12785			tcp_send_data(tcp, tcp->tcp_wq, mp1);
12786			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
12787		}
12788		freemsg(mp);
12789		return;
12790	case TCPS_SYN_RCVD:
12791		if (flags & TH_ACK) {
12792			/*
12793			 * In this state, a SYN|ACK packet is either bogus
12794			 * because the other side must be ACKing our SYN which
12795			 * indicates it has seen the ACK for their SYN and
12796			 * shouldn't retransmit it or we're crossing SYNs
12797			 * on active open.
12798			 */
12799			if ((flags & TH_SYN) && !tcp->tcp_active_open) {
12800				freemsg(mp);
12801				tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn",
12802				    tcp, seg_ack, 0, TH_RST);
12803				return;
12804			}
12805			/*
12806			 * NOTE: RFC 793 pg. 72 says this should be
12807			 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt
12808			 * but that would mean we have an ack that ignored
12809			 * our SYN.
12810			 */
12811			if (SEQ_LEQ(seg_ack, tcp->tcp_suna) ||
12812			    SEQ_GT(seg_ack, tcp->tcp_snxt)) {
12813				freemsg(mp);
12814				tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack",
12815				    tcp, seg_ack, 0, TH_RST);
12816				return;
12817			}
12818		}
12819		break;
12820	case TCPS_LISTEN:
12821		/*
12822		 * Only a TLI listener can come through this path when a
12823		 * acceptor is going back to be a listener and a packet
12824		 * for the acceptor hits the classifier. For a socket
12825		 * listener, this can never happen because a listener
12826		 * can never accept connection on itself and hence a
12827		 * socket acceptor can not go back to being a listener.
12828		 */
12829		ASSERT(!TCP_IS_SOCKET(tcp));
12830		/*FALLTHRU*/
12831	case TCPS_CLOSED:
12832	case TCPS_BOUND: {
12833		conn_t	*new_connp;
12834
12835		new_connp = ipcl_classify(mp, connp->conn_zoneid);
12836		if (new_connp != NULL) {
12837			tcp_reinput(new_connp, mp, connp->conn_sqp);
12838			return;
12839		}
12840		/* We failed to classify. For now just drop the packet */
12841		freemsg(mp);
12842		return;
12843	}
12844	case TCPS_IDLE:
12845		/*
12846		 * Handle the case where the tcp_clean_death() has happened
12847		 * on a connection (application hasn't closed yet) but a packet
12848		 * was already queued on squeue before tcp_clean_death()
12849		 * was processed. Calling tcp_clean_death() twice on same
12850		 * connection can result in weird behaviour.
12851		 */
12852		freemsg(mp);
12853		return;
12854	default:
12855		break;
12856	}
12857
12858	/*
12859	 * Already on the correct queue/perimeter.
12860	 * If this is a detached connection and not an eager
12861	 * connection hanging off a listener then new data
12862	 * (past the FIN) will cause a reset.
12863	 * We do a special check here where it
12864	 * is out of the main line, rather than check
12865	 * if we are detached every time we see new
12866	 * data down below.
12867	 */
12868	if (TCP_IS_DETACHED_NONEAGER(tcp) &&
12869	    (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) {
12870		BUMP_MIB(&tcp_mib, tcpInClosed);
12871		TCP_RECORD_TRACE(tcp,
12872		    mp, TCP_TRACE_RECV_PKT);
12873
12874		freemsg(mp);
12875		/*
12876		 * This could be an SSL closure alert. We're detached so just
12877		 * acknowledge it this last time.
12878		 */
12879		if (tcp->tcp_kssl_ctx != NULL) {
12880			kssl_release_ctx(tcp->tcp_kssl_ctx);
12881			tcp->tcp_kssl_ctx = NULL;
12882
12883			tcp->tcp_rnxt += seg_len;
12884			U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
12885			flags |= TH_ACK_NEEDED;
12886			goto ack_check;
12887		}
12888
12889		tcp_xmit_ctl("new data when detached", tcp,
12890		    tcp->tcp_snxt, 0, TH_RST);
12891		(void) tcp_clean_death(tcp, EPROTO, 12);
12892		return;
12893	}
12894
12895	mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
12896	urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION;
12897	new_swnd = BE16_TO_U16(tcph->th_win) <<
12898	    ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
12899	mss = tcp->tcp_mss;
12900
12901	if (tcp->tcp_snd_ts_ok) {
12902		if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
12903			/*
12904			 * This segment is not acceptable.
12905			 * Drop it and send back an ACK.
12906			 */
12907			freemsg(mp);
12908			flags |= TH_ACK_NEEDED;
12909			goto ack_check;
12910		}
12911	} else if (tcp->tcp_snd_sack_ok) {
12912		ASSERT(tcp->tcp_sack_info != NULL);
12913		tcpopt.tcp = tcp;
12914		/*
12915		 * SACK info in already updated in tcp_parse_options.  Ignore
12916		 * all other TCP options...
12917		 */
12918		(void) tcp_parse_options(tcph, &tcpopt);
12919	}
12920try_again:;
12921	gap = seg_seq - tcp->tcp_rnxt;
12922	rgap = tcp->tcp_rwnd - (gap + seg_len);
12923	/*
12924	 * gap is the amount of sequence space between what we expect to see
12925	 * and what we got for seg_seq.  A positive value for gap means
12926	 * something got lost.  A negative value means we got some old stuff.
12927	 */
12928	if (gap < 0) {
12929		/* Old stuff present.  Is the SYN in there? */
12930		if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) &&
12931		    (seg_len != 0)) {
12932			flags &= ~TH_SYN;
12933			seg_seq++;
12934			urp--;
12935			/* Recompute the gaps after noting the SYN. */
12936			goto try_again;
12937		}
12938		BUMP_MIB(&tcp_mib, tcpInDataDupSegs);
12939		UPDATE_MIB(&tcp_mib, tcpInDataDupBytes,
12940		    (seg_len > -gap ? -gap : seg_len));
12941		/* Remove the old stuff from seg_len. */
12942		seg_len += gap;
12943		/*
12944		 * Anything left?
12945		 * Make sure to check for unack'd FIN when rest of data
12946		 * has been previously ack'd.
12947		 */
12948		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
12949			/*
12950			 * Resets are only valid if they lie within our offered
12951			 * window.  If the RST bit is set, we just ignore this
12952			 * segment.
12953			 */
12954			if (flags & TH_RST) {
12955				freemsg(mp);
12956				return;
12957			}
12958
12959			/*
12960			 * The arriving of dup data packets indicate that we
12961			 * may have postponed an ack for too long, or the other
12962			 * side's RTT estimate is out of shape. Start acking
12963			 * more often.
12964			 */
12965			if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) &&
12966			    tcp->tcp_rack_cnt >= 1 &&
12967			    tcp->tcp_rack_abs_max > 2) {
12968				tcp->tcp_rack_abs_max--;
12969			}
12970			tcp->tcp_rack_cur_max = 1;
12971
12972			/*
12973			 * This segment is "unacceptable".  None of its
12974			 * sequence space lies within our advertized window.
12975			 *
12976			 * Adjust seg_len to the original value for tracing.
12977			 */
12978			seg_len -= gap;
12979			if (tcp->tcp_debug) {
12980				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
12981				    "tcp_rput: unacceptable, gap %d, rgap %d, "
12982				    "flags 0x%x, seg_seq %u, seg_ack %u, "
12983				    "seg_len %d, rnxt %u, snxt %u, %s",
12984				    gap, rgap, flags, seg_seq, seg_ack,
12985				    seg_len, tcp->tcp_rnxt, tcp->tcp_snxt,
12986				    tcp_display(tcp, NULL,
12987				    DISP_ADDR_AND_PORT));
12988			}
12989
12990			/*
12991			 * Arrange to send an ACK in response to the
12992			 * unacceptable segment per RFC 793 page 69. There
12993			 * is only one small difference between ours and the
12994			 * acceptability test in the RFC - we accept ACK-only
12995			 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK
12996			 * will be generated.
12997			 *
12998			 * Note that we have to ACK an ACK-only packet at least
12999			 * for stacks that send 0-length keep-alives with
13000			 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122,
13001			 * section 4.2.3.6. As long as we don't ever generate
13002			 * an unacceptable packet in response to an incoming
13003			 * packet that is unacceptable, it should not cause
13004			 * "ACK wars".
13005			 */
13006			flags |=  TH_ACK_NEEDED;
13007
13008			/*
13009			 * Continue processing this segment in order to use the
13010			 * ACK information it contains, but skip all other
13011			 * sequence-number processing.	Processing the ACK
13012			 * information is necessary in order to
13013			 * re-synchronize connections that may have lost
13014			 * synchronization.
13015			 *
13016			 * We clear seg_len and flag fields related to
13017			 * sequence number processing as they are not
13018			 * to be trusted for an unacceptable segment.
13019			 */
13020			seg_len = 0;
13021			flags &= ~(TH_SYN | TH_FIN | TH_URG);
13022			goto process_ack;
13023		}
13024
13025		/* Fix seg_seq, and chew the gap off the front. */
13026		seg_seq = tcp->tcp_rnxt;
13027		urp += gap;
13028		do {
13029			mblk_t	*mp2;
13030			ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
13031			    (uintptr_t)UINT_MAX);
13032			gap += (uint_t)(mp->b_wptr - mp->b_rptr);
13033			if (gap > 0) {
13034				mp->b_rptr = mp->b_wptr - gap;
13035				break;
13036			}
13037			mp2 = mp;
13038			mp = mp->b_cont;
13039			freeb(mp2);
13040		} while (gap < 0);
13041		/*
13042		 * If the urgent data has already been acknowledged, we
13043		 * should ignore TH_URG below
13044		 */
13045		if (urp < 0)
13046			flags &= ~TH_URG;
13047	}
13048	/*
13049	 * rgap is the amount of stuff received out of window.  A negative
13050	 * value is the amount out of window.
13051	 */
13052	if (rgap < 0) {
13053		mblk_t	*mp2;
13054
13055		if (tcp->tcp_rwnd == 0) {
13056			BUMP_MIB(&tcp_mib, tcpInWinProbe);
13057		} else {
13058			BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs);
13059			UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap);
13060		}
13061
13062		/*
13063		 * seg_len does not include the FIN, so if more than
13064		 * just the FIN is out of window, we act like we don't
13065		 * see it.  (If just the FIN is out of window, rgap
13066		 * will be zero and we will go ahead and acknowledge
13067		 * the FIN.)
13068		 */
13069		flags &= ~TH_FIN;
13070
13071		/* Fix seg_len and make sure there is something left. */
13072		seg_len += rgap;
13073		if (seg_len <= 0) {
13074			/*
13075			 * Resets are only valid if they lie within our offered
13076			 * window.  If the RST bit is set, we just ignore this
13077			 * segment.
13078			 */
13079			if (flags & TH_RST) {
13080				freemsg(mp);
13081				return;
13082			}
13083
13084			/* Per RFC 793, we need to send back an ACK. */
13085			flags |= TH_ACK_NEEDED;
13086
13087			/*
13088			 * Send SIGURG as soon as possible i.e. even
13089			 * if the TH_URG was delivered in a window probe
13090			 * packet (which will be unacceptable).
13091			 *
13092			 * We generate a signal if none has been generated
13093			 * for this connection or if this is a new urgent
13094			 * byte. Also send a zero-length "unmarked" message
13095			 * to inform SIOCATMARK that this is not the mark.
13096			 *
13097			 * tcp_urp_last_valid is cleared when the T_exdata_ind
13098			 * is sent up. This plus the check for old data
13099			 * (gap >= 0) handles the wraparound of the sequence
13100			 * number space without having to always track the
13101			 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks
13102			 * this max in its rcv_up variable).
13103			 *
13104			 * This prevents duplicate SIGURGS due to a "late"
13105			 * zero-window probe when the T_EXDATA_IND has already
13106			 * been sent up.
13107			 */
13108			if ((flags & TH_URG) &&
13109			    (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
13110			    tcp->tcp_urp_last))) {
13111				mp1 = allocb(0, BPRI_MED);
13112				if (mp1 == NULL) {
13113					freemsg(mp);
13114					return;
13115				}
13116				if (!TCP_IS_DETACHED(tcp) &&
13117				    !putnextctl1(tcp->tcp_rq, M_PCSIG,
13118				    SIGURG)) {
13119					/* Try again on the rexmit. */
13120					freemsg(mp1);
13121					freemsg(mp);
13122					return;
13123				}
13124				/*
13125				 * If the next byte would be the mark
13126				 * then mark with MARKNEXT else mark
13127				 * with NOTMARKNEXT.
13128				 */
13129				if (gap == 0 && urp == 0)
13130					mp1->b_flag |= MSGMARKNEXT;
13131				else
13132					mp1->b_flag |= MSGNOTMARKNEXT;
13133				freemsg(tcp->tcp_urp_mark_mp);
13134				tcp->tcp_urp_mark_mp = mp1;
13135				flags |= TH_SEND_URP_MARK;
13136				tcp->tcp_urp_last_valid = B_TRUE;
13137				tcp->tcp_urp_last = urp + seg_seq;
13138			}
13139			/*
13140			 * If this is a zero window probe, continue to
13141			 * process the ACK part.  But we need to set seg_len
13142			 * to 0 to avoid data processing.  Otherwise just
13143			 * drop the segment and send back an ACK.
13144			 */
13145			if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) {
13146				flags &= ~(TH_SYN | TH_URG);
13147				seg_len = 0;
13148				goto process_ack;
13149			} else {
13150				freemsg(mp);
13151				goto ack_check;
13152			}
13153		}
13154		/* Pitch out of window stuff off the end. */
13155		rgap = seg_len;
13156		mp2 = mp;
13157		do {
13158			ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
13159			    (uintptr_t)INT_MAX);
13160			rgap -= (int)(mp2->b_wptr - mp2->b_rptr);
13161			if (rgap < 0) {
13162				mp2->b_wptr += rgap;
13163				if ((mp1 = mp2->b_cont) != NULL) {
13164					mp2->b_cont = NULL;
13165					freemsg(mp1);
13166				}
13167				break;
13168			}
13169		} while ((mp2 = mp2->b_cont) != NULL);
13170	}
13171ok:;
13172	/*
13173	 * TCP should check ECN info for segments inside the window only.
13174	 * Therefore the check should be done here.
13175	 */
13176	if (tcp->tcp_ecn_ok) {
13177		if (flags & TH_CWR) {
13178			tcp->tcp_ecn_echo_on = B_FALSE;
13179		}
13180		/*
13181		 * Note that both ECN_CE and CWR can be set in the
13182		 * same segment.  In this case, we once again turn
13183		 * on ECN_ECHO.
13184		 */
13185		if (tcp->tcp_ipversion == IPV4_VERSION) {
13186			uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service;
13187
13188			if ((tos & IPH_ECN_CE) == IPH_ECN_CE) {
13189				tcp->tcp_ecn_echo_on = B_TRUE;
13190			}
13191		} else {
13192			uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf;
13193
13194			if ((vcf & htonl(IPH_ECN_CE << 20)) ==
13195			    htonl(IPH_ECN_CE << 20)) {
13196				tcp->tcp_ecn_echo_on = B_TRUE;
13197			}
13198		}
13199	}
13200
13201	/*
13202	 * Check whether we can update tcp_ts_recent.  This test is
13203	 * NOT the one in RFC 1323 3.4.  It is from Braden, 1993, "TCP
13204	 * Extensions for High Performance: An Update", Internet Draft.
13205	 */
13206	if (tcp->tcp_snd_ts_ok &&
13207	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
13208	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
13209		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
13210		tcp->tcp_last_rcv_lbolt = lbolt64;
13211	}
13212
13213	if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) {
13214		/*
13215		 * FIN in an out of order segment.  We record this in
13216		 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq.
13217		 * Clear the FIN so that any check on FIN flag will fail.
13218		 * Remember that FIN also counts in the sequence number
13219		 * space.  So we need to ack out of order FIN only segments.
13220		 */
13221		if (flags & TH_FIN) {
13222			tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID;
13223			tcp->tcp_ofo_fin_seq = seg_seq + seg_len;
13224			flags &= ~TH_FIN;
13225			flags |= TH_ACK_NEEDED;
13226		}
13227		if (seg_len > 0) {
13228			/* Fill in the SACK blk list. */
13229			if (tcp->tcp_snd_sack_ok) {
13230				ASSERT(tcp->tcp_sack_info != NULL);
13231				tcp_sack_insert(tcp->tcp_sack_list,
13232				    seg_seq, seg_seq + seg_len,
13233				    &(tcp->tcp_num_sack_blk));
13234			}
13235
13236			/*
13237			 * Attempt reassembly and see if we have something
13238			 * ready to go.
13239			 */
13240			mp = tcp_reass(tcp, mp, seg_seq);
13241			/* Always ack out of order packets */
13242			flags |= TH_ACK_NEEDED | TH_PUSH;
13243			if (mp) {
13244				ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
13245				    (uintptr_t)INT_MAX);
13246				seg_len = mp->b_cont ? msgdsize(mp) :
13247					(int)(mp->b_wptr - mp->b_rptr);
13248				seg_seq = tcp->tcp_rnxt;
13249				/*
13250				 * A gap is filled and the seq num and len
13251				 * of the gap match that of a previously
13252				 * received FIN, put the FIN flag back in.
13253				 */
13254				if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
13255				    seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
13256					flags |= TH_FIN;
13257					tcp->tcp_valid_bits &=
13258					    ~TCP_OFO_FIN_VALID;
13259				}
13260			} else {
13261				/*
13262				 * Keep going even with NULL mp.
13263				 * There may be a useful ACK or something else
13264				 * we don't want to miss.
13265				 *
13266				 * But TCP should not perform fast retransmit
13267				 * because of the ack number.  TCP uses
13268				 * seg_len == 0 to determine if it is a pure
13269				 * ACK.  And this is not a pure ACK.
13270				 */
13271				seg_len = 0;
13272				ofo_seg = B_TRUE;
13273			}
13274		}
13275	} else if (seg_len > 0) {
13276		BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
13277		UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len);
13278		/*
13279		 * If an out of order FIN was received before, and the seq
13280		 * num and len of the new segment match that of the FIN,
13281		 * put the FIN flag back in.
13282		 */
13283		if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
13284		    seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
13285			flags |= TH_FIN;
13286			tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
13287		}
13288	}
13289	if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
13290	if (flags & TH_RST) {
13291		freemsg(mp);
13292		switch (tcp->tcp_state) {
13293		case TCPS_SYN_RCVD:
13294			(void) tcp_clean_death(tcp, ECONNREFUSED, 14);
13295			break;
13296		case TCPS_ESTABLISHED:
13297		case TCPS_FIN_WAIT_1:
13298		case TCPS_FIN_WAIT_2:
13299		case TCPS_CLOSE_WAIT:
13300			(void) tcp_clean_death(tcp, ECONNRESET, 15);
13301			break;
13302		case TCPS_CLOSING:
13303		case TCPS_LAST_ACK:
13304			(void) tcp_clean_death(tcp, 0, 16);
13305			break;
13306		default:
13307			ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
13308			(void) tcp_clean_death(tcp, ENXIO, 17);
13309			break;
13310		}
13311		return;
13312	}
13313	if (flags & TH_SYN) {
13314		/*
13315		 * See RFC 793, Page 71
13316		 *
13317		 * The seq number must be in the window as it should
13318		 * be "fixed" above.  If it is outside window, it should
13319		 * be already rejected.  Note that we allow seg_seq to be
13320		 * rnxt + rwnd because we want to accept 0 window probe.
13321		 */
13322		ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) &&
13323		    SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
13324		freemsg(mp);
13325		/*
13326		 * If the ACK flag is not set, just use our snxt as the
13327		 * seq number of the RST segment.
13328		 */
13329		if (!(flags & TH_ACK)) {
13330			seg_ack = tcp->tcp_snxt;
13331		}
13332		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
13333		    TH_RST|TH_ACK);
13334		ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
13335		(void) tcp_clean_death(tcp, ECONNRESET, 18);
13336		return;
13337	}
13338	/*
13339	 * urp could be -1 when the urp field in the packet is 0
13340	 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
13341	 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
13342	 */
13343	if (flags & TH_URG && urp >= 0) {
13344		if (!tcp->tcp_urp_last_valid ||
13345		    SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
13346			/*
13347			 * If we haven't generated the signal yet for this
13348			 * urgent pointer value, do it now.  Also, send up a
13349			 * zero-length M_DATA indicating whether or not this is
13350			 * the mark. The latter is not needed when a
13351			 * T_EXDATA_IND is sent up. However, if there are
13352			 * allocation failures this code relies on the sender
13353			 * retransmitting and the socket code for determining
13354			 * the mark should not block waiting for the peer to
13355			 * transmit. Thus, for simplicity we always send up the
13356			 * mark indication.
13357			 */
13358			mp1 = allocb(0, BPRI_MED);
13359			if (mp1 == NULL) {
13360				freemsg(mp);
13361				return;
13362			}
13363			if (!TCP_IS_DETACHED(tcp) &&
13364			    !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) {
13365				/* Try again on the rexmit. */
13366				freemsg(mp1);
13367				freemsg(mp);
13368				return;
13369			}
13370			/*
13371			 * Mark with NOTMARKNEXT for now.
13372			 * The code below will change this to MARKNEXT
13373			 * if we are at the mark.
13374			 *
13375			 * If there are allocation failures (e.g. in dupmsg
13376			 * below) the next time tcp_rput_data sees the urgent
13377			 * segment it will send up the MSG*MARKNEXT message.
13378			 */
13379			mp1->b_flag |= MSGNOTMARKNEXT;
13380			freemsg(tcp->tcp_urp_mark_mp);
13381			tcp->tcp_urp_mark_mp = mp1;
13382			flags |= TH_SEND_URP_MARK;
13383#ifdef DEBUG
13384			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
13385			    "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
13386			    "last %x, %s",
13387			    seg_seq, urp, tcp->tcp_urp_last,
13388			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
13389#endif /* DEBUG */
13390			tcp->tcp_urp_last_valid = B_TRUE;
13391			tcp->tcp_urp_last = urp + seg_seq;
13392		} else if (tcp->tcp_urp_mark_mp != NULL) {
13393			/*
13394			 * An allocation failure prevented the previous
13395			 * tcp_rput_data from sending up the allocated
13396			 * MSG*MARKNEXT message - send it up this time
13397			 * around.
13398			 */
13399			flags |= TH_SEND_URP_MARK;
13400		}
13401
13402		/*
13403		 * If the urgent byte is in this segment, make sure that it is
13404		 * all by itself.  This makes it much easier to deal with the
13405		 * possibility of an allocation failure on the T_exdata_ind.
13406		 * Note that seg_len is the number of bytes in the segment, and
13407		 * urp is the offset into the segment of the urgent byte.
13408		 * urp < seg_len means that the urgent byte is in this segment.
13409		 */
13410		if (urp < seg_len) {
13411			if (seg_len != 1) {
13412				uint32_t  tmp_rnxt;
13413				/*
13414				 * Break it up and feed it back in.
13415				 * Re-attach the IP header.
13416				 */
13417				mp->b_rptr = iphdr;
13418				if (urp > 0) {
13419					/*
13420					 * There is stuff before the urgent
13421					 * byte.
13422					 */
13423					mp1 = dupmsg(mp);
13424					if (!mp1) {
13425						/*
13426						 * Trim from urgent byte on.
13427						 * The rest will come back.
13428						 */
13429						(void) adjmsg(mp,
13430						    urp - seg_len);
13431						tcp_rput_data(connp,
13432						    mp, NULL);
13433						return;
13434					}
13435					(void) adjmsg(mp1, urp - seg_len);
13436					/* Feed this piece back in. */
13437					tmp_rnxt = tcp->tcp_rnxt;
13438					tcp_rput_data(connp, mp1, NULL);
13439					/*
13440					 * If the data passed back in was not
13441					 * processed (ie: bad ACK) sending
13442					 * the remainder back in will cause a
13443					 * loop. In this case, drop the
13444					 * packet and let the sender try
13445					 * sending a good packet.
13446					 */
13447					if (tmp_rnxt == tcp->tcp_rnxt) {
13448						freemsg(mp);
13449						return;
13450					}
13451				}
13452				if (urp != seg_len - 1) {
13453					uint32_t  tmp_rnxt;
13454					/*
13455					 * There is stuff after the urgent
13456					 * byte.
13457					 */
13458					mp1 = dupmsg(mp);
13459					if (!mp1) {
13460						/*
13461						 * Trim everything beyond the
13462						 * urgent byte.  The rest will
13463						 * come back.
13464						 */
13465						(void) adjmsg(mp,
13466						    urp + 1 - seg_len);
13467						tcp_rput_data(connp,
13468						    mp, NULL);
13469						return;
13470					}
13471					(void) adjmsg(mp1, urp + 1 - seg_len);
13472					tmp_rnxt = tcp->tcp_rnxt;
13473					tcp_rput_data(connp, mp1, NULL);
13474					/*
13475					 * If the data passed back in was not
13476					 * processed (ie: bad ACK) sending
13477					 * the remainder back in will cause a
13478					 * loop. In this case, drop the
13479					 * packet and let the sender try
13480					 * sending a good packet.
13481					 */
13482					if (tmp_rnxt == tcp->tcp_rnxt) {
13483						freemsg(mp);
13484						return;
13485					}
13486				}
13487				tcp_rput_data(connp, mp, NULL);
13488				return;
13489			}
13490			/*
13491			 * This segment contains only the urgent byte.  We
13492			 * have to allocate the T_exdata_ind, if we can.
13493			 */
13494			if (!tcp->tcp_urp_mp) {
13495				struct T_exdata_ind *tei;
13496				mp1 = allocb(sizeof (struct T_exdata_ind),
13497				    BPRI_MED);
13498				if (!mp1) {
13499					/*
13500					 * Sigh... It'll be back.
13501					 * Generate any MSG*MARK message now.
13502					 */
13503					freemsg(mp);
13504					seg_len = 0;
13505					if (flags & TH_SEND_URP_MARK) {
13506
13507
13508						ASSERT(tcp->tcp_urp_mark_mp);
13509						tcp->tcp_urp_mark_mp->b_flag &=
13510							~MSGNOTMARKNEXT;
13511						tcp->tcp_urp_mark_mp->b_flag |=
13512							MSGMARKNEXT;
13513					}
13514					goto ack_check;
13515				}
13516				mp1->b_datap->db_type = M_PROTO;
13517				tei = (struct T_exdata_ind *)mp1->b_rptr;
13518				tei->PRIM_type = T_EXDATA_IND;
13519				tei->MORE_flag = 0;
13520				mp1->b_wptr = (uchar_t *)&tei[1];
13521				tcp->tcp_urp_mp = mp1;
13522#ifdef DEBUG
13523				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
13524				    "tcp_rput: allocated exdata_ind %s",
13525				    tcp_display(tcp, NULL,
13526				    DISP_PORT_ONLY));
13527#endif /* DEBUG */
13528				/*
13529				 * There is no need to send a separate MSG*MARK
13530				 * message since the T_EXDATA_IND will be sent
13531				 * now.
13532				 */
13533				flags &= ~TH_SEND_URP_MARK;
13534				freemsg(tcp->tcp_urp_mark_mp);
13535				tcp->tcp_urp_mark_mp = NULL;
13536			}
13537			/*
13538			 * Now we are all set.  On the next putnext upstream,
13539			 * tcp_urp_mp will be non-NULL and will get prepended
13540			 * to what has to be this piece containing the urgent
13541			 * byte.  If for any reason we abort this segment below,
13542			 * if it comes back, we will have this ready, or it
13543			 * will get blown off in close.
13544			 */
13545		} else if (urp == seg_len) {
13546			/*
13547			 * The urgent byte is the next byte after this sequence
13548			 * number. If there is data it is marked with
13549			 * MSGMARKNEXT and any tcp_urp_mark_mp is discarded
13550			 * since it is not needed. Otherwise, if the code
13551			 * above just allocated a zero-length tcp_urp_mark_mp
13552			 * message, that message is tagged with MSGMARKNEXT.
13553			 * Sending up these MSGMARKNEXT messages makes
13554			 * SIOCATMARK work correctly even though
13555			 * the T_EXDATA_IND will not be sent up until the
13556			 * urgent byte arrives.
13557			 */
13558			if (seg_len != 0) {
13559				flags |= TH_MARKNEXT_NEEDED;
13560				freemsg(tcp->tcp_urp_mark_mp);
13561				tcp->tcp_urp_mark_mp = NULL;
13562				flags &= ~TH_SEND_URP_MARK;
13563			} else if (tcp->tcp_urp_mark_mp != NULL) {
13564				flags |= TH_SEND_URP_MARK;
13565				tcp->tcp_urp_mark_mp->b_flag &=
13566					~MSGNOTMARKNEXT;
13567				tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT;
13568			}
13569#ifdef DEBUG
13570			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
13571			    "tcp_rput: AT MARK, len %d, flags 0x%x, %s",
13572			    seg_len, flags,
13573			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
13574#endif /* DEBUG */
13575		} else {
13576			/* Data left until we hit mark */
13577#ifdef DEBUG
13578			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
13579			    "tcp_rput: URP %d bytes left, %s",
13580			    urp - seg_len, tcp_display(tcp, NULL,
13581			    DISP_PORT_ONLY));
13582#endif /* DEBUG */
13583		}
13584	}
13585
13586process_ack:
13587	if (!(flags & TH_ACK)) {
13588		freemsg(mp);
13589		goto xmit_check;
13590	}
13591	}
13592	bytes_acked = (int)(seg_ack - tcp->tcp_suna);
13593
13594	if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0)
13595		tcp->tcp_ip_forward_progress = B_TRUE;
13596	if (tcp->tcp_state == TCPS_SYN_RCVD) {
13597		if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) &&
13598		    ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) {
13599			/* 3-way handshake complete - pass up the T_CONN_IND */
13600			tcp_t	*listener = tcp->tcp_listener;
13601			mblk_t	*mp = tcp->tcp_conn.tcp_eager_conn_ind;
13602
13603			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
13604			/*
13605			 * We are here means eager is fine but it can
13606			 * get a TH_RST at any point between now and till
13607			 * accept completes and disappear. We need to
13608			 * ensure that reference to eager is valid after
13609			 * we get out of eager's perimeter. So we do
13610			 * an extra refhold.
13611			 */
13612			CONN_INC_REF(connp);
13613
13614			/*
13615			 * The listener also exists because of the refhold
13616			 * done in tcp_conn_request. Its possible that it
13617			 * might have closed. We will check that once we
13618			 * get inside listeners context.
13619			 */
13620			CONN_INC_REF(listener->tcp_connp);
13621			if (listener->tcp_connp->conn_sqp ==
13622			    connp->conn_sqp) {
13623				tcp_send_conn_ind(listener->tcp_connp, mp,
13624				    listener->tcp_connp->conn_sqp);
13625				CONN_DEC_REF(listener->tcp_connp);
13626			} else if (!tcp->tcp_loopback) {
13627				squeue_fill(listener->tcp_connp->conn_sqp, mp,
13628				    tcp_send_conn_ind,
13629				    listener->tcp_connp, SQTAG_TCP_CONN_IND);
13630			} else {
13631				squeue_enter(listener->tcp_connp->conn_sqp, mp,
13632				    tcp_send_conn_ind, listener->tcp_connp,
13633				    SQTAG_TCP_CONN_IND);
13634			}
13635		}
13636
13637		if (tcp->tcp_active_open) {
13638			/*
13639			 * We are seeing the final ack in the three way
13640			 * hand shake of a active open'ed connection
13641			 * so we must send up a T_CONN_CON
13642			 */
13643			if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) {
13644				freemsg(mp);
13645				return;
13646			}
13647			/*
13648			 * Don't fuse the loopback endpoints for
13649			 * simultaneous active opens.
13650			 */
13651			if (tcp->tcp_loopback) {
13652				TCP_STAT(tcp_fusion_unfusable);
13653				tcp->tcp_unfusable = B_TRUE;
13654			}
13655		}
13656
13657		tcp->tcp_suna = tcp->tcp_iss + 1;	/* One for the SYN */
13658		bytes_acked--;
13659		/* SYN was acked - making progress */
13660		if (tcp->tcp_ipversion == IPV6_VERSION)
13661			tcp->tcp_ip_forward_progress = B_TRUE;
13662
13663		/*
13664		 * If SYN was retransmitted, need to reset all
13665		 * retransmission info as this segment will be
13666		 * treated as a dup ACK.
13667		 */
13668		if (tcp->tcp_rexmit) {
13669			tcp->tcp_rexmit = B_FALSE;
13670			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
13671			tcp->tcp_rexmit_max = tcp->tcp_snxt;
13672			tcp->tcp_snd_burst = tcp->tcp_localnet ?
13673			    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
13674			tcp->tcp_ms_we_have_waited = 0;
13675			tcp->tcp_cwnd = mss;
13676		}
13677
13678		/*
13679		 * We set the send window to zero here.
13680		 * This is needed if there is data to be
13681		 * processed already on the queue.
13682		 * Later (at swnd_update label), the
13683		 * "new_swnd > tcp_swnd" condition is satisfied
13684		 * the XMIT_NEEDED flag is set in the current
13685		 * (SYN_RCVD) state. This ensures tcp_wput_data() is
13686		 * called if there is already data on queue in
13687		 * this state.
13688		 */
13689		tcp->tcp_swnd = 0;
13690
13691		if (new_swnd > tcp->tcp_max_swnd)
13692			tcp->tcp_max_swnd = new_swnd;
13693		tcp->tcp_swl1 = seg_seq;
13694		tcp->tcp_swl2 = seg_ack;
13695		tcp->tcp_state = TCPS_ESTABLISHED;
13696		tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
13697
13698		/* Fuse when both sides are in ESTABLISHED state */
13699		if (tcp->tcp_loopback && do_tcp_fusion)
13700			tcp_fuse(tcp, iphdr, tcph);
13701
13702	}
13703	/* This code follows 4.4BSD-Lite2 mostly. */
13704	if (bytes_acked < 0)
13705		goto est;
13706
13707	/*
13708	 * If TCP is ECN capable and the congestion experience bit is
13709	 * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
13710	 * done once per window (or more loosely, per RTT).
13711	 */
13712	if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
13713		tcp->tcp_cwr = B_FALSE;
13714	if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
13715		if (!tcp->tcp_cwr) {
13716			npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
13717			tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
13718			tcp->tcp_cwnd = npkt * mss;
13719			/*
13720			 * If the cwnd is 0, use the timer to clock out
13721			 * new segments.  This is required by the ECN spec.
13722			 */
13723			if (npkt == 0) {
13724				TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
13725				/*
13726				 * This makes sure that when the ACK comes
13727				 * back, we will increase tcp_cwnd by 1 MSS.
13728				 */
13729				tcp->tcp_cwnd_cnt = 0;
13730			}
13731			tcp->tcp_cwr = B_TRUE;
13732			/*
13733			 * This marks the end of the current window of in
13734			 * flight data.  That is why we don't use
13735			 * tcp_suna + tcp_swnd.  Only data in flight can
13736			 * provide ECN info.
13737			 */
13738			tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
13739			tcp->tcp_ecn_cwr_sent = B_FALSE;
13740		}
13741	}
13742
13743	mp1 = tcp->tcp_xmit_head;
13744	if (bytes_acked == 0) {
13745		if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
13746			int dupack_cnt;
13747
13748			BUMP_MIB(&tcp_mib, tcpInDupAck);
13749			/*
13750			 * Fast retransmit.  When we have seen exactly three
13751			 * identical ACKs while we have unacked data
13752			 * outstanding we take it as a hint that our peer
13753			 * dropped something.
13754			 *
13755			 * If TCP is retransmitting, don't do fast retransmit.
13756			 */
13757			if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
13758			    ! tcp->tcp_rexmit) {
13759				/* Do Limited Transmit */
13760				if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
13761				    tcp_dupack_fast_retransmit) {
13762					/*
13763					 * RFC 3042
13764					 *
13765					 * What we need to do is temporarily
13766					 * increase tcp_cwnd so that new
13767					 * data can be sent if it is allowed
13768					 * by the receive window (tcp_rwnd).
13769					 * tcp_wput_data() will take care of
13770					 * the rest.
13771					 *
13772					 * If the connection is SACK capable,
13773					 * only do limited xmit when there
13774					 * is SACK info.
13775					 *
13776					 * Note how tcp_cwnd is incremented.
13777					 * The first dup ACK will increase
13778					 * it by 1 MSS.  The second dup ACK
13779					 * will increase it by 2 MSS.  This
13780					 * means that only 1 new segment will
13781					 * be sent for each dup ACK.
13782					 */
13783					if (tcp->tcp_unsent > 0 &&
13784					    (!tcp->tcp_snd_sack_ok ||
13785					    (tcp->tcp_snd_sack_ok &&
13786					    tcp->tcp_notsack_list != NULL))) {
13787						tcp->tcp_cwnd += mss <<
13788						    (tcp->tcp_dupack_cnt - 1);
13789						flags |= TH_LIMIT_XMIT;
13790					}
13791				} else if (dupack_cnt ==
13792				    tcp_dupack_fast_retransmit) {
13793
13794				/*
13795				 * If we have reduced tcp_ssthresh
13796				 * because of ECN, do not reduce it again
13797				 * unless it is already one window of data
13798				 * away.  After one window of data, tcp_cwr
13799				 * should then be cleared.  Note that
13800				 * for non ECN capable connection, tcp_cwr
13801				 * should always be false.
13802				 *
13803				 * Adjust cwnd since the duplicate
13804				 * ack indicates that a packet was
13805				 * dropped (due to congestion.)
13806				 */
13807				if (!tcp->tcp_cwr) {
13808					npkt = ((tcp->tcp_snxt -
13809					    tcp->tcp_suna) >> 1) / mss;
13810					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
13811					    mss;
13812					tcp->tcp_cwnd = (npkt +
13813					    tcp->tcp_dupack_cnt) * mss;
13814				}
13815				if (tcp->tcp_ecn_ok) {
13816					tcp->tcp_cwr = B_TRUE;
13817					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
13818					tcp->tcp_ecn_cwr_sent = B_FALSE;
13819				}
13820
13821				/*
13822				 * We do Hoe's algorithm.  Refer to her
13823				 * paper "Improving the Start-up Behavior
13824				 * of a Congestion Control Scheme for TCP,"
13825				 * appeared in SIGCOMM'96.
13826				 *
13827				 * Save highest seq no we have sent so far.
13828				 * Be careful about the invisible FIN byte.
13829				 */
13830				if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
13831				    (tcp->tcp_unsent == 0)) {
13832					tcp->tcp_rexmit_max = tcp->tcp_fss;
13833				} else {
13834					tcp->tcp_rexmit_max = tcp->tcp_snxt;
13835				}
13836
13837				/*
13838				 * Do not allow bursty traffic during.
13839				 * fast recovery.  Refer to Fall and Floyd's
13840				 * paper "Simulation-based Comparisons of
13841				 * Tahoe, Reno and SACK TCP" (in CCR?)
13842				 * This is a best current practise.
13843				 */
13844				tcp->tcp_snd_burst = TCP_CWND_SS;
13845
13846				/*
13847				 * For SACK:
13848				 * Calculate tcp_pipe, which is the
13849				 * estimated number of bytes in
13850				 * network.
13851				 *
13852				 * tcp_fack is the highest sack'ed seq num
13853				 * TCP has received.
13854				 *
13855				 * tcp_pipe is explained in the above quoted
13856				 * Fall and Floyd's paper.  tcp_fack is
13857				 * explained in Mathis and Mahdavi's
13858				 * "Forward Acknowledgment: Refining TCP
13859				 * Congestion Control" in SIGCOMM '96.
13860				 */
13861				if (tcp->tcp_snd_sack_ok) {
13862					ASSERT(tcp->tcp_sack_info != NULL);
13863					if (tcp->tcp_notsack_list != NULL) {
13864						tcp->tcp_pipe = tcp->tcp_snxt -
13865						    tcp->tcp_fack;
13866						tcp->tcp_sack_snxt = seg_ack;
13867						flags |= TH_NEED_SACK_REXMIT;
13868					} else {
13869						/*
13870						 * Always initialize tcp_pipe
13871						 * even though we don't have
13872						 * any SACK info.  If later
13873						 * we get SACK info and
13874						 * tcp_pipe is not initialized,
13875						 * funny things will happen.
13876						 */
13877						tcp->tcp_pipe =
13878						    tcp->tcp_cwnd_ssthresh;
13879					}
13880				} else {
13881					flags |= TH_REXMIT_NEEDED;
13882				} /* tcp_snd_sack_ok */
13883
13884				} else {
13885					/*
13886					 * Here we perform congestion
13887					 * avoidance, but NOT slow start.
13888					 * This is known as the Fast
13889					 * Recovery Algorithm.
13890					 */
13891					if (tcp->tcp_snd_sack_ok &&
13892					    tcp->tcp_notsack_list != NULL) {
13893						flags |= TH_NEED_SACK_REXMIT;
13894						tcp->tcp_pipe -= mss;
13895						if (tcp->tcp_pipe < 0)
13896							tcp->tcp_pipe = 0;
13897					} else {
13898					/*
13899					 * We know that one more packet has
13900					 * left the pipe thus we can update
13901					 * cwnd.
13902					 */
13903					cwnd = tcp->tcp_cwnd + mss;
13904					if (cwnd > tcp->tcp_cwnd_max)
13905						cwnd = tcp->tcp_cwnd_max;
13906					tcp->tcp_cwnd = cwnd;
13907					if (tcp->tcp_unsent > 0)
13908						flags |= TH_XMIT_NEEDED;
13909					}
13910				}
13911			}
13912		} else if (tcp->tcp_zero_win_probe) {
13913			/*
13914			 * If the window has opened, need to arrange
13915			 * to send additional data.
13916			 */
13917			if (new_swnd != 0) {
13918				/* tcp_suna != tcp_snxt */
13919				/* Packet contains a window update */
13920				BUMP_MIB(&tcp_mib, tcpInWinUpdate);
13921				tcp->tcp_zero_win_probe = 0;
13922				tcp->tcp_timer_backoff = 0;
13923				tcp->tcp_ms_we_have_waited = 0;
13924
13925				/*
13926				 * Transmit starting with tcp_suna since
13927				 * the one byte probe is not ack'ed.
13928				 * If TCP has sent more than one identical
13929				 * probe, tcp_rexmit will be set.  That means
13930				 * tcp_ss_rexmit() will send out the one
13931				 * byte along with new data.  Otherwise,
13932				 * fake the retransmission.
13933				 */
13934				flags |= TH_XMIT_NEEDED;
13935				if (!tcp->tcp_rexmit) {
13936					tcp->tcp_rexmit = B_TRUE;
13937					tcp->tcp_dupack_cnt = 0;
13938					tcp->tcp_rexmit_nxt = tcp->tcp_suna;
13939					tcp->tcp_rexmit_max = tcp->tcp_suna + 1;
13940				}
13941			}
13942		}
13943		goto swnd_update;
13944	}
13945
13946	/*
13947	 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73.
13948	 * If the ACK value acks something that we have not yet sent, it might
13949	 * be an old duplicate segment.  Send an ACK to re-synchronize the
13950	 * other side.
13951	 * Note: reset in response to unacceptable ACK in SYN_RECEIVE
13952	 * state is handled above, so we can always just drop the segment and
13953	 * send an ACK here.
13954	 *
13955	 * Should we send ACKs in response to ACK only segments?
13956	 */
13957	if (SEQ_GT(seg_ack, tcp->tcp_snxt)) {
13958		BUMP_MIB(&tcp_mib, tcpInAckUnsent);
13959		/* drop the received segment */
13960		freemsg(mp);
13961
13962		/*
13963		 * Send back an ACK.  If tcp_drop_ack_unsent_cnt is
13964		 * greater than 0, check if the number of such
13965		 * bogus ACks is greater than that count.  If yes,
13966		 * don't send back any ACK.  This prevents TCP from
13967		 * getting into an ACK storm if somehow an attacker
13968		 * successfully spoofs an acceptable segment to our
13969		 * peer.
13970		 */
13971		if (tcp_drop_ack_unsent_cnt > 0 &&
13972		    ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) {
13973			TCP_STAT(tcp_in_ack_unsent_drop);
13974			return;
13975		}
13976		mp = tcp_ack_mp(tcp);
13977		if (mp != NULL) {
13978			TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
13979			BUMP_LOCAL(tcp->tcp_obsegs);
13980			BUMP_MIB(&tcp_mib, tcpOutAck);
13981			tcp_send_data(tcp, tcp->tcp_wq, mp);
13982		}
13983		return;
13984	}
13985
13986	/*
13987	 * TCP gets a new ACK, update the notsack'ed list to delete those
13988	 * blocks that are covered by this ACK.
13989	 */
13990	if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
13991		tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
13992		    &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
13993	}
13994
13995	/*
13996	 * If we got an ACK after fast retransmit, check to see
13997	 * if it is a partial ACK.  If it is not and the congestion
13998	 * window was inflated to account for the other side's
13999	 * cached packets, retract it.  If it is, do Hoe's algorithm.
14000	 */
14001	if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) {
14002		ASSERT(tcp->tcp_rexmit == B_FALSE);
14003		if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
14004			tcp->tcp_dupack_cnt = 0;
14005			/*
14006			 * Restore the orig tcp_cwnd_ssthresh after
14007			 * fast retransmit phase.
14008			 */
14009			if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
14010				tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
14011			}
14012			tcp->tcp_rexmit_max = seg_ack;
14013			tcp->tcp_cwnd_cnt = 0;
14014			tcp->tcp_snd_burst = tcp->tcp_localnet ?
14015			    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
14016
14017			/*
14018			 * Remove all notsack info to avoid confusion with
14019			 * the next fast retrasnmit/recovery phase.
14020			 */
14021			if (tcp->tcp_snd_sack_ok &&
14022			    tcp->tcp_notsack_list != NULL) {
14023				TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
14024			}
14025		} else {
14026			if (tcp->tcp_snd_sack_ok &&
14027			    tcp->tcp_notsack_list != NULL) {
14028				flags |= TH_NEED_SACK_REXMIT;
14029				tcp->tcp_pipe -= mss;
14030				if (tcp->tcp_pipe < 0)
14031					tcp->tcp_pipe = 0;
14032			} else {
14033				/*
14034				 * Hoe's algorithm:
14035				 *
14036				 * Retransmit the unack'ed segment and
14037				 * restart fast recovery.  Note that we
14038				 * need to scale back tcp_cwnd to the
14039				 * original value when we started fast
14040				 * recovery.  This is to prevent overly
14041				 * aggressive behaviour in sending new
14042				 * segments.
14043				 */
14044				tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
14045					tcp_dupack_fast_retransmit * mss;
14046				tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
14047				flags |= TH_REXMIT_NEEDED;
14048			}
14049		}
14050	} else {
14051		tcp->tcp_dupack_cnt = 0;
14052		if (tcp->tcp_rexmit) {
14053			/*
14054			 * TCP is retranmitting.  If the ACK ack's all
14055			 * outstanding data, update tcp_rexmit_max and
14056			 * tcp_rexmit_nxt.  Otherwise, update tcp_rexmit_nxt
14057			 * to the correct value.
14058			 *
14059			 * Note that SEQ_LEQ() is used.  This is to avoid
14060			 * unnecessary fast retransmit caused by dup ACKs
14061			 * received when TCP does slow start retransmission
14062			 * after a time out.  During this phase, TCP may
14063			 * send out segments which are already received.
14064			 * This causes dup ACKs to be sent back.
14065			 */
14066			if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) {
14067				if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) {
14068					tcp->tcp_rexmit_nxt = seg_ack;
14069				}
14070				if (seg_ack != tcp->tcp_rexmit_max) {
14071					flags |= TH_XMIT_NEEDED;
14072				}
14073			} else {
14074				tcp->tcp_rexmit = B_FALSE;
14075				tcp->tcp_xmit_zc_clean = B_FALSE;
14076				tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
14077				tcp->tcp_snd_burst = tcp->tcp_localnet ?
14078				    TCP_CWND_INFINITE : TCP_CWND_NORMAL;
14079			}
14080			tcp->tcp_ms_we_have_waited = 0;
14081		}
14082	}
14083
14084	BUMP_MIB(&tcp_mib, tcpInAckSegs);
14085	UPDATE_MIB(&tcp_mib, tcpInAckBytes, bytes_acked);
14086	tcp->tcp_suna = seg_ack;
14087	if (tcp->tcp_zero_win_probe != 0) {
14088		tcp->tcp_zero_win_probe = 0;
14089		tcp->tcp_timer_backoff = 0;
14090	}
14091
14092	/*
14093	 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
14094	 * Note that it cannot be the SYN being ack'ed.  The code flow
14095	 * will not reach here.
14096	 */
14097	if (mp1 == NULL) {
14098		goto fin_acked;
14099	}
14100
14101	/*
14102	 * Update the congestion window.
14103	 *
14104	 * If TCP is not ECN capable or TCP is ECN capable but the
14105	 * congestion experience bit is not set, increase the tcp_cwnd as
14106	 * usual.
14107	 */
14108	if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
14109		cwnd = tcp->tcp_cwnd;
14110		add = mss;
14111
14112		if (cwnd >= tcp->tcp_cwnd_ssthresh) {
14113			/*
14114			 * This is to prevent an increase of less than 1 MSS of
14115			 * tcp_cwnd.  With partial increase, tcp_wput_data()
14116			 * may send out tinygrams in order to preserve mblk
14117			 * boundaries.
14118			 *
14119			 * By initializing tcp_cwnd_cnt to new tcp_cwnd and
14120			 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
14121			 * increased by 1 MSS for every RTTs.
14122			 */
14123			if (tcp->tcp_cwnd_cnt <= 0) {
14124				tcp->tcp_cwnd_cnt = cwnd + add;
14125			} else {
14126				tcp->tcp_cwnd_cnt -= add;
14127				add = 0;
14128			}
14129		}
14130		tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
14131	}
14132
14133	/* See if the latest urgent data has been acknowledged */
14134	if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
14135	    SEQ_GT(seg_ack, tcp->tcp_urg))
14136		tcp->tcp_valid_bits &= ~TCP_URG_VALID;
14137
14138	/* Can we update the RTT estimates? */
14139	if (tcp->tcp_snd_ts_ok) {
14140		/* Ignore zero timestamp echo-reply. */
14141		if (tcpopt.tcp_opt_ts_ecr != 0) {
14142			tcp_set_rto(tcp, (int32_t)lbolt -
14143			    (int32_t)tcpopt.tcp_opt_ts_ecr);
14144		}
14145
14146		/* If needed, restart the timer. */
14147		if (tcp->tcp_set_timer == 1) {
14148			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
14149			tcp->tcp_set_timer = 0;
14150		}
14151		/*
14152		 * Update tcp_csuna in case the other side stops sending
14153		 * us timestamps.
14154		 */
14155		tcp->tcp_csuna = tcp->tcp_snxt;
14156	} else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
14157		/*
14158		 * An ACK sequence we haven't seen before, so get the RTT
14159		 * and update the RTO. But first check if the timestamp is
14160		 * valid to use.
14161		 */
14162		if ((mp1->b_next != NULL) &&
14163		    SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
14164			tcp_set_rto(tcp, (int32_t)lbolt -
14165			    (int32_t)(intptr_t)mp1->b_prev);
14166		else
14167			BUMP_MIB(&tcp_mib, tcpRttNoUpdate);
14168
14169		/* Remeber the last sequence to be ACKed */
14170		tcp->tcp_csuna = seg_ack;
14171		if (tcp->tcp_set_timer == 1) {
14172			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
14173			tcp->tcp_set_timer = 0;
14174		}
14175	} else {
14176		BUMP_MIB(&tcp_mib, tcpRttNoUpdate);
14177	}
14178
14179	/* Eat acknowledged bytes off the xmit queue. */
14180	for (;;) {
14181		mblk_t	*mp2;
14182		uchar_t	*wptr;
14183
14184		wptr = mp1->b_wptr;
14185		ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
14186		bytes_acked -= (int)(wptr - mp1->b_rptr);
14187		if (bytes_acked < 0) {
14188			mp1->b_rptr = wptr + bytes_acked;
14189			/*
14190			 * Set a new timestamp if all the bytes timed by the
14191			 * old timestamp have been ack'ed.
14192			 */
14193			if (SEQ_GT(seg_ack,
14194			    (uint32_t)(uintptr_t)(mp1->b_next))) {
14195				mp1->b_prev = (mblk_t *)(uintptr_t)lbolt;
14196				mp1->b_next = NULL;
14197			}
14198			break;
14199		}
14200		mp1->b_next = NULL;
14201		mp1->b_prev = NULL;
14202		mp2 = mp1;
14203		mp1 = mp1->b_cont;
14204
14205		/*
14206		 * This notification is required for some zero-copy
14207		 * clients to maintain a copy semantic. After the data
14208		 * is ack'ed, client is safe to modify or reuse the buffer.
14209		 */
14210		if (tcp->tcp_snd_zcopy_aware &&
14211		    (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
14212			tcp_zcopy_notify(tcp);
14213		freeb(mp2);
14214		if (bytes_acked == 0) {
14215			if (mp1 == NULL) {
14216				/* Everything is ack'ed, clear the tail. */
14217				tcp->tcp_xmit_tail = NULL;
14218				/*
14219				 * Cancel the timer unless we are still
14220				 * waiting for an ACK for the FIN packet.
14221				 */
14222				if (tcp->tcp_timer_tid != 0 &&
14223				    tcp->tcp_snxt == tcp->tcp_suna) {
14224					(void) TCP_TIMER_CANCEL(tcp,
14225					    tcp->tcp_timer_tid);
14226					tcp->tcp_timer_tid = 0;
14227				}
14228				goto pre_swnd_update;
14229			}
14230			if (mp2 != tcp->tcp_xmit_tail)
14231				break;
14232			tcp->tcp_xmit_tail = mp1;
14233			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
14234			    (uintptr_t)INT_MAX);
14235			tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr -
14236			    mp1->b_rptr);
14237			break;
14238		}
14239		if (mp1 == NULL) {
14240			/*
14241			 * More was acked but there is nothing more
14242			 * outstanding.  This means that the FIN was
14243			 * just acked or that we're talking to a clown.
14244			 */
14245fin_acked:
14246			ASSERT(tcp->tcp_fin_sent);
14247			tcp->tcp_xmit_tail = NULL;
14248			if (tcp->tcp_fin_sent) {
14249				/* FIN was acked - making progress */
14250				if (tcp->tcp_ipversion == IPV6_VERSION &&
14251				    !tcp->tcp_fin_acked)
14252					tcp->tcp_ip_forward_progress = B_TRUE;
14253				tcp->tcp_fin_acked = B_TRUE;
14254				if (tcp->tcp_linger_tid != 0 &&
14255				    TCP_TIMER_CANCEL(tcp,
14256					tcp->tcp_linger_tid) >= 0) {
14257					tcp_stop_lingering(tcp);
14258				}
14259			} else {
14260				/*
14261				 * We should never get here because
14262				 * we have already checked that the
14263				 * number of bytes ack'ed should be
14264				 * smaller than or equal to what we
14265				 * have sent so far (it is the
14266				 * acceptability check of the ACK).
14267				 * We can only get here if the send
14268				 * queue is corrupted.
14269				 *
14270				 * Terminate the connection and
14271				 * panic the system.  It is better
14272				 * for us to panic instead of
14273				 * continuing to avoid other disaster.
14274				 */
14275				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
14276				    tcp->tcp_rnxt, TH_RST|TH_ACK);
14277				panic("Memory corruption "
14278				    "detected for connection %s.",
14279				    tcp_display(tcp, NULL,
14280					DISP_ADDR_AND_PORT));
14281				/*NOTREACHED*/
14282			}
14283			goto pre_swnd_update;
14284		}
14285		ASSERT(mp2 != tcp->tcp_xmit_tail);
14286	}
14287	if (tcp->tcp_unsent) {
14288		flags |= TH_XMIT_NEEDED;
14289	}
14290pre_swnd_update:
14291	tcp->tcp_xmit_head = mp1;
14292swnd_update:
14293	/*
14294	 * The following check is different from most other implementations.
14295	 * For bi-directional transfer, when segments are dropped, the
14296	 * "normal" check will not accept a window update in those
14297	 * retransmitted segemnts.  Failing to do that, TCP may send out
14298	 * segments which are outside receiver's window.  As TCP accepts
14299	 * the ack in those retransmitted segments, if the window update in
14300	 * the same segment is not accepted, TCP will incorrectly calculates
14301	 * that it can send more segments.  This can create a deadlock
14302	 * with the receiver if its window becomes zero.
14303	 */
14304	if (SEQ_LT(tcp->tcp_swl2, seg_ack) ||
14305	    SEQ_LT(tcp->tcp_swl1, seg_seq) ||
14306	    (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) {
14307		/*
14308		 * The criteria for update is:
14309		 *
14310		 * 1. the segment acknowledges some data.  Or
14311		 * 2. the segment is new, i.e. it has a higher seq num. Or
14312		 * 3. the segment is not old and the advertised window is
14313		 * larger than the previous advertised window.
14314		 */
14315		if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
14316			flags |= TH_XMIT_NEEDED;
14317		tcp->tcp_swnd = new_swnd;
14318		if (new_swnd > tcp->tcp_max_swnd)
14319			tcp->tcp_max_swnd = new_swnd;
14320		tcp->tcp_swl1 = seg_seq;
14321		tcp->tcp_swl2 = seg_ack;
14322	}
14323est:
14324	if (tcp->tcp_state > TCPS_ESTABLISHED) {
14325
14326		switch (tcp->tcp_state) {
14327		case TCPS_FIN_WAIT_1:
14328			if (tcp->tcp_fin_acked) {
14329				tcp->tcp_state = TCPS_FIN_WAIT_2;
14330				/*
14331				 * We implement the non-standard BSD/SunOS
14332				 * FIN_WAIT_2 flushing algorithm.
14333				 * If there is no user attached to this
14334				 * TCP endpoint, then this TCP struct
14335				 * could hang around forever in FIN_WAIT_2
14336				 * state if the peer forgets to send us
14337				 * a FIN.  To prevent this, we wait only
14338				 * 2*MSL (a convenient time value) for
14339				 * the FIN to arrive.  If it doesn't show up,
14340				 * we flush the TCP endpoint.  This algorithm,
14341				 * though a violation of RFC-793, has worked
14342				 * for over 10 years in BSD systems.
14343				 * Note: SunOS 4.x waits 675 seconds before
14344				 * flushing the FIN_WAIT_2 connection.
14345				 */
14346				TCP_TIMER_RESTART(tcp,
14347				    tcp_fin_wait_2_flush_interval);
14348			}
14349			break;
14350		case TCPS_FIN_WAIT_2:
14351			break;	/* Shutdown hook? */
14352		case TCPS_LAST_ACK:
14353			freemsg(mp);
14354			if (tcp->tcp_fin_acked) {
14355				(void) tcp_clean_death(tcp, 0, 19);
14356				return;
14357			}
14358			goto xmit_check;
14359		case TCPS_CLOSING:
14360			if (tcp->tcp_fin_acked) {
14361				tcp->tcp_state = TCPS_TIME_WAIT;
14362				/*
14363				 * Unconditionally clear the exclusive binding
14364				 * bit so this TIME-WAIT connection won't
14365				 * interfere with new ones.
14366				 */
14367				tcp->tcp_exclbind = 0;
14368				if (!TCP_IS_DETACHED(tcp)) {
14369					TCP_TIMER_RESTART(tcp,
14370					    tcp_time_wait_interval);
14371				} else {
14372					tcp_time_wait_append(tcp);
14373					TCP_DBGSTAT(tcp_rput_time_wait);
14374				}
14375			}
14376			/*FALLTHRU*/
14377		case TCPS_CLOSE_WAIT:
14378			freemsg(mp);
14379			goto xmit_check;
14380		default:
14381			ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
14382			break;
14383		}
14384	}
14385	if (flags & TH_FIN) {
14386		/* Make sure we ack the fin */
14387		flags |= TH_ACK_NEEDED;
14388		if (!tcp->tcp_fin_rcvd) {
14389			tcp->tcp_fin_rcvd = B_TRUE;
14390			tcp->tcp_rnxt++;
14391			tcph = tcp->tcp_tcph;
14392			U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
14393
14394			/*
14395			 * Generate the ordrel_ind at the end unless we
14396			 * are an eager guy.
14397			 * In the eager case tcp_rsrv will do this when run
14398			 * after tcp_accept is done.
14399			 */
14400			if (tcp->tcp_listener == NULL &&
14401			    !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
14402				flags |= TH_ORDREL_NEEDED;
14403			switch (tcp->tcp_state) {
14404			case TCPS_SYN_RCVD:
14405			case TCPS_ESTABLISHED:
14406				tcp->tcp_state = TCPS_CLOSE_WAIT;
14407				/* Keepalive? */
14408				break;
14409			case TCPS_FIN_WAIT_1:
14410				if (!tcp->tcp_fin_acked) {
14411					tcp->tcp_state = TCPS_CLOSING;
14412					break;
14413				}
14414				/* FALLTHRU */
14415			case TCPS_FIN_WAIT_2:
14416				tcp->tcp_state = TCPS_TIME_WAIT;
14417				/*
14418				 * Unconditionally clear the exclusive binding
14419				 * bit so this TIME-WAIT connection won't
14420				 * interfere with new ones.
14421				 */
14422				tcp->tcp_exclbind = 0;
14423				if (!TCP_IS_DETACHED(tcp)) {
14424					TCP_TIMER_RESTART(tcp,
14425					    tcp_time_wait_interval);
14426				} else {
14427					tcp_time_wait_append(tcp);
14428					TCP_DBGSTAT(tcp_rput_time_wait);
14429				}
14430				if (seg_len) {
14431					/*
14432					 * implies data piggybacked on FIN.
14433					 * break to handle data.
14434					 */
14435					break;
14436				}
14437				freemsg(mp);
14438				goto ack_check;
14439			}
14440		}
14441	}
14442	if (mp == NULL)
14443		goto xmit_check;
14444	if (seg_len == 0) {
14445		freemsg(mp);
14446		goto xmit_check;
14447	}
14448	if (mp->b_rptr == mp->b_wptr) {
14449		/*
14450		 * The header has been consumed, so we remove the
14451		 * zero-length mblk here.
14452		 */
14453		mp1 = mp;
14454		mp = mp->b_cont;
14455		freeb(mp1);
14456	}
14457	tcph = tcp->tcp_tcph;
14458	tcp->tcp_rack_cnt++;
14459	{
14460		uint32_t cur_max;
14461
14462		cur_max = tcp->tcp_rack_cur_max;
14463		if (tcp->tcp_rack_cnt >= cur_max) {
14464			/*
14465			 * We have more unacked data than we should - send
14466			 * an ACK now.
14467			 */
14468			flags |= TH_ACK_NEEDED;
14469			cur_max++;
14470			if (cur_max > tcp->tcp_rack_abs_max)
14471				tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max;
14472			else
14473				tcp->tcp_rack_cur_max = cur_max;
14474		} else if (TCP_IS_DETACHED(tcp)) {
14475			/* We don't have an ACK timer for detached TCP. */
14476			flags |= TH_ACK_NEEDED;
14477		} else if (seg_len < mss) {
14478			/*
14479			 * If we get a segment that is less than an mss, and we
14480			 * already have unacknowledged data, and the amount
14481			 * unacknowledged is not a multiple of mss, then we
14482			 * better generate an ACK now.  Otherwise, this may be
14483			 * the tail piece of a transaction, and we would rather
14484			 * wait for the response.
14485			 */
14486			uint32_t udif;
14487			ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <=
14488			    (uintptr_t)INT_MAX);
14489			udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack);
14490			if (udif && (udif % mss))
14491				flags |= TH_ACK_NEEDED;
14492			else
14493				flags |= TH_ACK_TIMER_NEEDED;
14494		} else {
14495			/* Start delayed ack timer */
14496			flags |= TH_ACK_TIMER_NEEDED;
14497		}
14498	}
14499	tcp->tcp_rnxt += seg_len;
14500	U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
14501
14502	/* Update SACK list */
14503	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
14504		tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt,
14505		    &(tcp->tcp_num_sack_blk));
14506	}
14507
14508	if (tcp->tcp_urp_mp) {
14509		tcp->tcp_urp_mp->b_cont = mp;
14510		mp = tcp->tcp_urp_mp;
14511		tcp->tcp_urp_mp = NULL;
14512		/* Ready for a new signal. */
14513		tcp->tcp_urp_last_valid = B_FALSE;
14514#ifdef DEBUG
14515		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
14516		    "tcp_rput: sending exdata_ind %s",
14517		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
14518#endif /* DEBUG */
14519	}
14520
14521	/*
14522	 * Check for ancillary data changes compared to last segment.
14523	 */
14524	if (tcp->tcp_ipv6_recvancillary != 0) {
14525		mp = tcp_rput_add_ancillary(tcp, mp, &ipp);
14526		if (mp == NULL)
14527			return;
14528	}
14529
14530	if (tcp->tcp_listener || tcp->tcp_hard_binding) {
14531		/*
14532		 * Side queue inbound data until the accept happens.
14533		 * tcp_accept/tcp_rput drains this when the accept happens.
14534		 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
14535		 * T_EXDATA_IND) it is queued on b_next.
14536		 * XXX Make urgent data use this. Requires:
14537		 *	Removing tcp_listener check for TH_URG
14538		 *	Making M_PCPROTO and MARK messages skip the eager case
14539		 */
14540
14541		if (tcp->tcp_kssl_pending) {
14542			tcp_kssl_input(tcp, mp);
14543		} else {
14544			tcp_rcv_enqueue(tcp, mp, seg_len);
14545		}
14546	} else {
14547		if (mp->b_datap->db_type != M_DATA ||
14548		    (flags & TH_MARKNEXT_NEEDED)) {
14549			if (tcp->tcp_rcv_list != NULL) {
14550				flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
14551			}
14552			ASSERT(tcp->tcp_rcv_list == NULL ||
14553			    tcp->tcp_fused_sigurg);
14554			if (flags & TH_MARKNEXT_NEEDED) {
14555#ifdef DEBUG
14556				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
14557				    "tcp_rput: sending MSGMARKNEXT %s",
14558				    tcp_display(tcp, NULL,
14559				    DISP_PORT_ONLY));
14560#endif /* DEBUG */
14561				mp->b_flag |= MSGMARKNEXT;
14562				flags &= ~TH_MARKNEXT_NEEDED;
14563			}
14564
14565			/* Does this need SSL processing first? */
14566			if ((tcp->tcp_kssl_ctx  != NULL) &&
14567			    (DB_TYPE(mp) == M_DATA)) {
14568				tcp_kssl_input(tcp, mp);
14569			} else {
14570				putnext(tcp->tcp_rq, mp);
14571				if (!canputnext(tcp->tcp_rq))
14572					tcp->tcp_rwnd -= seg_len;
14573			}
14574		} else if ((flags & (TH_PUSH|TH_FIN)) ||
14575		    tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) {
14576			if (tcp->tcp_rcv_list != NULL) {
14577				/*
14578				 * Enqueue the new segment first and then
14579				 * call tcp_rcv_drain() to send all data
14580				 * up.  The other way to do this is to
14581				 * send all queued data up and then call
14582				 * putnext() to send the new segment up.
14583				 * This way can remove the else part later
14584				 * on.
14585				 *
14586				 * We don't this to avoid one more call to
14587				 * canputnext() as tcp_rcv_drain() needs to
14588				 * call canputnext().
14589				 */
14590				tcp_rcv_enqueue(tcp, mp, seg_len);
14591				flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
14592			} else {
14593				/* Does this need SSL processing first? */
14594				if ((tcp->tcp_kssl_ctx  != NULL) &&
14595				    (DB_TYPE(mp) == M_DATA)) {
14596					tcp_kssl_input(tcp, mp);
14597				} else {
14598					putnext(tcp->tcp_rq, mp);
14599					if (!canputnext(tcp->tcp_rq))
14600						tcp->tcp_rwnd -= seg_len;
14601				}
14602			}
14603		} else {
14604			/*
14605			 * Enqueue all packets when processing an mblk
14606			 * from the co queue and also enqueue normal packets.
14607			 */
14608			tcp_rcv_enqueue(tcp, mp, seg_len);
14609		}
14610		/*
14611		 * Make sure the timer is running if we have data waiting
14612		 * for a push bit. This provides resiliency against
14613		 * implementations that do not correctly generate push bits.
14614		 */
14615		if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) {
14616			/*
14617			 * The connection may be closed at this point, so don't
14618			 * do anything for a detached tcp.
14619			 */
14620			if (!TCP_IS_DETACHED(tcp))
14621				tcp->tcp_push_tid = TCP_TIMER(tcp,
14622				    tcp_push_timer,
14623				    MSEC_TO_TICK(tcp_push_timer_interval));
14624		}
14625	}
14626xmit_check:
14627	/* Is there anything left to do? */
14628	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
14629	if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED|
14630	    TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
14631	    TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
14632		goto done;
14633
14634	/* Any transmit work to do and a non-zero window? */
14635	if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
14636	    TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
14637		if (flags & TH_REXMIT_NEEDED) {
14638			uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
14639
14640			BUMP_MIB(&tcp_mib, tcpOutFastRetrans);
14641			if (snd_size > mss)
14642				snd_size = mss;
14643			if (snd_size > tcp->tcp_swnd)
14644				snd_size = tcp->tcp_swnd;
14645			mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
14646			    NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
14647			    B_TRUE);
14648
14649			if (mp1 != NULL) {
14650				tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt;
14651				tcp->tcp_csuna = tcp->tcp_snxt;
14652				BUMP_MIB(&tcp_mib, tcpRetransSegs);
14653				UPDATE_MIB(&tcp_mib, tcpRetransBytes, snd_size);
14654				TCP_RECORD_TRACE(tcp, mp1,
14655				    TCP_TRACE_SEND_PKT);
14656				tcp_send_data(tcp, tcp->tcp_wq, mp1);
14657			}
14658		}
14659		if (flags & TH_NEED_SACK_REXMIT) {
14660			tcp_sack_rxmit(tcp, &flags);
14661		}
14662		/*
14663		 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
14664		 * out new segment.  Note that tcp_rexmit should not be
14665		 * set, otherwise TH_LIMIT_XMIT should not be set.
14666		 */
14667		if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
14668			if (!tcp->tcp_rexmit) {
14669				tcp_wput_data(tcp, NULL, B_FALSE);
14670			} else {
14671				tcp_ss_rexmit(tcp);
14672			}
14673		}
14674		/*
14675		 * Adjust tcp_cwnd back to normal value after sending
14676		 * new data segments.
14677		 */
14678		if (flags & TH_LIMIT_XMIT) {
14679			tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
14680			/*
14681			 * This will restart the timer.  Restarting the
14682			 * timer is used to avoid a timeout before the
14683			 * limited transmitted segment's ACK gets back.
14684			 */
14685			if (tcp->tcp_xmit_head != NULL)
14686				tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt;
14687		}
14688
14689		/* Anything more to do? */
14690		if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
14691		    TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
14692			goto done;
14693	}
14694ack_check:
14695	if (flags & TH_SEND_URP_MARK) {
14696		ASSERT(tcp->tcp_urp_mark_mp);
14697		/*
14698		 * Send up any queued data and then send the mark message
14699		 */
14700		if (tcp->tcp_rcv_list != NULL) {
14701			flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
14702		}
14703		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
14704
14705		mp1 = tcp->tcp_urp_mark_mp;
14706		tcp->tcp_urp_mark_mp = NULL;
14707#ifdef DEBUG
14708		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
14709		    "tcp_rput: sending zero-length %s %s",
14710		    ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
14711		    "MSGNOTMARKNEXT"),
14712		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
14713#endif /* DEBUG */
14714		putnext(tcp->tcp_rq, mp1);
14715		flags &= ~TH_SEND_URP_MARK;
14716	}
14717	if (flags & TH_ACK_NEEDED) {
14718		/*
14719		 * Time to send an ack for some reason.
14720		 */
14721		mp1 = tcp_ack_mp(tcp);
14722
14723		if (mp1 != NULL) {
14724			TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
14725			tcp_send_data(tcp, tcp->tcp_wq, mp1);
14726			BUMP_LOCAL(tcp->tcp_obsegs);
14727			BUMP_MIB(&tcp_mib, tcpOutAck);
14728		}
14729		if (tcp->tcp_ack_tid != 0) {
14730			(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
14731			tcp->tcp_ack_tid = 0;
14732		}
14733	}
14734	if (flags & TH_ACK_TIMER_NEEDED) {
14735		/*
14736		 * Arrange for deferred ACK or push wait timeout.
14737		 * Start timer if it is not already running.
14738		 */
14739		if (tcp->tcp_ack_tid == 0) {
14740			tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
14741			    MSEC_TO_TICK(tcp->tcp_localnet ?
14742			    (clock_t)tcp_local_dack_interval :
14743			    (clock_t)tcp_deferred_ack_interval));
14744		}
14745	}
14746	if (flags & TH_ORDREL_NEEDED) {
14747		/*
14748		 * Send up the ordrel_ind unless we are an eager guy.
14749		 * In the eager case tcp_rsrv will do this when run
14750		 * after tcp_accept is done.
14751		 */
14752		ASSERT(tcp->tcp_listener == NULL);
14753		if (tcp->tcp_rcv_list != NULL) {
14754			/*
14755			 * Push any mblk(s) enqueued from co processing.
14756			 */
14757			flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
14758		}
14759		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
14760		if ((mp1 = mi_tpi_ordrel_ind()) != NULL) {
14761			tcp->tcp_ordrel_done = B_TRUE;
14762			putnext(tcp->tcp_rq, mp1);
14763			if (tcp->tcp_deferred_clean_death) {
14764				/*
14765				 * tcp_clean_death was deferred
14766				 * for T_ORDREL_IND - do it now
14767				 */
14768				(void) tcp_clean_death(tcp,
14769				    tcp->tcp_client_errno, 20);
14770				tcp->tcp_deferred_clean_death =	B_FALSE;
14771			}
14772		} else {
14773			/*
14774			 * Run the orderly release in the
14775			 * service routine.
14776			 */
14777			qenable(tcp->tcp_rq);
14778			/*
14779			 * Caveat(XXX): The machine may be so
14780			 * overloaded that tcp_rsrv() is not scheduled
14781			 * until after the endpoint has transitioned
14782			 * to TCPS_TIME_WAIT
14783			 * and tcp_time_wait_interval expires. Then
14784			 * tcp_timer() will blow away state in tcp_t
14785			 * and T_ORDREL_IND will never be delivered
14786			 * upstream. Unlikely but potentially
14787			 * a problem.
14788			 */
14789		}
14790	}
14791done:
14792	ASSERT(!(flags & TH_MARKNEXT_NEEDED));
14793}
14794
14795/*
14796 * This function does PAWS protection check. Returns B_TRUE if the
14797 * segment passes the PAWS test, else returns B_FALSE.
14798 */
14799boolean_t
14800tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
14801{
14802	uint8_t	flags;
14803	int	options;
14804	uint8_t *up;
14805
14806	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
14807	/*
14808	 * If timestamp option is aligned nicely, get values inline,
14809	 * otherwise call general routine to parse.  Only do that
14810	 * if timestamp is the only option.
14811	 */
14812	if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH +
14813	    TCPOPT_REAL_TS_LEN &&
14814	    OK_32PTR((up = ((uint8_t *)tcph) +
14815	    TCP_MIN_HEADER_LENGTH)) &&
14816	    *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
14817		tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4));
14818		tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8));
14819
14820		options = TCP_OPT_TSTAMP_PRESENT;
14821	} else {
14822		if (tcp->tcp_snd_sack_ok) {
14823			tcpoptp->tcp = tcp;
14824		} else {
14825			tcpoptp->tcp = NULL;
14826		}
14827		options = tcp_parse_options(tcph, tcpoptp);
14828	}
14829
14830	if (options & TCP_OPT_TSTAMP_PRESENT) {
14831		/*
14832		 * Do PAWS per RFC 1323 section 4.2.  Accept RST
14833		 * regardless of the timestamp, page 18 RFC 1323.bis.
14834		 */
14835		if ((flags & TH_RST) == 0 &&
14836		    TSTMP_LT(tcpoptp->tcp_opt_ts_val,
14837		    tcp->tcp_ts_recent)) {
14838			if (TSTMP_LT(lbolt64, tcp->tcp_last_rcv_lbolt +
14839			    PAWS_TIMEOUT)) {
14840				/* This segment is not acceptable. */
14841				return (B_FALSE);
14842			} else {
14843				/*
14844				 * Connection has been idle for
14845				 * too long.  Reset the timestamp
14846				 * and assume the segment is valid.
14847				 */
14848				tcp->tcp_ts_recent =
14849				    tcpoptp->tcp_opt_ts_val;
14850			}
14851		}
14852	} else {
14853		/*
14854		 * If we don't get a timestamp on every packet, we
14855		 * figure we can't really trust 'em, so we stop sending
14856		 * and parsing them.
14857		 */
14858		tcp->tcp_snd_ts_ok = B_FALSE;
14859
14860		tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
14861		tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
14862		tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4);
14863		tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN);
14864		if (tcp->tcp_snd_sack_ok) {
14865			ASSERT(tcp->tcp_sack_info != NULL);
14866			tcp->tcp_max_sack_blk = 4;
14867		}
14868	}
14869	return (B_TRUE);
14870}
14871
14872/*
14873 * Attach ancillary data to a received TCP segments for the
14874 * ancillary pieces requested by the application that are
14875 * different than they were in the previous data segment.
14876 *
14877 * Save the "current" values once memory allocation is ok so that
14878 * when memory allocation fails we can just wait for the next data segment.
14879 */
14880static mblk_t *
14881tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
14882{
14883	struct T_optdata_ind *todi;
14884	int optlen;
14885	uchar_t *optptr;
14886	struct T_opthdr *toh;
14887	uint_t addflag;	/* Which pieces to add */
14888	mblk_t *mp1;
14889
14890	optlen = 0;
14891	addflag = 0;
14892	/* If app asked for pktinfo and the index has changed ... */
14893	if ((ipp->ipp_fields & IPPF_IFINDEX) &&
14894	    ipp->ipp_ifindex != tcp->tcp_recvifindex &&
14895	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) {
14896		optlen += sizeof (struct T_opthdr) +
14897		    sizeof (struct in6_pktinfo);
14898		addflag |= TCP_IPV6_RECVPKTINFO;
14899	}
14900	/* If app asked for hoplimit and it has changed ... */
14901	if ((ipp->ipp_fields & IPPF_HOPLIMIT) &&
14902	    ipp->ipp_hoplimit != tcp->tcp_recvhops &&
14903	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) {
14904		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
14905		addflag |= TCP_IPV6_RECVHOPLIMIT;
14906	}
14907	/* If app asked for tclass and it has changed ... */
14908	if ((ipp->ipp_fields & IPPF_TCLASS) &&
14909	    ipp->ipp_tclass != tcp->tcp_recvtclass &&
14910	    (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) {
14911		optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
14912		addflag |= TCP_IPV6_RECVTCLASS;
14913	}
14914	/*
14915	 * If app asked for hopbyhop headers and it has changed ...
14916	 * For security labels, note that (1) security labels can't change on
14917	 * a connected socket at all, (2) we're connected to at most one peer,
14918	 * (3) if anything changes, then it must be some other extra option.
14919	 */
14920	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) &&
14921	    ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen,
14922	    (ipp->ipp_fields & IPPF_HOPOPTS),
14923	    ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
14924		optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen -
14925		    tcp->tcp_label_len;
14926		addflag |= TCP_IPV6_RECVHOPOPTS;
14927		if (!ip_allocbuf((void **)&tcp->tcp_hopopts,
14928		    &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS),
14929		    ipp->ipp_hopopts, ipp->ipp_hopoptslen))
14930			return (mp);
14931	}
14932	/* If app asked for dst headers before routing headers ... */
14933	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) &&
14934	    ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen,
14935		(ipp->ipp_fields & IPPF_RTDSTOPTS),
14936		ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) {
14937		optlen += sizeof (struct T_opthdr) +
14938		    ipp->ipp_rtdstoptslen;
14939		addflag |= TCP_IPV6_RECVRTDSTOPTS;
14940		if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts,
14941		    &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS),
14942		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen))
14943			return (mp);
14944	}
14945	/* If app asked for routing headers and it has changed ... */
14946	if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) &&
14947	    ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen,
14948	    (ipp->ipp_fields & IPPF_RTHDR),
14949	    ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
14950		optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
14951		addflag |= TCP_IPV6_RECVRTHDR;
14952		if (!ip_allocbuf((void **)&tcp->tcp_rthdr,
14953		    &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR),
14954		    ipp->ipp_rthdr, ipp->ipp_rthdrlen))
14955			return (mp);
14956	}
14957	/* If app asked for dest headers and it has changed ... */
14958	if ((tcp->tcp_ipv6_recvancillary &
14959	    (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) &&
14960	    ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen,
14961	    (ipp->ipp_fields & IPPF_DSTOPTS),
14962	    ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
14963		optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
14964		addflag |= TCP_IPV6_RECVDSTOPTS;
14965		if (!ip_allocbuf((void **)&tcp->tcp_dstopts,
14966		    &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS),
14967		    ipp->ipp_dstopts, ipp->ipp_dstoptslen))
14968			return (mp);
14969	}
14970
14971	if (optlen == 0) {
14972		/* Nothing to add */
14973		return (mp);
14974	}
14975	mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED);
14976	if (mp1 == NULL) {
14977		/*
14978		 * Defer sending ancillary data until the next TCP segment
14979		 * arrives.
14980		 */
14981		return (mp);
14982	}
14983	mp1->b_cont = mp;
14984	mp = mp1;
14985	mp->b_wptr += sizeof (*todi) + optlen;
14986	mp->b_datap->db_type = M_PROTO;
14987	todi = (struct T_optdata_ind *)mp->b_rptr;
14988	todi->PRIM_type = T_OPTDATA_IND;
14989	todi->DATA_flag = 1;	/* MORE data */
14990	todi->OPT_length = optlen;
14991	todi->OPT_offset = sizeof (*todi);
14992	optptr = (uchar_t *)&todi[1];
14993	/*
14994	 * If app asked for pktinfo and the index has changed ...
14995	 * Note that the local address never changes for the connection.
14996	 */
14997	if (addflag & TCP_IPV6_RECVPKTINFO) {
14998		struct in6_pktinfo *pkti;
14999
15000		toh = (struct T_opthdr *)optptr;
15001		toh->level = IPPROTO_IPV6;
15002		toh->name = IPV6_PKTINFO;
15003		toh->len = sizeof (*toh) + sizeof (*pkti);
15004		toh->status = 0;
15005		optptr += sizeof (*toh);
15006		pkti = (struct in6_pktinfo *)optptr;
15007		if (tcp->tcp_ipversion == IPV6_VERSION)
15008			pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src;
15009		else
15010			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
15011			    &pkti->ipi6_addr);
15012		pkti->ipi6_ifindex = ipp->ipp_ifindex;
15013		optptr += sizeof (*pkti);
15014		ASSERT(OK_32PTR(optptr));
15015		/* Save as "last" value */
15016		tcp->tcp_recvifindex = ipp->ipp_ifindex;
15017	}
15018	/* If app asked for hoplimit and it has changed ... */
15019	if (addflag & TCP_IPV6_RECVHOPLIMIT) {
15020		toh = (struct T_opthdr *)optptr;
15021		toh->level = IPPROTO_IPV6;
15022		toh->name = IPV6_HOPLIMIT;
15023		toh->len = sizeof (*toh) + sizeof (uint_t);
15024		toh->status = 0;
15025		optptr += sizeof (*toh);
15026		*(uint_t *)optptr = ipp->ipp_hoplimit;
15027		optptr += sizeof (uint_t);
15028		ASSERT(OK_32PTR(optptr));
15029		/* Save as "last" value */
15030		tcp->tcp_recvhops = ipp->ipp_hoplimit;
15031	}
15032	/* If app asked for tclass and it has changed ... */
15033	if (addflag & TCP_IPV6_RECVTCLASS) {
15034		toh = (struct T_opthdr *)optptr;
15035		toh->level = IPPROTO_IPV6;
15036		toh->name = IPV6_TCLASS;
15037		toh->len = sizeof (*toh) + sizeof (uint_t);
15038		toh->status = 0;
15039		optptr += sizeof (*toh);
15040		*(uint_t *)optptr = ipp->ipp_tclass;
15041		optptr += sizeof (uint_t);
15042		ASSERT(OK_32PTR(optptr));
15043		/* Save as "last" value */
15044		tcp->tcp_recvtclass = ipp->ipp_tclass;
15045	}
15046	if (addflag & TCP_IPV6_RECVHOPOPTS) {
15047		toh = (struct T_opthdr *)optptr;
15048		toh->level = IPPROTO_IPV6;
15049		toh->name = IPV6_HOPOPTS;
15050		toh->len = sizeof (*toh) + ipp->ipp_hopoptslen -
15051		    tcp->tcp_label_len;
15052		toh->status = 0;
15053		optptr += sizeof (*toh);
15054		bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr,
15055		    ipp->ipp_hopoptslen - tcp->tcp_label_len);
15056		optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len;
15057		ASSERT(OK_32PTR(optptr));
15058		/* Save as last value */
15059		ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen,
15060		    (ipp->ipp_fields & IPPF_HOPOPTS),
15061		    ipp->ipp_hopopts, ipp->ipp_hopoptslen);
15062	}
15063	if (addflag & TCP_IPV6_RECVRTDSTOPTS) {
15064		toh = (struct T_opthdr *)optptr;
15065		toh->level = IPPROTO_IPV6;
15066		toh->name = IPV6_RTHDRDSTOPTS;
15067		toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen;
15068		toh->status = 0;
15069		optptr += sizeof (*toh);
15070		bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen);
15071		optptr += ipp->ipp_rtdstoptslen;
15072		ASSERT(OK_32PTR(optptr));
15073		/* Save as last value */
15074		ip_savebuf((void **)&tcp->tcp_rtdstopts,
15075		    &tcp->tcp_rtdstoptslen,
15076		    (ipp->ipp_fields & IPPF_RTDSTOPTS),
15077		    ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
15078	}
15079	if (addflag & TCP_IPV6_RECVRTHDR) {
15080		toh = (struct T_opthdr *)optptr;
15081		toh->level = IPPROTO_IPV6;
15082		toh->name = IPV6_RTHDR;
15083		toh->len = sizeof (*toh) + ipp->ipp_rthdrlen;
15084		toh->status = 0;
15085		optptr += sizeof (*toh);
15086		bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen);
15087		optptr += ipp->ipp_rthdrlen;
15088		ASSERT(OK_32PTR(optptr));
15089		/* Save as last value */
15090		ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen,
15091		    (ipp->ipp_fields & IPPF_RTHDR),
15092		    ipp->ipp_rthdr, ipp->ipp_rthdrlen);
15093	}
15094	if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) {
15095		toh = (struct T_opthdr *)optptr;
15096		toh->level = IPPROTO_IPV6;
15097		toh->name = IPV6_DSTOPTS;
15098		toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
15099		toh->status = 0;
15100		optptr += sizeof (*toh);
15101		bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
15102		optptr += ipp->ipp_dstoptslen;
15103		ASSERT(OK_32PTR(optptr));
15104		/* Save as last value */
15105		ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
15106		    (ipp->ipp_fields & IPPF_DSTOPTS),
15107		    ipp->ipp_dstopts, ipp->ipp_dstoptslen);
15108	}
15109	ASSERT(optptr == mp->b_wptr);
15110	return (mp);
15111}
15112
15113
15114/*
15115 * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK
15116 * or a "bad" IRE detected by tcp_adapt_ire.
15117 * We can't tell if the failure was due to the laddr or the faddr
15118 * thus we clear out all addresses and ports.
15119 */
15120static void
15121tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error)
15122{
15123	queue_t	*q = tcp->tcp_rq;
15124	tcph_t	*tcph;
15125	struct T_error_ack *tea;
15126	conn_t	*connp = tcp->tcp_connp;
15127
15128
15129	ASSERT(mp->b_datap->db_type == M_PCPROTO);
15130
15131	if (mp->b_cont) {
15132		freemsg(mp->b_cont);
15133		mp->b_cont = NULL;
15134	}
15135	tea = (struct T_error_ack *)mp->b_rptr;
15136	switch (tea->PRIM_type) {
15137	case T_BIND_ACK:
15138		/*
15139		 * Need to unbind with classifier since we were just told that
15140		 * our bind succeeded.
15141		 */
15142		tcp->tcp_hard_bound = B_FALSE;
15143		tcp->tcp_hard_binding = B_FALSE;
15144
15145		ipcl_hash_remove(connp);
15146		/* Reuse the mblk if possible */
15147		ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >=
15148			sizeof (*tea));
15149		mp->b_rptr = mp->b_datap->db_base;
15150		mp->b_wptr = mp->b_rptr + sizeof (*tea);
15151		tea = (struct T_error_ack *)mp->b_rptr;
15152		tea->PRIM_type = T_ERROR_ACK;
15153		tea->TLI_error = TSYSERR;
15154		tea->UNIX_error = error;
15155		if (tcp->tcp_state >= TCPS_SYN_SENT) {
15156			tea->ERROR_prim = T_CONN_REQ;
15157		} else {
15158			tea->ERROR_prim = O_T_BIND_REQ;
15159		}
15160		break;
15161
15162	case T_ERROR_ACK:
15163		if (tcp->tcp_state >= TCPS_SYN_SENT)
15164			tea->ERROR_prim = T_CONN_REQ;
15165		break;
15166	default:
15167		panic("tcp_bind_failed: unexpected TPI type");
15168		/*NOTREACHED*/
15169	}
15170
15171	tcp->tcp_state = TCPS_IDLE;
15172	if (tcp->tcp_ipversion == IPV4_VERSION)
15173		tcp->tcp_ipha->ipha_src = 0;
15174	else
15175		V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
15176	/*
15177	 * Copy of the src addr. in tcp_t is needed since
15178	 * the lookup funcs. can only look at tcp_t
15179	 */
15180	V6_SET_ZERO(tcp->tcp_ip_src_v6);
15181
15182	tcph = tcp->tcp_tcph;
15183	tcph->th_lport[0] = 0;
15184	tcph->th_lport[1] = 0;
15185	tcp_bind_hash_remove(tcp);
15186	bzero(&connp->u_port, sizeof (connp->u_port));
15187	/* blow away saved option results if any */
15188	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
15189		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
15190
15191	conn_delete_ire(tcp->tcp_connp, NULL);
15192	putnext(q, mp);
15193}
15194
15195/*
15196 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
15197 * messages.
15198 */
15199void
15200tcp_rput_other(tcp_t *tcp, mblk_t *mp)
15201{
15202	mblk_t	*mp1;
15203	uchar_t	*rptr = mp->b_rptr;
15204	queue_t	*q = tcp->tcp_rq;
15205	struct T_error_ack *tea;
15206	uint32_t mss;
15207	mblk_t *syn_mp;
15208	mblk_t *mdti;
15209	int	retval;
15210	mblk_t *ire_mp;
15211
15212	switch (mp->b_datap->db_type) {
15213	case M_PROTO:
15214	case M_PCPROTO:
15215		ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
15216		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
15217			break;
15218		tea = (struct T_error_ack *)rptr;
15219		switch (tea->PRIM_type) {
15220		case T_BIND_ACK:
15221			/*
15222			 * Adapt Multidata information, if any.  The
15223			 * following tcp_mdt_update routine will free
15224			 * the message.
15225			 */
15226			if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
15227				tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
15228				    b_rptr)->mdt_capab, B_TRUE);
15229				freemsg(mdti);
15230			}
15231
15232			/* Get the IRE, if we had requested for it */
15233			ire_mp = tcp_ire_mp(mp);
15234
15235			if (tcp->tcp_hard_binding) {
15236				tcp->tcp_hard_binding = B_FALSE;
15237				tcp->tcp_hard_bound = B_TRUE;
15238				CL_INET_CONNECT(tcp);
15239			} else {
15240				if (ire_mp != NULL)
15241					freeb(ire_mp);
15242				goto after_syn_sent;
15243			}
15244
15245			retval = tcp_adapt_ire(tcp, ire_mp);
15246			if (ire_mp != NULL)
15247				freeb(ire_mp);
15248			if (retval == 0) {
15249				tcp_bind_failed(tcp, mp,
15250				    (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
15251				    ENETUNREACH : EADDRNOTAVAIL));
15252				return;
15253			}
15254			/*
15255			 * Don't let an endpoint connect to itself.
15256			 * Also checked in tcp_connect() but that
15257			 * check can't handle the case when the
15258			 * local IP address is INADDR_ANY.
15259			 */
15260			if (tcp->tcp_ipversion == IPV4_VERSION) {
15261				if ((tcp->tcp_ipha->ipha_dst ==
15262				    tcp->tcp_ipha->ipha_src) &&
15263				    (BE16_EQL(tcp->tcp_tcph->th_lport,
15264				    tcp->tcp_tcph->th_fport))) {
15265					tcp_bind_failed(tcp, mp, EADDRNOTAVAIL);
15266					return;
15267				}
15268			} else {
15269				if (IN6_ARE_ADDR_EQUAL(
15270				    &tcp->tcp_ip6h->ip6_dst,
15271				    &tcp->tcp_ip6h->ip6_src) &&
15272				    (BE16_EQL(tcp->tcp_tcph->th_lport,
15273				    tcp->tcp_tcph->th_fport))) {
15274					tcp_bind_failed(tcp, mp, EADDRNOTAVAIL);
15275					return;
15276				}
15277			}
15278			ASSERT(tcp->tcp_state == TCPS_SYN_SENT);
15279			/*
15280			 * This should not be possible!  Just for
15281			 * defensive coding...
15282			 */
15283			if (tcp->tcp_state != TCPS_SYN_SENT)
15284				goto after_syn_sent;
15285
15286			if (is_system_labeled() &&
15287			    !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) {
15288				tcp_bind_failed(tcp, mp, EHOSTUNREACH);
15289				return;
15290			}
15291
15292			ASSERT(q == tcp->tcp_rq);
15293			/*
15294			 * tcp_adapt_ire() does not adjust
15295			 * for TCP/IP header length.
15296			 */
15297			mss = tcp->tcp_mss - tcp->tcp_hdr_len;
15298
15299			/*
15300			 * Just make sure our rwnd is at
15301			 * least tcp_recv_hiwat_mss * MSS
15302			 * large, and round up to the nearest
15303			 * MSS.
15304			 *
15305			 * We do the round up here because
15306			 * we need to get the interface
15307			 * MTU first before we can do the
15308			 * round up.
15309			 */
15310			tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
15311			    tcp_recv_hiwat_minmss * mss);
15312			q->q_hiwat = tcp->tcp_rwnd;
15313			tcp_set_ws_value(tcp);
15314			U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
15315			    tcp->tcp_tcph->th_win);
15316			if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always)
15317				tcp->tcp_snd_ws_ok = B_TRUE;
15318
15319			/*
15320			 * Set tcp_snd_ts_ok to true
15321			 * so that tcp_xmit_mp will
15322			 * include the timestamp
15323			 * option in the SYN segment.
15324			 */
15325			if (tcp_tstamp_always ||
15326			    (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) {
15327				tcp->tcp_snd_ts_ok = B_TRUE;
15328			}
15329
15330			/*
15331			 * tcp_snd_sack_ok can be set in
15332			 * tcp_adapt_ire() if the sack metric
15333			 * is set.  So check it here also.
15334			 */
15335			if (tcp_sack_permitted == 2 ||
15336			    tcp->tcp_snd_sack_ok) {
15337				if (tcp->tcp_sack_info == NULL) {
15338					tcp->tcp_sack_info =
15339					kmem_cache_alloc(tcp_sack_info_cache,
15340					    KM_SLEEP);
15341				}
15342				tcp->tcp_snd_sack_ok = B_TRUE;
15343			}
15344
15345			/*
15346			 * Should we use ECN?  Note that the current
15347			 * default value (SunOS 5.9) of tcp_ecn_permitted
15348			 * is 1.  The reason for doing this is that there
15349			 * are equipments out there that will drop ECN
15350			 * enabled IP packets.  Setting it to 1 avoids
15351			 * compatibility problems.
15352			 */
15353			if (tcp_ecn_permitted == 2)
15354				tcp->tcp_ecn_ok = B_TRUE;
15355
15356			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
15357			syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
15358			    tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
15359			if (syn_mp) {
15360				cred_t *cr;
15361				pid_t pid;
15362
15363				/*
15364				 * Obtain the credential from the
15365				 * thread calling connect(); the credential
15366				 * lives on in the second mblk which
15367				 * originated from T_CONN_REQ and is echoed
15368				 * with the T_BIND_ACK from ip.  If none
15369				 * can be found, default to the creator
15370				 * of the socket.
15371				 */
15372				if (mp->b_cont == NULL ||
15373				    (cr = DB_CRED(mp->b_cont)) == NULL) {
15374					cr = tcp->tcp_cred;
15375					pid = tcp->tcp_cpid;
15376				} else {
15377					pid = DB_CPID(mp->b_cont);
15378				}
15379
15380				TCP_RECORD_TRACE(tcp, syn_mp,
15381				    TCP_TRACE_SEND_PKT);
15382				mblk_setcred(syn_mp, cr);
15383				DB_CPID(syn_mp) = pid;
15384				tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
15385			}
15386		after_syn_sent:
15387			/*
15388			 * A trailer mblk indicates a waiting client upstream.
15389			 * We complete here the processing begun in
15390			 * either tcp_bind() or tcp_connect() by passing
15391			 * upstream the reply message they supplied.
15392			 */
15393			mp1 = mp;
15394			mp = mp->b_cont;
15395			freeb(mp1);
15396			if (mp)
15397				break;
15398			return;
15399		case T_ERROR_ACK:
15400			if (tcp->tcp_debug) {
15401				(void) strlog(TCP_MOD_ID, 0, 1,
15402				    SL_TRACE|SL_ERROR,
15403				    "tcp_rput_other: case T_ERROR_ACK, "
15404				    "ERROR_prim == %d",
15405				    tea->ERROR_prim);
15406			}
15407			switch (tea->ERROR_prim) {
15408			case O_T_BIND_REQ:
15409			case T_BIND_REQ:
15410				tcp_bind_failed(tcp, mp,
15411				    (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
15412				    ENETUNREACH : EADDRNOTAVAIL));
15413				return;
15414			case T_UNBIND_REQ:
15415				tcp->tcp_hard_binding = B_FALSE;
15416				tcp->tcp_hard_bound = B_FALSE;
15417				if (mp->b_cont) {
15418					freemsg(mp->b_cont);
15419					mp->b_cont = NULL;
15420				}
15421				if (tcp->tcp_unbind_pending)
15422					tcp->tcp_unbind_pending = 0;
15423				else {
15424					/* From tcp_ip_unbind() - free */
15425					freemsg(mp);
15426					return;
15427				}
15428				break;
15429			case T_SVR4_OPTMGMT_REQ:
15430				if (tcp->tcp_drop_opt_ack_cnt > 0) {
15431					/* T_OPTMGMT_REQ generated by TCP */
15432					printf("T_SVR4_OPTMGMT_REQ failed "
15433					    "%d/%d - dropped (cnt %d)\n",
15434					    tea->TLI_error, tea->UNIX_error,
15435					    tcp->tcp_drop_opt_ack_cnt);
15436					freemsg(mp);
15437					tcp->tcp_drop_opt_ack_cnt--;
15438					return;
15439				}
15440				break;
15441			}
15442			if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ &&
15443			    tcp->tcp_drop_opt_ack_cnt > 0) {
15444				printf("T_SVR4_OPTMGMT_REQ failed %d/%d "
15445				    "- dropped (cnt %d)\n",
15446				    tea->TLI_error, tea->UNIX_error,
15447				    tcp->tcp_drop_opt_ack_cnt);
15448				freemsg(mp);
15449				tcp->tcp_drop_opt_ack_cnt--;
15450				return;
15451			}
15452			break;
15453		case T_OPTMGMT_ACK:
15454			if (tcp->tcp_drop_opt_ack_cnt > 0) {
15455				/* T_OPTMGMT_REQ generated by TCP */
15456				freemsg(mp);
15457				tcp->tcp_drop_opt_ack_cnt--;
15458				return;
15459			}
15460			break;
15461		default:
15462			break;
15463		}
15464		break;
15465	case M_CTL:
15466		/*
15467		 * ICMP messages.
15468		 */
15469		tcp_icmp_error(tcp, mp);
15470		return;
15471	case M_FLUSH:
15472		if (*rptr & FLUSHR)
15473			flushq(q, FLUSHDATA);
15474		break;
15475	default:
15476		break;
15477	}
15478	/*
15479	 * Make sure we set this bit before sending the ACK for
15480	 * bind. Otherwise accept could possibly run and free
15481	 * this tcp struct.
15482	 */
15483	putnext(q, mp);
15484}
15485
15486/*
15487 * Called as the result of a qbufcall or a qtimeout to remedy a failure
15488 * to allocate a T_ordrel_ind in tcp_rsrv().  qenable(q) will make
15489 * tcp_rsrv() try again.
15490 */
15491static void
15492tcp_ordrel_kick(void *arg)
15493{
15494	conn_t 	*connp = (conn_t *)arg;
15495	tcp_t	*tcp = connp->conn_tcp;
15496
15497	tcp->tcp_ordrelid = 0;
15498	tcp->tcp_timeout = B_FALSE;
15499	if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL &&
15500	    tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
15501		qenable(tcp->tcp_rq);
15502	}
15503}
15504
15505/* ARGSUSED */
15506static void
15507tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
15508{
15509	conn_t	*connp = (conn_t *)arg;
15510	tcp_t	*tcp = connp->conn_tcp;
15511	queue_t	*q = tcp->tcp_rq;
15512	uint_t	thwin;
15513
15514	freeb(mp);
15515
15516	TCP_STAT(tcp_rsrv_calls);
15517
15518	if (TCP_IS_DETACHED(tcp) || q == NULL) {
15519		return;
15520	}
15521
15522	if (tcp->tcp_fused) {
15523		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
15524
15525		ASSERT(tcp->tcp_fused);
15526		ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused);
15527		ASSERT(peer_tcp->tcp_loopback_peer == tcp);
15528		ASSERT(!TCP_IS_DETACHED(tcp));
15529		ASSERT(tcp->tcp_connp->conn_sqp ==
15530		    peer_tcp->tcp_connp->conn_sqp);
15531
15532		/*
15533		 * Normally we would not get backenabled in synchronous
15534		 * streams mode, but in case this happens, we need to stop
15535		 * synchronous streams temporarily to prevent a race with
15536		 * tcp_fuse_rrw() or tcp_fuse_rinfop().  It is safe to access
15537		 * tcp_rcv_list here because those entry points will return
15538		 * right away when synchronous streams is stopped.
15539		 */
15540		TCP_FUSE_SYNCSTR_STOP(tcp);
15541		if (tcp->tcp_rcv_list != NULL)
15542			(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
15543
15544		tcp_clrqfull(peer_tcp);
15545		TCP_FUSE_SYNCSTR_RESUME(tcp);
15546		TCP_STAT(tcp_fusion_backenabled);
15547		return;
15548	}
15549
15550	if (canputnext(q)) {
15551		tcp->tcp_rwnd = q->q_hiwat;
15552		thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
15553		    << tcp->tcp_rcv_ws;
15554		thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
15555		/*
15556		 * Send back a window update immediately if TCP is above
15557		 * ESTABLISHED state and the increase of the rcv window
15558		 * that the other side knows is at least 1 MSS after flow
15559		 * control is lifted.
15560		 */
15561		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
15562		    (q->q_hiwat - thwin >= tcp->tcp_mss)) {
15563			tcp_xmit_ctl(NULL, tcp,
15564			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
15565			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
15566			BUMP_MIB(&tcp_mib, tcpOutWinUpdate);
15567		}
15568	}
15569	/* Handle a failure to allocate a T_ORDREL_IND here */
15570	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
15571		ASSERT(tcp->tcp_listener == NULL);
15572		if (tcp->tcp_rcv_list != NULL) {
15573			(void) tcp_rcv_drain(q, tcp);
15574		}
15575		ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
15576		mp = mi_tpi_ordrel_ind();
15577		if (mp) {
15578			tcp->tcp_ordrel_done = B_TRUE;
15579			putnext(q, mp);
15580			if (tcp->tcp_deferred_clean_death) {
15581				/*
15582				 * tcp_clean_death was deferred for
15583				 * T_ORDREL_IND - do it now
15584				 */
15585				tcp->tcp_deferred_clean_death = B_FALSE;
15586				(void) tcp_clean_death(tcp,
15587				    tcp->tcp_client_errno, 22);
15588			}
15589		} else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) {
15590			/*
15591			 * If there isn't already a timer running
15592			 * start one.  Use a 4 second
15593			 * timer as a fallback since it can't fail.
15594			 */
15595			tcp->tcp_timeout = B_TRUE;
15596			tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick,
15597			    MSEC_TO_TICK(4000));
15598		}
15599	}
15600}
15601
15602/*
15603 * The read side service routine is called mostly when we get back-enabled as a
15604 * result of flow control relief.  Since we don't actually queue anything in
15605 * TCP, we have no data to send out of here.  What we do is clear the receive
15606 * window, and send out a window update.
15607 * This routine is also called to drive an orderly release message upstream
15608 * if the attempt in tcp_rput failed.
15609 */
15610static void
15611tcp_rsrv(queue_t *q)
15612{
15613	conn_t *connp = Q_TO_CONN(q);
15614	tcp_t	*tcp = connp->conn_tcp;
15615	mblk_t	*mp;
15616
15617	/* No code does a putq on the read side */
15618	ASSERT(q->q_first == NULL);
15619
15620	/* Nothing to do for the default queue */
15621	if (q == tcp_g_q) {
15622		return;
15623	}
15624
15625	mp = allocb(0, BPRI_HI);
15626	if (mp == NULL) {
15627		/*
15628		 * We are under memory pressure. Return for now and we
15629		 * we will be called again later.
15630		 */
15631		if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) {
15632			/*
15633			 * If there isn't already a timer running
15634			 * start one.  Use a 4 second
15635			 * timer as a fallback since it can't fail.
15636			 */
15637			tcp->tcp_timeout = B_TRUE;
15638			tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick,
15639			    MSEC_TO_TICK(4000));
15640		}
15641		return;
15642	}
15643	CONN_INC_REF(connp);
15644	squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp,
15645	    SQTAG_TCP_RSRV);
15646}
15647
15648/*
15649 * tcp_rwnd_set() is called to adjust the receive window to a desired value.
15650 * We do not allow the receive window to shrink.  After setting rwnd,
15651 * set the flow control hiwat of the stream.
15652 *
15653 * This function is called in 2 cases:
15654 *
15655 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a
15656 *    connection (passive open) and in tcp_rput_data() for active connect.
15657 *    This is called after tcp_mss_set() when the desired MSS value is known.
15658 *    This makes sure that our window size is a mutiple of the other side's
15659 *    MSS.
15660 * 2) Handling SO_RCVBUF option.
15661 *
15662 * It is ASSUMED that the requested size is a multiple of the current MSS.
15663 *
15664 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the
15665 * user requests so.
15666 */
15667static int
15668tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
15669{
15670	uint32_t	mss = tcp->tcp_mss;
15671	uint32_t	old_max_rwnd;
15672	uint32_t	max_transmittable_rwnd;
15673	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
15674
15675	if (tcp->tcp_fused) {
15676		size_t sth_hiwat;
15677		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
15678
15679		ASSERT(peer_tcp != NULL);
15680		/*
15681		 * Record the stream head's high water mark for
15682		 * this endpoint; this is used for flow-control
15683		 * purposes in tcp_fuse_output().
15684		 */
15685		sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
15686		if (!tcp_detached)
15687			(void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat);
15688
15689		/*
15690		 * In the fusion case, the maxpsz stream head value of
15691		 * our peer is set according to its send buffer size
15692		 * and our receive buffer size; since the latter may
15693		 * have changed we need to update the peer's maxpsz.
15694		 */
15695		(void) tcp_maxpsz_set(peer_tcp, B_TRUE);
15696		return (rwnd);
15697	}
15698
15699	if (tcp_detached)
15700		old_max_rwnd = tcp->tcp_rwnd;
15701	else
15702		old_max_rwnd = tcp->tcp_rq->q_hiwat;
15703
15704	/*
15705	 * Insist on a receive window that is at least
15706	 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid
15707	 * funny TCP interactions of Nagle algorithm, SWS avoidance
15708	 * and delayed acknowledgement.
15709	 */
15710	rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss);
15711
15712	/*
15713	 * If window size info has already been exchanged, TCP should not
15714	 * shrink the window.  Shrinking window is doable if done carefully.
15715	 * We may add that support later.  But so far there is not a real
15716	 * need to do that.
15717	 */
15718	if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) {
15719		/* MSS may have changed, do a round up again. */
15720		rwnd = MSS_ROUNDUP(old_max_rwnd, mss);
15721	}
15722
15723	/*
15724	 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check
15725	 * can be applied even before the window scale option is decided.
15726	 */
15727	max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws;
15728	if (rwnd > max_transmittable_rwnd) {
15729		rwnd = max_transmittable_rwnd -
15730		    (max_transmittable_rwnd % mss);
15731		if (rwnd < mss)
15732			rwnd = max_transmittable_rwnd;
15733		/*
15734		 * If we're over the limit we may have to back down tcp_rwnd.
15735		 * The increment below won't work for us. So we set all three
15736		 * here and the increment below will have no effect.
15737		 */
15738		tcp->tcp_rwnd = old_max_rwnd = rwnd;
15739	}
15740	if (tcp->tcp_localnet) {
15741		tcp->tcp_rack_abs_max =
15742		    MIN(tcp_local_dacks_max, rwnd / mss / 2);
15743	} else {
15744		/*
15745		 * For a remote host on a different subnet (through a router),
15746		 * we ack every other packet to be conforming to RFC1122.
15747		 * tcp_deferred_acks_max is default to 2.
15748		 */
15749		tcp->tcp_rack_abs_max =
15750		    MIN(tcp_deferred_acks_max, rwnd / mss / 2);
15751	}
15752	if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max)
15753		tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max;
15754	else
15755		tcp->tcp_rack_cur_max = 0;
15756	/*
15757	 * Increment the current rwnd by the amount the maximum grew (we
15758	 * can not overwrite it since we might be in the middle of a
15759	 * connection.)
15760	 */
15761	tcp->tcp_rwnd += rwnd - old_max_rwnd;
15762	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win);
15763	if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max)
15764		tcp->tcp_cwnd_max = rwnd;
15765
15766	if (tcp_detached)
15767		return (rwnd);
15768	/*
15769	 * We set the maximum receive window into rq->q_hiwat.
15770	 * This is not actually used for flow control.
15771	 */
15772	tcp->tcp_rq->q_hiwat = rwnd;
15773	/*
15774	 * Set the Stream head high water mark. This doesn't have to be
15775	 * here, since we are simply using default values, but we would
15776	 * prefer to choose these values algorithmically, with a likely
15777	 * relationship to rwnd.
15778	 */
15779	(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat));
15780	return (rwnd);
15781}
15782
15783/*
15784 * Return SNMP stuff in buffer in mpdata.
15785 */
15786int
15787tcp_snmp_get(queue_t *q, mblk_t *mpctl)
15788{
15789	mblk_t			*mpdata;
15790	mblk_t			*mp_conn_ctl = NULL;
15791	mblk_t			*mp_conn_tail;
15792	mblk_t			*mp_attr_ctl = NULL;
15793	mblk_t			*mp_attr_tail;
15794	mblk_t			*mp6_conn_ctl = NULL;
15795	mblk_t			*mp6_conn_tail;
15796	mblk_t			*mp6_attr_ctl = NULL;
15797	mblk_t			*mp6_attr_tail;
15798	struct opthdr		*optp;
15799	mib2_tcpConnEntry_t	tce;
15800	mib2_tcp6ConnEntry_t	tce6;
15801	mib2_transportMLPEntry_t mlp;
15802	connf_t			*connfp;
15803	conn_t			*connp;
15804	int			i;
15805	boolean_t 		ispriv;
15806	zoneid_t 		zoneid;
15807	int			v4_conn_idx;
15808	int			v6_conn_idx;
15809
15810	if (mpctl == NULL ||
15811	    (mpdata = mpctl->b_cont) == NULL ||
15812	    (mp_conn_ctl = copymsg(mpctl)) == NULL ||
15813	    (mp_attr_ctl = copymsg(mpctl)) == NULL ||
15814	    (mp6_conn_ctl = copymsg(mpctl)) == NULL ||
15815	    (mp6_attr_ctl = copymsg(mpctl)) == NULL) {
15816		freemsg(mp_conn_ctl);
15817		freemsg(mp_attr_ctl);
15818		freemsg(mp6_conn_ctl);
15819		freemsg(mp6_attr_ctl);
15820		return (0);
15821	}
15822
15823	/* build table of connections -- need count in fixed part */
15824	SET_MIB(tcp_mib.tcpRtoAlgorithm, 4);   /* vanj */
15825	SET_MIB(tcp_mib.tcpRtoMin, tcp_rexmit_interval_min);
15826	SET_MIB(tcp_mib.tcpRtoMax, tcp_rexmit_interval_max);
15827	SET_MIB(tcp_mib.tcpMaxConn, -1);
15828	SET_MIB(tcp_mib.tcpCurrEstab, 0);
15829
15830	ispriv =
15831	    secpolicy_net_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0;
15832	zoneid = Q_TO_CONN(q)->conn_zoneid;
15833
15834	v4_conn_idx = v6_conn_idx = 0;
15835	mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL;
15836
15837	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
15838
15839		connfp = &ipcl_globalhash_fanout[i];
15840
15841		connp = NULL;
15842
15843		while ((connp =
15844		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
15845			tcp_t *tcp;
15846			boolean_t needattr;
15847
15848			if (connp->conn_zoneid != zoneid)
15849				continue;	/* not in this zone */
15850
15851			tcp = connp->conn_tcp;
15852			UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs);
15853			tcp->tcp_ibsegs = 0;
15854			UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs);
15855			tcp->tcp_obsegs = 0;
15856
15857			tce6.tcp6ConnState = tce.tcpConnState =
15858			    tcp_snmp_state(tcp);
15859			if (tce.tcpConnState == MIB2_TCP_established ||
15860			    tce.tcpConnState == MIB2_TCP_closeWait)
15861				BUMP_MIB(&tcp_mib, tcpCurrEstab);
15862
15863			needattr = B_FALSE;
15864			bzero(&mlp, sizeof (mlp));
15865			if (connp->conn_mlp_type != mlptSingle) {
15866				if (connp->conn_mlp_type == mlptShared ||
15867				    connp->conn_mlp_type == mlptBoth)
15868					mlp.tme_flags |= MIB2_TMEF_SHARED;
15869				if (connp->conn_mlp_type == mlptPrivate ||
15870				    connp->conn_mlp_type == mlptBoth)
15871					mlp.tme_flags |= MIB2_TMEF_PRIVATE;
15872				needattr = B_TRUE;
15873			}
15874			if (connp->conn_peercred != NULL) {
15875				ts_label_t *tsl;
15876
15877				tsl = crgetlabel(connp->conn_peercred);
15878				mlp.tme_doi = label2doi(tsl);
15879				mlp.tme_label = *label2bslabel(tsl);
15880				needattr = B_TRUE;
15881			}
15882
15883			/* Create a message to report on IPv6 entries */
15884			if (tcp->tcp_ipversion == IPV6_VERSION) {
15885			tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6;
15886			tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6;
15887			tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport);
15888			tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport);
15889			tce6.tcp6ConnIfIndex = tcp->tcp_bound_if;
15890			/* Don't want just anybody seeing these... */
15891			if (ispriv) {
15892				tce6.tcp6ConnEntryInfo.ce_snxt =
15893				    tcp->tcp_snxt;
15894				tce6.tcp6ConnEntryInfo.ce_suna =
15895				    tcp->tcp_suna;
15896				tce6.tcp6ConnEntryInfo.ce_rnxt =
15897				    tcp->tcp_rnxt;
15898				tce6.tcp6ConnEntryInfo.ce_rack =
15899				    tcp->tcp_rack;
15900			} else {
15901				/*
15902				 * Netstat, unfortunately, uses this to
15903				 * get send/receive queue sizes.  How to fix?
15904				 * Why not compute the difference only?
15905				 */
15906				tce6.tcp6ConnEntryInfo.ce_snxt =
15907				    tcp->tcp_snxt - tcp->tcp_suna;
15908				tce6.tcp6ConnEntryInfo.ce_suna = 0;
15909				tce6.tcp6ConnEntryInfo.ce_rnxt =
15910				    tcp->tcp_rnxt - tcp->tcp_rack;
15911				tce6.tcp6ConnEntryInfo.ce_rack = 0;
15912			}
15913
15914			tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd;
15915			tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd;
15916			tce6.tcp6ConnEntryInfo.ce_rto =  tcp->tcp_rto;
15917			tce6.tcp6ConnEntryInfo.ce_mss =  tcp->tcp_mss;
15918			tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state;
15919
15920			(void) snmp_append_data2(mp6_conn_ctl->b_cont,
15921			    &mp6_conn_tail, (char *)&tce6, sizeof (tce6));
15922
15923			mlp.tme_connidx = v6_conn_idx++;
15924			if (needattr)
15925				(void) snmp_append_data2(mp6_attr_ctl->b_cont,
15926				    &mp6_attr_tail, (char *)&mlp, sizeof (mlp));
15927			}
15928			/*
15929			 * Create an IPv4 table entry for IPv4 entries and also
15930			 * for IPv6 entries which are bound to in6addr_any
15931			 * but don't have IPV6_V6ONLY set.
15932			 * (i.e. anything an IPv4 peer could connect to)
15933			 */
15934			if (tcp->tcp_ipversion == IPV4_VERSION ||
15935			    (tcp->tcp_state <= TCPS_LISTEN &&
15936			    !tcp->tcp_connp->conn_ipv6_v6only &&
15937			    IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) {
15938				if (tcp->tcp_ipversion == IPV6_VERSION) {
15939					tce.tcpConnRemAddress = INADDR_ANY;
15940					tce.tcpConnLocalAddress = INADDR_ANY;
15941				} else {
15942					tce.tcpConnRemAddress =
15943					    tcp->tcp_remote;
15944					tce.tcpConnLocalAddress =
15945					    tcp->tcp_ip_src;
15946				}
15947				tce.tcpConnLocalPort = ntohs(tcp->tcp_lport);
15948				tce.tcpConnRemPort = ntohs(tcp->tcp_fport);
15949				/* Don't want just anybody seeing these... */
15950				if (ispriv) {
15951					tce.tcpConnEntryInfo.ce_snxt =
15952					    tcp->tcp_snxt;
15953					tce.tcpConnEntryInfo.ce_suna =
15954					    tcp->tcp_suna;
15955					tce.tcpConnEntryInfo.ce_rnxt =
15956					    tcp->tcp_rnxt;
15957					tce.tcpConnEntryInfo.ce_rack =
15958					    tcp->tcp_rack;
15959				} else {
15960					/*
15961					 * Netstat, unfortunately, uses this to
15962					 * get send/receive queue sizes.  How
15963					 * to fix?
15964					 * Why not compute the difference only?
15965					 */
15966					tce.tcpConnEntryInfo.ce_snxt =
15967					    tcp->tcp_snxt - tcp->tcp_suna;
15968					tce.tcpConnEntryInfo.ce_suna = 0;
15969					tce.tcpConnEntryInfo.ce_rnxt =
15970					    tcp->tcp_rnxt - tcp->tcp_rack;
15971					tce.tcpConnEntryInfo.ce_rack = 0;
15972				}
15973
15974				tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd;
15975				tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd;
15976				tce.tcpConnEntryInfo.ce_rto =  tcp->tcp_rto;
15977				tce.tcpConnEntryInfo.ce_mss =  tcp->tcp_mss;
15978				tce.tcpConnEntryInfo.ce_state =
15979				    tcp->tcp_state;
15980
15981				(void) snmp_append_data2(mp_conn_ctl->b_cont,
15982				    &mp_conn_tail, (char *)&tce, sizeof (tce));
15983
15984				mlp.tme_connidx = v4_conn_idx++;
15985				if (needattr)
15986					(void) snmp_append_data2(
15987					    mp_attr_ctl->b_cont,
15988					    &mp_attr_tail, (char *)&mlp,
15989					    sizeof (mlp));
15990			}
15991		}
15992	}
15993
15994	/* fixed length structure for IPv4 and IPv6 counters */
15995	SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t));
15996	SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t));
15997	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
15998	optp->level = MIB2_TCP;
15999	optp->name = 0;
16000	(void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib));
16001	optp->len = msgdsize(mpdata);
16002	qreply(q, mpctl);
16003
16004	/* table of connections... */
16005	optp = (struct opthdr *)&mp_conn_ctl->b_rptr[
16006	    sizeof (struct T_optmgmt_ack)];
16007	optp->level = MIB2_TCP;
16008	optp->name = MIB2_TCP_CONN;
16009	optp->len = msgdsize(mp_conn_ctl->b_cont);
16010	qreply(q, mp_conn_ctl);
16011
16012	/* table of MLP attributes... */
16013	optp = (struct opthdr *)&mp_attr_ctl->b_rptr[
16014	    sizeof (struct T_optmgmt_ack)];
16015	optp->level = MIB2_TCP;
16016	optp->name = EXPER_XPORT_MLP;
16017	optp->len = msgdsize(mp_attr_ctl->b_cont);
16018	if (optp->len == 0)
16019		freemsg(mp_attr_ctl);
16020	else
16021		qreply(q, mp_attr_ctl);
16022
16023	/* table of IPv6 connections... */
16024	optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[
16025	    sizeof (struct T_optmgmt_ack)];
16026	optp->level = MIB2_TCP6;
16027	optp->name = MIB2_TCP6_CONN;
16028	optp->len = msgdsize(mp6_conn_ctl->b_cont);
16029	qreply(q, mp6_conn_ctl);
16030
16031	/* table of IPv6 MLP attributes... */
16032	optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[
16033	    sizeof (struct T_optmgmt_ack)];
16034	optp->level = MIB2_TCP6;
16035	optp->name = EXPER_XPORT_MLP;
16036	optp->len = msgdsize(mp6_attr_ctl->b_cont);
16037	if (optp->len == 0)
16038		freemsg(mp6_attr_ctl);
16039	else
16040		qreply(q, mp6_attr_ctl);
16041	return (1);
16042}
16043
16044/* Return 0 if invalid set request, 1 otherwise, including non-tcp requests  */
16045/* ARGSUSED */
16046int
16047tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
16048{
16049	mib2_tcpConnEntry_t	*tce = (mib2_tcpConnEntry_t *)ptr;
16050
16051	switch (level) {
16052	case MIB2_TCP:
16053		switch (name) {
16054		case 13:
16055			if (tce->tcpConnState != MIB2_TCP_deleteTCB)
16056				return (0);
16057			/* TODO: delete entry defined by tce */
16058			return (1);
16059		default:
16060			return (0);
16061		}
16062	default:
16063		return (1);
16064	}
16065}
16066
16067/* Translate TCP state to MIB2 TCP state. */
16068static int
16069tcp_snmp_state(tcp_t *tcp)
16070{
16071	if (tcp == NULL)
16072		return (0);
16073
16074	switch (tcp->tcp_state) {
16075	case TCPS_CLOSED:
16076	case TCPS_IDLE:	/* RFC1213 doesn't have analogue for IDLE & BOUND */
16077	case TCPS_BOUND:
16078		return (MIB2_TCP_closed);
16079	case TCPS_LISTEN:
16080		return (MIB2_TCP_listen);
16081	case TCPS_SYN_SENT:
16082		return (MIB2_TCP_synSent);
16083	case TCPS_SYN_RCVD:
16084		return (MIB2_TCP_synReceived);
16085	case TCPS_ESTABLISHED:
16086		return (MIB2_TCP_established);
16087	case TCPS_CLOSE_WAIT:
16088		return (MIB2_TCP_closeWait);
16089	case TCPS_FIN_WAIT_1:
16090		return (MIB2_TCP_finWait1);
16091	case TCPS_CLOSING:
16092		return (MIB2_TCP_closing);
16093	case TCPS_LAST_ACK:
16094		return (MIB2_TCP_lastAck);
16095	case TCPS_FIN_WAIT_2:
16096		return (MIB2_TCP_finWait2);
16097	case TCPS_TIME_WAIT:
16098		return (MIB2_TCP_timeWait);
16099	default:
16100		return (0);
16101	}
16102}
16103
16104static char tcp_report_header[] =
16105	"TCP     " MI_COL_HDRPAD_STR
16106	"zone dest            snxt     suna     "
16107	"swnd       rnxt     rack     rwnd       rto   mss   w sw rw t "
16108	"recent   [lport,fport] state";
16109
16110/*
16111 * TCP status report triggered via the Named Dispatch mechanism.
16112 */
16113/* ARGSUSED */
16114static void
16115tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream,
16116    cred_t *cr)
16117{
16118	char hash[10], addrbuf[INET6_ADDRSTRLEN];
16119	boolean_t ispriv = secpolicy_net_config(cr, B_TRUE) == 0;
16120	char cflag;
16121	in6_addr_t	v6dst;
16122	char buf[80];
16123	uint_t print_len, buf_len;
16124
16125	buf_len = mp->b_datap->db_lim - mp->b_wptr;
16126	if (buf_len <= 0)
16127		return;
16128
16129	if (hashval >= 0)
16130		(void) sprintf(hash, "%03d ", hashval);
16131	else
16132		hash[0] = '\0';
16133
16134	/*
16135	 * Note that we use the remote address in the tcp_b  structure.
16136	 * This means that it will print out the real destination address,
16137	 * not the next hop's address if source routing is used.  This
16138	 * avoid the confusion on the output because user may not
16139	 * know that source routing is used for a connection.
16140	 */
16141	if (tcp->tcp_ipversion == IPV4_VERSION) {
16142		IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &v6dst);
16143	} else {
16144		v6dst = tcp->tcp_remote_v6;
16145	}
16146	(void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf));
16147	/*
16148	 * the ispriv checks are so that normal users cannot determine
16149	 * sequence number information using NDD.
16150	 */
16151
16152	if (TCP_IS_DETACHED(tcp))
16153		cflag = '*';
16154	else
16155		cflag = ' ';
16156	print_len = snprintf((char *)mp->b_wptr, buf_len,
16157	    "%s " MI_COL_PTRFMT_STR "%d %s %08x %08x %010d %08x %08x "
16158	    "%010d %05ld %05d %1d %02d %02d %1d %08x %s%c\n",
16159	    hash,
16160	    (void *)tcp,
16161	    tcp->tcp_connp->conn_zoneid,
16162	    addrbuf,
16163	    (ispriv) ? tcp->tcp_snxt : 0,
16164	    (ispriv) ? tcp->tcp_suna : 0,
16165	    tcp->tcp_swnd,
16166	    (ispriv) ? tcp->tcp_rnxt : 0,
16167	    (ispriv) ? tcp->tcp_rack : 0,
16168	    tcp->tcp_rwnd,
16169	    tcp->tcp_rto,
16170	    tcp->tcp_mss,
16171	    tcp->tcp_snd_ws_ok,
16172	    tcp->tcp_snd_ws,
16173	    tcp->tcp_rcv_ws,
16174	    tcp->tcp_snd_ts_ok,
16175	    tcp->tcp_ts_recent,
16176	    tcp_display(tcp, buf, DISP_PORT_ONLY), cflag);
16177	if (print_len < buf_len) {
16178		((mblk_t *)mp)->b_wptr += print_len;
16179	} else {
16180		((mblk_t *)mp)->b_wptr += buf_len;
16181	}
16182}
16183
16184/*
16185 * TCP status report (for listeners only) triggered via the Named Dispatch
16186 * mechanism.
16187 */
16188/* ARGSUSED */
16189static void
16190tcp_report_listener(mblk_t *mp, tcp_t *tcp, int hashval)
16191{
16192	char addrbuf[INET6_ADDRSTRLEN];
16193	in6_addr_t	v6dst;
16194	uint_t print_len, buf_len;
16195
16196	buf_len = mp->b_datap->db_lim - mp->b_wptr;
16197	if (buf_len <= 0)
16198		return;
16199
16200	if (tcp->tcp_ipversion == IPV4_VERSION) {
16201		IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6dst);
16202		(void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf));
16203	} else {
16204		(void) inet_ntop(AF_INET6, &tcp->tcp_ip6h->ip6_src,
16205		    addrbuf, sizeof (addrbuf));
16206	}
16207	print_len = snprintf((char *)mp->b_wptr, buf_len,
16208	    "%03d "
16209	    MI_COL_PTRFMT_STR
16210	    "%d %s %05u %08u %d/%d/%d%c\n",
16211	    hashval, (void *)tcp,
16212	    tcp->tcp_connp->conn_zoneid,
16213	    addrbuf,
16214	    (uint_t)BE16_TO_U16(tcp->tcp_tcph->th_lport),
16215	    tcp->tcp_conn_req_seqnum,
16216	    tcp->tcp_conn_req_cnt_q0, tcp->tcp_conn_req_cnt_q,
16217	    tcp->tcp_conn_req_max,
16218	    tcp->tcp_syn_defense ? '*' : ' ');
16219	if (print_len < buf_len) {
16220		((mblk_t *)mp)->b_wptr += print_len;
16221	} else {
16222		((mblk_t *)mp)->b_wptr += buf_len;
16223	}
16224}
16225
16226/* TCP status report triggered via the Named Dispatch mechanism. */
16227/* ARGSUSED */
16228static int
16229tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
16230{
16231	tcp_t	*tcp;
16232	int	i;
16233	conn_t	*connp;
16234	connf_t	*connfp;
16235	zoneid_t zoneid;
16236
16237	/*
16238	 * Because of the ndd constraint, at most we can have 64K buffer
16239	 * to put in all TCP info.  So to be more efficient, just
16240	 * allocate a 64K buffer here, assuming we need that large buffer.
16241	 * This may be a problem as any user can read tcp_status.  Therefore
16242	 * we limit the rate of doing this using tcp_ndd_get_info_interval.
16243	 * This should be OK as normal users should not do this too often.
16244	 */
16245	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
16246		if (ddi_get_lbolt() - tcp_last_ndd_get_info_time <
16247		    drv_usectohz(tcp_ndd_get_info_interval * 1000)) {
16248			(void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG);
16249			return (0);
16250		}
16251	}
16252	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
16253		/* The following may work even if we cannot get a large buf. */
16254		(void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG);
16255		return (0);
16256	}
16257
16258	(void) mi_mpprintf(mp, "%s", tcp_report_header);
16259
16260	zoneid = Q_TO_CONN(q)->conn_zoneid;
16261	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
16262
16263		connfp = &ipcl_globalhash_fanout[i];
16264
16265		connp = NULL;
16266
16267		while ((connp =
16268		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
16269			tcp = connp->conn_tcp;
16270			if (zoneid != GLOBAL_ZONEID &&
16271			    zoneid != connp->conn_zoneid)
16272				continue;
16273			tcp_report_item(mp->b_cont, tcp, -1, tcp,
16274			    cr);
16275		}
16276
16277	}
16278
16279	tcp_last_ndd_get_info_time = ddi_get_lbolt();
16280	return (0);
16281}
16282
16283/* TCP status report triggered via the Named Dispatch mechanism. */
16284/* ARGSUSED */
16285static int
16286tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
16287{
16288	tf_t	*tbf;
16289	tcp_t	*tcp;
16290	int	i;
16291	zoneid_t zoneid;
16292
16293	/* Refer to comments in tcp_status_report(). */
16294	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
16295		if (ddi_get_lbolt() - tcp_last_ndd_get_info_time <
16296		    drv_usectohz(tcp_ndd_get_info_interval * 1000)) {
16297			(void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG);
16298			return (0);
16299		}
16300	}
16301	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
16302		/* The following may work even if we cannot get a large buf. */
16303		(void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG);
16304		return (0);
16305	}
16306
16307	(void) mi_mpprintf(mp, "    %s", tcp_report_header);
16308
16309	zoneid = Q_TO_CONN(q)->conn_zoneid;
16310
16311	for (i = 0; i < A_CNT(tcp_bind_fanout); i++) {
16312		tbf = &tcp_bind_fanout[i];
16313		mutex_enter(&tbf->tf_lock);
16314		for (tcp = tbf->tf_tcp; tcp != NULL;
16315		    tcp = tcp->tcp_bind_hash) {
16316			if (zoneid != GLOBAL_ZONEID &&
16317			    zoneid != tcp->tcp_connp->conn_zoneid)
16318				continue;
16319			CONN_INC_REF(tcp->tcp_connp);
16320			tcp_report_item(mp->b_cont, tcp, i,
16321			    Q_TO_TCP(q), cr);
16322			CONN_DEC_REF(tcp->tcp_connp);
16323		}
16324		mutex_exit(&tbf->tf_lock);
16325	}
16326	tcp_last_ndd_get_info_time = ddi_get_lbolt();
16327	return (0);
16328}
16329
16330/* TCP status report triggered via the Named Dispatch mechanism. */
16331/* ARGSUSED */
16332static int
16333tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
16334{
16335	connf_t	*connfp;
16336	conn_t	*connp;
16337	tcp_t	*tcp;
16338	int	i;
16339	zoneid_t zoneid;
16340
16341	/* Refer to comments in tcp_status_report(). */
16342	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
16343		if (ddi_get_lbolt() - tcp_last_ndd_get_info_time <
16344		    drv_usectohz(tcp_ndd_get_info_interval * 1000)) {
16345			(void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG);
16346			return (0);
16347		}
16348	}
16349	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
16350		/* The following may work even if we cannot get a large buf. */
16351		(void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG);
16352		return (0);
16353	}
16354
16355	(void) mi_mpprintf(mp,
16356	    "    TCP    " MI_COL_HDRPAD_STR
16357	    "zone IP addr         port  seqnum   backlog (q0/q/max)");
16358
16359	zoneid = Q_TO_CONN(q)->conn_zoneid;
16360
16361	for (i = 0; i < ipcl_bind_fanout_size; i++) {
16362		connfp =  &ipcl_bind_fanout[i];
16363		connp = NULL;
16364		while ((connp =
16365		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
16366			tcp = connp->conn_tcp;
16367			if (zoneid != GLOBAL_ZONEID &&
16368			    zoneid != connp->conn_zoneid)
16369				continue;
16370			tcp_report_listener(mp->b_cont, tcp, i);
16371		}
16372	}
16373
16374	tcp_last_ndd_get_info_time = ddi_get_lbolt();
16375	return (0);
16376}
16377
16378/* TCP status report triggered via the Named Dispatch mechanism. */
16379/* ARGSUSED */
16380static int
16381tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
16382{
16383	connf_t	*connfp;
16384	conn_t	*connp;
16385	tcp_t	*tcp;
16386	int	i;
16387	zoneid_t zoneid;
16388
16389	/* Refer to comments in tcp_status_report(). */
16390	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
16391		if (ddi_get_lbolt() - tcp_last_ndd_get_info_time <
16392		    drv_usectohz(tcp_ndd_get_info_interval * 1000)) {
16393			(void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG);
16394			return (0);
16395		}
16396	}
16397	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
16398		/* The following may work even if we cannot get a large buf. */
16399		(void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG);
16400		return (0);
16401	}
16402
16403	(void) mi_mpprintf(mp, "tcp_conn_hash_size = %d",
16404	    ipcl_conn_fanout_size);
16405	(void) mi_mpprintf(mp, "    %s", tcp_report_header);
16406
16407	zoneid = Q_TO_CONN(q)->conn_zoneid;
16408
16409	for (i = 0; i < ipcl_conn_fanout_size; i++) {
16410		connfp =  &ipcl_conn_fanout[i];
16411		connp = NULL;
16412		while ((connp =
16413		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
16414			tcp = connp->conn_tcp;
16415			if (zoneid != GLOBAL_ZONEID &&
16416			    zoneid != connp->conn_zoneid)
16417				continue;
16418			tcp_report_item(mp->b_cont, tcp, i,
16419			    Q_TO_TCP(q), cr);
16420		}
16421	}
16422
16423	tcp_last_ndd_get_info_time = ddi_get_lbolt();
16424	return (0);
16425}
16426
16427/* TCP status report triggered via the Named Dispatch mechanism. */
16428/* ARGSUSED */
16429static int
16430tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
16431{
16432	tf_t	*tf;
16433	tcp_t	*tcp;
16434	int	i;
16435	zoneid_t zoneid;
16436
16437	/* Refer to comments in tcp_status_report(). */
16438	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
16439		if (ddi_get_lbolt() - tcp_last_ndd_get_info_time <
16440		    drv_usectohz(tcp_ndd_get_info_interval * 1000)) {
16441			(void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG);
16442			return (0);
16443		}
16444	}
16445	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
16446		/* The following may work even if we cannot get a large buf. */
16447		(void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG);
16448		return (0);
16449	}
16450
16451	(void) mi_mpprintf(mp, "    %s", tcp_report_header);
16452
16453	zoneid = Q_TO_CONN(q)->conn_zoneid;
16454
16455	for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) {
16456		tf = &tcp_acceptor_fanout[i];
16457		mutex_enter(&tf->tf_lock);
16458		for (tcp = tf->tf_tcp; tcp != NULL;
16459		    tcp = tcp->tcp_acceptor_hash) {
16460			if (zoneid != GLOBAL_ZONEID &&
16461			    zoneid != tcp->tcp_connp->conn_zoneid)
16462				continue;
16463			tcp_report_item(mp->b_cont, tcp, i,
16464			    Q_TO_TCP(q), cr);
16465		}
16466		mutex_exit(&tf->tf_lock);
16467	}
16468	tcp_last_ndd_get_info_time = ddi_get_lbolt();
16469	return (0);
16470}
16471
16472/*
16473 * tcp_timer is the timer service routine.  It handles the retransmission,
16474 * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
16475 * from the state of the tcp instance what kind of action needs to be done
16476 * at the time it is called.
16477 */
16478static void
16479tcp_timer(void *arg)
16480{
16481	mblk_t		*mp;
16482	clock_t		first_threshold;
16483	clock_t		second_threshold;
16484	clock_t		ms;
16485	uint32_t	mss;
16486	conn_t		*connp = (conn_t *)arg;
16487	tcp_t		*tcp = connp->conn_tcp;
16488
16489	tcp->tcp_timer_tid = 0;
16490
16491	if (tcp->tcp_fused)
16492		return;
16493
16494	first_threshold =  tcp->tcp_first_timer_threshold;
16495	second_threshold = tcp->tcp_second_timer_threshold;
16496	switch (tcp->tcp_state) {
16497	case TCPS_IDLE:
16498	case TCPS_BOUND:
16499	case TCPS_LISTEN:
16500		return;
16501	case TCPS_SYN_RCVD: {
16502		tcp_t	*listener = tcp->tcp_listener;
16503
16504		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
16505			ASSERT(tcp->tcp_rq == listener->tcp_rq);
16506			/* it's our first timeout */
16507			tcp->tcp_syn_rcvd_timeout = 1;
16508			mutex_enter(&listener->tcp_eager_lock);
16509			listener->tcp_syn_rcvd_timeout++;
16510			if (!listener->tcp_syn_defense &&
16511			    (listener->tcp_syn_rcvd_timeout >
16512			    (tcp_conn_req_max_q0 >> 2)) &&
16513			    (tcp_conn_req_max_q0 > 200)) {
16514				/* We may be under attack. Put on a defense. */
16515				listener->tcp_syn_defense = B_TRUE;
16516				cmn_err(CE_WARN, "High TCP connect timeout "
16517				    "rate! System (port %d) may be under a "
16518				    "SYN flood attack!",
16519				    BE16_TO_U16(listener->tcp_tcph->th_lport));
16520
16521				listener->tcp_ip_addr_cache = kmem_zalloc(
16522				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
16523				    KM_NOSLEEP);
16524			}
16525			mutex_exit(&listener->tcp_eager_lock);
16526		}
16527	}
16528		/* FALLTHRU */
16529	case TCPS_SYN_SENT:
16530		first_threshold =  tcp->tcp_first_ctimer_threshold;
16531		second_threshold = tcp->tcp_second_ctimer_threshold;
16532		break;
16533	case TCPS_ESTABLISHED:
16534	case TCPS_FIN_WAIT_1:
16535	case TCPS_CLOSING:
16536	case TCPS_CLOSE_WAIT:
16537	case TCPS_LAST_ACK:
16538		/* If we have data to rexmit */
16539		if (tcp->tcp_suna != tcp->tcp_snxt) {
16540			clock_t	time_to_wait;
16541
16542			BUMP_MIB(&tcp_mib, tcpTimRetrans);
16543			if (!tcp->tcp_xmit_head)
16544				break;
16545			time_to_wait = lbolt -
16546			    (clock_t)tcp->tcp_xmit_head->b_prev;
16547			time_to_wait = tcp->tcp_rto -
16548			    TICK_TO_MSEC(time_to_wait);
16549			/*
16550			 * If the timer fires too early, 1 clock tick earlier,
16551			 * restart the timer.
16552			 */
16553			if (time_to_wait > msec_per_tick) {
16554				TCP_STAT(tcp_timer_fire_early);
16555				TCP_TIMER_RESTART(tcp, time_to_wait);
16556				return;
16557			}
16558			/*
16559			 * When we probe zero windows, we force the swnd open.
16560			 * If our peer acks with a closed window swnd will be
16561			 * set to zero by tcp_rput(). As long as we are
16562			 * receiving acks tcp_rput will
16563			 * reset 'tcp_ms_we_have_waited' so as not to trip the
16564			 * first and second interval actions.  NOTE: the timer
16565			 * interval is allowed to continue its exponential
16566			 * backoff.
16567			 */
16568			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
16569				if (tcp->tcp_debug) {
16570					(void) strlog(TCP_MOD_ID, 0, 1,
16571					    SL_TRACE, "tcp_timer: zero win");
16572				}
16573			} else {
16574				/*
16575				 * After retransmission, we need to do
16576				 * slow start.  Set the ssthresh to one
16577				 * half of current effective window and
16578				 * cwnd to one MSS.  Also reset
16579				 * tcp_cwnd_cnt.
16580				 *
16581				 * Note that if tcp_ssthresh is reduced because
16582				 * of ECN, do not reduce it again unless it is
16583				 * already one window of data away (tcp_cwr
16584				 * should then be cleared) or this is a
16585				 * timeout for a retransmitted segment.
16586				 */
16587				uint32_t npkt;
16588
16589				if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
16590					npkt = ((tcp->tcp_timer_backoff ?
16591					    tcp->tcp_cwnd_ssthresh :
16592					    tcp->tcp_snxt -
16593					    tcp->tcp_suna) >> 1) / tcp->tcp_mss;
16594					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
16595					    tcp->tcp_mss;
16596				}
16597				tcp->tcp_cwnd = tcp->tcp_mss;
16598				tcp->tcp_cwnd_cnt = 0;
16599				if (tcp->tcp_ecn_ok) {
16600					tcp->tcp_cwr = B_TRUE;
16601					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
16602					tcp->tcp_ecn_cwr_sent = B_FALSE;
16603				}
16604			}
16605			break;
16606		}
16607		/*
16608		 * We have something to send yet we cannot send.  The
16609		 * reason can be:
16610		 *
16611		 * 1. Zero send window: we need to do zero window probe.
16612		 * 2. Zero cwnd: because of ECN, we need to "clock out
16613		 * segments.
16614		 * 3. SWS avoidance: receiver may have shrunk window,
16615		 * reset our knowledge.
16616		 *
16617		 * Note that condition 2 can happen with either 1 or
16618		 * 3.  But 1 and 3 are exclusive.
16619		 */
16620		if (tcp->tcp_unsent != 0) {
16621			if (tcp->tcp_cwnd == 0) {
16622				/*
16623				 * Set tcp_cwnd to 1 MSS so that a
16624				 * new segment can be sent out.  We
16625				 * are "clocking out" new data when
16626				 * the network is really congested.
16627				 */
16628				ASSERT(tcp->tcp_ecn_ok);
16629				tcp->tcp_cwnd = tcp->tcp_mss;
16630			}
16631			if (tcp->tcp_swnd == 0) {
16632				/* Extend window for zero window probe */
16633				tcp->tcp_swnd++;
16634				tcp->tcp_zero_win_probe = B_TRUE;
16635				BUMP_MIB(&tcp_mib, tcpOutWinProbe);
16636			} else {
16637				/*
16638				 * Handle timeout from sender SWS avoidance.
16639				 * Reset our knowledge of the max send window
16640				 * since the receiver might have reduced its
16641				 * receive buffer.  Avoid setting tcp_max_swnd
16642				 * to one since that will essentially disable
16643				 * the SWS checks.
16644				 *
16645				 * Note that since we don't have a SWS
16646				 * state variable, if the timeout is set
16647				 * for ECN but not for SWS, this
16648				 * code will also be executed.  This is
16649				 * fine as tcp_max_swnd is updated
16650				 * constantly and it will not affect
16651				 * anything.
16652				 */
16653				tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
16654			}
16655			tcp_wput_data(tcp, NULL, B_FALSE);
16656			return;
16657		}
16658		/* Is there a FIN that needs to be to re retransmitted? */
16659		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
16660		    !tcp->tcp_fin_acked)
16661			break;
16662		/* Nothing to do, return without restarting timer. */
16663		TCP_STAT(tcp_timer_fire_miss);
16664		return;
16665	case TCPS_FIN_WAIT_2:
16666		/*
16667		 * User closed the TCP endpoint and peer ACK'ed our FIN.
16668		 * We waited some time for for peer's FIN, but it hasn't
16669		 * arrived.  We flush the connection now to avoid
16670		 * case where the peer has rebooted.
16671		 */
16672		if (TCP_IS_DETACHED(tcp)) {
16673			(void) tcp_clean_death(tcp, 0, 23);
16674		} else {
16675			TCP_TIMER_RESTART(tcp, tcp_fin_wait_2_flush_interval);
16676		}
16677		return;
16678	case TCPS_TIME_WAIT:
16679		(void) tcp_clean_death(tcp, 0, 24);
16680		return;
16681	default:
16682		if (tcp->tcp_debug) {
16683			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
16684			    "tcp_timer: strange state (%d) %s",
16685			    tcp->tcp_state, tcp_display(tcp, NULL,
16686			    DISP_PORT_ONLY));
16687		}
16688		return;
16689	}
16690	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
16691		/*
16692		 * For zero window probe, we need to send indefinitely,
16693		 * unless we have not heard from the other side for some
16694		 * time...
16695		 */
16696		if ((tcp->tcp_zero_win_probe == 0) ||
16697		    (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >
16698		    second_threshold)) {
16699			BUMP_MIB(&tcp_mib, tcpTimRetransDrop);
16700			/*
16701			 * If TCP is in SYN_RCVD state, send back a
16702			 * RST|ACK as BSD does.  Note that tcp_zero_win_probe
16703			 * should be zero in TCPS_SYN_RCVD state.
16704			 */
16705			if (tcp->tcp_state == TCPS_SYN_RCVD) {
16706				tcp_xmit_ctl("tcp_timer: RST sent on timeout "
16707				    "in SYN_RCVD",
16708				    tcp, tcp->tcp_snxt,
16709				    tcp->tcp_rnxt, TH_RST | TH_ACK);
16710			}
16711			(void) tcp_clean_death(tcp,
16712			    tcp->tcp_client_errno ?
16713			    tcp->tcp_client_errno : ETIMEDOUT, 25);
16714			return;
16715		} else {
16716			/*
16717			 * Set tcp_ms_we_have_waited to second_threshold
16718			 * so that in next timeout, we will do the above
16719			 * check (lbolt - tcp_last_recv_time).  This is
16720			 * also to avoid overflow.
16721			 *
16722			 * We don't need to decrement tcp_timer_backoff
16723			 * to avoid overflow because it will be decremented
16724			 * later if new timeout value is greater than
16725			 * tcp_rexmit_interval_max.  In the case when
16726			 * tcp_rexmit_interval_max is greater than
16727			 * second_threshold, it means that we will wait
16728			 * longer than second_threshold to send the next
16729			 * window probe.
16730			 */
16731			tcp->tcp_ms_we_have_waited = second_threshold;
16732		}
16733	} else if (ms > first_threshold) {
16734		if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) &&
16735		    tcp->tcp_xmit_head != NULL) {
16736			tcp->tcp_xmit_head =
16737			    tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1);
16738		}
16739		/*
16740		 * We have been retransmitting for too long...  The RTT
16741		 * we calculated is probably incorrect.  Reinitialize it.
16742		 * Need to compensate for 0 tcp_rtt_sa.  Reset
16743		 * tcp_rtt_update so that we won't accidentally cache a
16744		 * bad value.  But only do this if this is not a zero
16745		 * window probe.
16746		 */
16747		if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
16748			tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
16749			    (tcp->tcp_rtt_sa >> 5);
16750			tcp->tcp_rtt_sa = 0;
16751			tcp_ip_notify(tcp);
16752			tcp->tcp_rtt_update = 0;
16753		}
16754	}
16755	tcp->tcp_timer_backoff++;
16756	if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
16757	    tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
16758	    tcp_rexmit_interval_min) {
16759		/*
16760		 * This means the original RTO is tcp_rexmit_interval_min.
16761		 * So we will use tcp_rexmit_interval_min as the RTO value
16762		 * and do the backoff.
16763		 */
16764		ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff;
16765	} else {
16766		ms <<= tcp->tcp_timer_backoff;
16767	}
16768	if (ms > tcp_rexmit_interval_max) {
16769		ms = tcp_rexmit_interval_max;
16770		/*
16771		 * ms is at max, decrement tcp_timer_backoff to avoid
16772		 * overflow.
16773		 */
16774		tcp->tcp_timer_backoff--;
16775	}
16776	tcp->tcp_ms_we_have_waited += ms;
16777	if (tcp->tcp_zero_win_probe == 0) {
16778		tcp->tcp_rto = ms;
16779	}
16780	TCP_TIMER_RESTART(tcp, ms);
16781	/*
16782	 * This is after a timeout and tcp_rto is backed off.  Set
16783	 * tcp_set_timer to 1 so that next time RTO is updated, we will
16784	 * restart the timer with a correct value.
16785	 */
16786	tcp->tcp_set_timer = 1;
16787	mss = tcp->tcp_snxt - tcp->tcp_suna;
16788	if (mss > tcp->tcp_mss)
16789		mss = tcp->tcp_mss;
16790	if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
16791		mss = tcp->tcp_swnd;
16792
16793	if ((mp = tcp->tcp_xmit_head) != NULL)
16794		mp->b_prev = (mblk_t *)lbolt;
16795	mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
16796	    B_TRUE);
16797
16798	/*
16799	 * When slow start after retransmission begins, start with
16800	 * this seq no.  tcp_rexmit_max marks the end of special slow
16801	 * start phase.  tcp_snd_burst controls how many segments
16802	 * can be sent because of an ack.
16803	 */
16804	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
16805	tcp->tcp_snd_burst = TCP_CWND_SS;
16806	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
16807	    (tcp->tcp_unsent == 0)) {
16808		tcp->tcp_rexmit_max = tcp->tcp_fss;
16809	} else {
16810		tcp->tcp_rexmit_max = tcp->tcp_snxt;
16811	}
16812	tcp->tcp_rexmit = B_TRUE;
16813	tcp->tcp_dupack_cnt = 0;
16814
16815	/*
16816	 * Remove all rexmit SACK blk to start from fresh.
16817	 */
16818	if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
16819		TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
16820		tcp->tcp_num_notsack_blk = 0;
16821		tcp->tcp_cnt_notsack_list = 0;
16822	}
16823	if (mp == NULL) {
16824		return;
16825	}
16826	/* Attach credentials to retransmitted initial SYNs. */
16827	if (tcp->tcp_state == TCPS_SYN_SENT) {
16828		mblk_setcred(mp, tcp->tcp_cred);
16829		DB_CPID(mp) = tcp->tcp_cpid;
16830	}
16831
16832	tcp->tcp_csuna = tcp->tcp_snxt;
16833	BUMP_MIB(&tcp_mib, tcpRetransSegs);
16834	UPDATE_MIB(&tcp_mib, tcpRetransBytes, mss);
16835	TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
16836	tcp_send_data(tcp, tcp->tcp_wq, mp);
16837
16838}
16839
16840/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
16841static void
16842tcp_unbind(tcp_t *tcp, mblk_t *mp)
16843{
16844	conn_t	*connp;
16845
16846	switch (tcp->tcp_state) {
16847	case TCPS_BOUND:
16848	case TCPS_LISTEN:
16849		break;
16850	default:
16851		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
16852		return;
16853	}
16854
16855	/*
16856	 * Need to clean up all the eagers since after the unbind, segments
16857	 * will no longer be delivered to this listener stream.
16858	 */
16859	mutex_enter(&tcp->tcp_eager_lock);
16860	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
16861		tcp_eager_cleanup(tcp, 0);
16862	}
16863	mutex_exit(&tcp->tcp_eager_lock);
16864
16865	if (tcp->tcp_ipversion == IPV4_VERSION) {
16866		tcp->tcp_ipha->ipha_src = 0;
16867	} else {
16868		V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
16869	}
16870	V6_SET_ZERO(tcp->tcp_ip_src_v6);
16871	bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
16872	tcp_bind_hash_remove(tcp);
16873	tcp->tcp_state = TCPS_IDLE;
16874	tcp->tcp_mdt = B_FALSE;
16875	/* Send M_FLUSH according to TPI */
16876	(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
16877	connp = tcp->tcp_connp;
16878	connp->conn_mdt_ok = B_FALSE;
16879	ipcl_hash_remove(connp);
16880	bzero(&connp->conn_ports, sizeof (connp->conn_ports));
16881	mp = mi_tpi_ok_ack_alloc(mp);
16882	putnext(tcp->tcp_rq, mp);
16883}
16884
16885/*
16886 * Don't let port fall into the privileged range.
16887 * Since the extra privileged ports can be arbitrary we also
16888 * ensure that we exclude those from consideration.
16889 * tcp_g_epriv_ports is not sorted thus we loop over it until
16890 * there are no changes.
16891 *
16892 * Note: No locks are held when inspecting tcp_g_*epriv_ports
16893 * but instead the code relies on:
16894 * - the fact that the address of the array and its size never changes
16895 * - the atomic assignment of the elements of the array
16896 *
16897 * Returns 0 if there are no more ports available.
16898 *
16899 * TS note: skip multilevel ports.
16900 */
16901static in_port_t
16902tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
16903{
16904	int i;
16905	boolean_t restart = B_FALSE;
16906
16907	if (random && tcp_random_anon_port != 0) {
16908		(void) random_get_pseudo_bytes((uint8_t *)&port,
16909		    sizeof (in_port_t));
16910		/*
16911		 * Unless changed by a sys admin, the smallest anon port
16912		 * is 32768 and the largest anon port is 65535.  It is
16913		 * very likely (50%) for the random port to be smaller
16914		 * than the smallest anon port.  When that happens,
16915		 * add port % (anon port range) to the smallest anon
16916		 * port to get the random port.  It should fall into the
16917		 * valid anon port range.
16918		 */
16919		if (port < tcp_smallest_anon_port) {
16920			port = tcp_smallest_anon_port +
16921			    port % (tcp_largest_anon_port -
16922				tcp_smallest_anon_port);
16923		}
16924	}
16925
16926retry:
16927	if (port < tcp_smallest_anon_port)
16928		port = (in_port_t)tcp_smallest_anon_port;
16929
16930	if (port > tcp_largest_anon_port) {
16931		if (restart)
16932			return (0);
16933		restart = B_TRUE;
16934		port = (in_port_t)tcp_smallest_anon_port;
16935	}
16936
16937	if (port < tcp_smallest_nonpriv_port)
16938		port = (in_port_t)tcp_smallest_nonpriv_port;
16939
16940	for (i = 0; i < tcp_g_num_epriv_ports; i++) {
16941		if (port == tcp_g_epriv_ports[i]) {
16942			port++;
16943			/*
16944			 * Make sure whether the port is in the
16945			 * valid range.
16946			 */
16947			goto retry;
16948		}
16949	}
16950	if (is_system_labeled() &&
16951	    (i = tsol_next_port(crgetzone(tcp->tcp_cred), port,
16952	    IPPROTO_TCP, B_TRUE)) != 0) {
16953		port = i;
16954		goto retry;
16955	}
16956	return (port);
16957}
16958
16959/*
16960 * Return the next anonymous port in the privileged port range for
16961 * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
16962 * downwards.  This is the same behavior as documented in the userland
16963 * library call rresvport(3N).
16964 *
16965 * TS note: skip multilevel ports.
16966 */
16967static in_port_t
16968tcp_get_next_priv_port(const tcp_t *tcp)
16969{
16970	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
16971	in_port_t nextport;
16972	boolean_t restart = B_FALSE;
16973
16974retry:
16975	if (next_priv_port < tcp_min_anonpriv_port ||
16976	    next_priv_port >= IPPORT_RESERVED) {
16977		next_priv_port = IPPORT_RESERVED - 1;
16978		if (restart)
16979			return (0);
16980		restart = B_TRUE;
16981	}
16982	if (is_system_labeled() &&
16983	    (nextport = tsol_next_port(crgetzone(tcp->tcp_cred),
16984	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
16985		next_priv_port = nextport;
16986		goto retry;
16987	}
16988	return (next_priv_port--);
16989}
16990
16991/* The write side r/w procedure. */
16992
16993#if CCS_STATS
16994struct {
16995	struct {
16996		int64_t count, bytes;
16997	} tot, hit;
16998} wrw_stats;
16999#endif
17000
17001/*
17002 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO,
17003 * messages.
17004 */
17005/* ARGSUSED */
17006static void
17007tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
17008{
17009	conn_t	*connp = (conn_t *)arg;
17010	tcp_t	*tcp = connp->conn_tcp;
17011	queue_t	*q = tcp->tcp_wq;
17012
17013	ASSERT(DB_TYPE(mp) != M_IOCTL);
17014	/*
17015	 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
17016	 * Once the close starts, streamhead and sockfs will not let any data
17017	 * packets come down (close ensures that there are no threads using the
17018	 * queue and no new threads will come down) but since qprocsoff()
17019	 * hasn't happened yet, a M_FLUSH or some non data message might
17020	 * get reflected back (in response to our own FLUSHRW) and get
17021	 * processed after tcp_close() is done. The conn would still be valid
17022	 * because a ref would have added but we need to check the state
17023	 * before actually processing the packet.
17024	 */
17025	if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
17026		freemsg(mp);
17027		return;
17028	}
17029
17030	switch (DB_TYPE(mp)) {
17031	case M_IOCDATA:
17032		tcp_wput_iocdata(tcp, mp);
17033		break;
17034	case M_FLUSH:
17035		tcp_wput_flush(tcp, mp);
17036		break;
17037	default:
17038		CALL_IP_WPUT(connp, q, mp);
17039		break;
17040	}
17041}
17042
17043/*
17044 * The TCP fast path write put procedure.
17045 * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
17046 */
17047/* ARGSUSED */
17048void
17049tcp_output(void *arg, mblk_t *mp, void *arg2)
17050{
17051	int		len;
17052	int		hdrlen;
17053	int		plen;
17054	mblk_t		*mp1;
17055	uchar_t		*rptr;
17056	uint32_t	snxt;
17057	tcph_t		*tcph;
17058	struct datab	*db;
17059	uint32_t	suna;
17060	uint32_t	mss;
17061	ipaddr_t	*dst;
17062	ipaddr_t	*src;
17063	uint32_t	sum;
17064	int		usable;
17065	conn_t		*connp = (conn_t *)arg;
17066	tcp_t		*tcp = connp->conn_tcp;
17067	uint32_t	msize;
17068
17069	/*
17070	 * Try and ASSERT the minimum possible references on the
17071	 * conn early enough. Since we are executing on write side,
17072	 * the connection is obviously not detached and that means
17073	 * there is a ref each for TCP and IP. Since we are behind
17074	 * the squeue, the minimum references needed are 3. If the
17075	 * conn is in classifier hash list, there should be an
17076	 * extra ref for that (we check both the possibilities).
17077	 */
17078	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
17079	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
17080
17081	ASSERT(DB_TYPE(mp) == M_DATA);
17082	msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
17083
17084	mutex_enter(&connp->conn_lock);
17085	tcp->tcp_squeue_bytes -= msize;
17086	mutex_exit(&connp->conn_lock);
17087
17088	/* Bypass tcp protocol for fused tcp loopback */
17089	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
17090		return;
17091
17092	mss = tcp->tcp_mss;
17093	if (tcp->tcp_xmit_zc_clean)
17094		mp = tcp_zcopy_backoff(tcp, mp, 0);
17095
17096	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
17097	len = (int)(mp->b_wptr - mp->b_rptr);
17098
17099	/*
17100	 * Criteria for fast path:
17101	 *
17102	 *   1. no unsent data
17103	 *   2. single mblk in request
17104	 *   3. connection established
17105	 *   4. data in mblk
17106	 *   5. len <= mss
17107	 *   6. no tcp_valid bits
17108	 */
17109	if ((tcp->tcp_unsent != 0) ||
17110	    (tcp->tcp_cork) ||
17111	    (mp->b_cont != NULL) ||
17112	    (tcp->tcp_state != TCPS_ESTABLISHED) ||
17113	    (len == 0) ||
17114	    (len > mss) ||
17115	    (tcp->tcp_valid_bits != 0)) {
17116		tcp_wput_data(tcp, mp, B_FALSE);
17117		return;
17118	}
17119
17120	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
17121	ASSERT(tcp->tcp_fin_sent == 0);
17122
17123	/* queue new packet onto retransmission queue */
17124	if (tcp->tcp_xmit_head == NULL) {
17125		tcp->tcp_xmit_head = mp;
17126	} else {
17127		tcp->tcp_xmit_last->b_cont = mp;
17128	}
17129	tcp->tcp_xmit_last = mp;
17130	tcp->tcp_xmit_tail = mp;
17131
17132	/* find out how much we can send */
17133	/* BEGIN CSTYLED */
17134	/*
17135	 *    un-acked           usable
17136	 *  |--------------|-----------------|
17137	 *  tcp_suna       tcp_snxt          tcp_suna+tcp_swnd
17138	 */
17139	/* END CSTYLED */
17140
17141	/* start sending from tcp_snxt */
17142	snxt = tcp->tcp_snxt;
17143
17144	/*
17145	 * Check to see if this connection has been idled for some
17146	 * time and no ACK is expected.  If it is, we need to slow
17147	 * start again to get back the connection's "self-clock" as
17148	 * described in VJ's paper.
17149	 *
17150	 * Refer to the comment in tcp_mss_set() for the calculation
17151	 * of tcp_cwnd after idle.
17152	 */
17153	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
17154	    (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
17155		SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle);
17156	}
17157
17158	usable = tcp->tcp_swnd;		/* tcp window size */
17159	if (usable > tcp->tcp_cwnd)
17160		usable = tcp->tcp_cwnd;	/* congestion window smaller */
17161	usable -= snxt;		/* subtract stuff already sent */
17162	suna = tcp->tcp_suna;
17163	usable += suna;
17164	/* usable can be < 0 if the congestion window is smaller */
17165	if (len > usable) {
17166		/* Can't send complete M_DATA in one shot */
17167		goto slow;
17168	}
17169
17170	if (tcp->tcp_flow_stopped &&
17171	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
17172		tcp_clrqfull(tcp);
17173	}
17174
17175	/*
17176	 * determine if anything to send (Nagle).
17177	 *
17178	 *   1. len < tcp_mss (i.e. small)
17179	 *   2. unacknowledged data present
17180	 *   3. len < nagle limit
17181	 *   4. last packet sent < nagle limit (previous packet sent)
17182	 */
17183	if ((len < mss) && (snxt != suna) &&
17184	    (len < (int)tcp->tcp_naglim) &&
17185	    (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
17186		/*
17187		 * This was the first unsent packet and normally
17188		 * mss < xmit_hiwater so there is no need to worry
17189		 * about flow control. The next packet will go
17190		 * through the flow control check in tcp_wput_data().
17191		 */
17192		/* leftover work from above */
17193		tcp->tcp_unsent = len;
17194		tcp->tcp_xmit_tail_unsent = len;
17195
17196		return;
17197	}
17198
17199	/* len <= tcp->tcp_mss && len == unsent so no silly window */
17200
17201	if (snxt == suna) {
17202		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
17203	}
17204
17205	/* we have always sent something */
17206	tcp->tcp_rack_cnt = 0;
17207
17208	tcp->tcp_snxt = snxt + len;
17209	tcp->tcp_rack = tcp->tcp_rnxt;
17210
17211	if ((mp1 = dupb(mp)) == 0)
17212		goto no_memory;
17213	mp->b_prev = (mblk_t *)(uintptr_t)lbolt;
17214	mp->b_next = (mblk_t *)(uintptr_t)snxt;
17215
17216	/* adjust tcp header information */
17217	tcph = tcp->tcp_tcph;
17218	tcph->th_flags[0] = (TH_ACK|TH_PUSH);
17219
17220	sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
17221	sum = (sum >> 16) + (sum & 0xFFFF);
17222	U16_TO_ABE16(sum, tcph->th_sum);
17223
17224	U32_TO_ABE32(snxt, tcph->th_seq);
17225
17226	BUMP_MIB(&tcp_mib, tcpOutDataSegs);
17227	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len);
17228	BUMP_LOCAL(tcp->tcp_obsegs);
17229
17230	/* Update the latest receive window size in TCP header. */
17231	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
17232	    tcph->th_win);
17233
17234	tcp->tcp_last_sent_len = (ushort_t)len;
17235
17236	plen = len + tcp->tcp_hdr_len;
17237
17238	if (tcp->tcp_ipversion == IPV4_VERSION) {
17239		tcp->tcp_ipha->ipha_length = htons(plen);
17240	} else {
17241		tcp->tcp_ip6h->ip6_plen = htons(plen -
17242		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
17243	}
17244
17245	/* see if we need to allocate a mblk for the headers */
17246	hdrlen = tcp->tcp_hdr_len;
17247	rptr = mp1->b_rptr - hdrlen;
17248	db = mp1->b_datap;
17249	if ((db->db_ref != 2) || rptr < db->db_base ||
17250	    (!OK_32PTR(rptr))) {
17251		/* NOTE: we assume allocb returns an OK_32PTR */
17252		mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
17253		    tcp_wroff_xtra, BPRI_MED);
17254		if (!mp) {
17255			freemsg(mp1);
17256			goto no_memory;
17257		}
17258		mp->b_cont = mp1;
17259		mp1 = mp;
17260		/* Leave room for Link Level header */
17261		/* hdrlen = tcp->tcp_hdr_len; */
17262		rptr = &mp1->b_rptr[tcp_wroff_xtra];
17263		mp1->b_wptr = &rptr[hdrlen];
17264	}
17265	mp1->b_rptr = rptr;
17266
17267	/* Fill in the timestamp option. */
17268	if (tcp->tcp_snd_ts_ok) {
17269		U32_TO_BE32((uint32_t)lbolt,
17270		    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
17271		U32_TO_BE32(tcp->tcp_ts_recent,
17272		    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
17273	} else {
17274		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
17275	}
17276
17277	/* copy header into outgoing packet */
17278	dst = (ipaddr_t *)rptr;
17279	src = (ipaddr_t *)tcp->tcp_iphc;
17280	dst[0] = src[0];
17281	dst[1] = src[1];
17282	dst[2] = src[2];
17283	dst[3] = src[3];
17284	dst[4] = src[4];
17285	dst[5] = src[5];
17286	dst[6] = src[6];
17287	dst[7] = src[7];
17288	dst[8] = src[8];
17289	dst[9] = src[9];
17290	if (hdrlen -= 40) {
17291		hdrlen >>= 2;
17292		dst += 10;
17293		src += 10;
17294		do {
17295			*dst++ = *src++;
17296		} while (--hdrlen);
17297	}
17298
17299	/*
17300	 * Set the ECN info in the TCP header.  Note that this
17301	 * is not the template header.
17302	 */
17303	if (tcp->tcp_ecn_ok) {
17304		SET_ECT(tcp, rptr);
17305
17306		tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
17307		if (tcp->tcp_ecn_echo_on)
17308			tcph->th_flags[0] |= TH_ECE;
17309		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
17310			tcph->th_flags[0] |= TH_CWR;
17311			tcp->tcp_ecn_cwr_sent = B_TRUE;
17312		}
17313	}
17314
17315	if (tcp->tcp_ip_forward_progress) {
17316		ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
17317		*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
17318		tcp->tcp_ip_forward_progress = B_FALSE;
17319	}
17320	TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
17321	tcp_send_data(tcp, tcp->tcp_wq, mp1);
17322	return;
17323
17324	/*
17325	 * If we ran out of memory, we pretend to have sent the packet
17326	 * and that it was lost on the wire.
17327	 */
17328no_memory:
17329	return;
17330
17331slow:
17332	/* leftover work from above */
17333	tcp->tcp_unsent = len;
17334	tcp->tcp_xmit_tail_unsent = len;
17335	tcp_wput_data(tcp, NULL, B_FALSE);
17336}
17337
17338/*
17339 * The function called through squeue to get behind eager's perimeter to
17340 * finish the accept processing.
17341 */
17342/* ARGSUSED */
17343void
17344tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
17345{
17346	conn_t			*connp = (conn_t *)arg;
17347	tcp_t			*tcp = connp->conn_tcp;
17348	queue_t			*q = tcp->tcp_rq;
17349	mblk_t			*mp1;
17350	mblk_t			*stropt_mp = mp;
17351	struct  stroptions	*stropt;
17352	uint_t			thwin;
17353
17354	/*
17355	 * Drop the eager's ref on the listener, that was placed when
17356	 * this eager began life in tcp_conn_request.
17357	 */
17358	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
17359
17360	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
17361		/*
17362		 * Someone blewoff the eager before we could finish
17363		 * the accept.
17364		 *
17365		 * The only reason eager exists it because we put in
17366		 * a ref on it when conn ind went up. We need to send
17367		 * a disconnect indication up while the last reference
17368		 * on the eager will be dropped by the squeue when we
17369		 * return.
17370		 */
17371		ASSERT(tcp->tcp_listener == NULL);
17372		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
17373			struct	T_discon_ind	*tdi;
17374
17375			(void) putnextctl1(q, M_FLUSH, FLUSHRW);
17376			/*
17377			 * Let us reuse the incoming mblk to avoid memory
17378			 * allocation failure problems. We know that the
17379			 * size of the incoming mblk i.e. stroptions is greater
17380			 * than sizeof T_discon_ind. So the reallocb below
17381			 * can't fail.
17382			 */
17383			freemsg(mp->b_cont);
17384			mp->b_cont = NULL;
17385			ASSERT(DB_REF(mp) == 1);
17386			mp = reallocb(mp, sizeof (struct T_discon_ind),
17387			    B_FALSE);
17388			ASSERT(mp != NULL);
17389			DB_TYPE(mp) = M_PROTO;
17390			((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND;
17391			tdi = (struct T_discon_ind *)mp->b_rptr;
17392			if (tcp->tcp_issocket) {
17393				tdi->DISCON_reason = ECONNREFUSED;
17394				tdi->SEQ_number = 0;
17395			} else {
17396				tdi->DISCON_reason = ENOPROTOOPT;
17397				tdi->SEQ_number =
17398				    tcp->tcp_conn_req_seqnum;
17399			}
17400			mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind);
17401			putnext(q, mp);
17402		} else {
17403			freemsg(mp);
17404		}
17405		if (tcp->tcp_hard_binding) {
17406			tcp->tcp_hard_binding = B_FALSE;
17407			tcp->tcp_hard_bound = B_TRUE;
17408		}
17409		tcp->tcp_detached = B_FALSE;
17410		return;
17411	}
17412
17413	mp1 = stropt_mp->b_cont;
17414	stropt_mp->b_cont = NULL;
17415	ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS);
17416	stropt = (struct stroptions *)stropt_mp->b_rptr;
17417
17418	while (mp1 != NULL) {
17419		mp = mp1;
17420		mp1 = mp1->b_cont;
17421		mp->b_cont = NULL;
17422		tcp->tcp_drop_opt_ack_cnt++;
17423		CALL_IP_WPUT(connp, tcp->tcp_wq, mp);
17424	}
17425	mp = NULL;
17426
17427	/*
17428	 * For a loopback connection with tcp_direct_sockfs on, note that
17429	 * we don't have to protect tcp_rcv_list yet because synchronous
17430	 * streams has not yet been enabled and tcp_fuse_rrw() cannot
17431	 * possibly race with us.
17432	 */
17433
17434	/*
17435	 * Set the max window size (tcp_rq->q_hiwat) of the acceptor
17436	 * properly.  This is the first time we know of the acceptor'
17437	 * queue.  So we do it here.
17438	 */
17439	if (tcp->tcp_rcv_list == NULL) {
17440		/*
17441		 * Recv queue is empty, tcp_rwnd should not have changed.
17442		 * That means it should be equal to the listener's tcp_rwnd.
17443		 */
17444		tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd;
17445	} else {
17446#ifdef DEBUG
17447		uint_t cnt = 0;
17448
17449		mp1 = tcp->tcp_rcv_list;
17450		while ((mp = mp1) != NULL) {
17451			mp1 = mp->b_next;
17452			cnt += msgdsize(mp);
17453		}
17454		ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt);
17455#endif
17456		/* There is some data, add them back to get the max. */
17457		tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
17458	}
17459
17460	stropt->so_flags = SO_HIWAT;
17461	stropt->so_hiwat = MAX(q->q_hiwat, tcp_sth_rcv_hiwat);
17462
17463	stropt->so_flags |= SO_MAXBLK;
17464	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
17465
17466	/*
17467	 * This is the first time we run on the correct
17468	 * queue after tcp_accept. So fix all the q parameters
17469	 * here.
17470	 */
17471	/* Allocate room for SACK options if needed. */
17472	stropt->so_flags |= SO_WROFF;
17473	if (tcp->tcp_fused) {
17474		ASSERT(tcp->tcp_loopback);
17475		ASSERT(tcp->tcp_loopback_peer != NULL);
17476		/*
17477		 * For fused tcp loopback, set the stream head's write
17478		 * offset value to zero since we won't be needing any room
17479		 * for TCP/IP headers.  This would also improve performance
17480		 * since it would reduce the amount of work done by kmem.
17481		 * Non-fused tcp loopback case is handled separately below.
17482		 */
17483		stropt->so_wroff = 0;
17484		/*
17485		 * Record the stream head's high water mark for this endpoint;
17486		 * this is used for flow-control purposes in tcp_fuse_output().
17487		 */
17488		stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat);
17489		/*
17490		 * Update the peer's transmit parameters according to
17491		 * our recently calculated high water mark value.
17492		 */
17493		(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
17494	} else if (tcp->tcp_snd_sack_ok) {
17495		stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
17496		    (tcp->tcp_loopback ? 0 : tcp_wroff_xtra);
17497	} else {
17498		stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
17499		    tcp_wroff_xtra);
17500	}
17501
17502	/*
17503	 * If this is endpoint is handling SSL, then reserve extra
17504	 * offset and space at the end.
17505	 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets,
17506	 * overriding the previous setting. The extra cost of signing and
17507	 * encrypting multiple MSS-size records (12 of them with Ethernet),
17508	 * instead of a single contiguous one by the stream head
17509	 * largely outweighs the statistical reduction of ACKs, when
17510	 * applicable. The peer will also save on decyption and verification
17511	 * costs.
17512	 */
17513	if (tcp->tcp_kssl_ctx != NULL) {
17514		stropt->so_wroff += SSL3_WROFFSET;
17515
17516		stropt->so_flags |= SO_TAIL;
17517		stropt->so_tail = SSL3_MAX_TAIL_LEN;
17518
17519		stropt->so_maxblk = SSL3_MAX_RECORD_LEN;
17520	}
17521
17522	/* Send the options up */
17523	putnext(q, stropt_mp);
17524
17525	/*
17526	 * Pass up any data and/or a fin that has been received.
17527	 *
17528	 * Adjust receive window in case it had decreased
17529	 * (because there is data <=> tcp_rcv_list != NULL)
17530	 * while the connection was detached. Note that
17531	 * in case the eager was flow-controlled, w/o this
17532	 * code, the rwnd may never open up again!
17533	 */
17534	if (tcp->tcp_rcv_list != NULL) {
17535		/* We drain directly in case of fused tcp loopback */
17536		if (!tcp->tcp_fused && canputnext(q)) {
17537			tcp->tcp_rwnd = q->q_hiwat;
17538			thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
17539			    << tcp->tcp_rcv_ws;
17540			thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
17541			if (tcp->tcp_state >= TCPS_ESTABLISHED &&
17542			    (q->q_hiwat - thwin >= tcp->tcp_mss)) {
17543				tcp_xmit_ctl(NULL,
17544				    tcp, (tcp->tcp_swnd == 0) ?
17545				    tcp->tcp_suna : tcp->tcp_snxt,
17546				    tcp->tcp_rnxt, TH_ACK);
17547				BUMP_MIB(&tcp_mib, tcpOutWinUpdate);
17548			}
17549
17550		}
17551		(void) tcp_rcv_drain(q, tcp);
17552
17553		/*
17554		 * For fused tcp loopback, back-enable peer endpoint
17555		 * if it's currently flow-controlled.
17556		 */
17557		if (tcp->tcp_fused &&
17558		    tcp->tcp_loopback_peer->tcp_flow_stopped) {
17559			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
17560
17561			ASSERT(peer_tcp != NULL);
17562			ASSERT(peer_tcp->tcp_fused);
17563
17564			tcp_clrqfull(peer_tcp);
17565			TCP_STAT(tcp_fusion_backenabled);
17566		}
17567	}
17568	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
17569	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
17570		mp = mi_tpi_ordrel_ind();
17571		if (mp) {
17572			tcp->tcp_ordrel_done = B_TRUE;
17573			putnext(q, mp);
17574			if (tcp->tcp_deferred_clean_death) {
17575				/*
17576				 * tcp_clean_death was deferred
17577				 * for T_ORDREL_IND - do it now
17578				 */
17579				(void) tcp_clean_death(tcp,
17580				    tcp->tcp_client_errno, 21);
17581				tcp->tcp_deferred_clean_death = B_FALSE;
17582			}
17583		} else {
17584			/*
17585			 * Run the orderly release in the
17586			 * service routine.
17587			 */
17588			qenable(q);
17589		}
17590	}
17591	if (tcp->tcp_hard_binding) {
17592		tcp->tcp_hard_binding = B_FALSE;
17593		tcp->tcp_hard_bound = B_TRUE;
17594	}
17595
17596	tcp->tcp_detached = B_FALSE;
17597
17598	/* We can enable synchronous streams now */
17599	if (tcp->tcp_fused) {
17600		tcp_fuse_syncstr_enable_pair(tcp);
17601	}
17602
17603	if (tcp->tcp_ka_enabled) {
17604		tcp->tcp_ka_last_intrvl = 0;
17605		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
17606		    MSEC_TO_TICK(tcp->tcp_ka_interval));
17607	}
17608
17609	/*
17610	 * At this point, eager is fully established and will
17611	 * have the following references -
17612	 *
17613	 * 2 references for connection to exist (1 for TCP and 1 for IP).
17614	 * 1 reference for the squeue which will be dropped by the squeue as
17615	 *	soon as this function returns.
17616	 * There will be 1 additonal reference for being in classifier
17617	 *	hash list provided something bad hasn't happened.
17618	 */
17619	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
17620	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
17621}
17622
17623/*
17624 * The function called through squeue to get behind listener's perimeter to
17625 * send a deffered conn_ind.
17626 */
17627/* ARGSUSED */
17628void
17629tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
17630{
17631	conn_t	*connp = (conn_t *)arg;
17632	tcp_t *listener = connp->conn_tcp;
17633
17634	if (listener->tcp_state == TCPS_CLOSED ||
17635	    TCP_IS_DETACHED(listener)) {
17636		/*
17637		 * If listener has closed, it would have caused a
17638		 * a cleanup/blowoff to happen for the eager.
17639		 */
17640		tcp_t *tcp;
17641		struct T_conn_ind	*conn_ind;
17642
17643		conn_ind = (struct T_conn_ind *)mp->b_rptr;
17644		bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
17645		    conn_ind->OPT_length);
17646		/*
17647		 * We need to drop the ref on eager that was put
17648		 * tcp_rput_data() before trying to send the conn_ind
17649		 * to listener. The conn_ind was deferred in tcp_send_conn_ind
17650		 * and tcp_wput_accept() is sending this deferred conn_ind but
17651		 * listener is closed so we drop the ref.
17652		 */
17653		CONN_DEC_REF(tcp->tcp_connp);
17654		freemsg(mp);
17655		return;
17656	}
17657	putnext(listener->tcp_rq, mp);
17658}
17659
17660
17661/*
17662 * This is the STREAMS entry point for T_CONN_RES coming down on
17663 * Acceptor STREAM when  sockfs listener does accept processing.
17664 * Read the block comment on top pf tcp_conn_request().
17665 */
17666void
17667tcp_wput_accept(queue_t *q, mblk_t *mp)
17668{
17669	queue_t *rq = RD(q);
17670	struct T_conn_res *conn_res;
17671	tcp_t *eager;
17672	tcp_t *listener;
17673	struct T_ok_ack *ok;
17674	t_scalar_t PRIM_type;
17675	mblk_t *opt_mp;
17676	conn_t *econnp;
17677
17678	ASSERT(DB_TYPE(mp) == M_PROTO);
17679
17680	conn_res = (struct T_conn_res *)mp->b_rptr;
17681	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
17682	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
17683		mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
17684		if (mp != NULL)
17685			putnext(rq, mp);
17686		return;
17687	}
17688	switch (conn_res->PRIM_type) {
17689	case O_T_CONN_RES:
17690	case T_CONN_RES:
17691		/*
17692		 * We pass up an err ack if allocb fails. This will
17693		 * cause sockfs to issue a T_DISCON_REQ which will cause
17694		 * tcp_eager_blowoff to be called. sockfs will then call
17695		 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
17696		 * we need to do the allocb up here because we have to
17697		 * make sure rq->q_qinfo->qi_qclose still points to the
17698		 * correct function (tcpclose_accept) in case allocb
17699		 * fails.
17700		 */
17701		opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
17702		if (opt_mp == NULL) {
17703			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
17704			if (mp != NULL)
17705				putnext(rq, mp);
17706			return;
17707		}
17708
17709		bcopy(mp->b_rptr + conn_res->OPT_offset,
17710		    &eager, conn_res->OPT_length);
17711		PRIM_type = conn_res->PRIM_type;
17712		mp->b_datap->db_type = M_PCPROTO;
17713		mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
17714		ok = (struct T_ok_ack *)mp->b_rptr;
17715		ok->PRIM_type = T_OK_ACK;
17716		ok->CORRECT_prim = PRIM_type;
17717		econnp = eager->tcp_connp;
17718		econnp->conn_dev = (dev_t)q->q_ptr;
17719		eager->tcp_rq = rq;
17720		eager->tcp_wq = q;
17721		rq->q_ptr = econnp;
17722		rq->q_qinfo = &tcp_rinit;
17723		q->q_ptr = econnp;
17724		q->q_qinfo = &tcp_winit;
17725		listener = eager->tcp_listener;
17726		eager->tcp_issocket = B_TRUE;
17727		econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
17728
17729		/* Put the ref for IP */
17730		CONN_INC_REF(econnp);
17731
17732		/*
17733		 * We should have minimum of 3 references on the conn
17734		 * at this point. One each for TCP and IP and one for
17735		 * the T_conn_ind that was sent up when the 3-way handshake
17736		 * completed. In the normal case we would also have another
17737		 * reference (making a total of 4) for the conn being in the
17738		 * classifier hash list. However the eager could have received
17739		 * an RST subsequently and tcp_closei_local could have removed
17740		 * the eager from the classifier hash list, hence we can't
17741		 * assert that reference.
17742		 */
17743		ASSERT(econnp->conn_ref >= 3);
17744
17745		/*
17746		 * Send the new local address also up to sockfs. There
17747		 * should already be enough space in the mp that came
17748		 * down from soaccept().
17749		 */
17750		if (eager->tcp_family == AF_INET) {
17751			sin_t *sin;
17752
17753			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
17754			    (sizeof (struct T_ok_ack) + sizeof (sin_t)));
17755			sin = (sin_t *)mp->b_wptr;
17756			mp->b_wptr += sizeof (sin_t);
17757			sin->sin_family = AF_INET;
17758			sin->sin_port = eager->tcp_lport;
17759			sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src;
17760		} else {
17761			sin6_t *sin6;
17762
17763			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
17764			    sizeof (struct T_ok_ack) + sizeof (sin6_t));
17765			sin6 = (sin6_t *)mp->b_wptr;
17766			mp->b_wptr += sizeof (sin6_t);
17767			sin6->sin6_family = AF_INET6;
17768			sin6->sin6_port = eager->tcp_lport;
17769			if (eager->tcp_ipversion == IPV4_VERSION) {
17770				sin6->sin6_flowinfo = 0;
17771				IN6_IPADDR_TO_V4MAPPED(
17772					eager->tcp_ipha->ipha_src,
17773					    &sin6->sin6_addr);
17774			} else {
17775				ASSERT(eager->tcp_ip6h != NULL);
17776				sin6->sin6_flowinfo =
17777				    eager->tcp_ip6h->ip6_vcf &
17778				    ~IPV6_VERS_AND_FLOW_MASK;
17779				sin6->sin6_addr = eager->tcp_ip6h->ip6_src;
17780			}
17781			sin6->sin6_scope_id = 0;
17782			sin6->__sin6_src_id = 0;
17783		}
17784
17785		putnext(rq, mp);
17786
17787		opt_mp->b_datap->db_type = M_SETOPTS;
17788		opt_mp->b_wptr += sizeof (struct stroptions);
17789
17790		/*
17791		 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
17792		 * from listener to acceptor. The message is chained on the
17793		 * bind_mp which tcp_rput_other will send down to IP.
17794		 */
17795		if (listener->tcp_bound_if != 0) {
17796			/* allocate optmgmt req */
17797			mp = tcp_setsockopt_mp(IPPROTO_IPV6,
17798			    IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
17799			    sizeof (int));
17800			if (mp != NULL)
17801				linkb(opt_mp, mp);
17802		}
17803		if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
17804			uint_t on = 1;
17805
17806			/* allocate optmgmt req */
17807			mp = tcp_setsockopt_mp(IPPROTO_IPV6,
17808			    IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
17809			if (mp != NULL)
17810				linkb(opt_mp, mp);
17811		}
17812
17813
17814		mutex_enter(&listener->tcp_eager_lock);
17815
17816		if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
17817
17818			tcp_t *tail;
17819			tcp_t *tcp;
17820			mblk_t *mp1;
17821
17822			tcp = listener->tcp_eager_prev_q0;
17823			/*
17824			 * listener->tcp_eager_prev_q0 points to the TAIL of the
17825			 * deferred T_conn_ind queue. We need to get to the head
17826			 * of the queue in order to send up T_conn_ind the same
17827			 * order as how the 3WHS is completed.
17828			 */
17829			while (tcp != listener) {
17830				if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
17831				    !tcp->tcp_kssl_pending)
17832					break;
17833				else
17834					tcp = tcp->tcp_eager_prev_q0;
17835			}
17836			/* None of the pending eagers can be sent up now */
17837			if (tcp == listener)
17838				goto no_more_eagers;
17839
17840			mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
17841			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
17842			/* Move from q0 to q */
17843			ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
17844			listener->tcp_conn_req_cnt_q0--;
17845			listener->tcp_conn_req_cnt_q++;
17846			tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
17847			    tcp->tcp_eager_prev_q0;
17848			tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
17849			    tcp->tcp_eager_next_q0;
17850			tcp->tcp_eager_prev_q0 = NULL;
17851			tcp->tcp_eager_next_q0 = NULL;
17852			tcp->tcp_conn_def_q0 = B_FALSE;
17853
17854			/*
17855			 * Insert at end of the queue because sockfs sends
17856			 * down T_CONN_RES in chronological order. Leaving
17857			 * the older conn indications at front of the queue
17858			 * helps reducing search time.
17859			 */
17860			tail = listener->tcp_eager_last_q;
17861			if (tail != NULL) {
17862				tail->tcp_eager_next_q = tcp;
17863			} else {
17864				listener->tcp_eager_next_q = tcp;
17865			}
17866			listener->tcp_eager_last_q = tcp;
17867			tcp->tcp_eager_next_q = NULL;
17868
17869			/* Need to get inside the listener perimeter */
17870			CONN_INC_REF(listener->tcp_connp);
17871			squeue_fill(listener->tcp_connp->conn_sqp, mp1,
17872			    tcp_send_pending, listener->tcp_connp,
17873			    SQTAG_TCP_SEND_PENDING);
17874		}
17875no_more_eagers:
17876		tcp_eager_unlink(eager);
17877		mutex_exit(&listener->tcp_eager_lock);
17878
17879		/*
17880		 * At this point, the eager is detached from the listener
17881		 * but we still have an extra refs on eager (apart from the
17882		 * usual tcp references). The ref was placed in tcp_rput_data
17883		 * before sending the conn_ind in tcp_send_conn_ind.
17884		 * The ref will be dropped in tcp_accept_finish().
17885		 */
17886		squeue_enter_nodrain(econnp->conn_sqp, opt_mp,
17887		    tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0);
17888		return;
17889	default:
17890		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
17891		if (mp != NULL)
17892			putnext(rq, mp);
17893		return;
17894	}
17895}
17896
17897void
17898tcp_wput(queue_t *q, mblk_t *mp)
17899{
17900	conn_t	*connp = Q_TO_CONN(q);
17901	tcp_t	*tcp;
17902	void (*output_proc)();
17903	t_scalar_t type;
17904	uchar_t *rptr;
17905	struct iocblk	*iocp;
17906	uint32_t	msize;
17907
17908	ASSERT(connp->conn_ref >= 2);
17909
17910	switch (DB_TYPE(mp)) {
17911	case M_DATA:
17912		tcp = connp->conn_tcp;
17913		ASSERT(tcp != NULL);
17914
17915		msize = msgdsize(mp);
17916
17917		mutex_enter(&connp->conn_lock);
17918		CONN_INC_REF_LOCKED(connp);
17919
17920		tcp->tcp_squeue_bytes += msize;
17921		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
17922			mutex_exit(&connp->conn_lock);
17923			tcp_setqfull(tcp);
17924		} else
17925			mutex_exit(&connp->conn_lock);
17926
17927		(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
17928		    tcp_output, connp, SQTAG_TCP_OUTPUT);
17929		return;
17930	case M_PROTO:
17931	case M_PCPROTO:
17932		/*
17933		 * if it is a snmp message, don't get behind the squeue
17934		 */
17935		tcp = connp->conn_tcp;
17936		rptr = mp->b_rptr;
17937		if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
17938			type = ((union T_primitives *)rptr)->type;
17939		} else {
17940			if (tcp->tcp_debug) {
17941				(void) strlog(TCP_MOD_ID, 0, 1,
17942				    SL_ERROR|SL_TRACE,
17943				    "tcp_wput_proto, dropping one...");
17944			}
17945			freemsg(mp);
17946			return;
17947		}
17948		if (type == T_SVR4_OPTMGMT_REQ) {
17949			cred_t	*cr = DB_CREDDEF(mp, tcp->tcp_cred);
17950			if (snmpcom_req(q, mp, tcp_snmp_set, tcp_snmp_get,
17951			    cr)) {
17952				/*
17953				 * This was a SNMP request
17954				 */
17955				return;
17956			} else {
17957				output_proc = tcp_wput_proto;
17958			}
17959		} else {
17960			output_proc = tcp_wput_proto;
17961		}
17962		break;
17963	case M_IOCTL:
17964		/*
17965		 * Most ioctls can be processed right away without going via
17966		 * squeues - process them right here. Those that do require
17967		 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
17968		 * are processed by tcp_wput_ioctl().
17969		 */
17970		iocp = (struct iocblk *)mp->b_rptr;
17971		tcp = connp->conn_tcp;
17972
17973		switch (iocp->ioc_cmd) {
17974		case TCP_IOC_ABORT_CONN:
17975			tcp_ioctl_abort_conn(q, mp);
17976			return;
17977		case TI_GETPEERNAME:
17978			if (tcp->tcp_state < TCPS_SYN_RCVD) {
17979				iocp->ioc_error = ENOTCONN;
17980				iocp->ioc_count = 0;
17981				mp->b_datap->db_type = M_IOCACK;
17982				qreply(q, mp);
17983				return;
17984			}
17985			/* FALLTHRU */
17986		case TI_GETMYNAME:
17987			mi_copyin(q, mp, NULL,
17988			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
17989			return;
17990		case ND_SET:
17991			/* nd_getset does the necessary checks */
17992		case ND_GET:
17993			if (!nd_getset(q, tcp_g_nd, mp)) {
17994				CALL_IP_WPUT(connp, q, mp);
17995				return;
17996			}
17997			qreply(q, mp);
17998			return;
17999		case TCP_IOC_DEFAULT_Q:
18000			/*
18001			 * Wants to be the default wq. Check the credentials
18002			 * first, the rest is executed via squeue.
18003			 */
18004			if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) {
18005				iocp->ioc_error = EPERM;
18006				iocp->ioc_count = 0;
18007				mp->b_datap->db_type = M_IOCACK;
18008				qreply(q, mp);
18009				return;
18010			}
18011			output_proc = tcp_wput_ioctl;
18012			break;
18013		default:
18014			output_proc = tcp_wput_ioctl;
18015			break;
18016		}
18017		break;
18018	default:
18019		output_proc = tcp_wput_nondata;
18020		break;
18021	}
18022
18023	CONN_INC_REF(connp);
18024	(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
18025	    output_proc, connp, SQTAG_TCP_WPUT_OTHER);
18026}
18027
18028/*
18029 * Initial STREAMS write side put() procedure for sockets. It tries to
18030 * handle the T_CAPABILITY_REQ which sockfs sends down while setting
18031 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
18032 * are handled by tcp_wput() as usual.
18033 *
18034 * All further messages will also be handled by tcp_wput() because we cannot
18035 * be sure that the above short cut is safe later.
18036 */
18037static void
18038tcp_wput_sock(queue_t *wq, mblk_t *mp)
18039{
18040	conn_t			*connp = Q_TO_CONN(wq);
18041	tcp_t			*tcp = connp->conn_tcp;
18042	struct T_capability_req	*car = (struct T_capability_req *)mp->b_rptr;
18043
18044	ASSERT(wq->q_qinfo == &tcp_sock_winit);
18045	wq->q_qinfo = &tcp_winit;
18046
18047	ASSERT(IPCL_IS_TCP(connp));
18048	ASSERT(TCP_IS_SOCKET(tcp));
18049
18050	if (DB_TYPE(mp) == M_PCPROTO &&
18051	    MBLKL(mp) == sizeof (struct T_capability_req) &&
18052	    car->PRIM_type == T_CAPABILITY_REQ) {
18053		tcp_capability_req(tcp, mp);
18054		return;
18055	}
18056
18057	tcp_wput(wq, mp);
18058}
18059
18060static boolean_t
18061tcp_zcopy_check(tcp_t *tcp)
18062{
18063	conn_t	*connp = tcp->tcp_connp;
18064	ire_t	*ire;
18065	boolean_t	zc_enabled = B_FALSE;
18066
18067	if (do_tcpzcopy == 2)
18068		zc_enabled = B_TRUE;
18069	else if (tcp->tcp_ipversion == IPV4_VERSION &&
18070	    IPCL_IS_CONNECTED(connp) &&
18071	    (connp->conn_flags & IPCL_CHECK_POLICY) == 0 &&
18072	    connp->conn_dontroute == 0 &&
18073	    !connp->conn_nexthop_set &&
18074	    connp->conn_xmit_if_ill == NULL &&
18075	    connp->conn_nofailover_ill == NULL &&
18076	    do_tcpzcopy == 1) {
18077		/*
18078		 * the checks above  closely resemble the fast path checks
18079		 * in tcp_send_data().
18080		 */
18081		mutex_enter(&connp->conn_lock);
18082		ire = connp->conn_ire_cache;
18083		ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
18084		if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
18085			IRE_REFHOLD(ire);
18086			if (ire->ire_stq != NULL) {
18087				ill_t	*ill = (ill_t *)ire->ire_stq->q_ptr;
18088
18089				zc_enabled = ill && (ill->ill_capabilities &
18090				    ILL_CAPAB_ZEROCOPY) &&
18091				    (ill->ill_zerocopy_capab->
18092				    ill_zerocopy_flags != 0);
18093			}
18094			IRE_REFRELE(ire);
18095		}
18096		mutex_exit(&connp->conn_lock);
18097	}
18098	tcp->tcp_snd_zcopy_on = zc_enabled;
18099	if (!TCP_IS_DETACHED(tcp)) {
18100		if (zc_enabled) {
18101			(void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE);
18102			TCP_STAT(tcp_zcopy_on);
18103		} else {
18104			(void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE);
18105			TCP_STAT(tcp_zcopy_off);
18106		}
18107	}
18108	return (zc_enabled);
18109}
18110
18111static mblk_t *
18112tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp)
18113{
18114	if (do_tcpzcopy == 2)
18115		return (bp);
18116	else if (tcp->tcp_snd_zcopy_on) {
18117		tcp->tcp_snd_zcopy_on = B_FALSE;
18118		if (!TCP_IS_DETACHED(tcp)) {
18119			(void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE);
18120			TCP_STAT(tcp_zcopy_disable);
18121		}
18122	}
18123	return (tcp_zcopy_backoff(tcp, bp, 0));
18124}
18125
18126/*
18127 * Backoff from a zero-copy mblk by copying data to a new mblk and freeing
18128 * the original desballoca'ed segmapped mblk.
18129 */
18130static mblk_t *
18131tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist)
18132{
18133	mblk_t *head, *tail, *nbp;
18134	if (IS_VMLOANED_MBLK(bp)) {
18135		TCP_STAT(tcp_zcopy_backoff);
18136		if ((head = copyb(bp)) == NULL) {
18137			/* fail to backoff; leave it for the next backoff */
18138			tcp->tcp_xmit_zc_clean = B_FALSE;
18139			return (bp);
18140		}
18141		if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
18142			if (fix_xmitlist)
18143				tcp_zcopy_notify(tcp);
18144			else
18145				head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
18146		}
18147		nbp = bp->b_cont;
18148		if (fix_xmitlist) {
18149			head->b_prev = bp->b_prev;
18150			head->b_next = bp->b_next;
18151			if (tcp->tcp_xmit_tail == bp)
18152				tcp->tcp_xmit_tail = head;
18153		}
18154		bp->b_next = NULL;
18155		bp->b_prev = NULL;
18156		freeb(bp);
18157	} else {
18158		head = bp;
18159		nbp = bp->b_cont;
18160	}
18161	tail = head;
18162	while (nbp) {
18163		if (IS_VMLOANED_MBLK(nbp)) {
18164			TCP_STAT(tcp_zcopy_backoff);
18165			if ((tail->b_cont = copyb(nbp)) == NULL) {
18166				tcp->tcp_xmit_zc_clean = B_FALSE;
18167				tail->b_cont = nbp;
18168				return (head);
18169			}
18170			tail = tail->b_cont;
18171			if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
18172				if (fix_xmitlist)
18173					tcp_zcopy_notify(tcp);
18174				else
18175					tail->b_datap->db_struioflag |=
18176					    STRUIO_ZCNOTIFY;
18177			}
18178			bp = nbp;
18179			nbp = nbp->b_cont;
18180			if (fix_xmitlist) {
18181				tail->b_prev = bp->b_prev;
18182				tail->b_next = bp->b_next;
18183				if (tcp->tcp_xmit_tail == bp)
18184					tcp->tcp_xmit_tail = tail;
18185			}
18186			bp->b_next = NULL;
18187			bp->b_prev = NULL;
18188			freeb(bp);
18189		} else {
18190			tail->b_cont = nbp;
18191			tail = nbp;
18192			nbp = nbp->b_cont;
18193		}
18194	}
18195	if (fix_xmitlist) {
18196		tcp->tcp_xmit_last = tail;
18197		tcp->tcp_xmit_zc_clean = B_TRUE;
18198	}
18199	return (head);
18200}
18201
18202static void
18203tcp_zcopy_notify(tcp_t *tcp)
18204{
18205	struct stdata	*stp;
18206
18207	if (tcp->tcp_detached)
18208		return;
18209	stp = STREAM(tcp->tcp_rq);
18210	mutex_enter(&stp->sd_lock);
18211	stp->sd_flag |= STZCNOTIFY;
18212	cv_broadcast(&stp->sd_zcopy_wait);
18213	mutex_exit(&stp->sd_lock);
18214}
18215
18216static void
18217tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
18218{
18219	ipha_t		*ipha;
18220	ipaddr_t	src;
18221	ipaddr_t	dst;
18222	uint32_t	cksum;
18223	ire_t		*ire;
18224	uint16_t	*up;
18225	ill_t		*ill;
18226	conn_t		*connp = tcp->tcp_connp;
18227	uint32_t	hcksum_txflags = 0;
18228	mblk_t		*ire_fp_mp;
18229	uint_t		ire_fp_mp_len;
18230
18231	ASSERT(DB_TYPE(mp) == M_DATA);
18232
18233	if (DB_CRED(mp) == NULL)
18234		mblk_setcred(mp, CONN_CRED(connp));
18235
18236	ipha = (ipha_t *)mp->b_rptr;
18237	src = ipha->ipha_src;
18238	dst = ipha->ipha_dst;
18239
18240	/*
18241	 * Drop off fast path for IPv6 and also if options are present or
18242	 * we need to resolve a TS label.
18243	 */
18244	if (tcp->tcp_ipversion != IPV4_VERSION ||
18245	    !IPCL_IS_CONNECTED(connp) ||
18246	    (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
18247	    connp->conn_dontroute ||
18248	    connp->conn_nexthop_set ||
18249	    connp->conn_xmit_if_ill != NULL ||
18250	    connp->conn_nofailover_ill != NULL ||
18251	    !connp->conn_ulp_labeled ||
18252	    ipha->ipha_ident == IP_HDR_INCLUDED ||
18253	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
18254	    IPP_ENABLED(IPP_LOCAL_OUT)) {
18255		if (tcp->tcp_snd_zcopy_aware)
18256			mp = tcp_zcopy_disable(tcp, mp);
18257		TCP_STAT(tcp_ip_send);
18258		CALL_IP_WPUT(connp, q, mp);
18259		return;
18260	}
18261
18262	mutex_enter(&connp->conn_lock);
18263	ire = connp->conn_ire_cache;
18264	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
18265	if (ire != NULL && ire->ire_addr == dst &&
18266	    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
18267		IRE_REFHOLD(ire);
18268		mutex_exit(&connp->conn_lock);
18269	} else {
18270		boolean_t cached = B_FALSE;
18271
18272		/* force a recheck later on */
18273		tcp->tcp_ire_ill_check_done = B_FALSE;
18274
18275		TCP_DBGSTAT(tcp_ire_null1);
18276		connp->conn_ire_cache = NULL;
18277		mutex_exit(&connp->conn_lock);
18278		if (ire != NULL)
18279			IRE_REFRELE_NOTR(ire);
18280		ire = ire_cache_lookup(dst, connp->conn_zoneid,
18281		    MBLK_GETLABEL(mp));
18282		if (ire == NULL) {
18283			if (tcp->tcp_snd_zcopy_aware)
18284				mp = tcp_zcopy_backoff(tcp, mp, 0);
18285			TCP_STAT(tcp_ire_null);
18286			CALL_IP_WPUT(connp, q, mp);
18287			return;
18288		}
18289		IRE_REFHOLD_NOTR(ire);
18290		/*
18291		 * Since we are inside the squeue, there cannot be another
18292		 * thread in TCP trying to set the conn_ire_cache now.  The
18293		 * check for IRE_MARK_CONDEMNED ensures that an interface
18294		 * unplumb thread has not yet started cleaning up the conns.
18295		 * Hence we don't need to grab the conn lock.
18296		 */
18297		if (!(connp->conn_state_flags & CONN_CLOSING)) {
18298			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
18299			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
18300				connp->conn_ire_cache = ire;
18301				cached = B_TRUE;
18302			}
18303			rw_exit(&ire->ire_bucket->irb_lock);
18304		}
18305
18306		/*
18307		 * We can continue to use the ire but since it was
18308		 * not cached, we should drop the extra reference.
18309		 */
18310		if (!cached)
18311			IRE_REFRELE_NOTR(ire);
18312
18313		/*
18314		 * Rampart note: no need to select a new label here, since
18315		 * labels are not allowed to change during the life of a TCP
18316		 * connection.
18317		 */
18318	}
18319
18320	/*
18321	 * The following if case identifies whether or not
18322	 * we are forced to take the slowpath.
18323	 */
18324	if (ire->ire_flags & RTF_MULTIRT ||
18325	    ire->ire_stq == NULL ||
18326	    ire->ire_max_frag < ntohs(ipha->ipha_length) ||
18327	    (ire->ire_nce != NULL &&
18328	    (ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) ||
18329	    (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) {
18330		if (tcp->tcp_snd_zcopy_aware)
18331			mp = tcp_zcopy_disable(tcp, mp);
18332		TCP_STAT(tcp_ip_ire_send);
18333		IRE_REFRELE(ire);
18334		CALL_IP_WPUT(connp, q, mp);
18335		return;
18336	}
18337
18338	ill = ire_to_ill(ire);
18339	if (connp->conn_outgoing_ill != NULL) {
18340		ill_t *conn_outgoing_ill = NULL;
18341		/*
18342		 * Choose a good ill in the group to send the packets on.
18343		 */
18344		ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill);
18345		ill = ire_to_ill(ire);
18346	}
18347	ASSERT(ill != NULL);
18348
18349	if (!tcp->tcp_ire_ill_check_done) {
18350		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
18351		tcp->tcp_ire_ill_check_done = B_TRUE;
18352	}
18353
18354	ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
18355	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
18356#ifndef _BIG_ENDIAN
18357	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
18358#endif
18359
18360	/*
18361	 * Check to see if we need to re-enable MDT for this connection
18362	 * because it was previously disabled due to changes in the ill;
18363	 * note that by doing it here, this re-enabling only applies when
18364	 * the packet is not dispatched through CALL_IP_WPUT().
18365	 *
18366	 * That means for IPv4, it is worth re-enabling MDT for the fastpath
18367	 * case, since that's how we ended up here.  For IPv6, we do the
18368	 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue.
18369	 */
18370	if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) {
18371		/*
18372		 * Restore MDT for this connection, so that next time around
18373		 * it is eligible to go through tcp_multisend() path again.
18374		 */
18375		TCP_STAT(tcp_mdt_conn_resumed1);
18376		tcp->tcp_mdt = B_TRUE;
18377		ip1dbg(("tcp_send_data: reenabling MDT for connp %p on "
18378		    "interface %s\n", (void *)connp, ill->ill_name));
18379	}
18380
18381	if (tcp->tcp_snd_zcopy_aware) {
18382		if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 ||
18383		    (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0))
18384			mp = tcp_zcopy_disable(tcp, mp);
18385		/*
18386		 * we shouldn't need to reset ipha as the mp containing
18387		 * ipha should never be a zero-copy mp.
18388		 */
18389	}
18390
18391	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
18392		ASSERT(ill->ill_hcksum_capab != NULL);
18393		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
18394	}
18395
18396	/* pseudo-header checksum (do it in parts for IP header checksum) */
18397	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
18398
18399	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
18400	up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
18401
18402	IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
18403	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
18404
18405	/* Software checksum? */
18406	if (DB_CKSUMFLAGS(mp) == 0) {
18407		TCP_STAT(tcp_out_sw_cksum);
18408		TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
18409		    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
18410	}
18411
18412	ipha->ipha_fragment_offset_and_flags |=
18413	    (uint32_t)htons(ire->ire_frag_flag);
18414
18415	/* Calculate IP header checksum if hardware isn't capable */
18416	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
18417		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
18418		    ((uint16_t *)ipha)[4]);
18419	}
18420
18421	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
18422	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
18423	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
18424
18425	UPDATE_OB_PKT_COUNT(ire);
18426	ire->ire_last_used_time = lbolt;
18427	BUMP_MIB(&ip_mib, ipOutRequests);
18428
18429	if (ILL_DLS_CAPABLE(ill)) {
18430		/*
18431		 * Send the packet directly to DLD, where it may be queued
18432		 * depending on the availability of transmit resources at
18433		 * the media layer.
18434		 */
18435		IP_DLS_ILL_TX(ill, mp);
18436	} else {
18437		putnext(ire->ire_stq, mp);
18438	}
18439	IRE_REFRELE(ire);
18440}
18441
18442/*
18443 * This handles the case when the receiver has shrunk its win. Per RFC 1122
18444 * if the receiver shrinks the window, i.e. moves the right window to the
18445 * left, the we should not send new data, but should retransmit normally the
18446 * old unacked data between suna and suna + swnd. We might has sent data
18447 * that is now outside the new window, pretend that we didn't send  it.
18448 */
18449static void
18450tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
18451{
18452	uint32_t	snxt = tcp->tcp_snxt;
18453	mblk_t		*xmit_tail;
18454	int32_t		offset;
18455
18456	ASSERT(shrunk_count > 0);
18457
18458	/* Pretend we didn't send the data outside the window */
18459	snxt -= shrunk_count;
18460
18461	/* Get the mblk and the offset in it per the shrunk window */
18462	xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
18463
18464	ASSERT(xmit_tail != NULL);
18465
18466	/* Reset all the values per the now shrunk window */
18467	tcp->tcp_snxt = snxt;
18468	tcp->tcp_xmit_tail = xmit_tail;
18469	tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - xmit_tail->b_rptr -
18470	    offset;
18471	tcp->tcp_unsent += shrunk_count;
18472
18473	if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
18474		/*
18475		 * Make sure the timer is running so that we will probe a zero
18476		 * window.
18477		 */
18478		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
18479}
18480
18481
18482/*
18483 * The TCP normal data output path.
18484 * NOTE: the logic of the fast path is duplicated from this function.
18485 */
18486static void
18487tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
18488{
18489	int		len;
18490	mblk_t		*local_time;
18491	mblk_t		*mp1;
18492	uint32_t	snxt;
18493	int		tail_unsent;
18494	int		tcpstate;
18495	int		usable = 0;
18496	mblk_t		*xmit_tail;
18497	queue_t		*q = tcp->tcp_wq;
18498	int32_t		mss;
18499	int32_t		num_sack_blk = 0;
18500	int32_t		tcp_hdr_len;
18501	int32_t		tcp_tcp_hdr_len;
18502	int		mdt_thres;
18503	int		rc;
18504
18505	tcpstate = tcp->tcp_state;
18506	if (mp == NULL) {
18507		/*
18508		 * tcp_wput_data() with NULL mp should only be called when
18509		 * there is unsent data.
18510		 */
18511		ASSERT(tcp->tcp_unsent > 0);
18512		/* Really tacky... but we need this for detached closes. */
18513		len = tcp->tcp_unsent;
18514		goto data_null;
18515	}
18516
18517#if CCS_STATS
18518	wrw_stats.tot.count++;
18519	wrw_stats.tot.bytes += msgdsize(mp);
18520#endif
18521	ASSERT(mp->b_datap->db_type == M_DATA);
18522	/*
18523	 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
18524	 * or before a connection attempt has begun.
18525	 */
18526	if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
18527	    (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
18528		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
18529#ifdef DEBUG
18530			cmn_err(CE_WARN,
18531			    "tcp_wput_data: data after ordrel, %s",
18532			    tcp_display(tcp, NULL,
18533			    DISP_ADDR_AND_PORT));
18534#else
18535			if (tcp->tcp_debug) {
18536				(void) strlog(TCP_MOD_ID, 0, 1,
18537				    SL_TRACE|SL_ERROR,
18538				    "tcp_wput_data: data after ordrel, %s\n",
18539				    tcp_display(tcp, NULL,
18540				    DISP_ADDR_AND_PORT));
18541			}
18542#endif /* DEBUG */
18543		}
18544		if (tcp->tcp_snd_zcopy_aware &&
18545		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
18546			tcp_zcopy_notify(tcp);
18547		freemsg(mp);
18548		if (tcp->tcp_flow_stopped &&
18549		    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
18550			tcp_clrqfull(tcp);
18551		}
18552		return;
18553	}
18554
18555	/* Strip empties */
18556	for (;;) {
18557		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
18558		    (uintptr_t)INT_MAX);
18559		len = (int)(mp->b_wptr - mp->b_rptr);
18560		if (len > 0)
18561			break;
18562		mp1 = mp;
18563		mp = mp->b_cont;
18564		freeb(mp1);
18565		if (!mp) {
18566			return;
18567		}
18568	}
18569
18570	/* If we are the first on the list ... */
18571	if (tcp->tcp_xmit_head == NULL) {
18572		tcp->tcp_xmit_head = mp;
18573		tcp->tcp_xmit_tail = mp;
18574		tcp->tcp_xmit_tail_unsent = len;
18575	} else {
18576		/* If tiny tx and room in txq tail, pullup to save mblks. */
18577		struct datab *dp;
18578
18579		mp1 = tcp->tcp_xmit_last;
18580		if (len < tcp_tx_pull_len &&
18581		    (dp = mp1->b_datap)->db_ref == 1 &&
18582		    dp->db_lim - mp1->b_wptr >= len) {
18583			ASSERT(len > 0);
18584			ASSERT(!mp1->b_cont);
18585			if (len == 1) {
18586				*mp1->b_wptr++ = *mp->b_rptr;
18587			} else {
18588				bcopy(mp->b_rptr, mp1->b_wptr, len);
18589				mp1->b_wptr += len;
18590			}
18591			if (mp1 == tcp->tcp_xmit_tail)
18592				tcp->tcp_xmit_tail_unsent += len;
18593			mp1->b_cont = mp->b_cont;
18594			if (tcp->tcp_snd_zcopy_aware &&
18595			    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
18596				mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
18597			freeb(mp);
18598			mp = mp1;
18599		} else {
18600			tcp->tcp_xmit_last->b_cont = mp;
18601		}
18602		len += tcp->tcp_unsent;
18603	}
18604
18605	/* Tack on however many more positive length mblks we have */
18606	if ((mp1 = mp->b_cont) != NULL) {
18607		do {
18608			int tlen;
18609			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
18610			    (uintptr_t)INT_MAX);
18611			tlen = (int)(mp1->b_wptr - mp1->b_rptr);
18612			if (tlen <= 0) {
18613				mp->b_cont = mp1->b_cont;
18614				freeb(mp1);
18615			} else {
18616				len += tlen;
18617				mp = mp1;
18618			}
18619		} while ((mp1 = mp->b_cont) != NULL);
18620	}
18621	tcp->tcp_xmit_last = mp;
18622	tcp->tcp_unsent = len;
18623
18624	if (urgent)
18625		usable = 1;
18626
18627data_null:
18628	snxt = tcp->tcp_snxt;
18629	xmit_tail = tcp->tcp_xmit_tail;
18630	tail_unsent = tcp->tcp_xmit_tail_unsent;
18631
18632	/*
18633	 * Note that tcp_mss has been adjusted to take into account the
18634	 * timestamp option if applicable.  Because SACK options do not
18635	 * appear in every TCP segments and they are of variable lengths,
18636	 * they cannot be included in tcp_mss.  Thus we need to calculate
18637	 * the actual segment length when we need to send a segment which
18638	 * includes SACK options.
18639	 */
18640	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
18641		int32_t	opt_len;
18642
18643		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
18644		    tcp->tcp_num_sack_blk);
18645		opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
18646		    2 + TCPOPT_HEADER_LEN;
18647		mss = tcp->tcp_mss - opt_len;
18648		tcp_hdr_len = tcp->tcp_hdr_len + opt_len;
18649		tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len;
18650	} else {
18651		mss = tcp->tcp_mss;
18652		tcp_hdr_len = tcp->tcp_hdr_len;
18653		tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
18654	}
18655
18656	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
18657	    (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
18658		SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle);
18659	}
18660	if (tcpstate == TCPS_SYN_RCVD) {
18661		/*
18662		 * The three-way connection establishment handshake is not
18663		 * complete yet. We want to queue the data for transmission
18664		 * after entering ESTABLISHED state (RFC793). A jump to
18665		 * "done" label effectively leaves data on the queue.
18666		 */
18667		goto done;
18668	} else {
18669		int usable_r;
18670
18671		/*
18672		 * In the special case when cwnd is zero, which can only
18673		 * happen if the connection is ECN capable, return now.
18674		 * New segments is sent using tcp_timer().  The timer
18675		 * is set in tcp_rput_data().
18676		 */
18677		if (tcp->tcp_cwnd == 0) {
18678			/*
18679			 * Note that tcp_cwnd is 0 before 3-way handshake is
18680			 * finished.
18681			 */
18682			ASSERT(tcp->tcp_ecn_ok ||
18683			    tcp->tcp_state < TCPS_ESTABLISHED);
18684			return;
18685		}
18686
18687		/* NOTE: trouble if xmitting while SYN not acked? */
18688		usable_r = snxt - tcp->tcp_suna;
18689		usable_r = tcp->tcp_swnd - usable_r;
18690
18691		/*
18692		 * Check if the receiver has shrunk the window.  If
18693		 * tcp_wput_data() with NULL mp is called, tcp_fin_sent
18694		 * cannot be set as there is unsent data, so FIN cannot
18695		 * be sent out.  Otherwise, we need to take into account
18696		 * of FIN as it consumes an "invisible" sequence number.
18697		 */
18698		ASSERT(tcp->tcp_fin_sent == 0);
18699		if (usable_r < 0) {
18700			/*
18701			 * The receiver has shrunk the window and we have sent
18702			 * -usable_r date beyond the window, re-adjust.
18703			 *
18704			 * If TCP window scaling is enabled, there can be
18705			 * round down error as the advertised receive window
18706			 * is actually right shifted n bits.  This means that
18707			 * the lower n bits info is wiped out.  It will look
18708			 * like the window is shrunk.  Do a check here to
18709			 * see if the shrunk amount is actually within the
18710			 * error in window calculation.  If it is, just
18711			 * return.  Note that this check is inside the
18712			 * shrunk window check.  This makes sure that even
18713			 * though tcp_process_shrunk_swnd() is not called,
18714			 * we will stop further processing.
18715			 */
18716			if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
18717				tcp_process_shrunk_swnd(tcp, -usable_r);
18718			}
18719			return;
18720		}
18721
18722		/* usable = MIN(swnd, cwnd) - unacked_bytes */
18723		if (tcp->tcp_swnd > tcp->tcp_cwnd)
18724			usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
18725
18726		/* usable = MIN(usable, unsent) */
18727		if (usable_r > len)
18728			usable_r = len;
18729
18730		/* usable = MAX(usable, {1 for urgent, 0 for data}) */
18731		if (usable_r > 0) {
18732			usable = usable_r;
18733		} else {
18734			/* Bypass all other unnecessary processing. */
18735			goto done;
18736		}
18737	}
18738
18739	local_time = (mblk_t *)lbolt;
18740
18741	/*
18742	 * "Our" Nagle Algorithm.  This is not the same as in the old
18743	 * BSD.  This is more in line with the true intent of Nagle.
18744	 *
18745	 * The conditions are:
18746	 * 1. The amount of unsent data (or amount of data which can be
18747	 *    sent, whichever is smaller) is less than Nagle limit.
18748	 * 2. The last sent size is also less than Nagle limit.
18749	 * 3. There is unack'ed data.
18750	 * 4. Urgent pointer is not set.  Send urgent data ignoring the
18751	 *    Nagle algorithm.  This reduces the probability that urgent
18752	 *    bytes get "merged" together.
18753	 * 5. The app has not closed the connection.  This eliminates the
18754	 *    wait time of the receiving side waiting for the last piece of
18755	 *    (small) data.
18756	 *
18757	 * If all are satisified, exit without sending anything.  Note
18758	 * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
18759	 * the smaller of 1 MSS and global tcp_naglim_def (default to be
18760	 * 4095).
18761	 */
18762	if (usable < (int)tcp->tcp_naglim &&
18763	    tcp->tcp_naglim > tcp->tcp_last_sent_len &&
18764	    snxt != tcp->tcp_suna &&
18765	    !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
18766	    !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
18767		goto done;
18768	}
18769
18770	if (tcp->tcp_cork) {
18771		/*
18772		 * if the tcp->tcp_cork option is set, then we have to force
18773		 * TCP not to send partial segment (smaller than MSS bytes).
18774		 * We are calculating the usable now based on full mss and
18775		 * will save the rest of remaining data for later.
18776		 */
18777		if (usable < mss)
18778			goto done;
18779		usable = (usable / mss) * mss;
18780	}
18781
18782	/* Update the latest receive window size in TCP header. */
18783	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
18784	    tcp->tcp_tcph->th_win);
18785
18786	/*
18787	 * Determine if it's worthwhile to attempt MDT, based on:
18788	 *
18789	 * 1. Simple TCP/IP{v4,v6} (no options).
18790	 * 2. IPSEC/IPQoS processing is not needed for the TCP connection.
18791	 * 3. If the TCP connection is in ESTABLISHED state.
18792	 * 4. The TCP is not detached.
18793	 *
18794	 * If any of the above conditions have changed during the
18795	 * connection, stop using MDT and restore the stream head
18796	 * parameters accordingly.
18797	 */
18798	if (tcp->tcp_mdt &&
18799	    ((tcp->tcp_ipversion == IPV4_VERSION &&
18800	    tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
18801	    (tcp->tcp_ipversion == IPV6_VERSION &&
18802	    tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) ||
18803	    tcp->tcp_state != TCPS_ESTABLISHED ||
18804	    TCP_IS_DETACHED(tcp) || !CONN_IS_MD_FASTPATH(tcp->tcp_connp) ||
18805	    CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) ||
18806	    IPP_ENABLED(IPP_LOCAL_OUT))) {
18807		tcp->tcp_connp->conn_mdt_ok = B_FALSE;
18808		tcp->tcp_mdt = B_FALSE;
18809
18810		/* Anything other than detached is considered pathological */
18811		if (!TCP_IS_DETACHED(tcp)) {
18812			TCP_STAT(tcp_mdt_conn_halted1);
18813			(void) tcp_maxpsz_set(tcp, B_TRUE);
18814		}
18815	}
18816
18817	/* Use MDT if sendable amount is greater than the threshold */
18818	if (tcp->tcp_mdt &&
18819	    (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) &&
18820	    (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL &&
18821	    MBLKL(xmit_tail->b_cont) > mdt_thres)) &&
18822	    (tcp->tcp_valid_bits == 0 ||
18823	    tcp->tcp_valid_bits == TCP_FSS_VALID)) {
18824		ASSERT(tcp->tcp_connp->conn_mdt_ok);
18825		rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
18826		    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
18827		    local_time, mdt_thres);
18828	} else {
18829		rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
18830		    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
18831		    local_time, INT_MAX);
18832	}
18833
18834	/* Pretend that all we were trying to send really got sent */
18835	if (rc < 0 && tail_unsent < 0) {
18836		do {
18837			xmit_tail = xmit_tail->b_cont;
18838			xmit_tail->b_prev = local_time;
18839			ASSERT((uintptr_t)(xmit_tail->b_wptr -
18840			    xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
18841			tail_unsent += (int)(xmit_tail->b_wptr -
18842			    xmit_tail->b_rptr);
18843		} while (tail_unsent < 0);
18844	}
18845done:;
18846	tcp->tcp_xmit_tail = xmit_tail;
18847	tcp->tcp_xmit_tail_unsent = tail_unsent;
18848	len = tcp->tcp_snxt - snxt;
18849	if (len) {
18850		/*
18851		 * If new data was sent, need to update the notsack
18852		 * list, which is, afterall, data blocks that have
18853		 * not been sack'ed by the receiver.  New data is
18854		 * not sack'ed.
18855		 */
18856		if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
18857			/* len is a negative value. */
18858			tcp->tcp_pipe -= len;
18859			tcp_notsack_update(&(tcp->tcp_notsack_list),
18860			    tcp->tcp_snxt, snxt,
18861			    &(tcp->tcp_num_notsack_blk),
18862			    &(tcp->tcp_cnt_notsack_list));
18863		}
18864		tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
18865		tcp->tcp_rack = tcp->tcp_rnxt;
18866		tcp->tcp_rack_cnt = 0;
18867		if ((snxt + len) == tcp->tcp_suna) {
18868			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
18869		}
18870	} else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
18871		/*
18872		 * Didn't send anything. Make sure the timer is running
18873		 * so that we will probe a zero window.
18874		 */
18875		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
18876	}
18877	/* Note that len is the amount we just sent but with a negative sign */
18878	tcp->tcp_unsent += len;
18879	if (tcp->tcp_flow_stopped) {
18880		if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
18881			tcp_clrqfull(tcp);
18882		}
18883	} else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
18884		tcp_setqfull(tcp);
18885	}
18886}
18887
18888/*
18889 * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the
18890 * outgoing TCP header with the template header, as well as other
18891 * options such as time-stamp, ECN and/or SACK.
18892 */
18893static void
18894tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
18895{
18896	tcph_t *tcp_tmpl, *tcp_h;
18897	uint32_t *dst, *src;
18898	int hdrlen;
18899
18900	ASSERT(OK_32PTR(rptr));
18901
18902	/* Template header */
18903	tcp_tmpl = tcp->tcp_tcph;
18904
18905	/* Header of outgoing packet */
18906	tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
18907
18908	/* dst and src are opaque 32-bit fields, used for copying */
18909	dst = (uint32_t *)rptr;
18910	src = (uint32_t *)tcp->tcp_iphc;
18911	hdrlen = tcp->tcp_hdr_len;
18912
18913	/* Fill time-stamp option if needed */
18914	if (tcp->tcp_snd_ts_ok) {
18915		U32_TO_BE32((uint32_t)now,
18916		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
18917		U32_TO_BE32(tcp->tcp_ts_recent,
18918		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
18919	} else {
18920		ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
18921	}
18922
18923	/*
18924	 * Copy the template header; is this really more efficient than
18925	 * calling bcopy()?  For simple IPv4/TCP, it may be the case,
18926	 * but perhaps not for other scenarios.
18927	 */
18928	dst[0] = src[0];
18929	dst[1] = src[1];
18930	dst[2] = src[2];
18931	dst[3] = src[3];
18932	dst[4] = src[4];
18933	dst[5] = src[5];
18934	dst[6] = src[6];
18935	dst[7] = src[7];
18936	dst[8] = src[8];
18937	dst[9] = src[9];
18938	if (hdrlen -= 40) {
18939		hdrlen >>= 2;
18940		dst += 10;
18941		src += 10;
18942		do {
18943			*dst++ = *src++;
18944		} while (--hdrlen);
18945	}
18946
18947	/*
18948	 * Set the ECN info in the TCP header if it is not a zero
18949	 * window probe.  Zero window probe is only sent in
18950	 * tcp_wput_data() and tcp_timer().
18951	 */
18952	if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
18953		SET_ECT(tcp, rptr);
18954
18955		if (tcp->tcp_ecn_echo_on)
18956			tcp_h->th_flags[0] |= TH_ECE;
18957		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
18958			tcp_h->th_flags[0] |= TH_CWR;
18959			tcp->tcp_ecn_cwr_sent = B_TRUE;
18960		}
18961	}
18962
18963	/* Fill in SACK options */
18964	if (num_sack_blk > 0) {
18965		uchar_t *wptr = rptr + tcp->tcp_hdr_len;
18966		sack_blk_t *tmp;
18967		int32_t	i;
18968
18969		wptr[0] = TCPOPT_NOP;
18970		wptr[1] = TCPOPT_NOP;
18971		wptr[2] = TCPOPT_SACK;
18972		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
18973		    sizeof (sack_blk_t);
18974		wptr += TCPOPT_REAL_SACK_LEN;
18975
18976		tmp = tcp->tcp_sack_list;
18977		for (i = 0; i < num_sack_blk; i++) {
18978			U32_TO_BE32(tmp[i].begin, wptr);
18979			wptr += sizeof (tcp_seq);
18980			U32_TO_BE32(tmp[i].end, wptr);
18981			wptr += sizeof (tcp_seq);
18982		}
18983		tcp_h->th_offset_and_rsrvd[0] +=
18984		    ((num_sack_blk * 2 + 1) << 4);
18985	}
18986}
18987
18988/*
18989 * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach
18990 * the destination address and SAP attribute, and if necessary, the
18991 * hardware checksum offload attribute to a Multidata message.
18992 */
18993static int
18994tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum,
18995    const uint32_t start, const uint32_t stuff, const uint32_t end,
18996    const uint32_t flags)
18997{
18998	/* Add global destination address & SAP attribute */
18999	if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) {
19000		ip1dbg(("tcp_mdt_add_attrs: can't add global physical "
19001		    "destination address+SAP\n"));
19002
19003		if (dlmp != NULL)
19004			TCP_STAT(tcp_mdt_allocfail);
19005		return (-1);
19006	}
19007
19008	/* Add global hwcksum attribute */
19009	if (hwcksum &&
19010	    !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) {
19011		ip1dbg(("tcp_mdt_add_attrs: can't add global hardware "
19012		    "checksum attribute\n"));
19013
19014		TCP_STAT(tcp_mdt_allocfail);
19015		return (-1);
19016	}
19017
19018	return (0);
19019}
19020
19021/*
19022 * Smaller and private version of pdescinfo_t used specifically for TCP,
19023 * which allows for only two payload spans per packet.
19024 */
19025typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
19026
19027/*
19028 * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
19029 * scheme, and returns one the following:
19030 *
19031 * -1 = failed allocation.
19032 *  0 = success; burst count reached, or usable send window is too small,
19033 *      and that we'd rather wait until later before sending again.
19034 */
19035static int
19036tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
19037    const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
19038    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
19039    const int mdt_thres)
19040{
19041	mblk_t		*md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf;
19042	multidata_t	*mmd;
19043	uint_t		obsegs, obbytes, hdr_frag_sz;
19044	uint_t		cur_hdr_off, cur_pld_off, base_pld_off, first_snxt;
19045	int		num_burst_seg, max_pld;
19046	pdesc_t		*pkt;
19047	tcp_pdescinfo_t	tcp_pkt_info;
19048	pdescinfo_t	*pkt_info;
19049	int		pbuf_idx, pbuf_idx_nxt;
19050	int		seg_len, len, spill, af;
19051	boolean_t	add_buffer, zcopy, clusterwide;
19052	boolean_t	rconfirm = B_FALSE;
19053	boolean_t	done = B_FALSE;
19054	uint32_t	cksum;
19055	uint32_t	hwcksum_flags;
19056	ire_t		*ire;
19057	ill_t		*ill;
19058	ipha_t		*ipha;
19059	ip6_t		*ip6h;
19060	ipaddr_t	src, dst;
19061	ill_zerocopy_capab_t *zc_cap = NULL;
19062	uint16_t	*up;
19063	int		err;
19064	conn_t		*connp;
19065
19066#ifdef	_BIG_ENDIAN
19067#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 28) & 0x7)
19068#else
19069#define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 4) & 0x7)
19070#endif
19071
19072#define	PREP_NEW_MULTIDATA() {			\
19073	mmd = NULL;				\
19074	md_mp = md_hbuf = NULL;			\
19075	cur_hdr_off = 0;			\
19076	max_pld = tcp->tcp_mdt_max_pld;		\
19077	pbuf_idx = pbuf_idx_nxt = -1;		\
19078	add_buffer = B_TRUE;			\
19079	zcopy = B_FALSE;			\
19080}
19081
19082#define	PREP_NEW_PBUF() {			\
19083	md_pbuf = md_pbuf_nxt = NULL;		\
19084	pbuf_idx = pbuf_idx_nxt = -1;		\
19085	cur_pld_off = 0;			\
19086	first_snxt = *snxt;			\
19087	ASSERT(*tail_unsent > 0);		\
19088	base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \
19089}
19090
19091	ASSERT(mdt_thres >= mss);
19092	ASSERT(*usable > 0 && *usable > mdt_thres);
19093	ASSERT(tcp->tcp_state == TCPS_ESTABLISHED);
19094	ASSERT(!TCP_IS_DETACHED(tcp));
19095	ASSERT(tcp->tcp_valid_bits == 0 ||
19096	    tcp->tcp_valid_bits == TCP_FSS_VALID);
19097	ASSERT((tcp->tcp_ipversion == IPV4_VERSION &&
19098	    tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) ||
19099	    (tcp->tcp_ipversion == IPV6_VERSION &&
19100	    tcp->tcp_ip_hdr_len == IPV6_HDR_LEN));
19101
19102	connp = tcp->tcp_connp;
19103	ASSERT(connp != NULL);
19104	ASSERT(CONN_IS_MD_FASTPATH(connp));
19105	ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp));
19106
19107	/*
19108	 * Note that tcp will only declare at most 2 payload spans per
19109	 * packet, which is much lower than the maximum allowable number
19110	 * of packet spans per Multidata.  For this reason, we use the
19111	 * privately declared and smaller descriptor info structure, in
19112	 * order to save some stack space.
19113	 */
19114	pkt_info = (pdescinfo_t *)&tcp_pkt_info;
19115
19116	af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
19117	if (af == AF_INET) {
19118		dst = tcp->tcp_ipha->ipha_dst;
19119		src = tcp->tcp_ipha->ipha_src;
19120		ASSERT(!CLASSD(dst));
19121	}
19122	ASSERT(af == AF_INET ||
19123	    !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst));
19124
19125	obsegs = obbytes = 0;
19126	num_burst_seg = tcp->tcp_snd_burst;
19127	md_mp_head = NULL;
19128	PREP_NEW_MULTIDATA();
19129
19130	/*
19131	 * Before we go on further, make sure there is an IRE that we can
19132	 * use, and that the ILL supports MDT.  Otherwise, there's no point
19133	 * in proceeding any further, and we should just hand everything
19134	 * off to the legacy path.
19135	 */
19136	mutex_enter(&connp->conn_lock);
19137	ire = connp->conn_ire_cache;
19138	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
19139	if (ire != NULL && ((af == AF_INET && ire->ire_addr == dst) ||
19140	    (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
19141	    &tcp->tcp_ip6h->ip6_dst))) &&
19142	    !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
19143		IRE_REFHOLD(ire);
19144		mutex_exit(&connp->conn_lock);
19145	} else {
19146		boolean_t cached = B_FALSE;
19147		ts_label_t *tsl;
19148
19149		/* force a recheck later on */
19150		tcp->tcp_ire_ill_check_done = B_FALSE;
19151
19152		TCP_DBGSTAT(tcp_ire_null1);
19153		connp->conn_ire_cache = NULL;
19154		mutex_exit(&connp->conn_lock);
19155
19156		/* Release the old ire */
19157		if (ire != NULL)
19158			IRE_REFRELE_NOTR(ire);
19159
19160		tsl = crgetlabel(CONN_CRED(connp));
19161		ire = (af == AF_INET) ?
19162		    ire_cache_lookup(dst, connp->conn_zoneid, tsl) :
19163		    ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst,
19164		    connp->conn_zoneid, tsl);
19165
19166		if (ire == NULL) {
19167			TCP_STAT(tcp_ire_null);
19168			goto legacy_send_no_md;
19169		}
19170
19171		IRE_REFHOLD_NOTR(ire);
19172		/*
19173		 * Since we are inside the squeue, there cannot be another
19174		 * thread in TCP trying to set the conn_ire_cache now. The
19175		 * check for IRE_MARK_CONDEMNED ensures that an interface
19176		 * unplumb thread has not yet started cleaning up the conns.
19177		 * Hence we don't need to grab the conn lock.
19178		 */
19179		if (!(connp->conn_state_flags & CONN_CLOSING)) {
19180			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
19181			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
19182				connp->conn_ire_cache = ire;
19183				cached = B_TRUE;
19184			}
19185			rw_exit(&ire->ire_bucket->irb_lock);
19186		}
19187
19188		/*
19189		 * We can continue to use the ire but since it was not
19190		 * cached, we should drop the extra reference.
19191		 */
19192		if (!cached)
19193			IRE_REFRELE_NOTR(ire);
19194	}
19195
19196	ASSERT(ire != NULL);
19197	ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION);
19198	ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6)));
19199	ASSERT(af == AF_INET || ire->ire_nce != NULL);
19200	ASSERT(!(ire->ire_type & IRE_BROADCAST));
19201	/*
19202	 * If we do support loopback for MDT (which requires modifications
19203	 * to the receiving paths), the following assertions should go away,
19204	 * and we would be sending the Multidata to loopback conn later on.
19205	 */
19206	ASSERT(!IRE_IS_LOCAL(ire));
19207	ASSERT(ire->ire_stq != NULL);
19208
19209	ill = ire_to_ill(ire);
19210	ASSERT(ill != NULL);
19211	ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
19212
19213	if (!tcp->tcp_ire_ill_check_done) {
19214		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
19215		tcp->tcp_ire_ill_check_done = B_TRUE;
19216	}
19217
19218	/*
19219	 * If the underlying interface conditions have changed, or if the
19220	 * new interface does not support MDT, go back to legacy path.
19221	 */
19222	if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) {
19223		/* don't go through this path anymore for this connection */
19224		TCP_STAT(tcp_mdt_conn_halted2);
19225		tcp->tcp_mdt = B_FALSE;
19226		ip1dbg(("tcp_multisend: disabling MDT for connp %p on "
19227		    "interface %s\n", (void *)connp, ill->ill_name));
19228		/* IRE will be released prior to returning */
19229		goto legacy_send_no_md;
19230	}
19231
19232	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)
19233		zc_cap = ill->ill_zerocopy_capab;
19234
19235	/*
19236	 * Check if we can take tcp fast-path. Note that "incomplete"
19237	 * ire's (where the link-layer for next hop is not resolved
19238	 * or where the fast-path header in nce_fp_mp is not available
19239	 * yet) are sent down the legacy (slow) path.
19240	 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA
19241	 */
19242	if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
19243		/* IRE will be released prior to returning */
19244		goto legacy_send_no_md;
19245	}
19246
19247	/* go to legacy path if interface doesn't support zerocopy */
19248	if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 &&
19249	    (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) {
19250		/* IRE will be released prior to returning */
19251		goto legacy_send_no_md;
19252	}
19253
19254	/* does the interface support hardware checksum offload? */
19255	hwcksum_flags = 0;
19256	if (ILL_HCKSUM_CAPABLE(ill) &&
19257	    (ill->ill_hcksum_capab->ill_hcksum_txflags &
19258	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
19259	    HCKSUM_IPHDRCKSUM)) && dohwcksum) {
19260		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
19261		    HCKSUM_IPHDRCKSUM)
19262			hwcksum_flags = HCK_IPV4_HDRCKSUM;
19263
19264		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
19265		    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
19266			hwcksum_flags |= HCK_FULLCKSUM;
19267		else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
19268		    HCKSUM_INET_PARTIAL)
19269			hwcksum_flags |= HCK_PARTIALCKSUM;
19270	}
19271
19272	/*
19273	 * Each header fragment consists of the leading extra space,
19274	 * followed by the TCP/IP header, and the trailing extra space.
19275	 * We make sure that each header fragment begins on a 32-bit
19276	 * aligned memory address (tcp_mdt_hdr_head is already 32-bit
19277	 * aligned in tcp_mdt_update).
19278	 */
19279	hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len +
19280	    tcp->tcp_mdt_hdr_tail), 4);
19281
19282	/* are we starting from the beginning of data block? */
19283	if (*tail_unsent == 0) {
19284		*xmit_tail = (*xmit_tail)->b_cont;
19285		ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX);
19286		*tail_unsent = (int)MBLKL(*xmit_tail);
19287	}
19288
19289	/*
19290	 * Here we create one or more Multidata messages, each made up of
19291	 * one header buffer and up to N payload buffers.  This entire
19292	 * operation is done within two loops:
19293	 *
19294	 * The outer loop mostly deals with creating the Multidata message,
19295	 * as well as the header buffer that gets added to it.  It also
19296	 * links the Multidata messages together such that all of them can
19297	 * be sent down to the lower layer in a single putnext call; this
19298	 * linking behavior depends on the tcp_mdt_chain tunable.
19299	 *
19300	 * The inner loop takes an existing Multidata message, and adds
19301	 * one or more (up to tcp_mdt_max_pld) payload buffers to it.  It
19302	 * packetizes those buffers by filling up the corresponding header
19303	 * buffer fragments with the proper IP and TCP headers, and by
19304	 * describing the layout of each packet in the packet descriptors
19305	 * that get added to the Multidata.
19306	 */
19307	do {
19308		/*
19309		 * If usable send window is too small, or data blocks in
19310		 * transmit list are smaller than our threshold (i.e. app
19311		 * performs large writes followed by small ones), we hand
19312		 * off the control over to the legacy path.  Note that we'll
19313		 * get back the control once it encounters a large block.
19314		 */
19315		if (*usable < mss || (*tail_unsent <= mdt_thres &&
19316		    (*xmit_tail)->b_cont != NULL &&
19317		    MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) {
19318			/* send down what we've got so far */
19319			if (md_mp_head != NULL) {
19320				tcp_multisend_data(tcp, ire, ill, md_mp_head,
19321				    obsegs, obbytes, &rconfirm);
19322			}
19323			/*
19324			 * Pass control over to tcp_send(), but tell it to
19325			 * return to us once a large-size transmission is
19326			 * possible.
19327			 */
19328			TCP_STAT(tcp_mdt_legacy_small);
19329			if ((err = tcp_send(q, tcp, mss, tcp_hdr_len,
19330			    tcp_tcp_hdr_len, num_sack_blk, usable, snxt,
19331			    tail_unsent, xmit_tail, local_time,
19332			    mdt_thres)) <= 0) {
19333				/* burst count reached, or alloc failed */
19334				IRE_REFRELE(ire);
19335				return (err);
19336			}
19337
19338			/* tcp_send() may have sent everything, so check */
19339			if (*usable <= 0) {
19340				IRE_REFRELE(ire);
19341				return (0);
19342			}
19343
19344			TCP_STAT(tcp_mdt_legacy_ret);
19345			/*
19346			 * We may have delivered the Multidata, so make sure
19347			 * to re-initialize before the next round.
19348			 */
19349			md_mp_head = NULL;
19350			obsegs = obbytes = 0;
19351			num_burst_seg = tcp->tcp_snd_burst;
19352			PREP_NEW_MULTIDATA();
19353
19354			/* are we starting from the beginning of data block? */
19355			if (*tail_unsent == 0) {
19356				*xmit_tail = (*xmit_tail)->b_cont;
19357				ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
19358				    (uintptr_t)INT_MAX);
19359				*tail_unsent = (int)MBLKL(*xmit_tail);
19360			}
19361		}
19362
19363		/*
19364		 * max_pld limits the number of mblks in tcp's transmit
19365		 * queue that can be added to a Multidata message.  Once
19366		 * this counter reaches zero, no more additional mblks
19367		 * can be added to it.  What happens afterwards depends
19368		 * on whether or not we are set to chain the Multidata
19369		 * messages.  If we are to link them together, reset
19370		 * max_pld to its original value (tcp_mdt_max_pld) and
19371		 * prepare to create a new Multidata message which will
19372		 * get linked to md_mp_head.  Else, leave it alone and
19373		 * let the inner loop break on its own.
19374		 */
19375		if (tcp_mdt_chain && max_pld == 0)
19376			PREP_NEW_MULTIDATA();
19377
19378		/* adding a payload buffer; re-initialize values */
19379		if (add_buffer)
19380			PREP_NEW_PBUF();
19381
19382		/*
19383		 * If we don't have a Multidata, either because we just
19384		 * (re)entered this outer loop, or after we branched off
19385		 * to tcp_send above, setup the Multidata and header
19386		 * buffer to be used.
19387		 */
19388		if (md_mp == NULL) {
19389			int md_hbuflen;
19390			uint32_t start, stuff;
19391
19392			/*
19393			 * Calculate Multidata header buffer size large enough
19394			 * to hold all of the headers that can possibly be
19395			 * sent at this moment.  We'd rather over-estimate
19396			 * the size than running out of space; this is okay
19397			 * since this buffer is small anyway.
19398			 */
19399			md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz;
19400
19401			/*
19402			 * Start and stuff offset for partial hardware
19403			 * checksum offload; these are currently for IPv4.
19404			 * For full checksum offload, they are set to zero.
19405			 */
19406			if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
19407				if (af == AF_INET) {
19408					start = IP_SIMPLE_HDR_LENGTH;
19409					stuff = IP_SIMPLE_HDR_LENGTH +
19410					    TCP_CHECKSUM_OFFSET;
19411				} else {
19412					start = IPV6_HDR_LEN;
19413					stuff = IPV6_HDR_LEN +
19414					    TCP_CHECKSUM_OFFSET;
19415				}
19416			} else {
19417				start = stuff = 0;
19418			}
19419
19420			/*
19421			 * Create the header buffer, Multidata, as well as
19422			 * any necessary attributes (destination address,
19423			 * SAP and hardware checksum offload) that should
19424			 * be associated with the Multidata message.
19425			 */
19426			ASSERT(cur_hdr_off == 0);
19427			if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL ||
19428			    ((md_hbuf->b_wptr += md_hbuflen),
19429			    (mmd = mmd_alloc(md_hbuf, &md_mp,
19430			    KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd,
19431			    /* fastpath mblk */
19432			    ire->ire_nce->nce_res_mp,
19433			    /* hardware checksum enabled */
19434			    (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
19435			    /* hardware checksum offsets */
19436			    start, stuff, 0,
19437			    /* hardware checksum flag */
19438			    hwcksum_flags) != 0)) {
19439legacy_send:
19440				if (md_mp != NULL) {
19441					/* Unlink message from the chain */
19442					if (md_mp_head != NULL) {
19443						err = (intptr_t)rmvb(md_mp_head,
19444						    md_mp);
19445						/*
19446						 * We can't assert that rmvb
19447						 * did not return -1, since we
19448						 * may get here before linkb
19449						 * happens.  We do, however,
19450						 * check if we just removed the
19451						 * only element in the list.
19452						 */
19453						if (err == 0)
19454							md_mp_head = NULL;
19455					}
19456					/* md_hbuf gets freed automatically */
19457					TCP_STAT(tcp_mdt_discarded);
19458					freeb(md_mp);
19459				} else {
19460					/* Either allocb or mmd_alloc failed */
19461					TCP_STAT(tcp_mdt_allocfail);
19462					if (md_hbuf != NULL)
19463						freeb(md_hbuf);
19464				}
19465
19466				/* send down what we've got so far */
19467				if (md_mp_head != NULL) {
19468					tcp_multisend_data(tcp, ire, ill,
19469					    md_mp_head, obsegs, obbytes,
19470					    &rconfirm);
19471				}
19472legacy_send_no_md:
19473				if (ire != NULL)
19474					IRE_REFRELE(ire);
19475				/*
19476				 * Too bad; let the legacy path handle this.
19477				 * We specify INT_MAX for the threshold, since
19478				 * we gave up with the Multidata processings
19479				 * and let the old path have it all.
19480				 */
19481				TCP_STAT(tcp_mdt_legacy_all);
19482				return (tcp_send(q, tcp, mss, tcp_hdr_len,
19483				    tcp_tcp_hdr_len, num_sack_blk, usable,
19484				    snxt, tail_unsent, xmit_tail, local_time,
19485				    INT_MAX));
19486			}
19487
19488			/* link to any existing ones, if applicable */
19489			TCP_STAT(tcp_mdt_allocd);
19490			if (md_mp_head == NULL) {
19491				md_mp_head = md_mp;
19492			} else if (tcp_mdt_chain) {
19493				TCP_STAT(tcp_mdt_linked);
19494				linkb(md_mp_head, md_mp);
19495			}
19496		}
19497
19498		ASSERT(md_mp_head != NULL);
19499		ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL);
19500		ASSERT(md_mp != NULL && mmd != NULL);
19501		ASSERT(md_hbuf != NULL);
19502
19503		/*
19504		 * Packetize the transmittable portion of the data block;
19505		 * each data block is essentially added to the Multidata
19506		 * as a payload buffer.  We also deal with adding more
19507		 * than one payload buffers, which happens when the remaining
19508		 * packetized portion of the current payload buffer is less
19509		 * than MSS, while the next data block in transmit queue
19510		 * has enough data to make up for one.  This "spillover"
19511		 * case essentially creates a split-packet, where portions
19512		 * of the packet's payload fragments may span across two
19513		 * virtually discontiguous address blocks.
19514		 */
19515		seg_len = mss;
19516		do {
19517			len = seg_len;
19518
19519			ASSERT(len > 0);
19520			ASSERT(max_pld >= 0);
19521			ASSERT(!add_buffer || cur_pld_off == 0);
19522
19523			/*
19524			 * First time around for this payload buffer; note
19525			 * in the case of a spillover, the following has
19526			 * been done prior to adding the split-packet
19527			 * descriptor to Multidata, and we don't want to
19528			 * repeat the process.
19529			 */
19530			if (add_buffer) {
19531				ASSERT(mmd != NULL);
19532				ASSERT(md_pbuf == NULL);
19533				ASSERT(md_pbuf_nxt == NULL);
19534				ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1);
19535
19536				/*
19537				 * Have we reached the limit?  We'd get to
19538				 * this case when we're not chaining the
19539				 * Multidata messages together, and since
19540				 * we're done, terminate this loop.
19541				 */
19542				if (max_pld == 0)
19543					break; /* done */
19544
19545				if ((md_pbuf = dupb(*xmit_tail)) == NULL) {
19546					TCP_STAT(tcp_mdt_allocfail);
19547					goto legacy_send; /* out_of_mem */
19548				}
19549
19550				if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy &&
19551				    zc_cap != NULL) {
19552					if (!ip_md_zcopy_attr(mmd, NULL,
19553					    zc_cap->ill_zerocopy_flags)) {
19554						freeb(md_pbuf);
19555						TCP_STAT(tcp_mdt_allocfail);
19556						/* out_of_mem */
19557						goto legacy_send;
19558					}
19559					zcopy = B_TRUE;
19560				}
19561
19562				md_pbuf->b_rptr += base_pld_off;
19563
19564				/*
19565				 * Add a payload buffer to the Multidata; this
19566				 * operation must not fail, or otherwise our
19567				 * logic in this routine is broken.  There
19568				 * is no memory allocation done by the
19569				 * routine, so any returned failure simply
19570				 * tells us that we've done something wrong.
19571				 *
19572				 * A failure tells us that either we're adding
19573				 * the same payload buffer more than once, or
19574				 * we're trying to add more buffers than
19575				 * allowed (max_pld calculation is wrong).
19576				 * None of the above cases should happen, and
19577				 * we panic because either there's horrible
19578				 * heap corruption, and/or programming mistake.
19579				 */
19580				pbuf_idx = mmd_addpldbuf(mmd, md_pbuf);
19581				if (pbuf_idx < 0) {
19582					cmn_err(CE_PANIC, "tcp_multisend: "
19583					    "payload buffer logic error "
19584					    "detected for tcp %p mmd %p "
19585					    "pbuf %p (%d)\n",
19586					    (void *)tcp, (void *)mmd,
19587					    (void *)md_pbuf, pbuf_idx);
19588				}
19589
19590				ASSERT(max_pld > 0);
19591				--max_pld;
19592				add_buffer = B_FALSE;
19593			}
19594
19595			ASSERT(md_mp_head != NULL);
19596			ASSERT(md_pbuf != NULL);
19597			ASSERT(md_pbuf_nxt == NULL);
19598			ASSERT(pbuf_idx != -1);
19599			ASSERT(pbuf_idx_nxt == -1);
19600			ASSERT(*usable > 0);
19601
19602			/*
19603			 * We spillover to the next payload buffer only
19604			 * if all of the following is true:
19605			 *
19606			 *   1. There is not enough data on the current
19607			 *	payload buffer to make up `len',
19608			 *   2. We are allowed to send `len',
19609			 *   3. The next payload buffer length is large
19610			 *	enough to accomodate `spill'.
19611			 */
19612			if ((spill = len - *tail_unsent) > 0 &&
19613			    *usable >= len &&
19614			    MBLKL((*xmit_tail)->b_cont) >= spill &&
19615			    max_pld > 0) {
19616				md_pbuf_nxt = dupb((*xmit_tail)->b_cont);
19617				if (md_pbuf_nxt == NULL) {
19618					TCP_STAT(tcp_mdt_allocfail);
19619					goto legacy_send; /* out_of_mem */
19620				}
19621
19622				if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy &&
19623				    zc_cap != NULL) {
19624					if (!ip_md_zcopy_attr(mmd, NULL,
19625					    zc_cap->ill_zerocopy_flags)) {
19626						freeb(md_pbuf_nxt);
19627						TCP_STAT(tcp_mdt_allocfail);
19628						/* out_of_mem */
19629						goto legacy_send;
19630					}
19631					zcopy = B_TRUE;
19632				}
19633
19634				/*
19635				 * See comments above on the first call to
19636				 * mmd_addpldbuf for explanation on the panic.
19637				 */
19638				pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt);
19639				if (pbuf_idx_nxt < 0) {
19640					panic("tcp_multisend: "
19641					    "next payload buffer logic error "
19642					    "detected for tcp %p mmd %p "
19643					    "pbuf %p (%d)\n",
19644					    (void *)tcp, (void *)mmd,
19645					    (void *)md_pbuf_nxt, pbuf_idx_nxt);
19646				}
19647
19648				ASSERT(max_pld > 0);
19649				--max_pld;
19650			} else if (spill > 0) {
19651				/*
19652				 * If there's a spillover, but the following
19653				 * xmit_tail couldn't give us enough octets
19654				 * to reach "len", then stop the current
19655				 * Multidata creation and let the legacy
19656				 * tcp_send() path take over.  We don't want
19657				 * to send the tiny segment as part of this
19658				 * Multidata for performance reasons; instead,
19659				 * we let the legacy path deal with grouping
19660				 * it with the subsequent small mblks.
19661				 */
19662				if (*usable >= len &&
19663				    MBLKL((*xmit_tail)->b_cont) < spill) {
19664					max_pld = 0;
19665					break;	/* done */
19666				}
19667
19668				/*
19669				 * We can't spillover, and we are near
19670				 * the end of the current payload buffer,
19671				 * so send what's left.
19672				 */
19673				ASSERT(*tail_unsent > 0);
19674				len = *tail_unsent;
19675			}
19676
19677			/* tail_unsent is negated if there is a spillover */
19678			*tail_unsent -= len;
19679			*usable -= len;
19680			ASSERT(*usable >= 0);
19681
19682			if (*usable < mss)
19683				seg_len = *usable;
19684			/*
19685			 * Sender SWS avoidance; see comments in tcp_send();
19686			 * everything else is the same, except that we only
19687			 * do this here if there is no more data to be sent
19688			 * following the current xmit_tail.  We don't check
19689			 * for 1-byte urgent data because we shouldn't get
19690			 * here if TCP_URG_VALID is set.
19691			 */
19692			if (*usable > 0 && *usable < mss &&
19693			    ((md_pbuf_nxt == NULL &&
19694			    (*xmit_tail)->b_cont == NULL) ||
19695			    (md_pbuf_nxt != NULL &&
19696			    (*xmit_tail)->b_cont->b_cont == NULL)) &&
19697			    seg_len < (tcp->tcp_max_swnd >> 1) &&
19698			    (tcp->tcp_unsent -
19699			    ((*snxt + len) - tcp->tcp_snxt)) > seg_len &&
19700			    !tcp->tcp_zero_win_probe) {
19701				if ((*snxt + len) == tcp->tcp_snxt &&
19702				    (*snxt + len) == tcp->tcp_suna) {
19703					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
19704				}
19705				done = B_TRUE;
19706			}
19707
19708			/*
19709			 * Prime pump for IP's checksumming on our behalf;
19710			 * include the adjustment for a source route if any.
19711			 * Do this only for software/partial hardware checksum
19712			 * offload, as this field gets zeroed out later for
19713			 * the full hardware checksum offload case.
19714			 */
19715			if (!(hwcksum_flags & HCK_FULLCKSUM)) {
19716				cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
19717				cksum = (cksum >> 16) + (cksum & 0xFFFF);
19718				U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum);
19719			}
19720
19721			U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq);
19722			*snxt += len;
19723
19724			tcp->tcp_tcph->th_flags[0] = TH_ACK;
19725			/*
19726			 * We set the PUSH bit only if TCP has no more buffered
19727			 * data to be transmitted (or if sender SWS avoidance
19728			 * takes place), as opposed to setting it for every
19729			 * last packet in the burst.
19730			 */
19731			if (done ||
19732			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0)
19733				tcp->tcp_tcph->th_flags[0] |= TH_PUSH;
19734
19735			/*
19736			 * Set FIN bit if this is our last segment; snxt
19737			 * already includes its length, and it will not
19738			 * be adjusted after this point.
19739			 */
19740			if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
19741			    *snxt == tcp->tcp_fss) {
19742				if (!tcp->tcp_fin_acked) {
19743					tcp->tcp_tcph->th_flags[0] |= TH_FIN;
19744					BUMP_MIB(&tcp_mib, tcpOutControl);
19745				}
19746				if (!tcp->tcp_fin_sent) {
19747					tcp->tcp_fin_sent = B_TRUE;
19748					/*
19749					 * tcp state must be ESTABLISHED
19750					 * in order for us to get here in
19751					 * the first place.
19752					 */
19753					tcp->tcp_state = TCPS_FIN_WAIT_1;
19754
19755					/*
19756					 * Upon returning from this routine,
19757					 * tcp_wput_data() will set tcp_snxt
19758					 * to be equal to snxt + tcp_fin_sent.
19759					 * This is essentially the same as
19760					 * setting it to tcp_fss + 1.
19761					 */
19762				}
19763			}
19764
19765			tcp->tcp_last_sent_len = (ushort_t)len;
19766
19767			len += tcp_hdr_len;
19768			if (tcp->tcp_ipversion == IPV4_VERSION)
19769				tcp->tcp_ipha->ipha_length = htons(len);
19770			else
19771				tcp->tcp_ip6h->ip6_plen = htons(len -
19772				    ((char *)&tcp->tcp_ip6h[1] -
19773				    tcp->tcp_iphc));
19774
19775			pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF);
19776
19777			/* setup header fragment */
19778			PDESC_HDR_ADD(pkt_info,
19779			    md_hbuf->b_rptr + cur_hdr_off,	/* base */
19780			    tcp->tcp_mdt_hdr_head,		/* head room */
19781			    tcp_hdr_len,			/* len */
19782			    tcp->tcp_mdt_hdr_tail);		/* tail room */
19783
19784			ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base ==
19785			    hdr_frag_sz);
19786			ASSERT(MBLKIN(md_hbuf,
19787			    (pkt_info->hdr_base - md_hbuf->b_rptr),
19788			    PDESC_HDRSIZE(pkt_info)));
19789
19790			/* setup first payload fragment */
19791			PDESC_PLD_INIT(pkt_info);
19792			PDESC_PLD_SPAN_ADD(pkt_info,
19793			    pbuf_idx,				/* index */
19794			    md_pbuf->b_rptr + cur_pld_off,	/* start */
19795			    tcp->tcp_last_sent_len);		/* len */
19796
19797			/* create a split-packet in case of a spillover */
19798			if (md_pbuf_nxt != NULL) {
19799				ASSERT(spill > 0);
19800				ASSERT(pbuf_idx_nxt > pbuf_idx);
19801				ASSERT(!add_buffer);
19802
19803				md_pbuf = md_pbuf_nxt;
19804				md_pbuf_nxt = NULL;
19805				pbuf_idx = pbuf_idx_nxt;
19806				pbuf_idx_nxt = -1;
19807				cur_pld_off = spill;
19808
19809				/* trim out first payload fragment */
19810				PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill);
19811
19812				/* setup second payload fragment */
19813				PDESC_PLD_SPAN_ADD(pkt_info,
19814				    pbuf_idx,			/* index */
19815				    md_pbuf->b_rptr,		/* start */
19816				    spill);			/* len */
19817
19818				if ((*xmit_tail)->b_next == NULL) {
19819					/*
19820					 * Store the lbolt used for RTT
19821					 * estimation. We can only record one
19822					 * timestamp per mblk so we do it when
19823					 * we reach the end of the payload
19824					 * buffer.  Also we only take a new
19825					 * timestamp sample when the previous
19826					 * timed data from the same mblk has
19827					 * been ack'ed.
19828					 */
19829					(*xmit_tail)->b_prev = local_time;
19830					(*xmit_tail)->b_next =
19831					    (mblk_t *)(uintptr_t)first_snxt;
19832				}
19833
19834				first_snxt = *snxt - spill;
19835
19836				/*
19837				 * Advance xmit_tail; usable could be 0 by
19838				 * the time we got here, but we made sure
19839				 * above that we would only spillover to
19840				 * the next data block if usable includes
19841				 * the spilled-over amount prior to the
19842				 * subtraction.  Therefore, we are sure
19843				 * that xmit_tail->b_cont can't be NULL.
19844				 */
19845				ASSERT((*xmit_tail)->b_cont != NULL);
19846				*xmit_tail = (*xmit_tail)->b_cont;
19847				ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
19848				    (uintptr_t)INT_MAX);
19849				*tail_unsent = (int)MBLKL(*xmit_tail) - spill;
19850			} else {
19851				cur_pld_off += tcp->tcp_last_sent_len;
19852			}
19853
19854			/*
19855			 * Fill in the header using the template header, and
19856			 * add options such as time-stamp, ECN and/or SACK,
19857			 * as needed.
19858			 */
19859			tcp_fill_header(tcp, pkt_info->hdr_rptr,
19860			    (clock_t)local_time, num_sack_blk);
19861
19862			/* take care of some IP header businesses */
19863			if (af == AF_INET) {
19864				ipha = (ipha_t *)pkt_info->hdr_rptr;
19865
19866				ASSERT(OK_32PTR((uchar_t *)ipha));
19867				ASSERT(PDESC_HDRL(pkt_info) >=
19868				    IP_SIMPLE_HDR_LENGTH);
19869				ASSERT(ipha->ipha_version_and_hdr_length ==
19870				    IP_SIMPLE_HDR_VERSION);
19871
19872				/*
19873				 * Assign ident value for current packet; see
19874				 * related comments in ip_wput_ire() about the
19875				 * contract private interface with clustering
19876				 * group.
19877				 */
19878				clusterwide = B_FALSE;
19879				if (cl_inet_ipident != NULL) {
19880					ASSERT(cl_inet_isclusterwide != NULL);
19881					if ((*cl_inet_isclusterwide)(IPPROTO_IP,
19882					    AF_INET,
19883					    (uint8_t *)(uintptr_t)src)) {
19884						ipha->ipha_ident =
19885						    (*cl_inet_ipident)
19886						    (IPPROTO_IP, AF_INET,
19887						    (uint8_t *)(uintptr_t)src,
19888						    (uint8_t *)(uintptr_t)dst);
19889						clusterwide = B_TRUE;
19890					}
19891				}
19892
19893				if (!clusterwide) {
19894					ipha->ipha_ident = (uint16_t)
19895					    atomic_add_32_nv(
19896						&ire->ire_ident, 1);
19897				}
19898#ifndef _BIG_ENDIAN
19899				ipha->ipha_ident = (ipha->ipha_ident << 8) |
19900				    (ipha->ipha_ident >> 8);
19901#endif
19902			} else {
19903				ip6h = (ip6_t *)pkt_info->hdr_rptr;
19904
19905				ASSERT(OK_32PTR((uchar_t *)ip6h));
19906				ASSERT(IPVER(ip6h) == IPV6_VERSION);
19907				ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
19908				ASSERT(PDESC_HDRL(pkt_info) >=
19909				    (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
19910				    TCP_CHECKSUM_SIZE));
19911				ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
19912
19913				if (tcp->tcp_ip_forward_progress) {
19914					rconfirm = B_TRUE;
19915					tcp->tcp_ip_forward_progress = B_FALSE;
19916				}
19917			}
19918
19919			/* at least one payload span, and at most two */
19920			ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3);
19921
19922			/* add the packet descriptor to Multidata */
19923			if ((pkt = mmd_addpdesc(mmd, pkt_info, &err,
19924			    KM_NOSLEEP)) == NULL) {
19925				/*
19926				 * Any failure other than ENOMEM indicates
19927				 * that we have passed in invalid pkt_info
19928				 * or parameters to mmd_addpdesc, which must
19929				 * not happen.
19930				 *
19931				 * EINVAL is a result of failure on boundary
19932				 * checks against the pkt_info contents.  It
19933				 * should not happen, and we panic because
19934				 * either there's horrible heap corruption,
19935				 * and/or programming mistake.
19936				 */
19937				if (err != ENOMEM) {
19938					cmn_err(CE_PANIC, "tcp_multisend: "
19939					    "pdesc logic error detected for "
19940					    "tcp %p mmd %p pinfo %p (%d)\n",
19941					    (void *)tcp, (void *)mmd,
19942					    (void *)pkt_info, err);
19943				}
19944				TCP_STAT(tcp_mdt_addpdescfail);
19945				goto legacy_send; /* out_of_mem */
19946			}
19947			ASSERT(pkt != NULL);
19948
19949			/* calculate IP header and TCP checksums */
19950			if (af == AF_INET) {
19951				/* calculate pseudo-header checksum */
19952				cksum = (dst >> 16) + (dst & 0xFFFF) +
19953				    (src >> 16) + (src & 0xFFFF);
19954
19955				/* offset for TCP header checksum */
19956				up = IPH_TCPH_CHECKSUMP(ipha,
19957				    IP_SIMPLE_HDR_LENGTH);
19958			} else {
19959				up = (uint16_t *)&ip6h->ip6_src;
19960
19961				/* calculate pseudo-header checksum */
19962				cksum = up[0] + up[1] + up[2] + up[3] +
19963				    up[4] + up[5] + up[6] + up[7] +
19964				    up[8] + up[9] + up[10] + up[11] +
19965				    up[12] + up[13] + up[14] + up[15];
19966
19967				/* Fold the initial sum */
19968				cksum = (cksum & 0xffff) + (cksum >> 16);
19969
19970				up = (uint16_t *)(((uchar_t *)ip6h) +
19971				    IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
19972			}
19973
19974			if (hwcksum_flags & HCK_FULLCKSUM) {
19975				/* clear checksum field for hardware */
19976				*up = 0;
19977			} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
19978				uint32_t sum;
19979
19980				/* pseudo-header checksumming */
19981				sum = *up + cksum + IP_TCP_CSUM_COMP;
19982				sum = (sum & 0xFFFF) + (sum >> 16);
19983				*up = (sum & 0xFFFF) + (sum >> 16);
19984			} else {
19985				/* software checksumming */
19986				TCP_STAT(tcp_out_sw_cksum);
19987				TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
19988				    tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
19989				*up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
19990				    cksum + IP_TCP_CSUM_COMP);
19991				if (*up == 0)
19992					*up = 0xFFFF;
19993			}
19994
19995			/* IPv4 header checksum */
19996			if (af == AF_INET) {
19997				ipha->ipha_fragment_offset_and_flags |=
19998				    (uint32_t)htons(ire->ire_frag_flag);
19999
20000				if (hwcksum_flags & HCK_IPV4_HDRCKSUM) {
20001					ipha->ipha_hdr_checksum = 0;
20002				} else {
20003					IP_HDR_CKSUM(ipha, cksum,
20004					    ((uint32_t *)ipha)[0],
20005					    ((uint16_t *)ipha)[4]);
20006				}
20007			}
20008
20009			/* advance header offset */
20010			cur_hdr_off += hdr_frag_sz;
20011
20012			obbytes += tcp->tcp_last_sent_len;
20013			++obsegs;
20014		} while (!done && *usable > 0 && --num_burst_seg > 0 &&
20015		    *tail_unsent > 0);
20016
20017		if ((*xmit_tail)->b_next == NULL) {
20018			/*
20019			 * Store the lbolt used for RTT estimation. We can only
20020			 * record one timestamp per mblk so we do it when we
20021			 * reach the end of the payload buffer. Also we only
20022			 * take a new timestamp sample when the previous timed
20023			 * data from the same mblk has been ack'ed.
20024			 */
20025			(*xmit_tail)->b_prev = local_time;
20026			(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt;
20027		}
20028
20029		ASSERT(*tail_unsent >= 0);
20030		if (*tail_unsent > 0) {
20031			/*
20032			 * We got here because we broke out of the above
20033			 * loop due to of one of the following cases:
20034			 *
20035			 *   1. len < adjusted MSS (i.e. small),
20036			 *   2. Sender SWS avoidance,
20037			 *   3. max_pld is zero.
20038			 *
20039			 * We are done for this Multidata, so trim our
20040			 * last payload buffer (if any) accordingly.
20041			 */
20042			if (md_pbuf != NULL)
20043				md_pbuf->b_wptr -= *tail_unsent;
20044		} else if (*usable > 0) {
20045			*xmit_tail = (*xmit_tail)->b_cont;
20046			ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
20047			    (uintptr_t)INT_MAX);
20048			*tail_unsent = (int)MBLKL(*xmit_tail);
20049			add_buffer = B_TRUE;
20050		}
20051	} while (!done && *usable > 0 && num_burst_seg > 0 &&
20052	    (tcp_mdt_chain || max_pld > 0));
20053
20054	/* send everything down */
20055	tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes,
20056	    &rconfirm);
20057
20058#undef PREP_NEW_MULTIDATA
20059#undef PREP_NEW_PBUF
20060#undef IPVER
20061
20062	IRE_REFRELE(ire);
20063	return (0);
20064}
20065
20066/*
20067 * A wrapper function for sending one or more Multidata messages down to
20068 * the module below ip; this routine does not release the reference of the
20069 * IRE (caller does that).  This routine is analogous to tcp_send_data().
20070 */
20071static void
20072tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head,
20073    const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm)
20074{
20075	uint64_t delta;
20076	nce_t *nce;
20077
20078	ASSERT(ire != NULL && ill != NULL);
20079	ASSERT(ire->ire_stq != NULL);
20080	ASSERT(md_mp_head != NULL);
20081	ASSERT(rconfirm != NULL);
20082
20083	/* adjust MIBs and IRE timestamp */
20084	TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT);
20085	tcp->tcp_obsegs += obsegs;
20086	UPDATE_MIB(&tcp_mib, tcpOutDataSegs, obsegs);
20087	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, obbytes);
20088	TCP_STAT_UPDATE(tcp_mdt_pkt_out, obsegs);
20089
20090	if (tcp->tcp_ipversion == IPV4_VERSION) {
20091		TCP_STAT_UPDATE(tcp_mdt_pkt_out_v4, obsegs);
20092		UPDATE_MIB(&ip_mib, ipOutRequests, obsegs);
20093	} else {
20094		TCP_STAT_UPDATE(tcp_mdt_pkt_out_v6, obsegs);
20095		UPDATE_MIB(&ip6_mib, ipv6OutRequests, obsegs);
20096	}
20097
20098	ire->ire_ob_pkt_count += obsegs;
20099	if (ire->ire_ipif != NULL)
20100		atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs);
20101	ire->ire_last_used_time = lbolt;
20102
20103	/* send it down */
20104	putnext(ire->ire_stq, md_mp_head);
20105
20106	/* we're done for TCP/IPv4 */
20107	if (tcp->tcp_ipversion == IPV4_VERSION)
20108		return;
20109
20110	nce = ire->ire_nce;
20111
20112	ASSERT(nce != NULL);
20113	ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)));
20114	ASSERT(nce->nce_state != ND_INCOMPLETE);
20115
20116	/* reachability confirmation? */
20117	if (*rconfirm) {
20118		nce->nce_last = TICK_TO_MSEC(lbolt64);
20119		if (nce->nce_state != ND_REACHABLE) {
20120			mutex_enter(&nce->nce_lock);
20121			nce->nce_state = ND_REACHABLE;
20122			nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
20123			mutex_exit(&nce->nce_lock);
20124			(void) untimeout(nce->nce_timeout_id);
20125			if (ip_debug > 2) {
20126				/* ip1dbg */
20127				pr_addr_dbg("tcp_multisend_data: state "
20128				    "for %s changed to REACHABLE\n",
20129				    AF_INET6, &ire->ire_addr_v6);
20130			}
20131		}
20132		/* reset transport reachability confirmation */
20133		*rconfirm = B_FALSE;
20134	}
20135
20136	delta =  TICK_TO_MSEC(lbolt64) - nce->nce_last;
20137	ip1dbg(("tcp_multisend_data: delta = %" PRId64
20138	    " ill_reachable_time = %d \n", delta, ill->ill_reachable_time));
20139
20140	if (delta > (uint64_t)ill->ill_reachable_time) {
20141		mutex_enter(&nce->nce_lock);
20142		switch (nce->nce_state) {
20143		case ND_REACHABLE:
20144		case ND_STALE:
20145			/*
20146			 * ND_REACHABLE is identical to ND_STALE in this
20147			 * specific case. If reachable time has expired for
20148			 * this neighbor (delta is greater than reachable
20149			 * time), conceptually, the neighbor cache is no
20150			 * longer in REACHABLE state, but already in STALE
20151			 * state.  So the correct transition here is to
20152			 * ND_DELAY.
20153			 */
20154			nce->nce_state = ND_DELAY;
20155			mutex_exit(&nce->nce_lock);
20156			NDP_RESTART_TIMER(nce, delay_first_probe_time);
20157			if (ip_debug > 3) {
20158				/* ip2dbg */
20159				pr_addr_dbg("tcp_multisend_data: state "
20160				    "for %s changed to DELAY\n",
20161				    AF_INET6, &ire->ire_addr_v6);
20162			}
20163			break;
20164		case ND_DELAY:
20165		case ND_PROBE:
20166			mutex_exit(&nce->nce_lock);
20167			/* Timers have already started */
20168			break;
20169		case ND_UNREACHABLE:
20170			/*
20171			 * ndp timer has detected that this nce is
20172			 * unreachable and initiated deleting this nce
20173			 * and all its associated IREs. This is a race
20174			 * where we found the ire before it was deleted
20175			 * and have just sent out a packet using this
20176			 * unreachable nce.
20177			 */
20178			mutex_exit(&nce->nce_lock);
20179			break;
20180		default:
20181			ASSERT(0);
20182		}
20183	}
20184}
20185
20186/*
20187 * tcp_send() is called by tcp_wput_data() for non-Multidata transmission
20188 * scheme, and returns one of the following:
20189 *
20190 * -1 = failed allocation.
20191 *  0 = success; burst count reached, or usable send window is too small,
20192 *      and that we'd rather wait until later before sending again.
20193 *  1 = success; we are called from tcp_multisend(), and both usable send
20194 *      window and tail_unsent are greater than the MDT threshold, and thus
20195 *      Multidata Transmit should be used instead.
20196 */
20197static int
20198tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
20199    const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
20200    uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
20201    const int mdt_thres)
20202{
20203	int num_burst_seg = tcp->tcp_snd_burst;
20204
20205	for (;;) {
20206		struct datab	*db;
20207		tcph_t		*tcph;
20208		uint32_t	sum;
20209		mblk_t		*mp, *mp1;
20210		uchar_t		*rptr;
20211		int		len;
20212
20213		/*
20214		 * If we're called by tcp_multisend(), and the amount of
20215		 * sendable data as well as the size of current xmit_tail
20216		 * is beyond the MDT threshold, return to the caller and
20217		 * let the large data transmit be done using MDT.
20218		 */
20219		if (*usable > 0 && *usable > mdt_thres &&
20220		    (*tail_unsent > mdt_thres || (*tail_unsent == 0 &&
20221		    MBLKL((*xmit_tail)->b_cont) > mdt_thres))) {
20222			ASSERT(tcp->tcp_mdt);
20223			return (1);	/* success; do large send */
20224		}
20225
20226		if (num_burst_seg-- == 0)
20227			break;		/* success; burst count reached */
20228
20229		len = mss;
20230		if (len > *usable) {
20231			len = *usable;
20232			if (len <= 0) {
20233				/* Terminate the loop */
20234				break;	/* success; too small */
20235			}
20236			/*
20237			 * Sender silly-window avoidance.
20238			 * Ignore this if we are going to send a
20239			 * zero window probe out.
20240			 *
20241			 * TODO: force data into microscopic window?
20242			 *	==> (!pushed || (unsent > usable))
20243			 */
20244			if (len < (tcp->tcp_max_swnd >> 1) &&
20245			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
20246			    !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
20247			    len == 1) && (! tcp->tcp_zero_win_probe)) {
20248				/*
20249				 * If the retransmit timer is not running
20250				 * we start it so that we will retransmit
20251				 * in the case when the the receiver has
20252				 * decremented the window.
20253				 */
20254				if (*snxt == tcp->tcp_snxt &&
20255				    *snxt == tcp->tcp_suna) {
20256					/*
20257					 * We are not supposed to send
20258					 * anything.  So let's wait a little
20259					 * bit longer before breaking SWS
20260					 * avoidance.
20261					 *
20262					 * What should the value be?
20263					 * Suggestion: MAX(init rexmit time,
20264					 * tcp->tcp_rto)
20265					 */
20266					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
20267				}
20268				break;	/* success; too small */
20269			}
20270		}
20271
20272		tcph = tcp->tcp_tcph;
20273
20274		*usable -= len; /* Approximate - can be adjusted later */
20275		if (*usable > 0)
20276			tcph->th_flags[0] = TH_ACK;
20277		else
20278			tcph->th_flags[0] = (TH_ACK | TH_PUSH);
20279
20280		/*
20281		 * Prime pump for IP's checksumming on our behalf
20282		 * Include the adjustment for a source route if any.
20283		 */
20284		sum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
20285		sum = (sum >> 16) + (sum & 0xFFFF);
20286		U16_TO_ABE16(sum, tcph->th_sum);
20287
20288		U32_TO_ABE32(*snxt, tcph->th_seq);
20289
20290		/*
20291		 * Branch off to tcp_xmit_mp() if any of the VALID bits is
20292		 * set.  For the case when TCP_FSS_VALID is the only valid
20293		 * bit (normal active close), branch off only when we think
20294		 * that the FIN flag needs to be set.  Note for this case,
20295		 * that (snxt + len) may not reflect the actual seg_len,
20296		 * as len may be further reduced in tcp_xmit_mp().  If len
20297		 * gets modified, we will end up here again.
20298		 */
20299		if (tcp->tcp_valid_bits != 0 &&
20300		    (tcp->tcp_valid_bits != TCP_FSS_VALID ||
20301		    ((*snxt + len) == tcp->tcp_fss))) {
20302			uchar_t		*prev_rptr;
20303			uint32_t	prev_snxt = tcp->tcp_snxt;
20304
20305			if (*tail_unsent == 0) {
20306				ASSERT((*xmit_tail)->b_cont != NULL);
20307				*xmit_tail = (*xmit_tail)->b_cont;
20308				prev_rptr = (*xmit_tail)->b_rptr;
20309				*tail_unsent = (int)((*xmit_tail)->b_wptr -
20310				    (*xmit_tail)->b_rptr);
20311			} else {
20312				prev_rptr = (*xmit_tail)->b_rptr;
20313				(*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
20314				    *tail_unsent;
20315			}
20316			mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
20317			    *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
20318			/* Restore tcp_snxt so we get amount sent right. */
20319			tcp->tcp_snxt = prev_snxt;
20320			if (prev_rptr == (*xmit_tail)->b_rptr) {
20321				/*
20322				 * If the previous timestamp is still in use,
20323				 * don't stomp on it.
20324				 */
20325				if ((*xmit_tail)->b_next == NULL) {
20326					(*xmit_tail)->b_prev = local_time;
20327					(*xmit_tail)->b_next =
20328					    (mblk_t *)(uintptr_t)(*snxt);
20329				}
20330			} else
20331				(*xmit_tail)->b_rptr = prev_rptr;
20332
20333			if (mp == NULL)
20334				return (-1);
20335			mp1 = mp->b_cont;
20336
20337			tcp->tcp_last_sent_len = (ushort_t)len;
20338			while (mp1->b_cont) {
20339				*xmit_tail = (*xmit_tail)->b_cont;
20340				(*xmit_tail)->b_prev = local_time;
20341				(*xmit_tail)->b_next =
20342				    (mblk_t *)(uintptr_t)(*snxt);
20343				mp1 = mp1->b_cont;
20344			}
20345			*snxt += len;
20346			*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
20347			BUMP_LOCAL(tcp->tcp_obsegs);
20348			BUMP_MIB(&tcp_mib, tcpOutDataSegs);
20349			UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len);
20350			TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
20351			tcp_send_data(tcp, q, mp);
20352			continue;
20353		}
20354
20355		*snxt += len;	/* Adjust later if we don't send all of len */
20356		BUMP_MIB(&tcp_mib, tcpOutDataSegs);
20357		UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len);
20358
20359		if (*tail_unsent) {
20360			/* Are the bytes above us in flight? */
20361			rptr = (*xmit_tail)->b_wptr - *tail_unsent;
20362			if (rptr != (*xmit_tail)->b_rptr) {
20363				*tail_unsent -= len;
20364				tcp->tcp_last_sent_len = (ushort_t)len;
20365				len += tcp_hdr_len;
20366				if (tcp->tcp_ipversion == IPV4_VERSION)
20367					tcp->tcp_ipha->ipha_length = htons(len);
20368				else
20369					tcp->tcp_ip6h->ip6_plen =
20370					    htons(len -
20371					    ((char *)&tcp->tcp_ip6h[1] -
20372					    tcp->tcp_iphc));
20373				mp = dupb(*xmit_tail);
20374				if (!mp)
20375					return (-1);	/* out_of_mem */
20376				mp->b_rptr = rptr;
20377				/*
20378				 * If the old timestamp is no longer in use,
20379				 * sample a new timestamp now.
20380				 */
20381				if ((*xmit_tail)->b_next == NULL) {
20382					(*xmit_tail)->b_prev = local_time;
20383					(*xmit_tail)->b_next =
20384					    (mblk_t *)(uintptr_t)(*snxt-len);
20385				}
20386				goto must_alloc;
20387			}
20388		} else {
20389			*xmit_tail = (*xmit_tail)->b_cont;
20390			ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
20391			    (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
20392			*tail_unsent = (int)((*xmit_tail)->b_wptr -
20393			    (*xmit_tail)->b_rptr);
20394		}
20395
20396		(*xmit_tail)->b_prev = local_time;
20397		(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
20398
20399		*tail_unsent -= len;
20400		tcp->tcp_last_sent_len = (ushort_t)len;
20401
20402		len += tcp_hdr_len;
20403		if (tcp->tcp_ipversion == IPV4_VERSION)
20404			tcp->tcp_ipha->ipha_length = htons(len);
20405		else
20406			tcp->tcp_ip6h->ip6_plen = htons(len -
20407			    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
20408
20409		mp = dupb(*xmit_tail);
20410		if (!mp)
20411			return (-1);	/* out_of_mem */
20412
20413		len = tcp_hdr_len;
20414		/*
20415		 * There are four reasons to allocate a new hdr mblk:
20416		 *  1) The bytes above us are in use by another packet
20417		 *  2) We don't have good alignment
20418		 *  3) The mblk is being shared
20419		 *  4) We don't have enough room for a header
20420		 */
20421		rptr = mp->b_rptr - len;
20422		if (!OK_32PTR(rptr) ||
20423		    ((db = mp->b_datap), db->db_ref != 2) ||
20424		    rptr < db->db_base) {
20425			/* NOTE: we assume allocb returns an OK_32PTR */
20426
20427		must_alloc:;
20428			mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
20429			    tcp_wroff_xtra, BPRI_MED);
20430			if (!mp1) {
20431				freemsg(mp);
20432				return (-1);	/* out_of_mem */
20433			}
20434			mp1->b_cont = mp;
20435			mp = mp1;
20436			/* Leave room for Link Level header */
20437			len = tcp_hdr_len;
20438			rptr = &mp->b_rptr[tcp_wroff_xtra];
20439			mp->b_wptr = &rptr[len];
20440		}
20441
20442		/*
20443		 * Fill in the header using the template header, and add
20444		 * options such as time-stamp, ECN and/or SACK, as needed.
20445		 */
20446		tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
20447
20448		mp->b_rptr = rptr;
20449
20450		if (*tail_unsent) {
20451			int spill = *tail_unsent;
20452
20453			mp1 = mp->b_cont;
20454			if (!mp1)
20455				mp1 = mp;
20456
20457			/*
20458			 * If we're a little short, tack on more mblks until
20459			 * there is no more spillover.
20460			 */
20461			while (spill < 0) {
20462				mblk_t *nmp;
20463				int nmpsz;
20464
20465				nmp = (*xmit_tail)->b_cont;
20466				nmpsz = MBLKL(nmp);
20467
20468				/*
20469				 * Excess data in mblk; can we split it?
20470				 * If MDT is enabled for the connection,
20471				 * keep on splitting as this is a transient
20472				 * send path.
20473				 */
20474				if (!tcp->tcp_mdt && (spill + nmpsz > 0)) {
20475					/*
20476					 * Don't split if stream head was
20477					 * told to break up larger writes
20478					 * into smaller ones.
20479					 */
20480					if (tcp->tcp_maxpsz > 0)
20481						break;
20482
20483					/*
20484					 * Next mblk is less than SMSS/2
20485					 * rounded up to nearest 64-byte;
20486					 * let it get sent as part of the
20487					 * next segment.
20488					 */
20489					if (tcp->tcp_localnet &&
20490					    !tcp->tcp_cork &&
20491					    (nmpsz < roundup((mss >> 1), 64)))
20492						break;
20493				}
20494
20495				*xmit_tail = nmp;
20496				ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
20497				/* Stash for rtt use later */
20498				(*xmit_tail)->b_prev = local_time;
20499				(*xmit_tail)->b_next =
20500				    (mblk_t *)(uintptr_t)(*snxt - len);
20501				mp1->b_cont = dupb(*xmit_tail);
20502				mp1 = mp1->b_cont;
20503
20504				spill += nmpsz;
20505				if (mp1 == NULL) {
20506					*tail_unsent = spill;
20507					freemsg(mp);
20508					return (-1);	/* out_of_mem */
20509				}
20510			}
20511
20512			/* Trim back any surplus on the last mblk */
20513			if (spill >= 0) {
20514				mp1->b_wptr -= spill;
20515				*tail_unsent = spill;
20516			} else {
20517				/*
20518				 * We did not send everything we could in
20519				 * order to remain within the b_cont limit.
20520				 */
20521				*usable -= spill;
20522				*snxt += spill;
20523				tcp->tcp_last_sent_len += spill;
20524				UPDATE_MIB(&tcp_mib, tcpOutDataBytes, spill);
20525				/*
20526				 * Adjust the checksum
20527				 */
20528				tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
20529				sum += spill;
20530				sum = (sum >> 16) + (sum & 0xFFFF);
20531				U16_TO_ABE16(sum, tcph->th_sum);
20532				if (tcp->tcp_ipversion == IPV4_VERSION) {
20533					sum = ntohs(
20534					    ((ipha_t *)rptr)->ipha_length) +
20535					    spill;
20536					((ipha_t *)rptr)->ipha_length =
20537					    htons(sum);
20538				} else {
20539					sum = ntohs(
20540					    ((ip6_t *)rptr)->ip6_plen) +
20541					    spill;
20542					((ip6_t *)rptr)->ip6_plen =
20543					    htons(sum);
20544				}
20545				*tail_unsent = 0;
20546			}
20547		}
20548		if (tcp->tcp_ip_forward_progress) {
20549			ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
20550			*(uint32_t *)mp->b_rptr  |= IP_FORWARD_PROG;
20551			tcp->tcp_ip_forward_progress = B_FALSE;
20552		}
20553
20554		TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
20555		tcp_send_data(tcp, q, mp);
20556		BUMP_LOCAL(tcp->tcp_obsegs);
20557	}
20558
20559	return (0);
20560}
20561
20562/* Unlink and return any mblk that looks like it contains a MDT info */
20563static mblk_t *
20564tcp_mdt_info_mp(mblk_t *mp)
20565{
20566	mblk_t	*prev_mp;
20567
20568	for (;;) {
20569		prev_mp = mp;
20570		/* no more to process? */
20571		if ((mp = mp->b_cont) == NULL)
20572			break;
20573
20574		switch (DB_TYPE(mp)) {
20575		case M_CTL:
20576			if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE)
20577				continue;
20578			ASSERT(prev_mp != NULL);
20579			prev_mp->b_cont = mp->b_cont;
20580			mp->b_cont = NULL;
20581			return (mp);
20582		default:
20583			break;
20584		}
20585	}
20586	return (mp);
20587}
20588
20589/* MDT info update routine, called when IP notifies us about MDT */
20590static void
20591tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first)
20592{
20593	boolean_t prev_state;
20594
20595	/*
20596	 * IP is telling us to abort MDT on this connection?  We know
20597	 * this because the capability is only turned off when IP
20598	 * encounters some pathological cases, e.g. link-layer change
20599	 * where the new driver doesn't support MDT, or in situation
20600	 * where MDT usage on the link-layer has been switched off.
20601	 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE
20602	 * if the link-layer doesn't support MDT, and if it does, it
20603	 * will indicate that the feature is to be turned on.
20604	 */
20605	prev_state = tcp->tcp_mdt;
20606	tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0);
20607	if (!tcp->tcp_mdt && !first) {
20608		TCP_STAT(tcp_mdt_conn_halted3);
20609		ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n",
20610		    (void *)tcp->tcp_connp));
20611	}
20612
20613	/*
20614	 * We currently only support MDT on simple TCP/{IPv4,IPv6},
20615	 * so disable MDT otherwise.  The checks are done here
20616	 * and in tcp_wput_data().
20617	 */
20618	if (tcp->tcp_mdt &&
20619	    (tcp->tcp_ipversion == IPV4_VERSION &&
20620	    tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
20621	    (tcp->tcp_ipversion == IPV6_VERSION &&
20622	    tcp->tcp_ip_hdr_len != IPV6_HDR_LEN))
20623		tcp->tcp_mdt = B_FALSE;
20624
20625	if (tcp->tcp_mdt) {
20626		if (mdt_capab->ill_mdt_version != MDT_VERSION_2) {
20627			cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT "
20628			    "version (%d), expected version is %d",
20629			    mdt_capab->ill_mdt_version, MDT_VERSION_2);
20630			tcp->tcp_mdt = B_FALSE;
20631			return;
20632		}
20633
20634		/*
20635		 * We need the driver to be able to handle at least three
20636		 * spans per packet in order for tcp MDT to be utilized.
20637		 * The first is for the header portion, while the rest are
20638		 * needed to handle a packet that straddles across two
20639		 * virtually non-contiguous buffers; a typical tcp packet
20640		 * therefore consists of only two spans.  Note that we take
20641		 * a zero as "don't care".
20642		 */
20643		if (mdt_capab->ill_mdt_span_limit > 0 &&
20644		    mdt_capab->ill_mdt_span_limit < 3) {
20645			tcp->tcp_mdt = B_FALSE;
20646			return;
20647		}
20648
20649		/* a zero means driver wants default value */
20650		tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld,
20651		    tcp_mdt_max_pbufs);
20652		if (tcp->tcp_mdt_max_pld == 0)
20653			tcp->tcp_mdt_max_pld = tcp_mdt_max_pbufs;
20654
20655		/* ensure 32-bit alignment */
20656		tcp->tcp_mdt_hdr_head = roundup(MAX(tcp_mdt_hdr_head_min,
20657		    mdt_capab->ill_mdt_hdr_head), 4);
20658		tcp->tcp_mdt_hdr_tail = roundup(MAX(tcp_mdt_hdr_tail_min,
20659		    mdt_capab->ill_mdt_hdr_tail), 4);
20660
20661		if (!first && !prev_state) {
20662			TCP_STAT(tcp_mdt_conn_resumed2);
20663			ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n",
20664			    (void *)tcp->tcp_connp));
20665		}
20666	}
20667}
20668
20669static void
20670tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt)
20671{
20672	conn_t *connp = tcp->tcp_connp;
20673
20674	ASSERT(ire != NULL);
20675
20676	/*
20677	 * We may be in the fastpath here, and although we essentially do
20678	 * similar checks as in ip_bind_connected{_v6}/ip_mdinfo_return,
20679	 * we try to keep things as brief as possible.  After all, these
20680	 * are only best-effort checks, and we do more thorough ones prior
20681	 * to calling tcp_multisend().
20682	 */
20683	if (ip_multidata_outbound && check_mdt &&
20684	    !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
20685	    ill != NULL && ILL_MDT_CAPABLE(ill) &&
20686	    !CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
20687	    !(ire->ire_flags & RTF_MULTIRT) &&
20688	    !IPP_ENABLED(IPP_LOCAL_OUT) &&
20689	    CONN_IS_MD_FASTPATH(connp)) {
20690		/* Remember the result */
20691		connp->conn_mdt_ok = B_TRUE;
20692
20693		ASSERT(ill->ill_mdt_capab != NULL);
20694		if (!ill->ill_mdt_capab->ill_mdt_on) {
20695			/*
20696			 * If MDT has been previously turned off in the past,
20697			 * and we currently can do MDT (due to IPQoS policy
20698			 * removal, etc.) then enable it for this interface.
20699			 */
20700			ill->ill_mdt_capab->ill_mdt_on = 1;
20701			ip1dbg(("tcp_ire_ill_check: connp %p enables MDT for "
20702			    "interface %s\n", (void *)connp, ill->ill_name));
20703		}
20704		tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE);
20705	}
20706
20707	/*
20708	 * The goal is to reduce the number of generated tcp segments by
20709	 * setting the maxpsz multiplier to 0; this will have an affect on
20710	 * tcp_maxpsz_set().  With this behavior, tcp will pack more data
20711	 * into each packet, up to SMSS bytes.  Doing this reduces the number
20712	 * of outbound segments and incoming ACKs, thus allowing for better
20713	 * network and system performance.  In contrast the legacy behavior
20714	 * may result in sending less than SMSS size, because the last mblk
20715	 * for some packets may have more data than needed to make up SMSS,
20716	 * and the legacy code refused to "split" it.
20717	 *
20718	 * We apply the new behavior on following situations:
20719	 *
20720	 *   1) Loopback connections,
20721	 *   2) Connections in which the remote peer is not on local subnet,
20722	 *   3) Local subnet connections over the bge interface (see below).
20723	 *
20724	 * Ideally, we would like this behavior to apply for interfaces other
20725	 * than bge.  However, doing so would negatively impact drivers which
20726	 * perform dynamic mapping and unmapping of DMA resources, which are
20727	 * increased by setting the maxpsz multiplier to 0 (more mblks per
20728	 * packet will be generated by tcp).  The bge driver does not suffer
20729	 * from this, as it copies the mblks into pre-mapped buffers, and
20730	 * therefore does not require more I/O resources than before.
20731	 *
20732	 * Otherwise, this behavior is present on all network interfaces when
20733	 * the destination endpoint is non-local, since reducing the number
20734	 * of packets in general is good for the network.
20735	 *
20736	 * TODO We need to remove this hard-coded conditional for bge once
20737	 *	a better "self-tuning" mechanism, or a way to comprehend
20738	 *	the driver transmit strategy is devised.  Until the solution
20739	 *	is found and well understood, we live with this hack.
20740	 */
20741	if (!tcp_static_maxpsz &&
20742	    (tcp->tcp_loopback || !tcp->tcp_localnet ||
20743	    (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) {
20744		/* override the default value */
20745		tcp->tcp_maxpsz = 0;
20746
20747		ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on "
20748		    "interface %s\n", (void *)connp, tcp->tcp_maxpsz,
20749		    ill != NULL ? ill->ill_name : ipif_loopback_name));
20750	}
20751
20752	/* set the stream head parameters accordingly */
20753	(void) tcp_maxpsz_set(tcp, B_TRUE);
20754}
20755
20756/* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
20757static void
20758tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
20759{
20760	uchar_t	fval = *mp->b_rptr;
20761	mblk_t	*tail;
20762	queue_t	*q = tcp->tcp_wq;
20763
20764	/* TODO: How should flush interact with urgent data? */
20765	if ((fval & FLUSHW) && tcp->tcp_xmit_head &&
20766	    !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
20767		/*
20768		 * Flush only data that has not yet been put on the wire.  If
20769		 * we flush data that we have already transmitted, life, as we
20770		 * know it, may come to an end.
20771		 */
20772		tail = tcp->tcp_xmit_tail;
20773		tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
20774		tcp->tcp_xmit_tail_unsent = 0;
20775		tcp->tcp_unsent = 0;
20776		if (tail->b_wptr != tail->b_rptr)
20777			tail = tail->b_cont;
20778		if (tail) {
20779			mblk_t **excess = &tcp->tcp_xmit_head;
20780			for (;;) {
20781				mblk_t *mp1 = *excess;
20782				if (mp1 == tail)
20783					break;
20784				tcp->tcp_xmit_tail = mp1;
20785				tcp->tcp_xmit_last = mp1;
20786				excess = &mp1->b_cont;
20787			}
20788			*excess = NULL;
20789			tcp_close_mpp(&tail);
20790			if (tcp->tcp_snd_zcopy_aware)
20791				tcp_zcopy_notify(tcp);
20792		}
20793		/*
20794		 * We have no unsent data, so unsent must be less than
20795		 * tcp_xmit_lowater, so re-enable flow.
20796		 */
20797		if (tcp->tcp_flow_stopped) {
20798			tcp_clrqfull(tcp);
20799		}
20800	}
20801	/*
20802	 * TODO: you can't just flush these, you have to increase rwnd for one
20803	 * thing.  For another, how should urgent data interact?
20804	 */
20805	if (fval & FLUSHR) {
20806		*mp->b_rptr = fval & ~FLUSHW;
20807		/* XXX */
20808		qreply(q, mp);
20809		return;
20810	}
20811	freemsg(mp);
20812}
20813
20814/*
20815 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
20816 * messages.
20817 */
20818static void
20819tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
20820{
20821	mblk_t	*mp1;
20822	STRUCT_HANDLE(strbuf, sb);
20823	uint16_t port;
20824	queue_t 	*q = tcp->tcp_wq;
20825	in6_addr_t	v6addr;
20826	ipaddr_t	v4addr;
20827	uint32_t	flowinfo = 0;
20828	int		addrlen;
20829
20830	/* Make sure it is one of ours. */
20831	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
20832	case TI_GETMYNAME:
20833	case TI_GETPEERNAME:
20834		break;
20835	default:
20836		CALL_IP_WPUT(tcp->tcp_connp, q, mp);
20837		return;
20838	}
20839	switch (mi_copy_state(q, mp, &mp1)) {
20840	case -1:
20841		return;
20842	case MI_COPY_CASE(MI_COPY_IN, 1):
20843		break;
20844	case MI_COPY_CASE(MI_COPY_OUT, 1):
20845		/* Copy out the strbuf. */
20846		mi_copyout(q, mp);
20847		return;
20848	case MI_COPY_CASE(MI_COPY_OUT, 2):
20849		/* All done. */
20850		mi_copy_done(q, mp, 0);
20851		return;
20852	default:
20853		mi_copy_done(q, mp, EPROTO);
20854		return;
20855	}
20856	/* Check alignment of the strbuf */
20857	if (!OK_32PTR(mp1->b_rptr)) {
20858		mi_copy_done(q, mp, EINVAL);
20859		return;
20860	}
20861
20862	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
20863	    (void *)mp1->b_rptr);
20864	addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t);
20865
20866	if (STRUCT_FGET(sb, maxlen) < addrlen) {
20867		mi_copy_done(q, mp, EINVAL);
20868		return;
20869	}
20870	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
20871	case TI_GETMYNAME:
20872		if (tcp->tcp_family == AF_INET) {
20873			if (tcp->tcp_ipversion == IPV4_VERSION) {
20874				v4addr = tcp->tcp_ipha->ipha_src;
20875			} else {
20876				/* can't return an address in this case */
20877				v4addr = 0;
20878			}
20879		} else {
20880			/* tcp->tcp_family == AF_INET6 */
20881			if (tcp->tcp_ipversion == IPV4_VERSION) {
20882				IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
20883				    &v6addr);
20884			} else {
20885				v6addr = tcp->tcp_ip6h->ip6_src;
20886			}
20887		}
20888		port = tcp->tcp_lport;
20889		break;
20890	case TI_GETPEERNAME:
20891		if (tcp->tcp_family == AF_INET) {
20892			if (tcp->tcp_ipversion == IPV4_VERSION) {
20893				IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6,
20894				    v4addr);
20895			} else {
20896				/* can't return an address in this case */
20897				v4addr = 0;
20898			}
20899		} else {
20900			/* tcp->tcp_family == AF_INET6) */
20901			v6addr = tcp->tcp_remote_v6;
20902			if (tcp->tcp_ipversion == IPV6_VERSION) {
20903				/*
20904				 * No flowinfo if tcp->tcp_ipversion is v4.
20905				 *
20906				 * flowinfo was already initialized to zero
20907				 * where it was declared above, so only
20908				 * set it if ipversion is v6.
20909				 */
20910				flowinfo = tcp->tcp_ip6h->ip6_vcf &
20911				    ~IPV6_VERS_AND_FLOW_MASK;
20912			}
20913		}
20914		port = tcp->tcp_fport;
20915		break;
20916	default:
20917		mi_copy_done(q, mp, EPROTO);
20918		return;
20919	}
20920	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
20921	if (!mp1)
20922		return;
20923
20924	if (tcp->tcp_family == AF_INET) {
20925		sin_t *sin;
20926
20927		STRUCT_FSET(sb, len, (int)sizeof (sin_t));
20928		sin = (sin_t *)mp1->b_rptr;
20929		mp1->b_wptr = (uchar_t *)&sin[1];
20930		*sin = sin_null;
20931		sin->sin_family = AF_INET;
20932		sin->sin_addr.s_addr = v4addr;
20933		sin->sin_port = port;
20934	} else {
20935		/* tcp->tcp_family == AF_INET6 */
20936		sin6_t *sin6;
20937
20938		STRUCT_FSET(sb, len, (int)sizeof (sin6_t));
20939		sin6 = (sin6_t *)mp1->b_rptr;
20940		mp1->b_wptr = (uchar_t *)&sin6[1];
20941		*sin6 = sin6_null;
20942		sin6->sin6_family = AF_INET6;
20943		sin6->sin6_flowinfo = flowinfo;
20944		sin6->sin6_addr = v6addr;
20945		sin6->sin6_port = port;
20946	}
20947	/* Copy out the address */
20948	mi_copyout(q, mp);
20949}
20950
20951/*
20952 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
20953 * messages.
20954 */
20955/* ARGSUSED */
20956static void
20957tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
20958{
20959	conn_t 	*connp = (conn_t *)arg;
20960	tcp_t	*tcp = connp->conn_tcp;
20961	queue_t	*q = tcp->tcp_wq;
20962	struct iocblk	*iocp;
20963
20964	ASSERT(DB_TYPE(mp) == M_IOCTL);
20965	/*
20966	 * Try and ASSERT the minimum possible references on the
20967	 * conn early enough. Since we are executing on write side,
20968	 * the connection is obviously not detached and that means
20969	 * there is a ref each for TCP and IP. Since we are behind
20970	 * the squeue, the minimum references needed are 3. If the
20971	 * conn is in classifier hash list, there should be an
20972	 * extra ref for that (we check both the possibilities).
20973	 */
20974	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
20975	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
20976
20977	iocp = (struct iocblk *)mp->b_rptr;
20978	switch (iocp->ioc_cmd) {
20979	case TCP_IOC_DEFAULT_Q:
20980		/* Wants to be the default wq. */
20981		if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) {
20982			iocp->ioc_error = EPERM;
20983			iocp->ioc_count = 0;
20984			mp->b_datap->db_type = M_IOCACK;
20985			qreply(q, mp);
20986			return;
20987		}
20988		tcp_def_q_set(tcp, mp);
20989		return;
20990	case _SIOCSOCKFALLBACK:
20991		/*
20992		 * Either sockmod is about to be popped and the socket
20993		 * would now be treated as a plain stream, or a module
20994		 * is about to be pushed so we could no longer use read-
20995		 * side synchronous streams for fused loopback tcp.
20996		 * Drain any queued data and disable direct sockfs
20997		 * interface from now on.
20998		 */
20999		if (!tcp->tcp_issocket) {
21000			DB_TYPE(mp) = M_IOCNAK;
21001			iocp->ioc_error = EINVAL;
21002		} else {
21003#ifdef	_ILP32
21004			tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
21005#else
21006			tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
21007#endif
21008			/*
21009			 * Insert this socket into the acceptor hash.
21010			 * We might need it for T_CONN_RES message
21011			 */
21012			tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
21013
21014			if (tcp->tcp_fused) {
21015				/*
21016				 * This is a fused loopback tcp; disable
21017				 * read-side synchronous streams interface
21018				 * and drain any queued data.  It is okay
21019				 * to do this for non-synchronous streams
21020				 * fused tcp as well.
21021				 */
21022				tcp_fuse_disable_pair(tcp, B_FALSE);
21023			}
21024			tcp->tcp_issocket = B_FALSE;
21025			TCP_STAT(tcp_sock_fallback);
21026
21027			DB_TYPE(mp) = M_IOCACK;
21028			iocp->ioc_error = 0;
21029		}
21030		iocp->ioc_count = 0;
21031		iocp->ioc_rval = 0;
21032		qreply(q, mp);
21033		return;
21034	}
21035	CALL_IP_WPUT(connp, q, mp);
21036}
21037
21038/*
21039 * This routine is called by tcp_wput() to handle all TPI requests.
21040 */
21041/* ARGSUSED */
21042static void
21043tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
21044{
21045	conn_t 	*connp = (conn_t *)arg;
21046	tcp_t	*tcp = connp->conn_tcp;
21047	union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
21048	uchar_t *rptr;
21049	t_scalar_t type;
21050	int len;
21051	cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred);
21052
21053	/*
21054	 * Try and ASSERT the minimum possible references on the
21055	 * conn early enough. Since we are executing on write side,
21056	 * the connection is obviously not detached and that means
21057	 * there is a ref each for TCP and IP. Since we are behind
21058	 * the squeue, the minimum references needed are 3. If the
21059	 * conn is in classifier hash list, there should be an
21060	 * extra ref for that (we check both the possibilities).
21061	 */
21062	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
21063	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
21064
21065	rptr = mp->b_rptr;
21066	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
21067	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
21068		type = ((union T_primitives *)rptr)->type;
21069		if (type == T_EXDATA_REQ) {
21070			uint32_t msize = msgdsize(mp->b_cont);
21071
21072			len = msize - 1;
21073			if (len < 0) {
21074				freemsg(mp);
21075				return;
21076			}
21077			/*
21078			 * Try to force urgent data out on the wire.
21079			 * Even if we have unsent data this will
21080			 * at least send the urgent flag.
21081			 * XXX does not handle more flag correctly.
21082			 */
21083			len += tcp->tcp_unsent;
21084			len += tcp->tcp_snxt;
21085			tcp->tcp_urg = len;
21086			tcp->tcp_valid_bits |= TCP_URG_VALID;
21087
21088			/* Bypass tcp protocol for fused tcp loopback */
21089			if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
21090				return;
21091		} else if (type != T_DATA_REQ) {
21092			goto non_urgent_data;
21093		}
21094		/* TODO: options, flags, ... from user */
21095		/* Set length to zero for reclamation below */
21096		tcp_wput_data(tcp, mp->b_cont, B_TRUE);
21097		freeb(mp);
21098		return;
21099	} else {
21100		if (tcp->tcp_debug) {
21101			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
21102			    "tcp_wput_proto, dropping one...");
21103		}
21104		freemsg(mp);
21105		return;
21106	}
21107
21108non_urgent_data:
21109
21110	switch ((int)tprim->type) {
21111	case T_SSL_PROXY_BIND_REQ:	/* an SSL proxy endpoint bind request */
21112		/*
21113		 * save the kssl_ent_t from the next block, and convert this
21114		 * back to a normal bind_req.
21115		 */
21116		if (mp->b_cont != NULL) {
21117		    ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t));
21118
21119			if (tcp->tcp_kssl_ent != NULL) {
21120				kssl_release_ent(tcp->tcp_kssl_ent, NULL,
21121				    KSSL_NO_PROXY);
21122				tcp->tcp_kssl_ent = NULL;
21123			}
21124			bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent,
21125			    sizeof (kssl_ent_t));
21126			kssl_hold_ent(tcp->tcp_kssl_ent);
21127			freemsg(mp->b_cont);
21128			mp->b_cont = NULL;
21129		}
21130		tprim->type = T_BIND_REQ;
21131
21132	/* FALLTHROUGH */
21133	case O_T_BIND_REQ:	/* bind request */
21134	case T_BIND_REQ:	/* new semantics bind request */
21135		tcp_bind(tcp, mp);
21136		break;
21137	case T_UNBIND_REQ:	/* unbind request */
21138		tcp_unbind(tcp, mp);
21139		break;
21140	case O_T_CONN_RES:	/* old connection response XXX */
21141	case T_CONN_RES:	/* connection response */
21142		tcp_accept(tcp, mp);
21143		break;
21144	case T_CONN_REQ:	/* connection request */
21145		tcp_connect(tcp, mp);
21146		break;
21147	case T_DISCON_REQ:	/* disconnect request */
21148		tcp_disconnect(tcp, mp);
21149		break;
21150	case T_CAPABILITY_REQ:
21151		tcp_capability_req(tcp, mp);	/* capability request */
21152		break;
21153	case T_INFO_REQ:	/* information request */
21154		tcp_info_req(tcp, mp);
21155		break;
21156	case T_SVR4_OPTMGMT_REQ:	/* manage options req */
21157		/* Only IP is allowed to return meaningful value */
21158		(void) svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj);
21159		break;
21160	case T_OPTMGMT_REQ:
21161		/*
21162		 * Note:  no support for snmpcom_req() through new
21163		 * T_OPTMGMT_REQ. See comments in ip.c
21164		 */
21165		/* Only IP is allowed to return meaningful value */
21166		(void) tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj);
21167		break;
21168
21169	case T_UNITDATA_REQ:	/* unitdata request */
21170		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
21171		break;
21172	case T_ORDREL_REQ:	/* orderly release req */
21173		freemsg(mp);
21174
21175		if (tcp->tcp_fused)
21176			tcp_unfuse(tcp);
21177
21178		if (tcp_xmit_end(tcp) != 0) {
21179			/*
21180			 * We were crossing FINs and got a reset from
21181			 * the other side. Just ignore it.
21182			 */
21183			if (tcp->tcp_debug) {
21184				(void) strlog(TCP_MOD_ID, 0, 1,
21185				    SL_ERROR|SL_TRACE,
21186				    "tcp_wput_proto, T_ORDREL_REQ out of "
21187				    "state %s",
21188				    tcp_display(tcp, NULL,
21189				    DISP_ADDR_AND_PORT));
21190			}
21191		}
21192		break;
21193	case T_ADDR_REQ:
21194		tcp_addr_req(tcp, mp);
21195		break;
21196	default:
21197		if (tcp->tcp_debug) {
21198			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
21199			    "tcp_wput_proto, bogus TPI msg, type %d",
21200			    tprim->type);
21201		}
21202		/*
21203		 * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
21204		 * to recover.
21205		 */
21206		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
21207		break;
21208	}
21209}
21210
21211/*
21212 * The TCP write service routine should never be called...
21213 */
21214/* ARGSUSED */
21215static void
21216tcp_wsrv(queue_t *q)
21217{
21218	TCP_STAT(tcp_wsrv_called);
21219}
21220
21221/* Non overlapping byte exchanger */
21222static void
21223tcp_xchg(uchar_t *a, uchar_t *b, int len)
21224{
21225	uchar_t	uch;
21226
21227	while (len-- > 0) {
21228		uch = a[len];
21229		a[len] = b[len];
21230		b[len] = uch;
21231	}
21232}
21233
21234/*
21235 * Send out a control packet on the tcp connection specified.  This routine
21236 * is typically called where we need a simple ACK or RST generated.
21237 */
21238static void
21239tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
21240{
21241	uchar_t		*rptr;
21242	tcph_t		*tcph;
21243	ipha_t		*ipha = NULL;
21244	ip6_t		*ip6h = NULL;
21245	uint32_t	sum;
21246	int		tcp_hdr_len;
21247	int		tcp_ip_hdr_len;
21248	mblk_t		*mp;
21249
21250	/*
21251	 * Save sum for use in source route later.
21252	 */
21253	ASSERT(tcp != NULL);
21254	sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
21255	tcp_hdr_len = tcp->tcp_hdr_len;
21256	tcp_ip_hdr_len = tcp->tcp_ip_hdr_len;
21257
21258	/* If a text string is passed in with the request, pass it to strlog. */
21259	if (str != NULL && tcp->tcp_debug) {
21260		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
21261		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
21262		    str, seq, ack, ctl);
21263	}
21264	mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra,
21265	    BPRI_MED);
21266	if (mp == NULL) {
21267		return;
21268	}
21269	rptr = &mp->b_rptr[tcp_wroff_xtra];
21270	mp->b_rptr = rptr;
21271	mp->b_wptr = &rptr[tcp_hdr_len];
21272	bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len);
21273
21274	if (tcp->tcp_ipversion == IPV4_VERSION) {
21275		ipha = (ipha_t *)rptr;
21276		ipha->ipha_length = htons(tcp_hdr_len);
21277	} else {
21278		ip6h = (ip6_t *)rptr;
21279		ASSERT(tcp != NULL);
21280		ip6h->ip6_plen = htons(tcp->tcp_hdr_len -
21281		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
21282	}
21283	tcph = (tcph_t *)&rptr[tcp_ip_hdr_len];
21284	tcph->th_flags[0] = (uint8_t)ctl;
21285	if (ctl & TH_RST) {
21286		BUMP_MIB(&tcp_mib, tcpOutRsts);
21287		BUMP_MIB(&tcp_mib, tcpOutControl);
21288		/*
21289		 * Don't send TSopt w/ TH_RST packets per RFC 1323.
21290		 */
21291		if (tcp->tcp_snd_ts_ok &&
21292		    tcp->tcp_state > TCPS_SYN_SENT) {
21293			mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN];
21294			*(mp->b_wptr) = TCPOPT_EOL;
21295			if (tcp->tcp_ipversion == IPV4_VERSION) {
21296				ipha->ipha_length = htons(tcp_hdr_len -
21297				    TCPOPT_REAL_TS_LEN);
21298			} else {
21299				ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
21300				    TCPOPT_REAL_TS_LEN);
21301			}
21302			tcph->th_offset_and_rsrvd[0] -= (3 << 4);
21303			sum -= TCPOPT_REAL_TS_LEN;
21304		}
21305	}
21306	if (ctl & TH_ACK) {
21307		if (tcp->tcp_snd_ts_ok) {
21308			U32_TO_BE32(lbolt,
21309			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
21310			U32_TO_BE32(tcp->tcp_ts_recent,
21311			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
21312		}
21313
21314		/* Update the latest receive window size in TCP header. */
21315		U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
21316		    tcph->th_win);
21317		tcp->tcp_rack = ack;
21318		tcp->tcp_rack_cnt = 0;
21319		BUMP_MIB(&tcp_mib, tcpOutAck);
21320	}
21321	BUMP_LOCAL(tcp->tcp_obsegs);
21322	U32_TO_BE32(seq, tcph->th_seq);
21323	U32_TO_BE32(ack, tcph->th_ack);
21324	/*
21325	 * Include the adjustment for a source route if any.
21326	 */
21327	sum = (sum >> 16) + (sum & 0xFFFF);
21328	U16_TO_BE16(sum, tcph->th_sum);
21329	TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
21330	tcp_send_data(tcp, tcp->tcp_wq, mp);
21331}
21332
21333/*
21334 * If this routine returns B_TRUE, TCP can generate a RST in response
21335 * to a segment.  If it returns B_FALSE, TCP should not respond.
21336 */
21337static boolean_t
21338tcp_send_rst_chk(void)
21339{
21340	clock_t	now;
21341
21342	/*
21343	 * TCP needs to protect itself from generating too many RSTs.
21344	 * This can be a DoS attack by sending us random segments
21345	 * soliciting RSTs.
21346	 *
21347	 * What we do here is to have a limit of tcp_rst_sent_rate RSTs
21348	 * in each 1 second interval.  In this way, TCP still generate
21349	 * RSTs in normal cases but when under attack, the impact is
21350	 * limited.
21351	 */
21352	if (tcp_rst_sent_rate_enabled != 0) {
21353		now = lbolt;
21354		/* lbolt can wrap around. */
21355		if ((tcp_last_rst_intrvl > now) ||
21356		    (TICK_TO_MSEC(now - tcp_last_rst_intrvl) > 1*SECONDS)) {
21357			tcp_last_rst_intrvl = now;
21358			tcp_rst_cnt = 1;
21359		} else if (++tcp_rst_cnt > tcp_rst_sent_rate) {
21360			return (B_FALSE);
21361		}
21362	}
21363	return (B_TRUE);
21364}
21365
21366/*
21367 * Send down the advice IP ioctl to tell IP to mark an IRE temporary.
21368 */
21369static void
21370tcp_ip_ire_mark_advice(tcp_t *tcp)
21371{
21372	mblk_t *mp;
21373	ipic_t *ipic;
21374
21375	if (tcp->tcp_ipversion == IPV4_VERSION) {
21376		mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
21377		    &ipic);
21378	} else {
21379		mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
21380		    &ipic);
21381	}
21382	if (mp == NULL)
21383		return;
21384	ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
21385	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
21386}
21387
21388/*
21389 * Return an IP advice ioctl mblk and set ipic to be the pointer
21390 * to the advice structure.
21391 */
21392static mblk_t *
21393tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic)
21394{
21395	struct iocblk *ioc;
21396	mblk_t *mp, *mp1;
21397
21398	mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI);
21399	if (mp == NULL)
21400		return (NULL);
21401	bzero(mp->b_rptr, sizeof (ipic_t) + addr_len);
21402	*ipic = (ipic_t *)mp->b_rptr;
21403	(*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY;
21404	(*ipic)->ipic_addr_offset = sizeof (ipic_t);
21405
21406	bcopy(addr, *ipic + 1, addr_len);
21407
21408	(*ipic)->ipic_addr_length = addr_len;
21409	mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len];
21410
21411	mp1 = mkiocb(IP_IOCTL);
21412	if (mp1 == NULL) {
21413		freemsg(mp);
21414		return (NULL);
21415	}
21416	mp1->b_cont = mp;
21417	ioc = (struct iocblk *)mp1->b_rptr;
21418	ioc->ioc_count = sizeof (ipic_t) + addr_len;
21419
21420	return (mp1);
21421}
21422
21423/*
21424 * Generate a reset based on an inbound packet for which there is no active
21425 * tcp state that we can find.
21426 *
21427 * IPSEC NOTE : Try to send the reply with the same protection as it came
21428 * in.  We still have the ipsec_mp that the packet was attached to. Thus
21429 * the packet will go out at the same level of protection as it came in by
21430 * converting the IPSEC_IN to IPSEC_OUT.
21431 */
21432static void
21433tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
21434    uint32_t ack, int ctl, uint_t ip_hdr_len)
21435{
21436	ipha_t		*ipha = NULL;
21437	ip6_t		*ip6h = NULL;
21438	ushort_t	len;
21439	tcph_t		*tcph;
21440	int		i;
21441	mblk_t		*ipsec_mp;
21442	boolean_t	mctl_present;
21443	ipic_t		*ipic;
21444	ipaddr_t	v4addr;
21445	in6_addr_t	v6addr;
21446	int		addr_len;
21447	void		*addr;
21448	queue_t		*q = tcp_g_q;
21449	tcp_t		*tcp = Q_TO_TCP(q);
21450	cred_t		*cr;
21451
21452	if (!tcp_send_rst_chk()) {
21453		tcp_rst_unsent++;
21454		freemsg(mp);
21455		return;
21456	}
21457
21458	if (mp->b_datap->db_type == M_CTL) {
21459		ipsec_mp = mp;
21460		mp = mp->b_cont;
21461		mctl_present = B_TRUE;
21462	} else {
21463		ipsec_mp = mp;
21464		mctl_present = B_FALSE;
21465	}
21466
21467	if (str && q && tcp_dbg) {
21468		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
21469		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
21470		    "flags 0x%x",
21471		    str, seq, ack, ctl);
21472	}
21473	if (mp->b_datap->db_ref != 1) {
21474		mblk_t *mp1 = copyb(mp);
21475		freemsg(mp);
21476		mp = mp1;
21477		if (!mp) {
21478			if (mctl_present)
21479				freeb(ipsec_mp);
21480			return;
21481		} else {
21482			if (mctl_present) {
21483				ipsec_mp->b_cont = mp;
21484			} else {
21485				ipsec_mp = mp;
21486			}
21487		}
21488	} else if (mp->b_cont) {
21489		freemsg(mp->b_cont);
21490		mp->b_cont = NULL;
21491	}
21492	/*
21493	 * We skip reversing source route here.
21494	 * (for now we replace all IP options with EOL)
21495	 */
21496	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
21497		ipha = (ipha_t *)mp->b_rptr;
21498		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
21499			mp->b_rptr[i] = IPOPT_EOL;
21500		/*
21501		 * Make sure that src address isn't flagrantly invalid.
21502		 * Not all broadcast address checking for the src address
21503		 * is possible, since we don't know the netmask of the src
21504		 * addr.  No check for destination address is done, since
21505		 * IP will not pass up a packet with a broadcast dest
21506		 * address to TCP.  Similar checks are done below for IPv6.
21507		 */
21508		if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
21509		    CLASSD(ipha->ipha_src)) {
21510			freemsg(ipsec_mp);
21511			BUMP_MIB(&ip_mib, ipInDiscards);
21512			return;
21513		}
21514	} else {
21515		ip6h = (ip6_t *)mp->b_rptr;
21516
21517		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
21518		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
21519			freemsg(ipsec_mp);
21520			BUMP_MIB(&ip6_mib, ipv6InDiscards);
21521			return;
21522		}
21523
21524		/* Remove any extension headers assuming partial overlay */
21525		if (ip_hdr_len > IPV6_HDR_LEN) {
21526			uint8_t *to;
21527
21528			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
21529			ovbcopy(ip6h, to, IPV6_HDR_LEN);
21530			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
21531			ip_hdr_len = IPV6_HDR_LEN;
21532			ip6h = (ip6_t *)mp->b_rptr;
21533			ip6h->ip6_nxt = IPPROTO_TCP;
21534		}
21535	}
21536	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
21537	if (tcph->th_flags[0] & TH_RST) {
21538		freemsg(ipsec_mp);
21539		return;
21540	}
21541	tcph->th_offset_and_rsrvd[0] = (5 << 4);
21542	len = ip_hdr_len + sizeof (tcph_t);
21543	mp->b_wptr = &mp->b_rptr[len];
21544	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
21545		ipha->ipha_length = htons(len);
21546		/* Swap addresses */
21547		v4addr = ipha->ipha_src;
21548		ipha->ipha_src = ipha->ipha_dst;
21549		ipha->ipha_dst = v4addr;
21550		ipha->ipha_ident = 0;
21551		ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl;
21552		addr_len = IP_ADDR_LEN;
21553		addr = &v4addr;
21554	} else {
21555		/* No ip6i_t in this case */
21556		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
21557		/* Swap addresses */
21558		v6addr = ip6h->ip6_src;
21559		ip6h->ip6_src = ip6h->ip6_dst;
21560		ip6h->ip6_dst = v6addr;
21561		ip6h->ip6_hops = (uchar_t)tcp_ipv6_hoplimit;
21562		addr_len = IPV6_ADDR_LEN;
21563		addr = &v6addr;
21564	}
21565	tcp_xchg(tcph->th_fport, tcph->th_lport, 2);
21566	U32_TO_BE32(ack, tcph->th_ack);
21567	U32_TO_BE32(seq, tcph->th_seq);
21568	U16_TO_BE16(0, tcph->th_win);
21569	U16_TO_BE16(sizeof (tcph_t), tcph->th_sum);
21570	tcph->th_flags[0] = (uint8_t)ctl;
21571	if (ctl & TH_RST) {
21572		BUMP_MIB(&tcp_mib, tcpOutRsts);
21573		BUMP_MIB(&tcp_mib, tcpOutControl);
21574	}
21575
21576	/* IP trusts us to set up labels when required. */
21577	if (is_system_labeled() && (cr = DB_CRED(mp)) != NULL &&
21578	    crgetlabel(cr) != NULL) {
21579		int err, adjust;
21580
21581		if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION)
21582			err = tsol_check_label(cr, &mp, &adjust,
21583			    tcp->tcp_connp->conn_mac_exempt);
21584		else
21585			err = tsol_check_label_v6(cr, &mp, &adjust,
21586			    tcp->tcp_connp->conn_mac_exempt);
21587		if (mctl_present)
21588			ipsec_mp->b_cont = mp;
21589		else
21590			ipsec_mp = mp;
21591		if (err != 0) {
21592			freemsg(ipsec_mp);
21593			return;
21594		}
21595		if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
21596			ipha = (ipha_t *)mp->b_rptr;
21597			adjust += ntohs(ipha->ipha_length);
21598			ipha->ipha_length = htons(adjust);
21599		} else {
21600			ip6h = (ip6_t *)mp->b_rptr;
21601		}
21602	}
21603
21604	if (mctl_present) {
21605		ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
21606
21607		ASSERT(ii->ipsec_in_type == IPSEC_IN);
21608		if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) {
21609			return;
21610		}
21611	}
21612	/*
21613	 * NOTE:  one might consider tracing a TCP packet here, but
21614	 * this function has no active TCP state and no tcp structure
21615	 * that has a trace buffer.  If we traced here, we would have
21616	 * to keep a local trace buffer in tcp_record_trace().
21617	 *
21618	 * TSol note: The mblk that contains the incoming packet was
21619	 * reused by tcp_xmit_listener_reset, so it already contains
21620	 * the right credentials and we don't need to call mblk_setcred.
21621	 * Also the conn's cred is not right since it is associated
21622	 * with tcp_g_q.
21623	 */
21624	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
21625
21626	/*
21627	 * Tell IP to mark the IRE used for this destination temporary.
21628	 * This way, we can limit our exposure to DoS attack because IP
21629	 * creates an IRE for each destination.  If there are too many,
21630	 * the time to do any routing lookup will be extremely long.  And
21631	 * the lookup can be in interrupt context.
21632	 *
21633	 * Note that in normal circumstances, this marking should not
21634	 * affect anything.  It would be nice if only 1 message is
21635	 * needed to inform IP that the IRE created for this RST should
21636	 * not be added to the cache table.  But there is currently
21637	 * not such communication mechanism between TCP and IP.  So
21638	 * the best we can do now is to send the advice ioctl to IP
21639	 * to mark the IRE temporary.
21640	 */
21641	if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) {
21642		ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
21643		CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
21644	}
21645}
21646
21647/*
21648 * Initiate closedown sequence on an active connection.  (May be called as
21649 * writer.)  Return value zero for OK return, non-zero for error return.
21650 */
21651static int
21652tcp_xmit_end(tcp_t *tcp)
21653{
21654	ipic_t	*ipic;
21655	mblk_t	*mp;
21656
21657	if (tcp->tcp_state < TCPS_SYN_RCVD ||
21658	    tcp->tcp_state > TCPS_CLOSE_WAIT) {
21659		/*
21660		 * Invalid state, only states TCPS_SYN_RCVD,
21661		 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
21662		 */
21663		return (-1);
21664	}
21665
21666	tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
21667	tcp->tcp_valid_bits |= TCP_FSS_VALID;
21668	/*
21669	 * If there is nothing more unsent, send the FIN now.
21670	 * Otherwise, it will go out with the last segment.
21671	 */
21672	if (tcp->tcp_unsent == 0) {
21673		mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
21674		    tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
21675
21676		if (mp) {
21677			TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
21678			tcp_send_data(tcp, tcp->tcp_wq, mp);
21679		} else {
21680			/*
21681			 * Couldn't allocate msg.  Pretend we got it out.
21682			 * Wait for rexmit timeout.
21683			 */
21684			tcp->tcp_snxt = tcp->tcp_fss + 1;
21685			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
21686		}
21687
21688		/*
21689		 * If needed, update tcp_rexmit_snxt as tcp_snxt is
21690		 * changed.
21691		 */
21692		if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
21693			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
21694		}
21695	} else {
21696		/*
21697		 * If tcp->tcp_cork is set, then the data will not get sent,
21698		 * so we have to check that and unset it first.
21699		 */
21700		if (tcp->tcp_cork)
21701			tcp->tcp_cork = B_FALSE;
21702		tcp_wput_data(tcp, NULL, B_FALSE);
21703	}
21704
21705	/*
21706	 * If TCP does not get enough samples of RTT or tcp_rtt_updates
21707	 * is 0, don't update the cache.
21708	 */
21709	if (tcp_rtt_updates == 0 || tcp->tcp_rtt_update < tcp_rtt_updates)
21710		return (0);
21711
21712	/*
21713	 * NOTE: should not update if source routes i.e. if tcp_remote if
21714	 * different from the destination.
21715	 */
21716	if (tcp->tcp_ipversion == IPV4_VERSION) {
21717		if (tcp->tcp_remote !=  tcp->tcp_ipha->ipha_dst) {
21718			return (0);
21719		}
21720		mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
21721		    &ipic);
21722	} else {
21723		if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
21724		    &tcp->tcp_ip6h->ip6_dst))) {
21725			return (0);
21726		}
21727		mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
21728		    &ipic);
21729	}
21730
21731	/* Record route attributes in the IRE for use by future connections. */
21732	if (mp == NULL)
21733		return (0);
21734
21735	/*
21736	 * We do not have a good algorithm to update ssthresh at this time.
21737	 * So don't do any update.
21738	 */
21739	ipic->ipic_rtt = tcp->tcp_rtt_sa;
21740	ipic->ipic_rtt_sd = tcp->tcp_rtt_sd;
21741
21742	CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
21743	return (0);
21744}
21745
21746/*
21747 * Generate a "no listener here" RST in response to an "unknown" segment.
21748 * Note that we are reusing the incoming mp to construct the outgoing
21749 * RST.
21750 */
21751void
21752tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len)
21753{
21754	uchar_t		*rptr;
21755	uint32_t	seg_len;
21756	tcph_t		*tcph;
21757	uint32_t	seg_seq;
21758	uint32_t	seg_ack;
21759	uint_t		flags;
21760	mblk_t		*ipsec_mp;
21761	ipha_t 		*ipha;
21762	ip6_t 		*ip6h;
21763	boolean_t	mctl_present = B_FALSE;
21764	boolean_t	check = B_TRUE;
21765	boolean_t	policy_present;
21766
21767	TCP_STAT(tcp_no_listener);
21768
21769	ipsec_mp = mp;
21770
21771	if (mp->b_datap->db_type == M_CTL) {
21772		ipsec_in_t *ii;
21773
21774		mctl_present = B_TRUE;
21775		mp = mp->b_cont;
21776
21777		ii = (ipsec_in_t *)ipsec_mp->b_rptr;
21778		ASSERT(ii->ipsec_in_type == IPSEC_IN);
21779		if (ii->ipsec_in_dont_check) {
21780			check = B_FALSE;
21781			if (!ii->ipsec_in_secure) {
21782				freeb(ipsec_mp);
21783				mctl_present = B_FALSE;
21784				ipsec_mp = mp;
21785			}
21786		}
21787	}
21788
21789	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
21790		policy_present = ipsec_inbound_v4_policy_present;
21791		ipha = (ipha_t *)mp->b_rptr;
21792		ip6h = NULL;
21793	} else {
21794		policy_present = ipsec_inbound_v6_policy_present;
21795		ipha = NULL;
21796		ip6h = (ip6_t *)mp->b_rptr;
21797	}
21798
21799	if (check && policy_present) {
21800		/*
21801		 * The conn_t parameter is NULL because we already know
21802		 * nobody's home.
21803		 */
21804		ipsec_mp = ipsec_check_global_policy(
21805			ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present);
21806		if (ipsec_mp == NULL)
21807			return;
21808	}
21809	if (is_system_labeled() && !tsol_can_reply_error(mp)) {
21810		DTRACE_PROBE2(
21811		    tx__ip__log__error__nolistener__tcp,
21812		    char *, "Could not reply with RST to mp(1)",
21813		    mblk_t *, mp);
21814		ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
21815		freemsg(ipsec_mp);
21816		return;
21817	}
21818
21819	rptr = mp->b_rptr;
21820
21821	tcph = (tcph_t *)&rptr[ip_hdr_len];
21822	seg_seq = BE32_TO_U32(tcph->th_seq);
21823	seg_ack = BE32_TO_U32(tcph->th_ack);
21824	flags = tcph->th_flags[0];
21825
21826	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
21827	if (flags & TH_RST) {
21828		freemsg(ipsec_mp);
21829	} else if (flags & TH_ACK) {
21830		tcp_xmit_early_reset("no tcp, reset",
21831		    ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len);
21832	} else {
21833		if (flags & TH_SYN) {
21834			seg_len++;
21835		} else {
21836			/*
21837			 * Here we violate the RFC.  Note that a normal
21838			 * TCP will never send a segment without the ACK
21839			 * flag, except for RST or SYN segment.  This
21840			 * segment is neither.  Just drop it on the
21841			 * floor.
21842			 */
21843			freemsg(ipsec_mp);
21844			tcp_rst_unsent++;
21845			return;
21846		}
21847
21848		tcp_xmit_early_reset("no tcp, reset/ack",
21849		    ipsec_mp, 0, seg_seq + seg_len,
21850		    TH_RST | TH_ACK, ip_hdr_len);
21851	}
21852}
21853
21854/*
21855 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
21856 * ip and tcp header ready to pass down to IP.  If the mp passed in is
21857 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
21858 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
21859 * otherwise it will dup partial mblks.)
21860 * Otherwise, an appropriate ACK packet will be generated.  This
21861 * routine is not usually called to send new data for the first time.  It
21862 * is mostly called out of the timer for retransmits, and to generate ACKs.
21863 *
21864 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
21865 * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
21866 * of the original mblk chain will be returned in *offset and *end_mp.
21867 */
21868static mblk_t *
21869tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
21870    mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
21871    boolean_t rexmit)
21872{
21873	int	data_length;
21874	int32_t	off = 0;
21875	uint_t	flags;
21876	mblk_t	*mp1;
21877	mblk_t	*mp2;
21878	uchar_t	*rptr;
21879	tcph_t	*tcph;
21880	int32_t	num_sack_blk = 0;
21881	int32_t	sack_opt_len = 0;
21882
21883	/* Allocate for our maximum TCP header + link-level */
21884	mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra,
21885	    BPRI_MED);
21886	if (!mp1)
21887		return (NULL);
21888	data_length = 0;
21889
21890	/*
21891	 * Note that tcp_mss has been adjusted to take into account the
21892	 * timestamp option if applicable.  Because SACK options do not
21893	 * appear in every TCP segments and they are of variable lengths,
21894	 * they cannot be included in tcp_mss.  Thus we need to calculate
21895	 * the actual segment length when we need to send a segment which
21896	 * includes SACK options.
21897	 */
21898	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
21899		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
21900		    tcp->tcp_num_sack_blk);
21901		sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
21902		    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
21903		if (max_to_send + sack_opt_len > tcp->tcp_mss)
21904			max_to_send -= sack_opt_len;
21905	}
21906
21907	if (offset != NULL) {
21908		off = *offset;
21909		/* We use offset as an indicator that end_mp is not NULL. */
21910		*end_mp = NULL;
21911	}
21912	for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
21913		/* This could be faster with cooperation from downstream */
21914		if (mp2 != mp1 && !sendall &&
21915		    data_length + (int)(mp->b_wptr - mp->b_rptr) >
21916		    max_to_send)
21917			/*
21918			 * Don't send the next mblk since the whole mblk
21919			 * does not fit.
21920			 */
21921			break;
21922		mp2->b_cont = dupb(mp);
21923		mp2 = mp2->b_cont;
21924		if (!mp2) {
21925			freemsg(mp1);
21926			return (NULL);
21927		}
21928		mp2->b_rptr += off;
21929		ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
21930		    (uintptr_t)INT_MAX);
21931
21932		data_length += (int)(mp2->b_wptr - mp2->b_rptr);
21933		if (data_length > max_to_send) {
21934			mp2->b_wptr -= data_length - max_to_send;
21935			data_length = max_to_send;
21936			off = mp2->b_wptr - mp->b_rptr;
21937			break;
21938		} else {
21939			off = 0;
21940		}
21941	}
21942	if (offset != NULL) {
21943		*offset = off;
21944		*end_mp = mp;
21945	}
21946	if (seg_len != NULL) {
21947		*seg_len = data_length;
21948	}
21949
21950	/* Update the latest receive window size in TCP header. */
21951	U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
21952	    tcp->tcp_tcph->th_win);
21953
21954	rptr = mp1->b_rptr + tcp_wroff_xtra;
21955	mp1->b_rptr = rptr;
21956	mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len;
21957	bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
21958	tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
21959	U32_TO_ABE32(seq, tcph->th_seq);
21960
21961	/*
21962	 * Use tcp_unsent to determine if the PUSH bit should be used assumes
21963	 * that this function was called from tcp_wput_data. Thus, when called
21964	 * to retransmit data the setting of the PUSH bit may appear some
21965	 * what random in that it might get set when it should not. This
21966	 * should not pose any performance issues.
21967	 */
21968	if (data_length != 0 && (tcp->tcp_unsent == 0 ||
21969	    tcp->tcp_unsent == data_length)) {
21970		flags = TH_ACK | TH_PUSH;
21971	} else {
21972		flags = TH_ACK;
21973	}
21974
21975	if (tcp->tcp_ecn_ok) {
21976		if (tcp->tcp_ecn_echo_on)
21977			flags |= TH_ECE;
21978
21979		/*
21980		 * Only set ECT bit and ECN_CWR if a segment contains new data.
21981		 * There is no TCP flow control for non-data segments, and
21982		 * only data segment is transmitted reliably.
21983		 */
21984		if (data_length > 0 && !rexmit) {
21985			SET_ECT(tcp, rptr);
21986			if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
21987				flags |= TH_CWR;
21988				tcp->tcp_ecn_cwr_sent = B_TRUE;
21989			}
21990		}
21991	}
21992
21993	if (tcp->tcp_valid_bits) {
21994		uint32_t u1;
21995
21996		if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
21997		    seq == tcp->tcp_iss) {
21998			uchar_t	*wptr;
21999
22000			/*
22001			 * If TCP_ISS_VALID and the seq number is tcp_iss,
22002			 * TCP can only be in SYN-SENT, SYN-RCVD or
22003			 * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
22004			 * our SYN is not ack'ed but the app closes this
22005			 * TCP connection.
22006			 */
22007			ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
22008			    tcp->tcp_state == TCPS_SYN_RCVD ||
22009			    tcp->tcp_state == TCPS_FIN_WAIT_1);
22010
22011			/*
22012			 * Tack on the MSS option.  It is always needed
22013			 * for both active and passive open.
22014			 *
22015			 * MSS option value should be interface MTU - MIN
22016			 * TCP/IP header according to RFC 793 as it means
22017			 * the maximum segment size TCP can receive.  But
22018			 * to get around some broken middle boxes/end hosts
22019			 * out there, we allow the option value to be the
22020			 * same as the MSS option size on the peer side.
22021			 * In this way, the other side will not send
22022			 * anything larger than they can receive.
22023			 *
22024			 * Note that for SYN_SENT state, the ndd param
22025			 * tcp_use_smss_as_mss_opt has no effect as we
22026			 * don't know the peer's MSS option value. So
22027			 * the only case we need to take care of is in
22028			 * SYN_RCVD state, which is done later.
22029			 */
22030			wptr = mp1->b_wptr;
22031			wptr[0] = TCPOPT_MAXSEG;
22032			wptr[1] = TCPOPT_MAXSEG_LEN;
22033			wptr += 2;
22034			u1 = tcp->tcp_if_mtu -
22035			    (tcp->tcp_ipversion == IPV4_VERSION ?
22036			    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) -
22037			    TCP_MIN_HEADER_LENGTH;
22038			U16_TO_BE16(u1, wptr);
22039			mp1->b_wptr = wptr + 2;
22040			/* Update the offset to cover the additional word */
22041			tcph->th_offset_and_rsrvd[0] += (1 << 4);
22042
22043			/*
22044			 * Note that the following way of filling in
22045			 * TCP options are not optimal.  Some NOPs can
22046			 * be saved.  But there is no need at this time
22047			 * to optimize it.  When it is needed, we will
22048			 * do it.
22049			 */
22050			switch (tcp->tcp_state) {
22051			case TCPS_SYN_SENT:
22052				flags = TH_SYN;
22053
22054				if (tcp->tcp_snd_ts_ok) {
22055					uint32_t llbolt = (uint32_t)lbolt;
22056
22057					wptr = mp1->b_wptr;
22058					wptr[0] = TCPOPT_NOP;
22059					wptr[1] = TCPOPT_NOP;
22060					wptr[2] = TCPOPT_TSTAMP;
22061					wptr[3] = TCPOPT_TSTAMP_LEN;
22062					wptr += 4;
22063					U32_TO_BE32(llbolt, wptr);
22064					wptr += 4;
22065					ASSERT(tcp->tcp_ts_recent == 0);
22066					U32_TO_BE32(0L, wptr);
22067					mp1->b_wptr += TCPOPT_REAL_TS_LEN;
22068					tcph->th_offset_and_rsrvd[0] +=
22069					    (3 << 4);
22070				}
22071
22072				/*
22073				 * Set up all the bits to tell other side
22074				 * we are ECN capable.
22075				 */
22076				if (tcp->tcp_ecn_ok) {
22077					flags |= (TH_ECE | TH_CWR);
22078				}
22079				break;
22080			case TCPS_SYN_RCVD:
22081				flags |= TH_SYN;
22082
22083				/*
22084				 * Reset the MSS option value to be SMSS
22085				 * We should probably add back the bytes
22086				 * for timestamp option and IPsec.  We
22087				 * don't do that as this is a workaround
22088				 * for broken middle boxes/end hosts, it
22089				 * is better for us to be more cautious.
22090				 * They may not take these things into
22091				 * account in their SMSS calculation.  Thus
22092				 * the peer's calculated SMSS may be smaller
22093				 * than what it can be.  This should be OK.
22094				 */
22095				if (tcp_use_smss_as_mss_opt) {
22096					u1 = tcp->tcp_mss;
22097					U16_TO_BE16(u1, wptr);
22098				}
22099
22100				/*
22101				 * If the other side is ECN capable, reply
22102				 * that we are also ECN capable.
22103				 */
22104				if (tcp->tcp_ecn_ok)
22105					flags |= TH_ECE;
22106				break;
22107			default:
22108				/*
22109				 * The above ASSERT() makes sure that this
22110				 * must be FIN-WAIT-1 state.  Our SYN has
22111				 * not been ack'ed so retransmit it.
22112				 */
22113				flags |= TH_SYN;
22114				break;
22115			}
22116
22117			if (tcp->tcp_snd_ws_ok) {
22118				wptr = mp1->b_wptr;
22119				wptr[0] =  TCPOPT_NOP;
22120				wptr[1] =  TCPOPT_WSCALE;
22121				wptr[2] =  TCPOPT_WS_LEN;
22122				wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
22123				mp1->b_wptr += TCPOPT_REAL_WS_LEN;
22124				tcph->th_offset_and_rsrvd[0] += (1 << 4);
22125			}
22126
22127			if (tcp->tcp_snd_sack_ok) {
22128				wptr = mp1->b_wptr;
22129				wptr[0] = TCPOPT_NOP;
22130				wptr[1] = TCPOPT_NOP;
22131				wptr[2] = TCPOPT_SACK_PERMITTED;
22132				wptr[3] = TCPOPT_SACK_OK_LEN;
22133				mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN;
22134				tcph->th_offset_and_rsrvd[0] += (1 << 4);
22135			}
22136
22137			/* allocb() of adequate mblk assures space */
22138			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
22139			    (uintptr_t)INT_MAX);
22140			u1 = (int)(mp1->b_wptr - mp1->b_rptr);
22141			/*
22142			 * Get IP set to checksum on our behalf
22143			 * Include the adjustment for a source route if any.
22144			 */
22145			u1 += tcp->tcp_sum;
22146			u1 = (u1 >> 16) + (u1 & 0xFFFF);
22147			U16_TO_BE16(u1, tcph->th_sum);
22148			BUMP_MIB(&tcp_mib, tcpOutControl);
22149		}
22150		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
22151		    (seq + data_length) == tcp->tcp_fss) {
22152			if (!tcp->tcp_fin_acked) {
22153				flags |= TH_FIN;
22154				BUMP_MIB(&tcp_mib, tcpOutControl);
22155			}
22156			if (!tcp->tcp_fin_sent) {
22157				tcp->tcp_fin_sent = B_TRUE;
22158				switch (tcp->tcp_state) {
22159				case TCPS_SYN_RCVD:
22160				case TCPS_ESTABLISHED:
22161					tcp->tcp_state = TCPS_FIN_WAIT_1;
22162					break;
22163				case TCPS_CLOSE_WAIT:
22164					tcp->tcp_state = TCPS_LAST_ACK;
22165					break;
22166				}
22167				if (tcp->tcp_suna == tcp->tcp_snxt)
22168					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
22169				tcp->tcp_snxt = tcp->tcp_fss + 1;
22170			}
22171		}
22172		/*
22173		 * Note the trick here.  u1 is unsigned.  When tcp_urg
22174		 * is smaller than seq, u1 will become a very huge value.
22175		 * So the comparison will fail.  Also note that tcp_urp
22176		 * should be positive, see RFC 793 page 17.
22177		 */
22178		u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
22179		if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
22180		    u1 < (uint32_t)(64 * 1024)) {
22181			flags |= TH_URG;
22182			BUMP_MIB(&tcp_mib, tcpOutUrg);
22183			U32_TO_ABE16(u1, tcph->th_urp);
22184		}
22185	}
22186	tcph->th_flags[0] = (uchar_t)flags;
22187	tcp->tcp_rack = tcp->tcp_rnxt;
22188	tcp->tcp_rack_cnt = 0;
22189
22190	if (tcp->tcp_snd_ts_ok) {
22191		if (tcp->tcp_state != TCPS_SYN_SENT) {
22192			uint32_t llbolt = (uint32_t)lbolt;
22193
22194			U32_TO_BE32(llbolt,
22195			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
22196			U32_TO_BE32(tcp->tcp_ts_recent,
22197			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
22198		}
22199	}
22200
22201	if (num_sack_blk > 0) {
22202		uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
22203		sack_blk_t *tmp;
22204		int32_t	i;
22205
22206		wptr[0] = TCPOPT_NOP;
22207		wptr[1] = TCPOPT_NOP;
22208		wptr[2] = TCPOPT_SACK;
22209		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
22210		    sizeof (sack_blk_t);
22211		wptr += TCPOPT_REAL_SACK_LEN;
22212
22213		tmp = tcp->tcp_sack_list;
22214		for (i = 0; i < num_sack_blk; i++) {
22215			U32_TO_BE32(tmp[i].begin, wptr);
22216			wptr += sizeof (tcp_seq);
22217			U32_TO_BE32(tmp[i].end, wptr);
22218			wptr += sizeof (tcp_seq);
22219		}
22220		tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4);
22221	}
22222	ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
22223	data_length += (int)(mp1->b_wptr - rptr);
22224	if (tcp->tcp_ipversion == IPV4_VERSION) {
22225		((ipha_t *)rptr)->ipha_length = htons(data_length);
22226	} else {
22227		ip6_t *ip6 = (ip6_t *)(rptr +
22228		    (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
22229		    sizeof (ip6i_t) : 0));
22230
22231		ip6->ip6_plen = htons(data_length -
22232		    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
22233	}
22234
22235	/*
22236	 * Prime pump for IP
22237	 * Include the adjustment for a source route if any.
22238	 */
22239	data_length -= tcp->tcp_ip_hdr_len;
22240	data_length += tcp->tcp_sum;
22241	data_length = (data_length >> 16) + (data_length & 0xFFFF);
22242	U16_TO_ABE16(data_length, tcph->th_sum);
22243	if (tcp->tcp_ip_forward_progress) {
22244		ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
22245		*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
22246		tcp->tcp_ip_forward_progress = B_FALSE;
22247	}
22248	return (mp1);
22249}
22250
22251/* This function handles the push timeout. */
22252void
22253tcp_push_timer(void *arg)
22254{
22255	conn_t	*connp = (conn_t *)arg;
22256	tcp_t *tcp = connp->conn_tcp;
22257
22258	TCP_DBGSTAT(tcp_push_timer_cnt);
22259
22260	ASSERT(tcp->tcp_listener == NULL);
22261
22262	/*
22263	 * We need to stop synchronous streams temporarily to prevent a race
22264	 * with tcp_fuse_rrw() or tcp_fusion rinfop().  It is safe to access
22265	 * tcp_rcv_list here because those entry points will return right
22266	 * away when synchronous streams is stopped.
22267	 */
22268	TCP_FUSE_SYNCSTR_STOP(tcp);
22269	tcp->tcp_push_tid = 0;
22270	if ((tcp->tcp_rcv_list != NULL) &&
22271	    (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED))
22272		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
22273	TCP_FUSE_SYNCSTR_RESUME(tcp);
22274}
22275
22276/*
22277 * This function handles delayed ACK timeout.
22278 */
22279static void
22280tcp_ack_timer(void *arg)
22281{
22282	conn_t	*connp = (conn_t *)arg;
22283	tcp_t *tcp = connp->conn_tcp;
22284	mblk_t *mp;
22285
22286	TCP_DBGSTAT(tcp_ack_timer_cnt);
22287
22288	tcp->tcp_ack_tid = 0;
22289
22290	if (tcp->tcp_fused)
22291		return;
22292
22293	/*
22294	 * Do not send ACK if there is no outstanding unack'ed data.
22295	 */
22296	if (tcp->tcp_rnxt == tcp->tcp_rack) {
22297		return;
22298	}
22299
22300	if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
22301		/*
22302		 * Make sure we don't allow deferred ACKs to result in
22303		 * timer-based ACKing.  If we have held off an ACK
22304		 * when there was more than an mss here, and the timer
22305		 * goes off, we have to worry about the possibility
22306		 * that the sender isn't doing slow-start, or is out
22307		 * of step with us for some other reason.  We fall
22308		 * permanently back in the direction of
22309		 * ACK-every-other-packet as suggested in RFC 1122.
22310		 */
22311		if (tcp->tcp_rack_abs_max > 2)
22312			tcp->tcp_rack_abs_max--;
22313		tcp->tcp_rack_cur_max = 2;
22314	}
22315	mp = tcp_ack_mp(tcp);
22316
22317	if (mp != NULL) {
22318		TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT);
22319		BUMP_LOCAL(tcp->tcp_obsegs);
22320		BUMP_MIB(&tcp_mib, tcpOutAck);
22321		BUMP_MIB(&tcp_mib, tcpOutAckDelayed);
22322		tcp_send_data(tcp, tcp->tcp_wq, mp);
22323	}
22324}
22325
22326
22327/* Generate an ACK-only (no data) segment for a TCP endpoint */
22328static mblk_t *
22329tcp_ack_mp(tcp_t *tcp)
22330{
22331	uint32_t	seq_no;
22332
22333	/*
22334	 * There are a few cases to be considered while setting the sequence no.
22335	 * Essentially, we can come here while processing an unacceptable pkt
22336	 * in the TCPS_SYN_RCVD state, in which case we set the sequence number
22337	 * to snxt (per RFC 793), note the swnd wouldn't have been set yet.
22338	 * If we are here for a zero window probe, stick with suna. In all
22339	 * other cases, we check if suna + swnd encompasses snxt and set
22340	 * the sequence number to snxt, if so. If snxt falls outside the
22341	 * window (the receiver probably shrunk its window), we will go with
22342	 * suna + swnd, otherwise the sequence no will be unacceptable to the
22343	 * receiver.
22344	 */
22345	if (tcp->tcp_zero_win_probe) {
22346		seq_no = tcp->tcp_suna;
22347	} else if (tcp->tcp_state == TCPS_SYN_RCVD) {
22348		ASSERT(tcp->tcp_swnd == 0);
22349		seq_no = tcp->tcp_snxt;
22350	} else {
22351		seq_no = SEQ_GT(tcp->tcp_snxt,
22352		    (tcp->tcp_suna + tcp->tcp_swnd)) ?
22353		    (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt;
22354	}
22355
22356	if (tcp->tcp_valid_bits) {
22357		/*
22358		 * For the complex case where we have to send some
22359		 * controls (FIN or SYN), let tcp_xmit_mp do it.
22360		 */
22361		return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE,
22362		    NULL, B_FALSE));
22363	} else {
22364		/* Generate a simple ACK */
22365		int	data_length;
22366		uchar_t	*rptr;
22367		tcph_t	*tcph;
22368		mblk_t	*mp1;
22369		int32_t	tcp_hdr_len;
22370		int32_t	tcp_tcp_hdr_len;
22371		int32_t	num_sack_blk = 0;
22372		int32_t sack_opt_len;
22373
22374		/*
22375		 * Allocate space for TCP + IP headers
22376		 * and link-level header
22377		 */
22378		if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
22379			num_sack_blk = MIN(tcp->tcp_max_sack_blk,
22380			    tcp->tcp_num_sack_blk);
22381			sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
22382			    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
22383			tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len;
22384			tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len;
22385		} else {
22386			tcp_hdr_len = tcp->tcp_hdr_len;
22387			tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
22388		}
22389		mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, BPRI_MED);
22390		if (!mp1)
22391			return (NULL);
22392
22393		/* Update the latest receive window size in TCP header. */
22394		U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
22395		    tcp->tcp_tcph->th_win);
22396		/* copy in prototype TCP + IP header */
22397		rptr = mp1->b_rptr + tcp_wroff_xtra;
22398		mp1->b_rptr = rptr;
22399		mp1->b_wptr = rptr + tcp_hdr_len;
22400		bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
22401
22402		tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
22403
22404		/* Set the TCP sequence number. */
22405		U32_TO_ABE32(seq_no, tcph->th_seq);
22406
22407		/* Set up the TCP flag field. */
22408		tcph->th_flags[0] = (uchar_t)TH_ACK;
22409		if (tcp->tcp_ecn_echo_on)
22410			tcph->th_flags[0] |= TH_ECE;
22411
22412		tcp->tcp_rack = tcp->tcp_rnxt;
22413		tcp->tcp_rack_cnt = 0;
22414
22415		/* fill in timestamp option if in use */
22416		if (tcp->tcp_snd_ts_ok) {
22417			uint32_t llbolt = (uint32_t)lbolt;
22418
22419			U32_TO_BE32(llbolt,
22420			    (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
22421			U32_TO_BE32(tcp->tcp_ts_recent,
22422			    (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
22423		}
22424
22425		/* Fill in SACK options */
22426		if (num_sack_blk > 0) {
22427			uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
22428			sack_blk_t *tmp;
22429			int32_t	i;
22430
22431			wptr[0] = TCPOPT_NOP;
22432			wptr[1] = TCPOPT_NOP;
22433			wptr[2] = TCPOPT_SACK;
22434			wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
22435			    sizeof (sack_blk_t);
22436			wptr += TCPOPT_REAL_SACK_LEN;
22437
22438			tmp = tcp->tcp_sack_list;
22439			for (i = 0; i < num_sack_blk; i++) {
22440				U32_TO_BE32(tmp[i].begin, wptr);
22441				wptr += sizeof (tcp_seq);
22442				U32_TO_BE32(tmp[i].end, wptr);
22443				wptr += sizeof (tcp_seq);
22444			}
22445			tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1)
22446			    << 4);
22447		}
22448
22449		if (tcp->tcp_ipversion == IPV4_VERSION) {
22450			((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len);
22451		} else {
22452			/* Check for ip6i_t header in sticky hdrs */
22453			ip6_t *ip6 = (ip6_t *)(rptr +
22454			    (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
22455			    sizeof (ip6i_t) : 0));
22456
22457			ip6->ip6_plen = htons(tcp_hdr_len -
22458			    ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
22459		}
22460
22461		/*
22462		 * Prime pump for checksum calculation in IP.  Include the
22463		 * adjustment for a source route if any.
22464		 */
22465		data_length = tcp_tcp_hdr_len + tcp->tcp_sum;
22466		data_length = (data_length >> 16) + (data_length & 0xFFFF);
22467		U16_TO_ABE16(data_length, tcph->th_sum);
22468
22469		if (tcp->tcp_ip_forward_progress) {
22470			ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
22471			*(uint32_t *)mp1->b_rptr  |= IP_FORWARD_PROG;
22472			tcp->tcp_ip_forward_progress = B_FALSE;
22473		}
22474		return (mp1);
22475	}
22476}
22477
22478/*
22479 * To create a temporary tcp structure for inserting into bind hash list.
22480 * The parameter is assumed to be in network byte order, ready for use.
22481 */
22482/* ARGSUSED */
22483static tcp_t *
22484tcp_alloc_temp_tcp(in_port_t port)
22485{
22486	conn_t	*connp;
22487	tcp_t	*tcp;
22488
22489	connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP);
22490	if (connp == NULL)
22491		return (NULL);
22492
22493	tcp = connp->conn_tcp;
22494
22495	/*
22496	 * Only initialize the necessary info in those structures.  Note
22497	 * that since INADDR_ANY is all 0, we do not need to set
22498	 * tcp_bound_source to INADDR_ANY here.
22499	 */
22500	tcp->tcp_state = TCPS_BOUND;
22501	tcp->tcp_lport = port;
22502	tcp->tcp_exclbind = 1;
22503	tcp->tcp_reserved_port = 1;
22504
22505	/* Just for place holding... */
22506	tcp->tcp_ipversion = IPV4_VERSION;
22507
22508	return (tcp);
22509}
22510
22511/*
22512 * To remove a port range specified by lo_port and hi_port from the
22513 * reserved port ranges.  This is one of the three public functions of
22514 * the reserved port interface.  Note that a port range has to be removed
22515 * as a whole.  Ports in a range cannot be removed individually.
22516 *
22517 * Params:
22518 *	in_port_t lo_port: the beginning port of the reserved port range to
22519 *		be deleted.
22520 *	in_port_t hi_port: the ending port of the reserved port range to
22521 *		be deleted.
22522 *
22523 * Return:
22524 *	B_TRUE if the deletion is successful, B_FALSE otherwise.
22525 */
22526boolean_t
22527tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port)
22528{
22529	int	i, j;
22530	int	size;
22531	tcp_t	**temp_tcp_array;
22532	tcp_t	*tcp;
22533
22534	rw_enter(&tcp_reserved_port_lock, RW_WRITER);
22535
22536	/* First make sure that the port ranage is indeed reserved. */
22537	for (i = 0; i < tcp_reserved_port_array_size; i++) {
22538		if (tcp_reserved_port[i].lo_port == lo_port) {
22539			hi_port = tcp_reserved_port[i].hi_port;
22540			temp_tcp_array = tcp_reserved_port[i].temp_tcp_array;
22541			break;
22542		}
22543	}
22544	if (i == tcp_reserved_port_array_size) {
22545		rw_exit(&tcp_reserved_port_lock);
22546		return (B_FALSE);
22547	}
22548
22549	/*
22550	 * Remove the range from the array.  This simple loop is possible
22551	 * because port ranges are inserted in ascending order.
22552	 */
22553	for (j = i; j < tcp_reserved_port_array_size - 1; j++) {
22554		tcp_reserved_port[j].lo_port = tcp_reserved_port[j+1].lo_port;
22555		tcp_reserved_port[j].hi_port = tcp_reserved_port[j+1].hi_port;
22556		tcp_reserved_port[j].temp_tcp_array =
22557		    tcp_reserved_port[j+1].temp_tcp_array;
22558	}
22559
22560	/* Remove all the temporary tcp structures. */
22561	size = hi_port - lo_port + 1;
22562	while (size > 0) {
22563		tcp = temp_tcp_array[size - 1];
22564		ASSERT(tcp != NULL);
22565		tcp_bind_hash_remove(tcp);
22566		CONN_DEC_REF(tcp->tcp_connp);
22567		size--;
22568	}
22569	kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *));
22570	tcp_reserved_port_array_size--;
22571	rw_exit(&tcp_reserved_port_lock);
22572	return (B_TRUE);
22573}
22574
22575/*
22576 * Macro to remove temporary tcp structure from the bind hash list.  The
22577 * first parameter is the list of tcp to be removed.  The second parameter
22578 * is the number of tcps in the array.
22579 */
22580#define	TCP_TMP_TCP_REMOVE(tcp_array, num) \
22581{ \
22582	while ((num) > 0) { \
22583		tcp_t *tcp = (tcp_array)[(num) - 1]; \
22584		tf_t *tbf; \
22585		tcp_t *tcpnext; \
22586		tbf = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \
22587		mutex_enter(&tbf->tf_lock); \
22588		tcpnext = tcp->tcp_bind_hash; \
22589		if (tcpnext) { \
22590			tcpnext->tcp_ptpbhn = \
22591				tcp->tcp_ptpbhn; \
22592		} \
22593		*tcp->tcp_ptpbhn = tcpnext; \
22594		mutex_exit(&tbf->tf_lock); \
22595		kmem_free(tcp, sizeof (tcp_t)); \
22596		(tcp_array)[(num) - 1] = NULL; \
22597		(num)--; \
22598	} \
22599}
22600
22601/*
22602 * The public interface for other modules to call to reserve a port range
22603 * in TCP.  The caller passes in how large a port range it wants.  TCP
22604 * will try to find a range and return it via lo_port and hi_port.  This is
22605 * used by NCA's nca_conn_init.
22606 * NCA can only be used in the global zone so this only affects the global
22607 * zone's ports.
22608 *
22609 * Params:
22610 *	int size: the size of the port range to be reserved.
22611 *	in_port_t *lo_port (referenced): returns the beginning port of the
22612 *		reserved port range added.
22613 *	in_port_t *hi_port (referenced): returns the ending port of the
22614 *		reserved port range added.
22615 *
22616 * Return:
22617 *	B_TRUE if the port reservation is successful, B_FALSE otherwise.
22618 */
22619boolean_t
22620tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port)
22621{
22622	tcp_t		*tcp;
22623	tcp_t		*tmp_tcp;
22624	tcp_t		**temp_tcp_array;
22625	tf_t		*tbf;
22626	in_port_t	net_port;
22627	in_port_t	port;
22628	int32_t		cur_size;
22629	int		i, j;
22630	boolean_t	used;
22631	tcp_rport_t 	tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE];
22632	zoneid_t	zoneid = GLOBAL_ZONEID;
22633
22634	/* Sanity check. */
22635	if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) {
22636		return (B_FALSE);
22637	}
22638
22639	rw_enter(&tcp_reserved_port_lock, RW_WRITER);
22640	if (tcp_reserved_port_array_size == TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) {
22641		rw_exit(&tcp_reserved_port_lock);
22642		return (B_FALSE);
22643	}
22644
22645	/*
22646	 * Find the starting port to try.  Since the port ranges are ordered
22647	 * in the reserved port array, we can do a simple search here.
22648	 */
22649	*lo_port = TCP_SMALLEST_RESERVED_PORT;
22650	*hi_port = TCP_LARGEST_RESERVED_PORT;
22651	for (i = 0; i < tcp_reserved_port_array_size;
22652	    *lo_port = tcp_reserved_port[i].hi_port + 1, i++) {
22653		if (tcp_reserved_port[i].lo_port - *lo_port >= size) {
22654			*hi_port = tcp_reserved_port[i].lo_port - 1;
22655			break;
22656		}
22657	}
22658	/* No available port range. */
22659	if (i == tcp_reserved_port_array_size && *hi_port - *lo_port < size) {
22660		rw_exit(&tcp_reserved_port_lock);
22661		return (B_FALSE);
22662	}
22663
22664	temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP);
22665	if (temp_tcp_array == NULL) {
22666		rw_exit(&tcp_reserved_port_lock);
22667		return (B_FALSE);
22668	}
22669
22670	/* Go thru the port range to see if some ports are already bound. */
22671	for (port = *lo_port, cur_size = 0;
22672	    cur_size < size && port <= *hi_port;
22673	    cur_size++, port++) {
22674		used = B_FALSE;
22675		net_port = htons(port);
22676		tbf = &tcp_bind_fanout[TCP_BIND_HASH(net_port)];
22677		mutex_enter(&tbf->tf_lock);
22678		for (tcp = tbf->tf_tcp; tcp != NULL;
22679		    tcp = tcp->tcp_bind_hash) {
22680			if (IPCL_ZONE_MATCH(tcp->tcp_connp, zoneid) &&
22681			    net_port == tcp->tcp_lport) {
22682				/*
22683				 * A port is already bound.  Search again
22684				 * starting from port + 1.  Release all
22685				 * temporary tcps.
22686				 */
22687				mutex_exit(&tbf->tf_lock);
22688				TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size);
22689				*lo_port = port + 1;
22690				cur_size = -1;
22691				used = B_TRUE;
22692				break;
22693			}
22694		}
22695		if (!used) {
22696			if ((tmp_tcp = tcp_alloc_temp_tcp(net_port)) == NULL) {
22697				/*
22698				 * Allocation failure.  Just fail the request.
22699				 * Need to remove all those temporary tcp
22700				 * structures.
22701				 */
22702				mutex_exit(&tbf->tf_lock);
22703				TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size);
22704				rw_exit(&tcp_reserved_port_lock);
22705				kmem_free(temp_tcp_array,
22706				    (hi_port - lo_port + 1) *
22707				    sizeof (tcp_t *));
22708				return (B_FALSE);
22709			}
22710			temp_tcp_array[cur_size] = tmp_tcp;
22711			tcp_bind_hash_insert(tbf, tmp_tcp, B_TRUE);
22712			mutex_exit(&tbf->tf_lock);
22713		}
22714	}
22715
22716	/*
22717	 * The current range is not large enough.  We can actually do another
22718	 * search if this search is done between 2 reserved port ranges.  But
22719	 * for first release, we just stop here and return saying that no port
22720	 * range is available.
22721	 */
22722	if (cur_size < size) {
22723		TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size);
22724		rw_exit(&tcp_reserved_port_lock);
22725		kmem_free(temp_tcp_array, size * sizeof (tcp_t *));
22726		return (B_FALSE);
22727	}
22728	*hi_port = port - 1;
22729
22730	/*
22731	 * Insert range into array in ascending order.  Since this function
22732	 * must not be called often, we choose to use the simplest method.
22733	 * The above array should not consume excessive stack space as
22734	 * the size must be very small.  If in future releases, we find
22735	 * that we should provide more reserved port ranges, this function
22736	 * has to be modified to be more efficient.
22737	 */
22738	if (tcp_reserved_port_array_size == 0) {
22739		tcp_reserved_port[0].lo_port = *lo_port;
22740		tcp_reserved_port[0].hi_port = *hi_port;
22741		tcp_reserved_port[0].temp_tcp_array = temp_tcp_array;
22742	} else {
22743		for (i = 0, j = 0; i < tcp_reserved_port_array_size; i++, j++) {
22744			if (*lo_port < tcp_reserved_port[i].lo_port && i == j) {
22745				tmp_ports[j].lo_port = *lo_port;
22746				tmp_ports[j].hi_port = *hi_port;
22747				tmp_ports[j].temp_tcp_array = temp_tcp_array;
22748				j++;
22749			}
22750			tmp_ports[j].lo_port = tcp_reserved_port[i].lo_port;
22751			tmp_ports[j].hi_port = tcp_reserved_port[i].hi_port;
22752			tmp_ports[j].temp_tcp_array =
22753			    tcp_reserved_port[i].temp_tcp_array;
22754		}
22755		if (j == i) {
22756			tmp_ports[j].lo_port = *lo_port;
22757			tmp_ports[j].hi_port = *hi_port;
22758			tmp_ports[j].temp_tcp_array = temp_tcp_array;
22759		}
22760		bcopy(tmp_ports, tcp_reserved_port, sizeof (tmp_ports));
22761	}
22762	tcp_reserved_port_array_size++;
22763	rw_exit(&tcp_reserved_port_lock);
22764	return (B_TRUE);
22765}
22766
22767/*
22768 * Check to see if a port is in any reserved port range.
22769 *
22770 * Params:
22771 *	in_port_t port: the port to be verified.
22772 *
22773 * Return:
22774 *	B_TRUE is the port is inside a reserved port range, B_FALSE otherwise.
22775 */
22776boolean_t
22777tcp_reserved_port_check(in_port_t port)
22778{
22779	int i;
22780
22781	rw_enter(&tcp_reserved_port_lock, RW_READER);
22782	for (i = 0; i < tcp_reserved_port_array_size; i++) {
22783		if (port >= tcp_reserved_port[i].lo_port ||
22784		    port <= tcp_reserved_port[i].hi_port) {
22785			rw_exit(&tcp_reserved_port_lock);
22786			return (B_TRUE);
22787		}
22788	}
22789	rw_exit(&tcp_reserved_port_lock);
22790	return (B_FALSE);
22791}
22792
22793/*
22794 * To list all reserved port ranges.  This is the function to handle
22795 * ndd tcp_reserved_port_list.
22796 */
22797/* ARGSUSED */
22798static int
22799tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
22800{
22801	int i;
22802
22803	rw_enter(&tcp_reserved_port_lock, RW_READER);
22804	if (tcp_reserved_port_array_size > 0)
22805		(void) mi_mpprintf(mp, "The following ports are reserved:");
22806	else
22807		(void) mi_mpprintf(mp, "No port is reserved.");
22808	for (i = 0; i < tcp_reserved_port_array_size; i++) {
22809		(void) mi_mpprintf(mp, "%d-%d",
22810		    tcp_reserved_port[i].lo_port, tcp_reserved_port[i].hi_port);
22811	}
22812	rw_exit(&tcp_reserved_port_lock);
22813	return (0);
22814}
22815
22816/*
22817 * Hash list insertion routine for tcp_t structures.
22818 * Inserts entries with the ones bound to a specific IP address first
22819 * followed by those bound to INADDR_ANY.
22820 */
22821static void
22822tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
22823{
22824	tcp_t	**tcpp;
22825	tcp_t	*tcpnext;
22826
22827	if (tcp->tcp_ptpbhn != NULL) {
22828		ASSERT(!caller_holds_lock);
22829		tcp_bind_hash_remove(tcp);
22830	}
22831	tcpp = &tbf->tf_tcp;
22832	if (!caller_holds_lock) {
22833		mutex_enter(&tbf->tf_lock);
22834	} else {
22835		ASSERT(MUTEX_HELD(&tbf->tf_lock));
22836	}
22837	tcpnext = tcpp[0];
22838	if (tcpnext) {
22839		/*
22840		 * If the new tcp bound to the INADDR_ANY address
22841		 * and the first one in the list is not bound to
22842		 * INADDR_ANY we skip all entries until we find the
22843		 * first one bound to INADDR_ANY.
22844		 * This makes sure that applications binding to a
22845		 * specific address get preference over those binding to
22846		 * INADDR_ANY.
22847		 */
22848		if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) &&
22849		    !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) {
22850			while ((tcpnext = tcpp[0]) != NULL &&
22851			    !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6))
22852				tcpp = &(tcpnext->tcp_bind_hash);
22853			if (tcpnext)
22854				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash;
22855		} else
22856			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash;
22857	}
22858	tcp->tcp_bind_hash = tcpnext;
22859	tcp->tcp_ptpbhn = tcpp;
22860	tcpp[0] = tcp;
22861	if (!caller_holds_lock)
22862		mutex_exit(&tbf->tf_lock);
22863}
22864
22865/*
22866 * Hash list removal routine for tcp_t structures.
22867 */
22868static void
22869tcp_bind_hash_remove(tcp_t *tcp)
22870{
22871	tcp_t	*tcpnext;
22872	kmutex_t *lockp;
22873
22874	if (tcp->tcp_ptpbhn == NULL)
22875		return;
22876
22877	/*
22878	 * Extract the lock pointer in case there are concurrent
22879	 * hash_remove's for this instance.
22880	 */
22881	ASSERT(tcp->tcp_lport != 0);
22882	lockp = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock;
22883
22884	ASSERT(lockp != NULL);
22885	mutex_enter(lockp);
22886	if (tcp->tcp_ptpbhn) {
22887		tcpnext = tcp->tcp_bind_hash;
22888		if (tcpnext) {
22889			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
22890			tcp->tcp_bind_hash = NULL;
22891		}
22892		*tcp->tcp_ptpbhn = tcpnext;
22893		tcp->tcp_ptpbhn = NULL;
22894	}
22895	mutex_exit(lockp);
22896}
22897
22898
22899/*
22900 * Hash list lookup routine for tcp_t structures.
22901 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF.
22902 */
22903static tcp_t *
22904tcp_acceptor_hash_lookup(t_uscalar_t id)
22905{
22906	tf_t	*tf;
22907	tcp_t	*tcp;
22908
22909	tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)];
22910	mutex_enter(&tf->tf_lock);
22911	for (tcp = tf->tf_tcp; tcp != NULL;
22912	    tcp = tcp->tcp_acceptor_hash) {
22913		if (tcp->tcp_acceptor_id == id) {
22914			CONN_INC_REF(tcp->tcp_connp);
22915			mutex_exit(&tf->tf_lock);
22916			return (tcp);
22917		}
22918	}
22919	mutex_exit(&tf->tf_lock);
22920	return (NULL);
22921}
22922
22923
22924/*
22925 * Hash list insertion routine for tcp_t structures.
22926 */
22927void
22928tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp)
22929{
22930	tf_t	*tf;
22931	tcp_t	**tcpp;
22932	tcp_t	*tcpnext;
22933
22934	tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)];
22935
22936	if (tcp->tcp_ptpahn != NULL)
22937		tcp_acceptor_hash_remove(tcp);
22938	tcpp = &tf->tf_tcp;
22939	mutex_enter(&tf->tf_lock);
22940	tcpnext = tcpp[0];
22941	if (tcpnext)
22942		tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash;
22943	tcp->tcp_acceptor_hash = tcpnext;
22944	tcp->tcp_ptpahn = tcpp;
22945	tcpp[0] = tcp;
22946	tcp->tcp_acceptor_lockp = &tf->tf_lock;	/* For tcp_*_hash_remove */
22947	mutex_exit(&tf->tf_lock);
22948}
22949
22950/*
22951 * Hash list removal routine for tcp_t structures.
22952 */
22953static void
22954tcp_acceptor_hash_remove(tcp_t *tcp)
22955{
22956	tcp_t	*tcpnext;
22957	kmutex_t *lockp;
22958
22959	/*
22960	 * Extract the lock pointer in case there are concurrent
22961	 * hash_remove's for this instance.
22962	 */
22963	lockp = tcp->tcp_acceptor_lockp;
22964
22965	if (tcp->tcp_ptpahn == NULL)
22966		return;
22967
22968	ASSERT(lockp != NULL);
22969	mutex_enter(lockp);
22970	if (tcp->tcp_ptpahn) {
22971		tcpnext = tcp->tcp_acceptor_hash;
22972		if (tcpnext) {
22973			tcpnext->tcp_ptpahn = tcp->tcp_ptpahn;
22974			tcp->tcp_acceptor_hash = NULL;
22975		}
22976		*tcp->tcp_ptpahn = tcpnext;
22977		tcp->tcp_ptpahn = NULL;
22978	}
22979	mutex_exit(lockp);
22980	tcp->tcp_acceptor_lockp = NULL;
22981}
22982
22983/* ARGSUSED */
22984static int
22985tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af)
22986{
22987	int error = 0;
22988	int retval;
22989	char *end;
22990
22991	tcp_hsp_t *hsp;
22992	tcp_hsp_t *hspprev;
22993
22994	ipaddr_t addr = 0;		/* Address we're looking for */
22995	in6_addr_t v6addr;		/* Address we're looking for */
22996	uint32_t hash;			/* Hash of that address */
22997
22998	/*
22999	 * If the following variables are still zero after parsing the input
23000	 * string, the user didn't specify them and we don't change them in
23001	 * the HSP.
23002	 */
23003
23004	ipaddr_t mask = 0;		/* Subnet mask */
23005	in6_addr_t v6mask;
23006	long sendspace = 0;		/* Send buffer size */
23007	long recvspace = 0;		/* Receive buffer size */
23008	long timestamp = 0;	/* Originate TCP TSTAMP option, 1 = yes */
23009	boolean_t delete = B_FALSE;	/* User asked to delete this HSP */
23010
23011	rw_enter(&tcp_hsp_lock, RW_WRITER);
23012
23013	/* Parse and validate address */
23014	if (af == AF_INET) {
23015		retval = inet_pton(af, value, &addr);
23016		if (retval == 1)
23017			IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
23018	} else if (af == AF_INET6) {
23019		retval = inet_pton(af, value, &v6addr);
23020	} else {
23021		error = EINVAL;
23022		goto done;
23023	}
23024	if (retval == 0) {
23025		error = EINVAL;
23026		goto done;
23027	}
23028
23029	while ((*value) && *value != ' ')
23030		value++;
23031
23032	/* Parse individual keywords, set variables if found */
23033	while (*value) {
23034		/* Skip leading blanks */
23035
23036		while (*value == ' ' || *value == '\t')
23037			value++;
23038
23039		/* If at end of string, we're done */
23040
23041		if (!*value)
23042			break;
23043
23044		/* We have a word, figure out what it is */
23045
23046		if (strncmp("mask", value, 4) == 0) {
23047			value += 4;
23048			while (*value == ' ' || *value == '\t')
23049				value++;
23050			/* Parse subnet mask */
23051			if (af == AF_INET) {
23052				retval = inet_pton(af, value, &mask);
23053				if (retval == 1) {
23054					V4MASK_TO_V6(mask, v6mask);
23055				}
23056			} else if (af == AF_INET6) {
23057				retval = inet_pton(af, value, &v6mask);
23058			}
23059			if (retval != 1) {
23060				error = EINVAL;
23061				goto done;
23062			}
23063			while ((*value) && *value != ' ')
23064				value++;
23065		} else if (strncmp("sendspace", value, 9) == 0) {
23066			value += 9;
23067
23068			if (ddi_strtol(value, &end, 0, &sendspace) != 0 ||
23069			    sendspace < TCP_XMIT_HIWATER ||
23070			    sendspace >= (1L<<30)) {
23071				error = EINVAL;
23072				goto done;
23073			}
23074			value = end;
23075		} else if (strncmp("recvspace", value, 9) == 0) {
23076			value += 9;
23077
23078			if (ddi_strtol(value, &end, 0, &recvspace) != 0 ||
23079			    recvspace < TCP_RECV_HIWATER ||
23080			    recvspace >= (1L<<30)) {
23081				error = EINVAL;
23082				goto done;
23083			}
23084			value = end;
23085		} else if (strncmp("timestamp", value, 9) == 0) {
23086			value += 9;
23087
23088			if (ddi_strtol(value, &end, 0, &timestamp) != 0 ||
23089			    timestamp < 0 || timestamp > 1) {
23090				error = EINVAL;
23091				goto done;
23092			}
23093
23094			/*
23095			 * We increment timestamp so we know it's been set;
23096			 * this is undone when we put it in the HSP
23097			 */
23098			timestamp++;
23099			value = end;
23100		} else if (strncmp("delete", value, 6) == 0) {
23101			value += 6;
23102			delete = B_TRUE;
23103		} else {
23104			error = EINVAL;
23105			goto done;
23106		}
23107	}
23108
23109	/* Hash address for lookup */
23110
23111	hash = TCP_HSP_HASH(addr);
23112
23113	if (delete) {
23114		/*
23115		 * Note that deletes don't return an error if the thing
23116		 * we're trying to delete isn't there.
23117		 */
23118		if (tcp_hsp_hash == NULL)
23119			goto done;
23120		hsp = tcp_hsp_hash[hash];
23121
23122		if (hsp) {
23123			if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6,
23124			    &v6addr)) {
23125				tcp_hsp_hash[hash] = hsp->tcp_hsp_next;
23126				mi_free((char *)hsp);
23127			} else {
23128				hspprev = hsp;
23129				while ((hsp = hsp->tcp_hsp_next) != NULL) {
23130					if (IN6_ARE_ADDR_EQUAL(
23131					    &hsp->tcp_hsp_addr_v6, &v6addr)) {
23132						hspprev->tcp_hsp_next =
23133						    hsp->tcp_hsp_next;
23134						mi_free((char *)hsp);
23135						break;
23136					}
23137					hspprev = hsp;
23138				}
23139			}
23140		}
23141	} else {
23142		/*
23143		 * We're adding/modifying an HSP.  If we haven't already done
23144		 * so, allocate the hash table.
23145		 */
23146
23147		if (!tcp_hsp_hash) {
23148			tcp_hsp_hash = (tcp_hsp_t **)
23149			    mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE);
23150			if (!tcp_hsp_hash) {
23151				error = EINVAL;
23152				goto done;
23153			}
23154		}
23155
23156		/* Get head of hash chain */
23157
23158		hsp = tcp_hsp_hash[hash];
23159
23160		/* Try to find pre-existing hsp on hash chain */
23161		/* Doesn't handle CIDR prefixes. */
23162		while (hsp) {
23163			if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr))
23164				break;
23165			hsp = hsp->tcp_hsp_next;
23166		}
23167
23168		/*
23169		 * If we didn't, create one with default values and put it
23170		 * at head of hash chain
23171		 */
23172
23173		if (!hsp) {
23174			hsp = (tcp_hsp_t *)mi_zalloc(sizeof (tcp_hsp_t));
23175			if (!hsp) {
23176				error = EINVAL;
23177				goto done;
23178			}
23179			hsp->tcp_hsp_next = tcp_hsp_hash[hash];
23180			tcp_hsp_hash[hash] = hsp;
23181		}
23182
23183		/* Set values that the user asked us to change */
23184
23185		hsp->tcp_hsp_addr_v6 = v6addr;
23186		if (IN6_IS_ADDR_V4MAPPED(&v6addr))
23187			hsp->tcp_hsp_vers = IPV4_VERSION;
23188		else
23189			hsp->tcp_hsp_vers = IPV6_VERSION;
23190		hsp->tcp_hsp_subnet_v6 = v6mask;
23191		if (sendspace > 0)
23192			hsp->tcp_hsp_sendspace = sendspace;
23193		if (recvspace > 0)
23194			hsp->tcp_hsp_recvspace = recvspace;
23195		if (timestamp > 0)
23196			hsp->tcp_hsp_tstamp = timestamp - 1;
23197	}
23198
23199done:
23200	rw_exit(&tcp_hsp_lock);
23201	return (error);
23202}
23203
23204/* Set callback routine passed to nd_load by tcp_param_register. */
23205/* ARGSUSED */
23206static int
23207tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
23208{
23209	return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET));
23210}
23211/* ARGSUSED */
23212static int
23213tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
23214    cred_t *cr)
23215{
23216	return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET6));
23217}
23218
23219/* TCP host parameters report triggered via the Named Dispatch mechanism. */
23220/* ARGSUSED */
23221static int
23222tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
23223{
23224	tcp_hsp_t *hsp;
23225	int i;
23226	char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN];
23227
23228	rw_enter(&tcp_hsp_lock, RW_READER);
23229	(void) mi_mpprintf(mp,
23230	    "Hash HSP     " MI_COL_HDRPAD_STR
23231	    "Address         Subnet Mask     Send       Receive    TStamp");
23232	if (tcp_hsp_hash) {
23233		for (i = 0; i < TCP_HSP_HASH_SIZE; i++) {
23234			hsp = tcp_hsp_hash[i];
23235			while (hsp) {
23236				if (hsp->tcp_hsp_vers == IPV4_VERSION) {
23237					(void) inet_ntop(AF_INET,
23238					    &hsp->tcp_hsp_addr,
23239					    addrbuf, sizeof (addrbuf));
23240					(void) inet_ntop(AF_INET,
23241					    &hsp->tcp_hsp_subnet,
23242					    subnetbuf, sizeof (subnetbuf));
23243				} else {
23244					(void) inet_ntop(AF_INET6,
23245					    &hsp->tcp_hsp_addr_v6,
23246					    addrbuf, sizeof (addrbuf));
23247					(void) inet_ntop(AF_INET6,
23248					    &hsp->tcp_hsp_subnet_v6,
23249					    subnetbuf, sizeof (subnetbuf));
23250				}
23251				(void) mi_mpprintf(mp,
23252				    " %03d " MI_COL_PTRFMT_STR
23253				    "%s %s %010d %010d      %d",
23254				    i,
23255				    (void *)hsp,
23256				    addrbuf,
23257				    subnetbuf,
23258				    hsp->tcp_hsp_sendspace,
23259				    hsp->tcp_hsp_recvspace,
23260				    hsp->tcp_hsp_tstamp);
23261
23262				hsp = hsp->tcp_hsp_next;
23263			}
23264		}
23265	}
23266	rw_exit(&tcp_hsp_lock);
23267	return (0);
23268}
23269
23270
23271/* Data for fast netmask macro used by tcp_hsp_lookup */
23272
23273static ipaddr_t netmasks[] = {
23274	IN_CLASSA_NET, IN_CLASSA_NET, IN_CLASSB_NET,
23275	IN_CLASSC_NET | IN_CLASSD_NET  /* Class C,D,E */
23276};
23277
23278#define	netmask(addr) (netmasks[(ipaddr_t)(addr) >> 30])
23279
23280/*
23281 * XXX This routine should go away and instead we should use the metrics
23282 * associated with the routes to determine the default sndspace and rcvspace.
23283 */
23284static tcp_hsp_t *
23285tcp_hsp_lookup(ipaddr_t addr)
23286{
23287	tcp_hsp_t *hsp = NULL;
23288
23289	/* Quick check without acquiring the lock. */
23290	if (tcp_hsp_hash == NULL)
23291		return (NULL);
23292
23293	rw_enter(&tcp_hsp_lock, RW_READER);
23294
23295	/* This routine finds the best-matching HSP for address addr. */
23296
23297	if (tcp_hsp_hash) {
23298		int i;
23299		ipaddr_t srchaddr;
23300		tcp_hsp_t *hsp_net;
23301
23302		/* We do three passes: host, network, and subnet. */
23303
23304		srchaddr = addr;
23305
23306		for (i = 1; i <= 3; i++) {
23307			/* Look for exact match on srchaddr */
23308
23309			hsp = tcp_hsp_hash[TCP_HSP_HASH(srchaddr)];
23310			while (hsp) {
23311				if (hsp->tcp_hsp_vers == IPV4_VERSION &&
23312				    hsp->tcp_hsp_addr == srchaddr)
23313					break;
23314				hsp = hsp->tcp_hsp_next;
23315			}
23316			ASSERT(hsp == NULL ||
23317			    hsp->tcp_hsp_vers == IPV4_VERSION);
23318
23319			/*
23320			 * If this is the first pass:
23321			 *   If we found a match, great, return it.
23322			 *   If not, search for the network on the second pass.
23323			 */
23324
23325			if (i == 1)
23326				if (hsp)
23327					break;
23328				else
23329				{
23330					srchaddr = addr & netmask(addr);
23331					continue;
23332				}
23333
23334			/*
23335			 * If this is the second pass:
23336			 *   If we found a match, but there's a subnet mask,
23337			 *    save the match but try again using the subnet
23338			 *    mask on the third pass.
23339			 *   Otherwise, return whatever we found.
23340			 */
23341
23342			if (i == 2) {
23343				if (hsp && hsp->tcp_hsp_subnet) {
23344					hsp_net = hsp;
23345					srchaddr = addr & hsp->tcp_hsp_subnet;
23346					continue;
23347				} else {
23348					break;
23349				}
23350			}
23351
23352			/*
23353			 * This must be the third pass.  If we didn't find
23354			 * anything, return the saved network HSP instead.
23355			 */
23356
23357			if (!hsp)
23358				hsp = hsp_net;
23359		}
23360	}
23361
23362	rw_exit(&tcp_hsp_lock);
23363	return (hsp);
23364}
23365
23366/*
23367 * XXX Equally broken as the IPv4 routine. Doesn't handle longest
23368 * match lookup.
23369 */
23370static tcp_hsp_t *
23371tcp_hsp_lookup_ipv6(in6_addr_t *v6addr)
23372{
23373	tcp_hsp_t *hsp = NULL;
23374
23375	/* Quick check without acquiring the lock. */
23376	if (tcp_hsp_hash == NULL)
23377		return (NULL);
23378
23379	rw_enter(&tcp_hsp_lock, RW_READER);
23380
23381	/* This routine finds the best-matching HSP for address addr. */
23382
23383	if (tcp_hsp_hash) {
23384		int i;
23385		in6_addr_t v6srchaddr;
23386		tcp_hsp_t *hsp_net;
23387
23388		/* We do three passes: host, network, and subnet. */
23389
23390		v6srchaddr = *v6addr;
23391
23392		for (i = 1; i <= 3; i++) {
23393			/* Look for exact match on srchaddr */
23394
23395			hsp = tcp_hsp_hash[TCP_HSP_HASH(
23396			    V4_PART_OF_V6(v6srchaddr))];
23397			while (hsp) {
23398				if (hsp->tcp_hsp_vers == IPV6_VERSION &&
23399				    IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6,
23400				    &v6srchaddr))
23401					break;
23402				hsp = hsp->tcp_hsp_next;
23403			}
23404
23405			/*
23406			 * If this is the first pass:
23407			 *   If we found a match, great, return it.
23408			 *   If not, search for the network on the second pass.
23409			 */
23410
23411			if (i == 1)
23412				if (hsp)
23413					break;
23414				else {
23415					/* Assume a 64 bit mask */
23416					v6srchaddr.s6_addr32[0] =
23417					    v6addr->s6_addr32[0];
23418					v6srchaddr.s6_addr32[1] =
23419					    v6addr->s6_addr32[1];
23420					v6srchaddr.s6_addr32[2] = 0;
23421					v6srchaddr.s6_addr32[3] = 0;
23422					continue;
23423				}
23424
23425			/*
23426			 * If this is the second pass:
23427			 *   If we found a match, but there's a subnet mask,
23428			 *    save the match but try again using the subnet
23429			 *    mask on the third pass.
23430			 *   Otherwise, return whatever we found.
23431			 */
23432
23433			if (i == 2) {
23434				ASSERT(hsp == NULL ||
23435				    hsp->tcp_hsp_vers == IPV6_VERSION);
23436				if (hsp &&
23437				    !IN6_IS_ADDR_UNSPECIFIED(
23438				    &hsp->tcp_hsp_subnet_v6)) {
23439					hsp_net = hsp;
23440					V6_MASK_COPY(*v6addr,
23441					    hsp->tcp_hsp_subnet_v6, v6srchaddr);
23442					continue;
23443				} else {
23444					break;
23445				}
23446			}
23447
23448			/*
23449			 * This must be the third pass.  If we didn't find
23450			 * anything, return the saved network HSP instead.
23451			 */
23452
23453			if (!hsp)
23454				hsp = hsp_net;
23455		}
23456	}
23457
23458	rw_exit(&tcp_hsp_lock);
23459	return (hsp);
23460}
23461
23462/*
23463 * Type three generator adapted from the random() function in 4.4 BSD:
23464 */
23465
23466/*
23467 * Copyright (c) 1983, 1993
23468 *	The Regents of the University of California.  All rights reserved.
23469 *
23470 * Redistribution and use in source and binary forms, with or without
23471 * modification, are permitted provided that the following conditions
23472 * are met:
23473 * 1. Redistributions of source code must retain the above copyright
23474 *    notice, this list of conditions and the following disclaimer.
23475 * 2. Redistributions in binary form must reproduce the above copyright
23476 *    notice, this list of conditions and the following disclaimer in the
23477 *    documentation and/or other materials provided with the distribution.
23478 * 3. All advertising materials mentioning features or use of this software
23479 *    must display the following acknowledgement:
23480 *	This product includes software developed by the University of
23481 *	California, Berkeley and its contributors.
23482 * 4. Neither the name of the University nor the names of its contributors
23483 *    may be used to endorse or promote products derived from this software
23484 *    without specific prior written permission.
23485 *
23486 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23487 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23488 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23489 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23490 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23491 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23492 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23493 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23494 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23495 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23496 * SUCH DAMAGE.
23497 */
23498
23499/* Type 3 -- x**31 + x**3 + 1 */
23500#define	DEG_3		31
23501#define	SEP_3		3
23502
23503
23504/* Protected by tcp_random_lock */
23505static int tcp_randtbl[DEG_3 + 1];
23506
23507static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1];
23508static int *tcp_random_rptr = &tcp_randtbl[1];
23509
23510static int *tcp_random_state = &tcp_randtbl[1];
23511static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1];
23512
23513kmutex_t tcp_random_lock;
23514
23515void
23516tcp_random_init(void)
23517{
23518	int i;
23519	hrtime_t hrt;
23520	time_t wallclock;
23521	uint64_t result;
23522
23523	/*
23524	 * Use high-res timer and current time for seed.  Gethrtime() returns
23525	 * a longlong, which may contain resolution down to nanoseconds.
23526	 * The current time will either be a 32-bit or a 64-bit quantity.
23527	 * XOR the two together in a 64-bit result variable.
23528	 * Convert the result to a 32-bit value by multiplying the high-order
23529	 * 32-bits by the low-order 32-bits.
23530	 */
23531
23532	hrt = gethrtime();
23533	(void) drv_getparm(TIME, &wallclock);
23534	result = (uint64_t)wallclock ^ (uint64_t)hrt;
23535	mutex_enter(&tcp_random_lock);
23536	tcp_random_state[0] = ((result >> 32) & 0xffffffff) *
23537	    (result & 0xffffffff);
23538
23539	for (i = 1; i < DEG_3; i++)
23540		tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1]
23541			+ 12345;
23542	tcp_random_fptr = &tcp_random_state[SEP_3];
23543	tcp_random_rptr = &tcp_random_state[0];
23544	mutex_exit(&tcp_random_lock);
23545	for (i = 0; i < 10 * DEG_3; i++)
23546		(void) tcp_random();
23547}
23548
23549/*
23550 * tcp_random: Return a random number in the range [1 - (128K + 1)].
23551 * This range is selected to be approximately centered on TCP_ISS / 2,
23552 * and easy to compute. We get this value by generating a 32-bit random
23553 * number, selecting out the high-order 17 bits, and then adding one so
23554 * that we never return zero.
23555 */
23556int
23557tcp_random(void)
23558{
23559	int i;
23560
23561	mutex_enter(&tcp_random_lock);
23562	*tcp_random_fptr += *tcp_random_rptr;
23563
23564	/*
23565	 * The high-order bits are more random than the low-order bits,
23566	 * so we select out the high-order 17 bits and add one so that
23567	 * we never return zero.
23568	 */
23569	i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1;
23570	if (++tcp_random_fptr >= tcp_random_end_ptr) {
23571		tcp_random_fptr = tcp_random_state;
23572		++tcp_random_rptr;
23573	} else if (++tcp_random_rptr >= tcp_random_end_ptr)
23574		tcp_random_rptr = tcp_random_state;
23575
23576	mutex_exit(&tcp_random_lock);
23577	return (i);
23578}
23579
23580/*
23581 * XXX This will go away when TPI is extended to send
23582 * info reqs to sockfs/timod .....
23583 * Given a queue, set the max packet size for the write
23584 * side of the queue below stream head.  This value is
23585 * cached on the stream head.
23586 * Returns 1 on success, 0 otherwise.
23587 */
23588static int
23589setmaxps(queue_t *q, int maxpsz)
23590{
23591	struct stdata	*stp;
23592	queue_t		*wq;
23593	stp = STREAM(q);
23594
23595	/*
23596	 * At this point change of a queue parameter is not allowed
23597	 * when a multiplexor is sitting on top.
23598	 */
23599	if (stp->sd_flag & STPLEX)
23600		return (0);
23601
23602	claimstr(stp->sd_wrq);
23603	wq = stp->sd_wrq->q_next;
23604	ASSERT(wq != NULL);
23605	(void) strqset(wq, QMAXPSZ, 0, maxpsz);
23606	releasestr(stp->sd_wrq);
23607	return (1);
23608}
23609
23610static int
23611tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
23612    int *t_errorp, int *sys_errorp)
23613{
23614	int error;
23615	int is_absreq_failure;
23616	t_scalar_t *opt_lenp;
23617	t_scalar_t opt_offset;
23618	int prim_type;
23619	struct T_conn_req *tcreqp;
23620	struct T_conn_res *tcresp;
23621	cred_t *cr;
23622
23623	cr = DB_CREDDEF(mp, tcp->tcp_cred);
23624
23625	prim_type = ((union T_primitives *)mp->b_rptr)->type;
23626	ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
23627	    prim_type == T_CONN_RES);
23628
23629	switch (prim_type) {
23630	case T_CONN_REQ:
23631		tcreqp = (struct T_conn_req *)mp->b_rptr;
23632		opt_offset = tcreqp->OPT_offset;
23633		opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
23634		break;
23635	case O_T_CONN_RES:
23636	case T_CONN_RES:
23637		tcresp = (struct T_conn_res *)mp->b_rptr;
23638		opt_offset = tcresp->OPT_offset;
23639		opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
23640		break;
23641	}
23642
23643	*t_errorp = 0;
23644	*sys_errorp = 0;
23645	*do_disconnectp = 0;
23646
23647	error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp,
23648	    opt_offset, cr, &tcp_opt_obj,
23649	    NULL, &is_absreq_failure);
23650
23651	switch (error) {
23652	case  0:		/* no error */
23653		ASSERT(is_absreq_failure == 0);
23654		return (0);
23655	case ENOPROTOOPT:
23656		*t_errorp = TBADOPT;
23657		break;
23658	case EACCES:
23659		*t_errorp = TACCES;
23660		break;
23661	default:
23662		*t_errorp = TSYSERR; *sys_errorp = error;
23663		break;
23664	}
23665	if (is_absreq_failure != 0) {
23666		/*
23667		 * The connection request should get the local ack
23668		 * T_OK_ACK and then a T_DISCON_IND.
23669		 */
23670		*do_disconnectp = 1;
23671	}
23672	return (-1);
23673}
23674
23675/*
23676 * Split this function out so that if the secret changes, I'm okay.
23677 *
23678 * Initialize the tcp_iss_cookie and tcp_iss_key.
23679 */
23680
23681#define	PASSWD_SIZE 16  /* MUST be multiple of 4 */
23682
23683static void
23684tcp_iss_key_init(uint8_t *phrase, int len)
23685{
23686	struct {
23687		int32_t current_time;
23688		uint32_t randnum;
23689		uint16_t pad;
23690		uint8_t ether[6];
23691		uint8_t passwd[PASSWD_SIZE];
23692	} tcp_iss_cookie;
23693	time_t t;
23694
23695	/*
23696	 * Start with the current absolute time.
23697	 */
23698	(void) drv_getparm(TIME, &t);
23699	tcp_iss_cookie.current_time = t;
23700
23701	/*
23702	 * XXX - Need a more random number per RFC 1750, not this crap.
23703	 * OTOH, if what follows is pretty random, then I'm in better shape.
23704	 */
23705	tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random());
23706	tcp_iss_cookie.pad = 0x365c;  /* Picked from HMAC pad values. */
23707
23708	/*
23709	 * The cpu_type_info is pretty non-random.  Ugggh.  It does serve
23710	 * as a good template.
23711	 */
23712	bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd,
23713	    min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info)));
23714
23715	/*
23716	 * The pass-phrase.  Normally this is supplied by user-called NDD.
23717	 */
23718	bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len));
23719
23720	/*
23721	 * See 4010593 if this section becomes a problem again,
23722	 * but the local ethernet address is useful here.
23723	 */
23724	(void) localetheraddr(NULL,
23725	    (struct ether_addr *)&tcp_iss_cookie.ether);
23726
23727	/*
23728	 * Hash 'em all together.  The MD5Final is called per-connection.
23729	 */
23730	mutex_enter(&tcp_iss_key_lock);
23731	MD5Init(&tcp_iss_key);
23732	MD5Update(&tcp_iss_key, (uchar_t *)&tcp_iss_cookie,
23733	    sizeof (tcp_iss_cookie));
23734	mutex_exit(&tcp_iss_key_lock);
23735}
23736
23737/*
23738 * Set the RFC 1948 pass phrase
23739 */
23740/* ARGSUSED */
23741static int
23742tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
23743    cred_t *cr)
23744{
23745	/*
23746	 * Basically, value contains a new pass phrase.  Pass it along!
23747	 */
23748	tcp_iss_key_init((uint8_t *)value, strlen(value));
23749	return (0);
23750}
23751
23752/* ARGSUSED */
23753static int
23754tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags)
23755{
23756	bzero(buf, sizeof (tcp_sack_info_t));
23757	return (0);
23758}
23759
23760/* ARGSUSED */
23761static int
23762tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags)
23763{
23764	bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH);
23765	return (0);
23766}
23767
23768void
23769tcp_ddi_init(void)
23770{
23771	int i;
23772
23773	/* Initialize locks */
23774	rw_init(&tcp_hsp_lock, NULL, RW_DEFAULT, NULL);
23775	mutex_init(&tcp_g_q_lock, NULL, MUTEX_DEFAULT, NULL);
23776	mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL);
23777	mutex_init(&tcp_iss_key_lock, NULL, MUTEX_DEFAULT, NULL);
23778	mutex_init(&tcp_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
23779	rw_init(&tcp_reserved_port_lock, NULL, RW_DEFAULT, NULL);
23780
23781	for (i = 0; i < A_CNT(tcp_bind_fanout); i++) {
23782		mutex_init(&tcp_bind_fanout[i].tf_lock, NULL,
23783		    MUTEX_DEFAULT, NULL);
23784	}
23785
23786	for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) {
23787		mutex_init(&tcp_acceptor_fanout[i].tf_lock, NULL,
23788		    MUTEX_DEFAULT, NULL);
23789	}
23790
23791	/* TCP's IPsec code calls the packet dropper. */
23792	ip_drop_register(&tcp_dropper, "TCP IPsec policy enforcement");
23793
23794	if (!tcp_g_nd) {
23795		if (!tcp_param_register(tcp_param_arr, A_CNT(tcp_param_arr))) {
23796			nd_free(&tcp_g_nd);
23797		}
23798	}
23799
23800	/*
23801	 * Note: To really walk the device tree you need the devinfo
23802	 * pointer to your device which is only available after probe/attach.
23803	 * The following is safe only because it uses ddi_root_node()
23804	 */
23805	tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr,
23806	    tcp_opt_obj.odb_opt_arr_cnt);
23807
23808	tcp_timercache = kmem_cache_create("tcp_timercache",
23809	    sizeof (tcp_timer_t) + sizeof (mblk_t), 0,
23810	    NULL, NULL, NULL, NULL, NULL, 0);
23811
23812	tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache",
23813	    sizeof (tcp_sack_info_t), 0,
23814	    tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0);
23815
23816	tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache",
23817	    TCP_MAX_COMBINED_HEADER_LENGTH, 0,
23818	    tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0);
23819
23820	tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput);
23821	tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close);
23822
23823	ip_squeue_init(tcp_squeue_add);
23824
23825	/* Initialize the random number generator */
23826	tcp_random_init();
23827
23828	/*
23829	 * Initialize RFC 1948 secret values.  This will probably be reset once
23830	 * by the boot scripts.
23831	 *
23832	 * Use NULL name, as the name is caught by the new lockstats.
23833	 *
23834	 * Initialize with some random, non-guessable string, like the global
23835	 * T_INFO_ACK.
23836	 */
23837
23838	tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack,
23839	    sizeof (tcp_g_t_info_ack));
23840
23841	if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat",
23842		"net", KSTAT_TYPE_NAMED,
23843		sizeof (tcp_statistics) / sizeof (kstat_named_t),
23844		KSTAT_FLAG_VIRTUAL)) != NULL) {
23845		tcp_kstat->ks_data = &tcp_statistics;
23846		kstat_install(tcp_kstat);
23847	}
23848
23849	tcp_kstat_init();
23850}
23851
23852void
23853tcp_ddi_destroy(void)
23854{
23855	int i;
23856
23857	nd_free(&tcp_g_nd);
23858
23859	for (i = 0; i < A_CNT(tcp_bind_fanout); i++) {
23860		mutex_destroy(&tcp_bind_fanout[i].tf_lock);
23861	}
23862
23863	for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) {
23864		mutex_destroy(&tcp_acceptor_fanout[i].tf_lock);
23865	}
23866
23867	mutex_destroy(&tcp_iss_key_lock);
23868	rw_destroy(&tcp_hsp_lock);
23869	mutex_destroy(&tcp_g_q_lock);
23870	mutex_destroy(&tcp_random_lock);
23871	mutex_destroy(&tcp_epriv_port_lock);
23872	rw_destroy(&tcp_reserved_port_lock);
23873
23874	ip_drop_unregister(&tcp_dropper);
23875
23876	kmem_cache_destroy(tcp_timercache);
23877	kmem_cache_destroy(tcp_sack_info_cache);
23878	kmem_cache_destroy(tcp_iphc_cache);
23879
23880	tcp_kstat_fini();
23881}
23882
23883/*
23884 * Generate ISS, taking into account NDD changes may happen halfway through.
23885 * (If the iss is not zero, set it.)
23886 */
23887
23888static void
23889tcp_iss_init(tcp_t *tcp)
23890{
23891	MD5_CTX context;
23892	struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg;
23893	uint32_t answer[4];
23894
23895	tcp_iss_incr_extra += (ISS_INCR >> 1);
23896	tcp->tcp_iss = tcp_iss_incr_extra;
23897	switch (tcp_strong_iss) {
23898	case 2:
23899		mutex_enter(&tcp_iss_key_lock);
23900		context = tcp_iss_key;
23901		mutex_exit(&tcp_iss_key_lock);
23902		arg.ports = tcp->tcp_ports;
23903		if (tcp->tcp_ipversion == IPV4_VERSION) {
23904			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
23905			    &arg.src);
23906			IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst,
23907			    &arg.dst);
23908		} else {
23909			arg.src = tcp->tcp_ip6h->ip6_src;
23910			arg.dst = tcp->tcp_ip6h->ip6_dst;
23911		}
23912		MD5Update(&context, (uchar_t *)&arg, sizeof (arg));
23913		MD5Final((uchar_t *)answer, &context);
23914		tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3];
23915		/*
23916		 * Now that we've hashed into a unique per-connection sequence
23917		 * space, add a random increment per strong_iss == 1.  So I
23918		 * guess we'll have to...
23919		 */
23920		/* FALLTHRU */
23921	case 1:
23922		tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random();
23923		break;
23924	default:
23925		tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR;
23926		break;
23927	}
23928	tcp->tcp_valid_bits = TCP_ISS_VALID;
23929	tcp->tcp_fss = tcp->tcp_iss - 1;
23930	tcp->tcp_suna = tcp->tcp_iss;
23931	tcp->tcp_snxt = tcp->tcp_iss + 1;
23932	tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
23933	tcp->tcp_csuna = tcp->tcp_snxt;
23934}
23935
23936/*
23937 * Exported routine for extracting active tcp connection status.
23938 *
23939 * This is used by the Solaris Cluster Networking software to
23940 * gather a list of connections that need to be forwarded to
23941 * specific nodes in the cluster when configuration changes occur.
23942 *
23943 * The callback is invoked for each tcp_t structure. Returning
23944 * non-zero from the callback routine terminates the search.
23945 */
23946int
23947cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg)
23948{
23949	tcp_t *tcp;
23950	cl_tcp_info_t	cl_tcpi;
23951	connf_t	*connfp;
23952	conn_t	*connp;
23953	int	i;
23954
23955	ASSERT(callback != NULL);
23956
23957	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
23958
23959		connfp = &ipcl_globalhash_fanout[i];
23960		connp = NULL;
23961
23962		while ((connp =
23963		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
23964
23965			tcp = connp->conn_tcp;
23966			cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
23967			cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion;
23968			cl_tcpi.cl_tcpi_state = tcp->tcp_state;
23969			cl_tcpi.cl_tcpi_lport = tcp->tcp_lport;
23970			cl_tcpi.cl_tcpi_fport = tcp->tcp_fport;
23971			/*
23972			 * The macros tcp_laddr and tcp_faddr give the IPv4
23973			 * addresses. They are copied implicitly below as
23974			 * mapped addresses.
23975			 */
23976			cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6;
23977			if (tcp->tcp_ipversion == IPV4_VERSION) {
23978				cl_tcpi.cl_tcpi_faddr =
23979				    tcp->tcp_ipha->ipha_dst;
23980			} else {
23981				cl_tcpi.cl_tcpi_faddr_v6 =
23982				    tcp->tcp_ip6h->ip6_dst;
23983			}
23984
23985			/*
23986			 * If the callback returns non-zero
23987			 * we terminate the traversal.
23988			 */
23989			if ((*callback)(&cl_tcpi, arg) != 0) {
23990				CONN_DEC_REF(tcp->tcp_connp);
23991				return (1);
23992			}
23993		}
23994	}
23995
23996	return (0);
23997}
23998
23999/*
24000 * Macros used for accessing the different types of sockaddr
24001 * structures inside a tcp_ioc_abort_conn_t.
24002 */
24003#define	TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
24004#define	TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
24005#define	TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
24006#define	TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
24007#define	TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
24008#define	TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
24009#define	TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
24010#define	TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
24011#define	TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
24012#define	TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
24013#define	TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
24014#define	TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
24015
24016/*
24017 * Return the correct error code to mimic the behavior
24018 * of a connection reset.
24019 */
24020#define	TCP_AC_GET_ERRCODE(state, err) {	\
24021		switch ((state)) {		\
24022		case TCPS_SYN_SENT:		\
24023		case TCPS_SYN_RCVD:		\
24024			(err) = ECONNREFUSED;	\
24025			break;			\
24026		case TCPS_ESTABLISHED:		\
24027		case TCPS_FIN_WAIT_1:		\
24028		case TCPS_FIN_WAIT_2:		\
24029		case TCPS_CLOSE_WAIT:		\
24030			(err) = ECONNRESET;	\
24031			break;			\
24032		case TCPS_CLOSING:		\
24033		case TCPS_LAST_ACK:		\
24034		case TCPS_TIME_WAIT:		\
24035			(err) = 0;		\
24036			break;			\
24037		default:			\
24038			(err) = ENXIO;		\
24039		}				\
24040	}
24041
24042/*
24043 * Check if a tcp structure matches the info in acp.
24044 */
24045#define	TCP_AC_ADDR_MATCH(acp, tcp)					\
24046	(((acp)->ac_local.ss_family == AF_INET) ?		\
24047	((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||		\
24048	TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) &&	\
24049	(TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||		\
24050	TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) &&	\
24051	(TCP_AC_V4LPORT((acp)) == 0 ||				\
24052	TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) &&		\
24053	(TCP_AC_V4RPORT((acp)) == 0 ||				\
24054	TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) &&		\
24055	(acp)->ac_start <= (tcp)->tcp_state &&	\
24056	(acp)->ac_end >= (tcp)->tcp_state) :		\
24057	((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||	\
24058	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),		\
24059	&(tcp)->tcp_ip_src_v6)) &&				\
24060	(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||	\
24061	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),		\
24062	&(tcp)->tcp_remote_v6)) &&				\
24063	(TCP_AC_V6LPORT((acp)) == 0 ||				\
24064	TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) &&		\
24065	(TCP_AC_V6RPORT((acp)) == 0 ||				\
24066	TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) &&		\
24067	(acp)->ac_start <= (tcp)->tcp_state &&	\
24068	(acp)->ac_end >= (tcp)->tcp_state))
24069
24070#define	TCP_AC_MATCH(acp, tcp)					\
24071	(((acp)->ac_zoneid == ALL_ZONES ||			\
24072	(acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ?	\
24073	TCP_AC_ADDR_MATCH(acp, tcp) : 0)
24074
24075/*
24076 * Build a message containing a tcp_ioc_abort_conn_t structure
24077 * which is filled in with information from acp and tp.
24078 */
24079static mblk_t *
24080tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
24081{
24082	mblk_t *mp;
24083	tcp_ioc_abort_conn_t *tacp;
24084
24085	mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
24086	if (mp == NULL)
24087		return (NULL);
24088
24089	mp->b_datap->db_type = M_CTL;
24090
24091	*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
24092	tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
24093		sizeof (uint32_t));
24094
24095	tacp->ac_start = acp->ac_start;
24096	tacp->ac_end = acp->ac_end;
24097	tacp->ac_zoneid = acp->ac_zoneid;
24098
24099	if (acp->ac_local.ss_family == AF_INET) {
24100		tacp->ac_local.ss_family = AF_INET;
24101		tacp->ac_remote.ss_family = AF_INET;
24102		TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src;
24103		TCP_AC_V4REMOTE(tacp) = tp->tcp_remote;
24104		TCP_AC_V4LPORT(tacp) = tp->tcp_lport;
24105		TCP_AC_V4RPORT(tacp) = tp->tcp_fport;
24106	} else {
24107		tacp->ac_local.ss_family = AF_INET6;
24108		tacp->ac_remote.ss_family = AF_INET6;
24109		TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6;
24110		TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6;
24111		TCP_AC_V6LPORT(tacp) = tp->tcp_lport;
24112		TCP_AC_V6RPORT(tacp) = tp->tcp_fport;
24113	}
24114	mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
24115	return (mp);
24116}
24117
24118/*
24119 * Print a tcp_ioc_abort_conn_t structure.
24120 */
24121static void
24122tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
24123{
24124	char lbuf[128];
24125	char rbuf[128];
24126	sa_family_t af;
24127	in_port_t lport, rport;
24128	ushort_t logflags;
24129
24130	af = acp->ac_local.ss_family;
24131
24132	if (af == AF_INET) {
24133		(void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
24134				lbuf, 128);
24135		(void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
24136				rbuf, 128);
24137		lport = ntohs(TCP_AC_V4LPORT(acp));
24138		rport = ntohs(TCP_AC_V4RPORT(acp));
24139	} else {
24140		(void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
24141				lbuf, 128);
24142		(void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
24143				rbuf, 128);
24144		lport = ntohs(TCP_AC_V6LPORT(acp));
24145		rport = ntohs(TCP_AC_V6RPORT(acp));
24146	}
24147
24148	logflags = SL_TRACE | SL_NOTE;
24149	/*
24150	 * Don't print this message to the console if the operation was done
24151	 * to a non-global zone.
24152	 */
24153	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
24154		logflags |= SL_CONSOLE;
24155	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
24156		"TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
24157		"start = %d, end = %d\n", lbuf, lport, rbuf, rport,
24158		acp->ac_start, acp->ac_end);
24159}
24160
24161/*
24162 * Called inside tcp_rput when a message built using
24163 * tcp_ioctl_abort_build_msg is put into a queue.
24164 * Note that when we get here there is no wildcard in acp any more.
24165 */
24166static void
24167tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp)
24168{
24169	tcp_ioc_abort_conn_t *acp;
24170
24171	acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
24172	if (tcp->tcp_state <= acp->ac_end) {
24173		/*
24174		 * If we get here, we are already on the correct
24175		 * squeue. This ioctl follows the following path
24176		 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
24177		 * ->tcp_ioctl_abort->squeue_fill (if on a
24178		 * different squeue)
24179		 */
24180		int errcode;
24181
24182		TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
24183		(void) tcp_clean_death(tcp, errcode, 26);
24184	}
24185	freemsg(mp);
24186}
24187
24188/*
24189 * Abort all matching connections on a hash chain.
24190 */
24191static int
24192tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
24193    boolean_t exact)
24194{
24195	int nmatch, err = 0;
24196	tcp_t *tcp;
24197	MBLKP mp, last, listhead = NULL;
24198	conn_t	*tconnp;
24199	connf_t	*connfp = &ipcl_conn_fanout[index];
24200
24201startover:
24202	nmatch = 0;
24203
24204	mutex_enter(&connfp->connf_lock);
24205	for (tconnp = connfp->connf_head; tconnp != NULL;
24206	    tconnp = tconnp->conn_next) {
24207		tcp = tconnp->conn_tcp;
24208		if (TCP_AC_MATCH(acp, tcp)) {
24209			CONN_INC_REF(tcp->tcp_connp);
24210			mp = tcp_ioctl_abort_build_msg(acp, tcp);
24211			if (mp == NULL) {
24212				err = ENOMEM;
24213				CONN_DEC_REF(tcp->tcp_connp);
24214				break;
24215			}
24216			mp->b_prev = (mblk_t *)tcp;
24217
24218			if (listhead == NULL) {
24219				listhead = mp;
24220				last = mp;
24221			} else {
24222				last->b_next = mp;
24223				last = mp;
24224			}
24225			nmatch++;
24226			if (exact)
24227				break;
24228		}
24229
24230		/* Avoid holding lock for too long. */
24231		if (nmatch >= 500)
24232			break;
24233	}
24234	mutex_exit(&connfp->connf_lock);
24235
24236	/* Pass mp into the correct tcp */
24237	while ((mp = listhead) != NULL) {
24238		listhead = listhead->b_next;
24239		tcp = (tcp_t *)mp->b_prev;
24240		mp->b_next = mp->b_prev = NULL;
24241		squeue_fill(tcp->tcp_connp->conn_sqp, mp,
24242		    tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET);
24243	}
24244
24245	*count += nmatch;
24246	if (nmatch >= 500 && err == 0)
24247		goto startover;
24248	return (err);
24249}
24250
24251/*
24252 * Abort all connections that matches the attributes specified in acp.
24253 */
24254static int
24255tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp)
24256{
24257	sa_family_t af;
24258	uint32_t  ports;
24259	uint16_t *pports;
24260	int err = 0, count = 0;
24261	boolean_t exact = B_FALSE; /* set when there is no wildcard */
24262	int index = -1;
24263	ushort_t logflags;
24264
24265	af = acp->ac_local.ss_family;
24266
24267	if (af == AF_INET) {
24268		if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
24269		    TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
24270			pports = (uint16_t *)&ports;
24271			pports[1] = TCP_AC_V4LPORT(acp);
24272			pports[0] = TCP_AC_V4RPORT(acp);
24273			exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
24274		}
24275	} else {
24276		if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
24277		    TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
24278			pports = (uint16_t *)&ports;
24279			pports[1] = TCP_AC_V6LPORT(acp);
24280			pports[0] = TCP_AC_V6RPORT(acp);
24281			exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
24282		}
24283	}
24284
24285	/*
24286	 * For cases where remote addr, local port, and remote port are non-
24287	 * wildcards, tcp_ioctl_abort_bucket will only be called once.
24288	 */
24289	if (index != -1) {
24290		err = tcp_ioctl_abort_bucket(acp, index,
24291			    &count, exact);
24292	} else {
24293		/*
24294		 * loop through all entries for wildcard case
24295		 */
24296		for (index = 0; index < ipcl_conn_fanout_size; index++) {
24297			err = tcp_ioctl_abort_bucket(acp, index,
24298			    &count, exact);
24299			if (err != 0)
24300				break;
24301		}
24302	}
24303
24304	logflags = SL_TRACE | SL_NOTE;
24305	/*
24306	 * Don't print this message to the console if the operation was done
24307	 * to a non-global zone.
24308	 */
24309	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
24310		logflags |= SL_CONSOLE;
24311	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
24312	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
24313	if (err == 0 && count == 0)
24314		err = ENOENT;
24315	return (err);
24316}
24317
24318/*
24319 * Process the TCP_IOC_ABORT_CONN ioctl request.
24320 */
24321static void
24322tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
24323{
24324	int	err;
24325	IOCP    iocp;
24326	MBLKP   mp1;
24327	sa_family_t laf, raf;
24328	tcp_ioc_abort_conn_t *acp;
24329	zone_t *zptr;
24330	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
24331
24332	iocp = (IOCP)mp->b_rptr;
24333
24334	if ((mp1 = mp->b_cont) == NULL ||
24335	    iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
24336		err = EINVAL;
24337		goto out;
24338	}
24339
24340	/* check permissions */
24341	if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) {
24342		err = EPERM;
24343		goto out;
24344	}
24345
24346	if (mp1->b_cont != NULL) {
24347		freemsg(mp1->b_cont);
24348		mp1->b_cont = NULL;
24349	}
24350
24351	acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
24352	laf = acp->ac_local.ss_family;
24353	raf = acp->ac_remote.ss_family;
24354
24355	/* check that a zone with the supplied zoneid exists */
24356	if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
24357		zptr = zone_find_by_id(zoneid);
24358		if (zptr != NULL) {
24359			zone_rele(zptr);
24360		} else {
24361			err = EINVAL;
24362			goto out;
24363		}
24364	}
24365
24366	if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
24367	    acp->ac_start > acp->ac_end || laf != raf ||
24368	    (laf != AF_INET && laf != AF_INET6)) {
24369		err = EINVAL;
24370		goto out;
24371	}
24372
24373	tcp_ioctl_abort_dump(acp);
24374	err = tcp_ioctl_abort(acp);
24375
24376out:
24377	if (mp1 != NULL) {
24378		freemsg(mp1);
24379		mp->b_cont = NULL;
24380	}
24381
24382	if (err != 0)
24383		miocnak(q, mp, 0, err);
24384	else
24385		miocack(q, mp, 0, 0);
24386}
24387
24388/*
24389 * tcp_time_wait_processing() handles processing of incoming packets when
24390 * the tcp is in the TIME_WAIT state.
24391 * A TIME_WAIT tcp that has an associated open TCP stream is never put
24392 * on the time wait list.
24393 */
24394void
24395tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
24396    uint32_t seg_ack, int seg_len, tcph_t *tcph)
24397{
24398	int32_t		bytes_acked;
24399	int32_t		gap;
24400	int32_t		rgap;
24401	tcp_opt_t	tcpopt;
24402	uint_t		flags;
24403	uint32_t	new_swnd = 0;
24404	conn_t		*connp;
24405
24406	BUMP_LOCAL(tcp->tcp_ibsegs);
24407	TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT);
24408
24409	flags = (unsigned int)tcph->th_flags[0] & 0xFF;
24410	new_swnd = BE16_TO_U16(tcph->th_win) <<
24411	    ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
24412	if (tcp->tcp_snd_ts_ok) {
24413		if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
24414			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
24415			    tcp->tcp_rnxt, TH_ACK);
24416			goto done;
24417		}
24418	}
24419	gap = seg_seq - tcp->tcp_rnxt;
24420	rgap = tcp->tcp_rwnd - (gap + seg_len);
24421	if (gap < 0) {
24422		BUMP_MIB(&tcp_mib, tcpInDataDupSegs);
24423		UPDATE_MIB(&tcp_mib, tcpInDataDupBytes,
24424		    (seg_len > -gap ? -gap : seg_len));
24425		seg_len += gap;
24426		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
24427			if (flags & TH_RST) {
24428				goto done;
24429			}
24430			if ((flags & TH_FIN) && seg_len == -1) {
24431				/*
24432				 * When TCP receives a duplicate FIN in
24433				 * TIME_WAIT state, restart the 2 MSL timer.
24434				 * See page 73 in RFC 793. Make sure this TCP
24435				 * is already on the TIME_WAIT list. If not,
24436				 * just restart the timer.
24437				 */
24438				if (TCP_IS_DETACHED(tcp)) {
24439					tcp_time_wait_remove(tcp, NULL);
24440					tcp_time_wait_append(tcp);
24441					TCP_DBGSTAT(tcp_rput_time_wait);
24442				} else {
24443					ASSERT(tcp != NULL);
24444					TCP_TIMER_RESTART(tcp,
24445					    tcp_time_wait_interval);
24446				}
24447				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
24448				    tcp->tcp_rnxt, TH_ACK);
24449				goto done;
24450			}
24451			flags |=  TH_ACK_NEEDED;
24452			seg_len = 0;
24453			goto process_ack;
24454		}
24455
24456		/* Fix seg_seq, and chew the gap off the front. */
24457		seg_seq = tcp->tcp_rnxt;
24458	}
24459
24460	if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
24461		/*
24462		 * Make sure that when we accept the connection, pick
24463		 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the
24464		 * old connection.
24465		 *
24466		 * The next ISS generated is equal to tcp_iss_incr_extra
24467		 * + ISS_INCR/2 + other components depending on the
24468		 * value of tcp_strong_iss.  We pre-calculate the new
24469		 * ISS here and compare with tcp_snxt to determine if
24470		 * we need to make adjustment to tcp_iss_incr_extra.
24471		 *
24472		 * The above calculation is ugly and is a
24473		 * waste of CPU cycles...
24474		 */
24475		uint32_t new_iss = tcp_iss_incr_extra;
24476		int32_t adj;
24477
24478		switch (tcp_strong_iss) {
24479		case 2: {
24480			/* Add time and MD5 components. */
24481			uint32_t answer[4];
24482			struct {
24483				uint32_t ports;
24484				in6_addr_t src;
24485				in6_addr_t dst;
24486			} arg;
24487			MD5_CTX context;
24488
24489			mutex_enter(&tcp_iss_key_lock);
24490			context = tcp_iss_key;
24491			mutex_exit(&tcp_iss_key_lock);
24492			arg.ports = tcp->tcp_ports;
24493			/* We use MAPPED addresses in tcp_iss_init */
24494			arg.src = tcp->tcp_ip_src_v6;
24495			if (tcp->tcp_ipversion == IPV4_VERSION) {
24496				IN6_IPADDR_TO_V4MAPPED(
24497					tcp->tcp_ipha->ipha_dst,
24498					    &arg.dst);
24499			} else {
24500				arg.dst =
24501				    tcp->tcp_ip6h->ip6_dst;
24502			}
24503			MD5Update(&context, (uchar_t *)&arg,
24504			    sizeof (arg));
24505			MD5Final((uchar_t *)answer, &context);
24506			answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
24507			new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
24508			break;
24509		}
24510		case 1:
24511			/* Add time component and min random (i.e. 1). */
24512			new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
24513			break;
24514		default:
24515			/* Add only time component. */
24516			new_iss += (uint32_t)gethrestime_sec() * ISS_INCR;
24517			break;
24518		}
24519		if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
24520			/*
24521			 * New ISS not guaranteed to be ISS_INCR/2
24522			 * ahead of the current tcp_snxt, so add the
24523			 * difference to tcp_iss_incr_extra.
24524			 */
24525			tcp_iss_incr_extra += adj;
24526		}
24527		/*
24528		 * If tcp_clean_death() can not perform the task now,
24529		 * drop the SYN packet and let the other side re-xmit.
24530		 * Otherwise pass the SYN packet back in, since the
24531		 * old tcp state has been cleaned up or freed.
24532		 */
24533		if (tcp_clean_death(tcp, 0, 27) == -1)
24534			goto done;
24535		/*
24536		 * We will come back to tcp_rput_data
24537		 * on the global queue. Packets destined
24538		 * for the global queue will be checked
24539		 * with global policy. But the policy for
24540		 * this packet has already been checked as
24541		 * this was destined for the detached
24542		 * connection. We need to bypass policy
24543		 * check this time by attaching a dummy
24544		 * ipsec_in with ipsec_in_dont_check set.
24545		 */
24546		if ((connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid)) !=
24547		    NULL) {
24548			TCP_STAT(tcp_time_wait_syn_success);
24549			tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp);
24550			return;
24551		}
24552		goto done;
24553	}
24554
24555	/*
24556	 * rgap is the amount of stuff received out of window.  A negative
24557	 * value is the amount out of window.
24558	 */
24559	if (rgap < 0) {
24560		BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs);
24561		UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap);
24562		/* Fix seg_len and make sure there is something left. */
24563		seg_len += rgap;
24564		if (seg_len <= 0) {
24565			if (flags & TH_RST) {
24566				goto done;
24567			}
24568			flags |=  TH_ACK_NEEDED;
24569			seg_len = 0;
24570			goto process_ack;
24571		}
24572	}
24573	/*
24574	 * Check whether we can update tcp_ts_recent.  This test is
24575	 * NOT the one in RFC 1323 3.4.  It is from Braden, 1993, "TCP
24576	 * Extensions for High Performance: An Update", Internet Draft.
24577	 */
24578	if (tcp->tcp_snd_ts_ok &&
24579	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
24580	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
24581		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
24582		tcp->tcp_last_rcv_lbolt = lbolt64;
24583	}
24584
24585	if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
24586		/* Always ack out of order packets */
24587		flags |= TH_ACK_NEEDED;
24588		seg_len = 0;
24589	} else if (seg_len > 0) {
24590		BUMP_MIB(&tcp_mib, tcpInClosed);
24591		BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
24592		UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len);
24593	}
24594	if (flags & TH_RST) {
24595		(void) tcp_clean_death(tcp, 0, 28);
24596		goto done;
24597	}
24598	if (flags & TH_SYN) {
24599		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
24600		    TH_RST|TH_ACK);
24601		/*
24602		 * Do not delete the TCP structure if it is in
24603		 * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
24604		 */
24605		goto done;
24606	}
24607process_ack:
24608	if (flags & TH_ACK) {
24609		bytes_acked = (int)(seg_ack - tcp->tcp_suna);
24610		if (bytes_acked <= 0) {
24611			if (bytes_acked == 0 && seg_len == 0 &&
24612			    new_swnd == tcp->tcp_swnd)
24613				BUMP_MIB(&tcp_mib, tcpInDupAck);
24614		} else {
24615			/* Acks something not sent */
24616			flags |= TH_ACK_NEEDED;
24617		}
24618	}
24619	if (flags & TH_ACK_NEEDED) {
24620		/*
24621		 * Time to send an ack for some reason.
24622		 */
24623		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
24624		    tcp->tcp_rnxt, TH_ACK);
24625	}
24626done:
24627	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
24628		DB_CKSUMSTART(mp) = 0;
24629		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
24630		TCP_STAT(tcp_time_wait_syn_fail);
24631	}
24632	freemsg(mp);
24633}
24634
24635/*
24636 * Allocate a T_SVR4_OPTMGMT_REQ.
24637 * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so
24638 * that tcp_rput_other can drop the acks.
24639 */
24640static mblk_t *
24641tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen)
24642{
24643	mblk_t *mp;
24644	struct T_optmgmt_req *tor;
24645	struct opthdr *oh;
24646	uint_t size;
24647	char *optptr;
24648
24649	size = sizeof (*tor) + sizeof (*oh) + optlen;
24650	mp = allocb(size, BPRI_MED);
24651	if (mp == NULL)
24652		return (NULL);
24653
24654	mp->b_wptr += size;
24655	mp->b_datap->db_type = M_PROTO;
24656	tor = (struct T_optmgmt_req *)mp->b_rptr;
24657	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
24658	tor->MGMT_flags = T_NEGOTIATE;
24659	tor->OPT_length = sizeof (*oh) + optlen;
24660	tor->OPT_offset = (t_scalar_t)sizeof (*tor);
24661
24662	oh = (struct opthdr *)&tor[1];
24663	oh->level = level;
24664	oh->name = cmd;
24665	oh->len = optlen;
24666	if (optlen != 0) {
24667		optptr = (char *)&oh[1];
24668		bcopy(opt, optptr, optlen);
24669	}
24670	return (mp);
24671}
24672
24673/*
24674 * TCP Timers Implementation.
24675 */
24676timeout_id_t
24677tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
24678{
24679	mblk_t *mp;
24680	tcp_timer_t *tcpt;
24681	tcp_t *tcp = connp->conn_tcp;
24682
24683	ASSERT(connp->conn_sqp != NULL);
24684
24685	TCP_DBGSTAT(tcp_timeout_calls);
24686
24687	if (tcp->tcp_timercache == NULL) {
24688		mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
24689	} else {
24690		TCP_DBGSTAT(tcp_timeout_cached_alloc);
24691		mp = tcp->tcp_timercache;
24692		tcp->tcp_timercache = mp->b_next;
24693		mp->b_next = NULL;
24694		ASSERT(mp->b_wptr == NULL);
24695	}
24696
24697	CONN_INC_REF(connp);
24698	tcpt = (tcp_timer_t *)mp->b_rptr;
24699	tcpt->connp = connp;
24700	tcpt->tcpt_proc = f;
24701	tcpt->tcpt_tid = timeout(tcp_timer_callback, mp, tim);
24702	return ((timeout_id_t)mp);
24703}
24704
24705static void
24706tcp_timer_callback(void *arg)
24707{
24708	mblk_t *mp = (mblk_t *)arg;
24709	tcp_timer_t *tcpt;
24710	conn_t	*connp;
24711
24712	tcpt = (tcp_timer_t *)mp->b_rptr;
24713	connp = tcpt->connp;
24714	squeue_fill(connp->conn_sqp, mp,
24715	    tcp_timer_handler, connp, SQTAG_TCP_TIMER);
24716}
24717
24718static void
24719tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
24720{
24721	tcp_timer_t *tcpt;
24722	conn_t *connp = (conn_t *)arg;
24723	tcp_t *tcp = connp->conn_tcp;
24724
24725	tcpt = (tcp_timer_t *)mp->b_rptr;
24726	ASSERT(connp == tcpt->connp);
24727	ASSERT((squeue_t *)arg2 == connp->conn_sqp);
24728
24729	/*
24730	 * If the TCP has reached the closed state, don't proceed any
24731	 * further. This TCP logically does not exist on the system.
24732	 * tcpt_proc could for example access queues, that have already
24733	 * been qprocoff'ed off. Also see comments at the start of tcp_input
24734	 */
24735	if (tcp->tcp_state != TCPS_CLOSED) {
24736		(*tcpt->tcpt_proc)(connp);
24737	} else {
24738		tcp->tcp_timer_tid = 0;
24739	}
24740	tcp_timer_free(connp->conn_tcp, mp);
24741}
24742
24743/*
24744 * There is potential race with untimeout and the handler firing at the same
24745 * time. The mblock may be freed by the handler while we are trying to use
24746 * it. But since both should execute on the same squeue, this race should not
24747 * occur.
24748 */
24749clock_t
24750tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
24751{
24752	mblk_t	*mp = (mblk_t *)id;
24753	tcp_timer_t *tcpt;
24754	clock_t delta;
24755
24756	TCP_DBGSTAT(tcp_timeout_cancel_reqs);
24757
24758	if (mp == NULL)
24759		return (-1);
24760
24761	tcpt = (tcp_timer_t *)mp->b_rptr;
24762	ASSERT(tcpt->connp == connp);
24763
24764	delta = untimeout(tcpt->tcpt_tid);
24765
24766	if (delta >= 0) {
24767		TCP_DBGSTAT(tcp_timeout_canceled);
24768		tcp_timer_free(connp->conn_tcp, mp);
24769		CONN_DEC_REF(connp);
24770	}
24771
24772	return (delta);
24773}
24774
24775/*
24776 * Allocate space for the timer event. The allocation looks like mblk, but it is
24777 * not a proper mblk. To avoid confusion we set b_wptr to NULL.
24778 *
24779 * Dealing with failures: If we can't allocate from the timer cache we try
24780 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
24781 * points to b_rptr.
24782 * If we can't allocate anything using allocb_tryhard(), we perform a last
24783 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
24784 * save the actual allocation size in b_datap.
24785 */
24786mblk_t *
24787tcp_timermp_alloc(int kmflags)
24788{
24789	mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
24790	    kmflags & ~KM_PANIC);
24791
24792	if (mp != NULL) {
24793		mp->b_next = mp->b_prev = NULL;
24794		mp->b_rptr = (uchar_t *)(&mp[1]);
24795		mp->b_wptr = NULL;
24796		mp->b_datap = NULL;
24797		mp->b_queue = NULL;
24798	} else if (kmflags & KM_PANIC) {
24799		/*
24800		 * Failed to allocate memory for the timer. Try allocating from
24801		 * dblock caches.
24802		 */
24803		TCP_STAT(tcp_timermp_allocfail);
24804		mp = allocb_tryhard(sizeof (tcp_timer_t));
24805		if (mp == NULL) {
24806			size_t size = 0;
24807			/*
24808			 * Memory is really low. Try tryhard allocation.
24809			 */
24810			TCP_STAT(tcp_timermp_allocdblfail);
24811			mp = kmem_alloc_tryhard(sizeof (mblk_t) +
24812			    sizeof (tcp_timer_t), &size, kmflags);
24813			mp->b_rptr = (uchar_t *)(&mp[1]);
24814			mp->b_next = mp->b_prev = NULL;
24815			mp->b_wptr = (uchar_t *)-1;
24816			mp->b_datap = (dblk_t *)size;
24817			mp->b_queue = NULL;
24818		}
24819		ASSERT(mp->b_wptr != NULL);
24820	}
24821	TCP_DBGSTAT(tcp_timermp_alloced);
24822
24823	return (mp);
24824}
24825
24826/*
24827 * Free per-tcp timer cache.
24828 * It can only contain entries from tcp_timercache.
24829 */
24830void
24831tcp_timermp_free(tcp_t *tcp)
24832{
24833	mblk_t *mp;
24834
24835	while ((mp = tcp->tcp_timercache) != NULL) {
24836		ASSERT(mp->b_wptr == NULL);
24837		tcp->tcp_timercache = tcp->tcp_timercache->b_next;
24838		kmem_cache_free(tcp_timercache, mp);
24839	}
24840}
24841
24842/*
24843 * Free timer event. Put it on the per-tcp timer cache if there is not too many
24844 * events there already (currently at most two events are cached).
24845 * If the event is not allocated from the timer cache, free it right away.
24846 */
24847static void
24848tcp_timer_free(tcp_t *tcp, mblk_t *mp)
24849{
24850	mblk_t *mp1 = tcp->tcp_timercache;
24851
24852	if (mp->b_wptr != NULL) {
24853		/*
24854		 * This allocation is not from a timer cache, free it right
24855		 * away.
24856		 */
24857		if (mp->b_wptr != (uchar_t *)-1)
24858			freeb(mp);
24859		else
24860			kmem_free(mp, (size_t)mp->b_datap);
24861	} else if (mp1 == NULL || mp1->b_next == NULL) {
24862		/* Cache this timer block for future allocations */
24863		mp->b_rptr = (uchar_t *)(&mp[1]);
24864		mp->b_next = mp1;
24865		tcp->tcp_timercache = mp;
24866	} else {
24867		kmem_cache_free(tcp_timercache, mp);
24868		TCP_DBGSTAT(tcp_timermp_freed);
24869	}
24870}
24871
24872/*
24873 * End of TCP Timers implementation.
24874 */
24875
24876/*
24877 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL
24878 * on the specified backing STREAMS q. Note, the caller may make the
24879 * decision to call based on the tcp_t.tcp_flow_stopped value which
24880 * when check outside the q's lock is only an advisory check ...
24881 */
24882
24883void
24884tcp_setqfull(tcp_t *tcp)
24885{
24886	queue_t *q = tcp->tcp_wq;
24887
24888	if (!(q->q_flag & QFULL)) {
24889		mutex_enter(QLOCK(q));
24890		if (!(q->q_flag & QFULL)) {
24891			/* still need to set QFULL */
24892			q->q_flag |= QFULL;
24893			tcp->tcp_flow_stopped = B_TRUE;
24894			mutex_exit(QLOCK(q));
24895			TCP_STAT(tcp_flwctl_on);
24896		} else {
24897			mutex_exit(QLOCK(q));
24898		}
24899	}
24900}
24901
24902void
24903tcp_clrqfull(tcp_t *tcp)
24904{
24905	queue_t *q = tcp->tcp_wq;
24906
24907	if (q->q_flag & QFULL) {
24908		mutex_enter(QLOCK(q));
24909		if (q->q_flag & QFULL) {
24910			q->q_flag &= ~QFULL;
24911			tcp->tcp_flow_stopped = B_FALSE;
24912			mutex_exit(QLOCK(q));
24913			if (q->q_flag & QWANTW)
24914				qbackenable(q, 0);
24915		} else {
24916			mutex_exit(QLOCK(q));
24917		}
24918	}
24919}
24920
24921/*
24922 * TCP Kstats implementation
24923 */
24924static void
24925tcp_kstat_init(void)
24926{
24927	tcp_named_kstat_t template = {
24928		{ "rtoAlgorithm",	KSTAT_DATA_INT32, 0 },
24929		{ "rtoMin",		KSTAT_DATA_INT32, 0 },
24930		{ "rtoMax",		KSTAT_DATA_INT32, 0 },
24931		{ "maxConn",		KSTAT_DATA_INT32, 0 },
24932		{ "activeOpens",	KSTAT_DATA_UINT32, 0 },
24933		{ "passiveOpens",	KSTAT_DATA_UINT32, 0 },
24934		{ "attemptFails",	KSTAT_DATA_UINT32, 0 },
24935		{ "estabResets",	KSTAT_DATA_UINT32, 0 },
24936		{ "currEstab",		KSTAT_DATA_UINT32, 0 },
24937		{ "inSegs",		KSTAT_DATA_UINT32, 0 },
24938		{ "outSegs",		KSTAT_DATA_UINT32, 0 },
24939		{ "retransSegs",	KSTAT_DATA_UINT32, 0 },
24940		{ "connTableSize",	KSTAT_DATA_INT32, 0 },
24941		{ "outRsts",		KSTAT_DATA_UINT32, 0 },
24942		{ "outDataSegs",	KSTAT_DATA_UINT32, 0 },
24943		{ "outDataBytes",	KSTAT_DATA_UINT32, 0 },
24944		{ "retransBytes",	KSTAT_DATA_UINT32, 0 },
24945		{ "outAck",		KSTAT_DATA_UINT32, 0 },
24946		{ "outAckDelayed",	KSTAT_DATA_UINT32, 0 },
24947		{ "outUrg",		KSTAT_DATA_UINT32, 0 },
24948		{ "outWinUpdate",	KSTAT_DATA_UINT32, 0 },
24949		{ "outWinProbe",	KSTAT_DATA_UINT32, 0 },
24950		{ "outControl",		KSTAT_DATA_UINT32, 0 },
24951		{ "outFastRetrans",	KSTAT_DATA_UINT32, 0 },
24952		{ "inAckSegs",		KSTAT_DATA_UINT32, 0 },
24953		{ "inAckBytes",		KSTAT_DATA_UINT32, 0 },
24954		{ "inDupAck",		KSTAT_DATA_UINT32, 0 },
24955		{ "inAckUnsent",	KSTAT_DATA_UINT32, 0 },
24956		{ "inDataInorderSegs",	KSTAT_DATA_UINT32, 0 },
24957		{ "inDataInorderBytes",	KSTAT_DATA_UINT32, 0 },
24958		{ "inDataUnorderSegs",	KSTAT_DATA_UINT32, 0 },
24959		{ "inDataUnorderBytes",	KSTAT_DATA_UINT32, 0 },
24960		{ "inDataDupSegs",	KSTAT_DATA_UINT32, 0 },
24961		{ "inDataDupBytes",	KSTAT_DATA_UINT32, 0 },
24962		{ "inDataPartDupSegs",	KSTAT_DATA_UINT32, 0 },
24963		{ "inDataPartDupBytes",	KSTAT_DATA_UINT32, 0 },
24964		{ "inDataPastWinSegs",	KSTAT_DATA_UINT32, 0 },
24965		{ "inDataPastWinBytes",	KSTAT_DATA_UINT32, 0 },
24966		{ "inWinProbe",		KSTAT_DATA_UINT32, 0 },
24967		{ "inWinUpdate",	KSTAT_DATA_UINT32, 0 },
24968		{ "inClosed",		KSTAT_DATA_UINT32, 0 },
24969		{ "rttUpdate",		KSTAT_DATA_UINT32, 0 },
24970		{ "rttNoUpdate",	KSTAT_DATA_UINT32, 0 },
24971		{ "timRetrans",		KSTAT_DATA_UINT32, 0 },
24972		{ "timRetransDrop",	KSTAT_DATA_UINT32, 0 },
24973		{ "timKeepalive",	KSTAT_DATA_UINT32, 0 },
24974		{ "timKeepaliveProbe",	KSTAT_DATA_UINT32, 0 },
24975		{ "timKeepaliveDrop",	KSTAT_DATA_UINT32, 0 },
24976		{ "listenDrop",		KSTAT_DATA_UINT32, 0 },
24977		{ "listenDropQ0",	KSTAT_DATA_UINT32, 0 },
24978		{ "halfOpenDrop",	KSTAT_DATA_UINT32, 0 },
24979		{ "outSackRetransSegs",	KSTAT_DATA_UINT32, 0 },
24980		{ "connTableSize6",	KSTAT_DATA_INT32, 0 }
24981	};
24982
24983	tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME,
24984	    "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0);
24985
24986	if (tcp_mibkp == NULL)
24987		return;
24988
24989	template.rtoAlgorithm.value.ui32 = 4;
24990	template.rtoMin.value.ui32 = tcp_rexmit_interval_min;
24991	template.rtoMax.value.ui32 = tcp_rexmit_interval_max;
24992	template.maxConn.value.i32 = -1;
24993
24994	bcopy(&template, tcp_mibkp->ks_data, sizeof (template));
24995
24996	tcp_mibkp->ks_update = tcp_kstat_update;
24997
24998	kstat_install(tcp_mibkp);
24999}
25000
25001static void
25002tcp_kstat_fini(void)
25003{
25004
25005	if (tcp_mibkp != NULL) {
25006		kstat_delete(tcp_mibkp);
25007		tcp_mibkp = NULL;
25008	}
25009}
25010
25011static int
25012tcp_kstat_update(kstat_t *kp, int rw)
25013{
25014	tcp_named_kstat_t	*tcpkp;
25015	tcp_t			*tcp;
25016	connf_t			*connfp;
25017	conn_t			*connp;
25018	int 			i;
25019
25020	if (!kp || !kp->ks_data)
25021		return (EIO);
25022
25023	if (rw == KSTAT_WRITE)
25024		return (EACCES);
25025
25026	tcpkp = (tcp_named_kstat_t *)kp->ks_data;
25027
25028	tcpkp->currEstab.value.ui32 = 0;
25029
25030	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
25031		connfp = &ipcl_globalhash_fanout[i];
25032		connp = NULL;
25033		while ((connp =
25034		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
25035			tcp = connp->conn_tcp;
25036			switch (tcp_snmp_state(tcp)) {
25037			case MIB2_TCP_established:
25038			case MIB2_TCP_closeWait:
25039				tcpkp->currEstab.value.ui32++;
25040				break;
25041			}
25042		}
25043	}
25044
25045	tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens;
25046	tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens;
25047	tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails;
25048	tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets;
25049	tcpkp->inSegs.value.ui32 = tcp_mib.tcpInSegs;
25050	tcpkp->outSegs.value.ui32 = tcp_mib.tcpOutSegs;
25051	tcpkp->retransSegs.value.ui32 =	tcp_mib.tcpRetransSegs;
25052	tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize;
25053	tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts;
25054	tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs;
25055	tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes;
25056	tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes;
25057	tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck;
25058	tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed;
25059	tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg;
25060	tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate;
25061	tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe;
25062	tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl;
25063	tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans;
25064	tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs;
25065	tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes;
25066	tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck;
25067	tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent;
25068	tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs;
25069	tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes;
25070	tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs;
25071	tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes;
25072	tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs;
25073	tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes;
25074	tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs;
25075	tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes;
25076	tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs;
25077	tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes;
25078	tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe;
25079	tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate;
25080	tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed;
25081	tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate;
25082	tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate;
25083	tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans;
25084	tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop;
25085	tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive;
25086	tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe;
25087	tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop;
25088	tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop;
25089	tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0;
25090	tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop;
25091	tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs;
25092	tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize;
25093
25094	return (0);
25095}
25096
25097void
25098tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
25099{
25100	uint16_t	hdr_len;
25101	ipha_t		*ipha;
25102	uint8_t		*nexthdrp;
25103	tcph_t		*tcph;
25104
25105	/* Already has an eager */
25106	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
25107		TCP_STAT(tcp_reinput_syn);
25108		squeue_enter(connp->conn_sqp, mp, connp->conn_recv,
25109		    connp, SQTAG_TCP_REINPUT_EAGER);
25110		return;
25111	}
25112
25113	switch (IPH_HDR_VERSION(mp->b_rptr)) {
25114	case IPV4_VERSION:
25115		ipha = (ipha_t *)mp->b_rptr;
25116		hdr_len = IPH_HDR_LENGTH(ipha);
25117		break;
25118	case IPV6_VERSION:
25119		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
25120		    &hdr_len, &nexthdrp)) {
25121			CONN_DEC_REF(connp);
25122			freemsg(mp);
25123			return;
25124		}
25125		break;
25126	}
25127
25128	tcph = (tcph_t *)&mp->b_rptr[hdr_len];
25129	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
25130		mp->b_datap->db_struioflag |= STRUIO_EAGER;
25131		DB_CKSUMSTART(mp) = (intptr_t)sqp;
25132	}
25133
25134	squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,
25135	    SQTAG_TCP_REINPUT);
25136}
25137
25138static squeue_func_t
25139tcp_squeue_switch(int val)
25140{
25141	squeue_func_t rval = squeue_fill;
25142
25143	switch (val) {
25144	case 1:
25145		rval = squeue_enter_nodrain;
25146		break;
25147	case 2:
25148		rval = squeue_enter;
25149		break;
25150	default:
25151		break;
25152	}
25153	return (rval);
25154}
25155
25156static void
25157tcp_squeue_add(squeue_t *sqp)
25158{
25159	tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc(
25160		sizeof (tcp_squeue_priv_t), KM_SLEEP);
25161
25162	*squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait;
25163	tcp_time_wait->tcp_time_wait_tid = timeout(tcp_time_wait_collector,
25164	    sqp, TCP_TIME_WAIT_DELAY);
25165	if (tcp_free_list_max_cnt == 0) {
25166		int tcp_ncpus = ((boot_max_ncpus == -1) ?
25167			max_ncpus : boot_max_ncpus);
25168
25169		/*
25170		 * Limit number of entries to 1% of availble memory / tcp_ncpus
25171		 */
25172		tcp_free_list_max_cnt = (freemem * PAGESIZE) /
25173			(tcp_ncpus * sizeof (tcp_t) * 100);
25174	}
25175	tcp_time_wait->tcp_free_list_cnt = 0;
25176}
25177