tcp_opt_data.c revision 12644:4f9a0cd40c5f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <sys/stream.h>
27#define	_SUN_TPI_VERSION 2
28#include <sys/tihdr.h>
29#include <sys/socket.h>
30#include <sys/xti_xtiopt.h>
31#include <sys/xti_inet.h>
32#include <sys/policy.h>
33
34#include <inet/common.h>
35#include <netinet/ip6.h>
36#include <inet/ip.h>
37
38#include <netinet/in.h>
39#include <netinet/tcp.h>
40#include <inet/optcom.h>
41#include <inet/proto_set.h>
42#include <inet/tcp_impl.h>
43
44/*
45 * Table of all known options handled on a TCP protocol stack.
46 *
47 * Note: This table contains options processed by both TCP and IP levels
48 *       and is the superset of options that can be performed on a TCP over IP
49 *       stack.
50 */
51opdes_t	tcp_opt_arr[] = {
52
53{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
54	sizeof (struct linger), 0 },
55
56{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
57{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
60	},
61{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
65{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
68	sizeof (struct timeval), 0 },
69{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
70	sizeof (struct timeval), 0 },
71{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
72	},
73{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
75	0 },
76{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
77	0 },
78{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
79	0 },
80{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
81	0 },
82{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
83
84{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
85
86{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
87
88{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
89	},
90{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
91	536 },
92
93{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
94	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
95
96{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98
99{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101
102{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104
105{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
106	0 },
107
108{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
109	sizeof (int), 0 },
110
111{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
112	},
113
114{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
115	sizeof (int), 0 },
116
117{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
118	sizeof (int), 0	},
119
120{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121	sizeof (int), 0	},
122
123{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124
125{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
126
127{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
128
129{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
130
131{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
132
133{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
134	(OP_VARLEN|OP_NODEFAULT),
135	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
136{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
137	(OP_VARLEN|OP_NODEFAULT),
138	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
139
140{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
141{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
142{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
143	sizeof (int), -1 /* not initialized */ },
144
145{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
146	sizeof (ipsec_req_t), -1 /* not initialized */ },
147
148{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
149	sizeof (int),	0 /* no ifindex */ },
150
151{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
152	sizeof (int), 0 },
153
154{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
155	sizeof (int), -1 /* not initialized */ },
156
157{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
158	sizeof (int),	0 /* no ifindex */ },
159
160{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161
162{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
163	sizeof (in_addr_t),	-1 /* not initialized  */ },
164
165{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
166	sizeof (int), 0 },
167
168{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
169	(OP_NODEFAULT|OP_VARLEN),
170	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
171{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
172	OP_NODEFAULT,
173	sizeof (sin6_t), -1 /* not initialized */ },
174{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
175	(OP_VARLEN|OP_NODEFAULT), 255*8,
176	-1 /* not initialized */ },
177{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178	(OP_VARLEN|OP_NODEFAULT), 255*8,
179	-1 /* not initialized */ },
180{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181	(OP_VARLEN|OP_NODEFAULT), 255*8,
182	-1 /* not initialized */ },
183{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184	(OP_VARLEN|OP_NODEFAULT), 255*8,
185	-1 /* not initialized */ },
186{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187	OP_NODEFAULT,
188	sizeof (int), -1 /* not initialized */ },
189{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190	OP_NODEFAULT,
191	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
192{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
193	sizeof (int), 0 },
194{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
195	sizeof (int), 0 },
196{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
197	sizeof (int), 0 },
198
199/* Enable receipt of ancillary data */
200{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
201	sizeof (int), 0 },
202{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
203	sizeof (int), 0 },
204{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
205	sizeof (int), 0 },
206{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
207	sizeof (int), 0 },
208{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
209	sizeof (int), 0 },
210{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
211	sizeof (int), 0 },
212{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
213	sizeof (int), 0 },
214{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
215	sizeof (int), 0 },
216
217{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
218	sizeof (ipsec_req_t), -1 /* not initialized */ },
219{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
221};
222
223/*
224 * Table of all supported levels
225 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
226 * any supported options so we need this info separately.
227 *
228 * This is needed only for topmost tpi providers and is used only by
229 * XTI interfaces.
230 */
231optlevel_t	tcp_valid_levels_arr[] = {
232	XTI_GENERIC,
233	SOL_SOCKET,
234	IPPROTO_TCP,
235	IPPROTO_IP,
236	IPPROTO_IPV6
237};
238
239
240#define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
241#define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
242
243uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
244
245/*
246 * Initialize option database object for TCP
247 *
248 * This object represents database of options to search passed to
249 * {sock,tpi}optcom_req() interface routine to take care of option
250 * management and associated methods.
251 */
252
253optdb_obj_t tcp_opt_obj = {
254	tcp_opt_default,	/* TCP default value function pointer */
255	tcp_tpi_opt_get,	/* TCP get function pointer */
256	tcp_tpi_opt_set,	/* TCP set function pointer */
257	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
258	tcp_opt_arr,		/* TCP option database */
259	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
260	tcp_valid_levels_arr	/* TCP valid level array */
261};
262
263/* Maximum TCP initial cwin (start/restart). */
264#define	TCP_MAX_INIT_CWND	16
265
266static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
267
268/*
269 * Some TCP options can be "set" by requesting them in the option
270 * buffer. This is needed for XTI feature test though we do not
271 * allow it in general. We interpret that this mechanism is more
272 * applicable to OSI protocols and need not be allowed in general.
273 * This routine filters out options for which it is not allowed (most)
274 * and lets through those (few) for which it is. [ The XTI interface
275 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
276 * ever implemented will have to be allowed here ].
277 */
278static boolean_t
279tcp_allow_connopt_set(int level, int name)
280{
281
282	switch (level) {
283	case IPPROTO_TCP:
284		switch (name) {
285		case TCP_NODELAY:
286			return (B_TRUE);
287		default:
288			return (B_FALSE);
289		}
290		/*NOTREACHED*/
291	default:
292		return (B_FALSE);
293	}
294	/*NOTREACHED*/
295}
296
297/*
298 * This routine gets default values of certain options whose default
299 * values are maintained by protocol specific code
300 */
301/* ARGSUSED */
302int
303tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
304{
305	int32_t	*i1 = (int32_t *)ptr;
306	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
307
308	switch (level) {
309	case IPPROTO_TCP:
310		switch (name) {
311		case TCP_NOTIFY_THRESHOLD:
312			*i1 = tcps->tcps_ip_notify_interval;
313			break;
314		case TCP_ABORT_THRESHOLD:
315			*i1 = tcps->tcps_ip_abort_interval;
316			break;
317		case TCP_CONN_NOTIFY_THRESHOLD:
318			*i1 = tcps->tcps_ip_notify_cinterval;
319			break;
320		case TCP_CONN_ABORT_THRESHOLD:
321			*i1 = tcps->tcps_ip_abort_cinterval;
322			break;
323		default:
324			return (-1);
325		}
326		break;
327	case IPPROTO_IP:
328		switch (name) {
329		case IP_TTL:
330			*i1 = tcps->tcps_ipv4_ttl;
331			break;
332		default:
333			return (-1);
334		}
335		break;
336	case IPPROTO_IPV6:
337		switch (name) {
338		case IPV6_UNICAST_HOPS:
339			*i1 = tcps->tcps_ipv6_hoplimit;
340			break;
341		default:
342			return (-1);
343		}
344		break;
345	default:
346		return (-1);
347	}
348	return (sizeof (int));
349}
350
351/*
352 * TCP routine to get the values of options.
353 */
354int
355tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
356{
357	int		*i1 = (int *)ptr;
358	tcp_t		*tcp = connp->conn_tcp;
359	conn_opt_arg_t	coas;
360	int		retval;
361
362	coas.coa_connp = connp;
363	coas.coa_ixa = connp->conn_ixa;
364	coas.coa_ipp = &connp->conn_xmit_ipp;
365	coas.coa_ancillary = B_FALSE;
366	coas.coa_changed = 0;
367
368	switch (level) {
369	case SOL_SOCKET:
370		switch (name) {
371		case SO_SND_COPYAVOID:
372			*i1 = tcp->tcp_snd_zcopy_on ?
373			    SO_SND_COPYAVOID : 0;
374			return (sizeof (int));
375		case SO_ACCEPTCONN:
376			*i1 = (tcp->tcp_state == TCPS_LISTEN);
377			return (sizeof (int));
378		}
379		break;
380	case IPPROTO_TCP:
381		switch (name) {
382		case TCP_NODELAY:
383			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
384			return (sizeof (int));
385		case TCP_MAXSEG:
386			*i1 = tcp->tcp_mss;
387			return (sizeof (int));
388		case TCP_NOTIFY_THRESHOLD:
389			*i1 = (int)tcp->tcp_first_timer_threshold;
390			return (sizeof (int));
391		case TCP_ABORT_THRESHOLD:
392			*i1 = tcp->tcp_second_timer_threshold;
393			return (sizeof (int));
394		case TCP_CONN_NOTIFY_THRESHOLD:
395			*i1 = tcp->tcp_first_ctimer_threshold;
396			return (sizeof (int));
397		case TCP_CONN_ABORT_THRESHOLD:
398			*i1 = tcp->tcp_second_ctimer_threshold;
399			return (sizeof (int));
400		case TCP_INIT_CWND:
401			*i1 = tcp->tcp_init_cwnd;
402			return (sizeof (int));
403		case TCP_KEEPALIVE_THRESHOLD:
404			*i1 = tcp->tcp_ka_interval;
405			return (sizeof (int));
406		case TCP_KEEPALIVE_ABORT_THRESHOLD:
407			*i1 = tcp->tcp_ka_abort_thres;
408			return (sizeof (int));
409		case TCP_CORK:
410			*i1 = tcp->tcp_cork;
411			return (sizeof (int));
412		case TCP_RTO_INITIAL:
413			*i1 = tcp->tcp_rto_initial;
414			return (sizeof (uint32_t));
415		case TCP_RTO_MIN:
416			*i1 = tcp->tcp_rto_min;
417			return (sizeof (uint32_t));
418		case TCP_RTO_MAX:
419			*i1 = tcp->tcp_rto_max;
420			return (sizeof (uint32_t));
421		case TCP_LINGER2:
422			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
423			return (sizeof (int));
424		}
425		break;
426	case IPPROTO_IP:
427		if (connp->conn_family != AF_INET)
428			return (-1);
429		switch (name) {
430		case IP_OPTIONS:
431		case T_IP_OPTIONS:
432			/* Caller ensures enough space */
433			return (ip_opt_get_user(connp, ptr));
434		default:
435			break;
436		}
437		break;
438
439	case IPPROTO_IPV6:
440		/*
441		 * IPPROTO_IPV6 options are only supported for sockets
442		 * that are using IPv6 on the wire.
443		 */
444		if (connp->conn_ipversion != IPV6_VERSION) {
445			return (-1);
446		}
447		switch (name) {
448		case IPV6_PATHMTU:
449			if (tcp->tcp_state < TCPS_ESTABLISHED)
450				return (-1);
451			break;
452		}
453		break;
454	}
455	mutex_enter(&connp->conn_lock);
456	retval = conn_opt_get(&coas, level, name, ptr);
457	mutex_exit(&connp->conn_lock);
458	return (retval);
459}
460
461/*
462 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
463 * Parameters are assumed to be verified by the caller.
464 */
465/* ARGSUSED */
466int
467tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
468    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
469    void *thisdg_attrs, cred_t *cr)
470{
471	tcp_t	*tcp = connp->conn_tcp;
472	int	*i1 = (int *)invalp;
473	boolean_t onoff = (*i1 == 0) ? 0 : 1;
474	boolean_t checkonly;
475	int	reterr;
476	tcp_stack_t	*tcps = tcp->tcp_tcps;
477	conn_opt_arg_t	coas;
478	uint32_t	val = *((uint32_t *)invalp);
479
480	coas.coa_connp = connp;
481	coas.coa_ixa = connp->conn_ixa;
482	coas.coa_ipp = &connp->conn_xmit_ipp;
483	coas.coa_ancillary = B_FALSE;
484	coas.coa_changed = 0;
485
486	switch (optset_context) {
487	case SETFN_OPTCOM_CHECKONLY:
488		checkonly = B_TRUE;
489		/*
490		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
491		 * inlen != 0 implies value supplied and
492		 * 	we have to "pretend" to set it.
493		 * inlen == 0 implies that there is no
494		 * 	value part in T_CHECK request and just validation
495		 * done elsewhere should be enough, we just return here.
496		 */
497		if (inlen == 0) {
498			*outlenp = 0;
499			return (0);
500		}
501		break;
502	case SETFN_OPTCOM_NEGOTIATE:
503		checkonly = B_FALSE;
504		break;
505	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
506	case SETFN_CONN_NEGOTIATE:
507		checkonly = B_FALSE;
508		/*
509		 * Negotiating local and "association-related" options
510		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
511		 * primitives is allowed by XTI, but we choose
512		 * to not implement this style negotiation for Internet
513		 * protocols (We interpret it is a must for OSI world but
514		 * optional for Internet protocols) for all options.
515		 * [ Will do only for the few options that enable test
516		 * suites that our XTI implementation of this feature
517		 * works for transports that do allow it ]
518		 */
519		if (!tcp_allow_connopt_set(level, name)) {
520			*outlenp = 0;
521			return (EINVAL);
522		}
523		break;
524	default:
525		/*
526		 * We should never get here
527		 */
528		*outlenp = 0;
529		return (EINVAL);
530	}
531
532	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
533	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
534
535	/*
536	 * For TCP, we should have no ancillary data sent down
537	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
538	 * has to be zero.
539	 */
540	ASSERT(thisdg_attrs == NULL);
541
542	/*
543	 * For fixed length options, no sanity check
544	 * of passed in length is done. It is assumed *_optcom_req()
545	 * routines do the right thing.
546	 */
547	switch (level) {
548	case SOL_SOCKET:
549		switch (name) {
550		case SO_KEEPALIVE:
551			if (checkonly) {
552				/* check only case */
553				break;
554			}
555
556			if (!onoff) {
557				if (connp->conn_keepalive) {
558					if (tcp->tcp_ka_tid != 0) {
559						(void) TCP_TIMER_CANCEL(tcp,
560						    tcp->tcp_ka_tid);
561						tcp->tcp_ka_tid = 0;
562					}
563					connp->conn_keepalive = 0;
564				}
565				break;
566			}
567			if (!connp->conn_keepalive) {
568				/* Crank up the keepalive timer */
569				tcp->tcp_ka_last_intrvl = 0;
570				tcp->tcp_ka_tid = TCP_TIMER(tcp,
571				    tcp_keepalive_timer, tcp->tcp_ka_interval);
572				connp->conn_keepalive = 1;
573			}
574			break;
575		case SO_SNDBUF: {
576			if (*i1 > tcps->tcps_max_buf) {
577				*outlenp = 0;
578				return (ENOBUFS);
579			}
580			if (checkonly)
581				break;
582
583			connp->conn_sndbuf = *i1;
584			if (tcps->tcps_snd_lowat_fraction != 0) {
585				connp->conn_sndlowat = connp->conn_sndbuf /
586				    tcps->tcps_snd_lowat_fraction;
587			}
588			(void) tcp_maxpsz_set(tcp, B_TRUE);
589			/*
590			 * If we are flow-controlled, recheck the condition.
591			 * There are apps that increase SO_SNDBUF size when
592			 * flow-controlled (EWOULDBLOCK), and expect the flow
593			 * control condition to be lifted right away.
594			 */
595			mutex_enter(&tcp->tcp_non_sq_lock);
596			if (tcp->tcp_flow_stopped &&
597			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
598				tcp_clrqfull(tcp);
599			}
600			mutex_exit(&tcp->tcp_non_sq_lock);
601			*outlenp = inlen;
602			return (0);
603		}
604		case SO_RCVBUF:
605			if (*i1 > tcps->tcps_max_buf) {
606				*outlenp = 0;
607				return (ENOBUFS);
608			}
609			/* Silently ignore zero */
610			if (!checkonly && *i1 != 0) {
611				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
612				(void) tcp_rwnd_set(tcp, *i1);
613			}
614			/*
615			 * XXX should we return the rwnd here
616			 * and tcp_opt_get ?
617			 */
618			*outlenp = inlen;
619			return (0);
620		case SO_SND_COPYAVOID:
621			if (!checkonly) {
622				if (tcp->tcp_loopback ||
623				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
624					*outlenp = 0;
625					return (EOPNOTSUPP);
626				}
627				tcp->tcp_snd_zcopy_aware = 1;
628			}
629			*outlenp = inlen;
630			return (0);
631		}
632		break;
633	case IPPROTO_TCP:
634		switch (name) {
635		case TCP_NODELAY:
636			if (!checkonly)
637				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
638			break;
639		case TCP_NOTIFY_THRESHOLD:
640			if (!checkonly)
641				tcp->tcp_first_timer_threshold = *i1;
642			break;
643		case TCP_ABORT_THRESHOLD:
644			if (!checkonly)
645				tcp->tcp_second_timer_threshold = *i1;
646			break;
647		case TCP_CONN_NOTIFY_THRESHOLD:
648			if (!checkonly)
649				tcp->tcp_first_ctimer_threshold = *i1;
650			break;
651		case TCP_CONN_ABORT_THRESHOLD:
652			if (!checkonly)
653				tcp->tcp_second_ctimer_threshold = *i1;
654			break;
655		case TCP_RECVDSTADDR:
656			if (tcp->tcp_state > TCPS_LISTEN) {
657				*outlenp = 0;
658				return (EOPNOTSUPP);
659			}
660			/* Setting done in conn_opt_set */
661			break;
662		case TCP_INIT_CWND:
663			if (checkonly)
664				break;
665
666			/*
667			 * Only allow socket with network configuration
668			 * privilege to set the initial cwnd to be larger
669			 * than allowed by RFC 3390.
670			 */
671			if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
672				tcp->tcp_init_cwnd = val;
673				break;
674			}
675			if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
676				*outlenp = 0;
677				return (reterr);
678			}
679			if (val > tcp_max_init_cwnd) {
680				*outlenp = 0;
681				return (EINVAL);
682			}
683			tcp->tcp_init_cwnd = val;
684			break;
685		case TCP_KEEPALIVE_THRESHOLD:
686			if (checkonly)
687				break;
688
689			if (*i1 < tcps->tcps_keepalive_interval_low ||
690			    *i1 > tcps->tcps_keepalive_interval_high) {
691				*outlenp = 0;
692				return (EINVAL);
693			}
694			if (*i1 != tcp->tcp_ka_interval) {
695				tcp->tcp_ka_interval = *i1;
696				/*
697				 * Check if we need to restart the
698				 * keepalive timer.
699				 */
700				if (tcp->tcp_ka_tid != 0) {
701					ASSERT(connp->conn_keepalive);
702					(void) TCP_TIMER_CANCEL(tcp,
703					    tcp->tcp_ka_tid);
704					tcp->tcp_ka_last_intrvl = 0;
705					tcp->tcp_ka_tid = TCP_TIMER(tcp,
706					    tcp_keepalive_timer,
707					    tcp->tcp_ka_interval);
708				}
709			}
710			break;
711		case TCP_KEEPALIVE_ABORT_THRESHOLD:
712			if (!checkonly) {
713				if (*i1 <
714				    tcps->tcps_keepalive_abort_interval_low ||
715				    *i1 >
716				    tcps->tcps_keepalive_abort_interval_high) {
717					*outlenp = 0;
718					return (EINVAL);
719				}
720				tcp->tcp_ka_abort_thres = *i1;
721			}
722			break;
723		case TCP_CORK:
724			if (!checkonly) {
725				/*
726				 * if tcp->tcp_cork was set and is now
727				 * being unset, we have to make sure that
728				 * the remaining data gets sent out. Also
729				 * unset tcp->tcp_cork so that tcp_wput_data()
730				 * can send data even if it is less than mss
731				 */
732				if (tcp->tcp_cork && onoff == 0 &&
733				    tcp->tcp_unsent > 0) {
734					tcp->tcp_cork = B_FALSE;
735					tcp_wput_data(tcp, NULL, B_FALSE);
736				}
737				tcp->tcp_cork = onoff;
738			}
739			break;
740		case TCP_RTO_INITIAL: {
741			clock_t rto;
742
743			if (checkonly || val == 0)
744				break;
745
746			/*
747			 * Sanity checks
748			 *
749			 * The initial RTO should be bounded by the minimum
750			 * and maximum RTO.  And it should also be smaller
751			 * than the connect attempt abort timeout.  Otherwise,
752			 * the connection won't be aborted in a period
753			 * reasonably close to that timeout.
754			 */
755			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
756			    val > tcp->tcp_second_ctimer_threshold ||
757			    val < tcps->tcps_rexmit_interval_initial_low ||
758			    val > tcps->tcps_rexmit_interval_initial_high) {
759				*outlenp = 0;
760				return (EINVAL);
761			}
762			tcp->tcp_rto_initial = val;
763
764			/*
765			 * If TCP has not sent anything, need to re-calculate
766			 * tcp_rto.  Otherwise, this option change does not
767			 * really affect anything.
768			 */
769			if (tcp->tcp_state >= TCPS_SYN_SENT)
770				break;
771
772			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
773			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
774			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
775			    tcps->tcps_rexmit_interval_extra +
776			    (tcp->tcp_rtt_sa >> 5) +
777			    tcps->tcps_conn_grace_period;
778			TCP_SET_RTO(tcp, rto);
779			break;
780		}
781		case TCP_RTO_MIN:
782			if (checkonly || val == 0)
783				break;
784
785			if (val < tcps->tcps_rexmit_interval_min_low ||
786			    val > tcps->tcps_rexmit_interval_min_high ||
787			    val > tcp->tcp_rto_max) {
788				*outlenp = 0;
789				return (EINVAL);
790			}
791			tcp->tcp_rto_min = val;
792			if (tcp->tcp_rto < val)
793				tcp->tcp_rto = val;
794			break;
795		case TCP_RTO_MAX:
796			if (checkonly || val == 0)
797				break;
798
799			/*
800			 * Sanity checks
801			 *
802			 * The maximum RTO should not be larger than the
803			 * connection abort timeout.  Otherwise, the
804			 * connection won't be aborted in a period reasonably
805			 * close to that timeout.
806			 */
807			if (val < tcps->tcps_rexmit_interval_max_low ||
808			    val > tcps->tcps_rexmit_interval_max_high ||
809			    val < tcp->tcp_rto_min ||
810			    val > tcp->tcp_second_timer_threshold) {
811				*outlenp = 0;
812				return (EINVAL);
813			}
814			tcp->tcp_rto_max = val;
815			if (tcp->tcp_rto > val)
816				tcp->tcp_rto = val;
817			break;
818		case TCP_LINGER2:
819			if (checkonly || *i1 == 0)
820				break;
821
822			/*
823			 * Note that the option value's unit is second.  And
824			 * the value should be bigger than the private
825			 * parameter tcp_fin_wait_2_flush_interval's lower
826			 * bound and smaller than the current value of that
827			 * parameter.  It should be smaller than the current
828			 * value to avoid an app setting TCP_LINGER2 to a big
829			 * value, causing resource to be held up too long in
830			 * FIN-WAIT-2 state.
831			 */
832			if (*i1 < 0 ||
833			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
834			    *i1 ||
835			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
836			    *i1) {
837				*outlenp = 0;
838				return (EINVAL);
839			}
840			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
841			break;
842		default:
843			break;
844		}
845		break;
846	case IPPROTO_IP:
847		if (connp->conn_family != AF_INET) {
848			*outlenp = 0;
849			return (EINVAL);
850		}
851		switch (name) {
852		case IP_SEC_OPT:
853			/*
854			 * We should not allow policy setting after
855			 * we start listening for connections.
856			 */
857			if (tcp->tcp_state == TCPS_LISTEN) {
858				return (EINVAL);
859			}
860			break;
861		}
862		break;
863	case IPPROTO_IPV6:
864		/*
865		 * IPPROTO_IPV6 options are only supported for sockets
866		 * that are using IPv6 on the wire.
867		 */
868		if (connp->conn_ipversion != IPV6_VERSION) {
869			*outlenp = 0;
870			return (EINVAL);
871		}
872
873		switch (name) {
874		case IPV6_RECVPKTINFO:
875			if (!checkonly) {
876				/* Force it to be sent up with the next msg */
877				tcp->tcp_recvifindex = 0;
878			}
879			break;
880		case IPV6_RECVTCLASS:
881			if (!checkonly) {
882				/* Force it to be sent up with the next msg */
883				tcp->tcp_recvtclass = 0xffffffffU;
884			}
885			break;
886		case IPV6_RECVHOPLIMIT:
887			if (!checkonly) {
888				/* Force it to be sent up with the next msg */
889				tcp->tcp_recvhops = 0xffffffffU;
890			}
891			break;
892		case IPV6_PKTINFO:
893			/* This is an extra check for TCP */
894			if (inlen == sizeof (struct in6_pktinfo)) {
895				struct in6_pktinfo *pkti;
896
897				pkti = (struct in6_pktinfo *)invalp;
898				/*
899				 * RFC 3542 states that ipi6_addr must be
900				 * the unspecified address when setting the
901				 * IPV6_PKTINFO sticky socket option on a
902				 * TCP socket.
903				 */
904				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
905					return (EINVAL);
906			}
907			break;
908		case IPV6_SEC_OPT:
909			/*
910			 * We should not allow policy setting after
911			 * we start listening for connections.
912			 */
913			if (tcp->tcp_state == TCPS_LISTEN) {
914				return (EINVAL);
915			}
916			break;
917		}
918		break;
919	}
920	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
921	    checkonly, cr);
922	if (reterr != 0) {
923		*outlenp = 0;
924		return (reterr);
925	}
926
927	/*
928	 * Common case of OK return with outval same as inval
929	 */
930	if (invalp != outvalp) {
931		/* don't trust bcopy for identical src/dst */
932		(void) bcopy(invalp, outvalp, inlen);
933	}
934	*outlenp = inlen;
935
936	if (coas.coa_changed & COA_HEADER_CHANGED) {
937		/* If we are connected we rebuilt the headers */
938		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
939		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
940			reterr = tcp_build_hdrs(tcp);
941			if (reterr != 0)
942				return (reterr);
943		}
944	}
945	if (coas.coa_changed & COA_ROUTE_CHANGED) {
946		in6_addr_t nexthop;
947
948		/*
949		 * If we are connected we re-cache the information.
950		 * We ignore errors to preserve BSD behavior.
951		 * Note that we don't redo IPsec policy lookup here
952		 * since the final destination (or source) didn't change.
953		 */
954		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
955		    &connp->conn_faddr_v6, &nexthop);
956
957		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
958		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
959			(void) ip_attr_connect(connp, connp->conn_ixa,
960			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
961			    &nexthop, connp->conn_fport, NULL, NULL,
962			    IPDF_VERIFY_DST);
963		}
964	}
965	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
966		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
967	}
968	if (coas.coa_changed & COA_WROFF_CHANGED) {
969		connp->conn_wroff = connp->conn_ht_iphc_allocated +
970		    tcps->tcps_wroff_xtra;
971		(void) proto_set_tx_wroff(connp->conn_rq, connp,
972		    connp->conn_wroff);
973	}
974	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
975		if (IPCL_IS_NONSTR(connp))
976			proto_set_rx_oob_opt(connp, onoff);
977	}
978	return (0);
979}
980