sctp_output.c revision 8348:4137e18bfaf0
187866Ssheldonh/*
287866Ssheldonh * CDDL HEADER START
387866Ssheldonh *
487866Ssheldonh * The contents of this file are subject to the terms of the
587866Ssheldonh * Common Development and Distribution License (the "License").
687866Ssheldonh * You may not use this file except in compliance with the License.
787866Ssheldonh *
887866Ssheldonh * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
987866Ssheldonh * or http://www.opensolaris.org/os/licensing.
1087866Ssheldonh * See the License for the specific language governing permissions
1187866Ssheldonh * and limitations under the License.
1287866Ssheldonh *
1387866Ssheldonh * When distributing Covered Code, include this CDDL HEADER in each
1487866Ssheldonh * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1587866Ssheldonh * If applicable, add the following below this CDDL HEADER, with the
1687866Ssheldonh * fields enclosed by brackets "[]" replaced with your own identifying
1787866Ssheldonh * information: Portions Copyright [yyyy] [name of copyright owner]
1887866Ssheldonh *
1987866Ssheldonh * CDDL HEADER END
2087866Ssheldonh */
2187866Ssheldonh
2287866Ssheldonh/*
2387866Ssheldonh * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
2487866Ssheldonh * Use is subject to license terms.
2587866Ssheldonh */
2687866Ssheldonh
2787866Ssheldonh#include <sys/types.h>
2887866Ssheldonh#include <sys/systm.h>
2987866Ssheldonh#include <sys/stream.h>
3087866Ssheldonh#include <sys/cmn_err.h>
3187866Ssheldonh#define	_SUN_TPI_VERSION 2
3287866Ssheldonh#include <sys/tihdr.h>
3387866Ssheldonh#include <sys/socket.h>
3487866Ssheldonh#include <sys/stropts.h>
3587866Ssheldonh#include <sys/strsun.h>
3687866Ssheldonh#include <sys/strsubr.h>
3787866Ssheldonh#include <sys/socketvar.h>
3887866Ssheldonh#include <inet/common.h>
3987866Ssheldonh#include <inet/mi.h>
4087866Ssheldonh#include <inet/ip.h>
4187866Ssheldonh#include <inet/ip6.h>
4287866Ssheldonh#include <inet/sctp_ip.h>
4387866Ssheldonh#include <inet/ipclassifier.h>
4487866Ssheldonh
4587866Ssheldonh/*
4687866Ssheldonh * PR-SCTP comments.
4787866Ssheldonh *
4887866Ssheldonh * A message can expire before it gets to the transmit list (i.e. it is still
4987866Ssheldonh * in the unsent list - unchunked), after it gets to the transmit list, but
5087866Ssheldonh * before transmission has actually started, or after transmission has begun.
5187866Ssheldonh * Accordingly, we check for the status of a message in sctp_chunkify() when
5287866Ssheldonh * the message is being transferred from the unsent list to the transmit list;
5387866Ssheldonh * in sctp_get_msg_to_send(), when we get the next chunk from the transmit
5487866Ssheldonh * list and in sctp_rexmit() when we get the next chunk to be (re)transmitted.
5587866Ssheldonh * When we nuke a message in sctp_chunkify(), all we need to do is take it
5687866Ssheldonh * out of the unsent list and update sctp_unsent; when a message is deemed
5787866Ssheldonh * timed-out in sctp_get_msg_to_send() we can just take it out of the transmit
5887866Ssheldonh * list, update sctp_unsent IFF transmission for the message has not yet begun
5987866Ssheldonh * (i.e. !SCTP_CHUNK_ISSENT(meta->b_cont)). However, if transmission for the
6087866Ssheldonh * message has started, then we cannot just take it out of the list, we need
6187866Ssheldonh * to send Forward TSN chunk to the peer so that the peer can clear its
6287866Ssheldonh * fragment list for this message. However, we cannot just send the Forward
6387866Ssheldonh * TSN in sctp_get_msg_to_send() because there might be unacked chunks for
6487866Ssheldonh * messages preceeding this abandoned message. So, we send a Forward TSN
6587866Ssheldonh * IFF all messages prior to this abandoned message has been SACKd, if not
6687866Ssheldonh * we defer sending the Forward TSN to sctp_cumack(), which will check for
6787866Ssheldonh * this condition and send the Forward TSN via sctp_check_abandoned_msg(). In
6887866Ssheldonh * sctp_rexmit() when we check for retransmissions, we need to determine if
6987866Ssheldonh * the advanced peer ack point can be moved ahead, and if so, send a Forward
7087866Ssheldonh * TSN to the peer instead of retransmitting the chunk. Note that when
7187866Ssheldonh * we send a Forward TSN for a message, there may be yet unsent chunks for
7287866Ssheldonh * this message; we need to mark all such chunks as abandoned, so that
7387866Ssheldonh * sctp_cumack() can take the message out of the transmit list, additionally
7487866Ssheldonh * sctp_unsent need to be adjusted. Whenever sctp_unsent is updated (i.e.
7587866Ssheldonh * decremented when a message/chunk is deemed abandoned), sockfs needs to
7687866Ssheldonh * be notified so that it can adjust its idea of the queued message.
7787866Ssheldonh */
7887866Ssheldonh
7987866Ssheldonh#include "sctp_impl.h"
8087866Ssheldonh
8187866Ssheldonhstatic struct kmem_cache	*sctp_kmem_ftsn_set_cache;
8287866Ssheldonh
8387866Ssheldonh#ifdef	DEBUG
8487866Ssheldonhstatic boolean_t	sctp_verify_chain(mblk_t *, mblk_t *);
8587866Ssheldonh#endif
8687866Ssheldonh
8787866Ssheldonh/*
88 * Called to allocate a header mblk when sending data to SCTP.
89 * Data will follow in b_cont of this mblk.
90 */
91mblk_t *
92sctp_alloc_hdr(const char *name, int nlen, const char *control, int clen,
93    int flags)
94{
95	mblk_t *mp;
96	struct T_unitdata_req *tudr;
97	size_t size;
98	int error;
99
100	size = sizeof (*tudr) + _TPI_ALIGN_TOPT(nlen) + clen;
101	size = MAX(size, sizeof (sctp_msg_hdr_t));
102	if (flags & SCTP_CAN_BLOCK) {
103		mp = allocb_wait(size, BPRI_MED, 0, &error);
104	} else {
105		mp = allocb(size, BPRI_MED);
106	}
107	if (mp) {
108		tudr = (struct T_unitdata_req *)mp->b_rptr;
109		tudr->PRIM_type = T_UNITDATA_REQ;
110		tudr->DEST_length = nlen;
111		tudr->DEST_offset = sizeof (*tudr);
112		tudr->OPT_length = clen;
113		tudr->OPT_offset = (t_scalar_t)(sizeof (*tudr) +
114		    _TPI_ALIGN_TOPT(nlen));
115		if (nlen > 0)
116			bcopy(name, tudr + 1, nlen);
117		if (clen > 0)
118			bcopy(control, (char *)tudr + tudr->OPT_offset, clen);
119		mp->b_wptr += (tudr ->OPT_offset + clen);
120		mp->b_datap->db_type = M_PROTO;
121	}
122	return (mp);
123}
124
125/*ARGSUSED2*/
126int
127sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
128{
129	sctp_faddr_t	*fp = NULL;
130	struct T_unitdata_req	*tudr;
131	int		error = 0;
132	mblk_t		*mproto = mp;
133	in6_addr_t	*addr;
134	in6_addr_t	tmpaddr;
135	uint16_t	sid = sctp->sctp_def_stream;
136	uint32_t	ppid = sctp->sctp_def_ppid;
137	uint32_t	context = sctp->sctp_def_context;
138	uint16_t	msg_flags = sctp->sctp_def_flags;
139	sctp_msg_hdr_t	*sctp_msg_hdr;
140	uint32_t	msg_len = 0;
141	uint32_t	timetolive = sctp->sctp_def_timetolive;
142
143	ASSERT(DB_TYPE(mproto) == M_PROTO);
144
145	mp = mp->b_cont;
146	ASSERT(mp == NULL || DB_TYPE(mp) == M_DATA);
147
148	tudr = (struct T_unitdata_req *)mproto->b_rptr;
149	ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
150
151	/* Get destination address, if specified */
152	if (tudr->DEST_length > 0) {
153		sin_t *sin;
154		sin6_t *sin6;
155
156		sin = (struct sockaddr_in *)
157		    (mproto->b_rptr + tudr->DEST_offset);
158		switch (sin->sin_family) {
159		case AF_INET:
160			if (tudr->DEST_length < sizeof (*sin)) {
161				return (EINVAL);
162			}
163			IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &tmpaddr);
164			addr = &tmpaddr;
165			break;
166		case AF_INET6:
167			if (tudr->DEST_length < sizeof (*sin6)) {
168				return (EINVAL);
169			}
170			sin6 = (struct sockaddr_in6 *)
171			    (mproto->b_rptr + tudr->DEST_offset);
172			addr = &sin6->sin6_addr;
173			break;
174		default:
175			return (EAFNOSUPPORT);
176		}
177		fp = sctp_lookup_faddr(sctp, addr);
178		if (fp == NULL) {
179			return (EINVAL);
180		}
181	}
182	/* Ancillary Data? */
183	if (tudr->OPT_length > 0) {
184		struct cmsghdr		*cmsg;
185		char			*cend;
186		struct sctp_sndrcvinfo	*sndrcv;
187
188		cmsg = (struct cmsghdr *)(mproto->b_rptr + tudr->OPT_offset);
189		cend = ((char *)cmsg + tudr->OPT_length);
190		ASSERT(cend <= (char *)mproto->b_wptr);
191
192		for (;;) {
193			if ((char *)(cmsg + 1) > cend ||
194			    ((char *)cmsg + cmsg->cmsg_len) > cend) {
195				break;
196			}
197			if ((cmsg->cmsg_level == IPPROTO_SCTP) &&
198			    (cmsg->cmsg_type == SCTP_SNDRCV)) {
199				if (cmsg->cmsg_len <
200				    (sizeof (*sndrcv) + sizeof (*cmsg))) {
201					return (EINVAL);
202				}
203				sndrcv = (struct sctp_sndrcvinfo *)(cmsg + 1);
204				sid = sndrcv->sinfo_stream;
205				msg_flags = sndrcv->sinfo_flags;
206				ppid = sndrcv->sinfo_ppid;
207				context = sndrcv->sinfo_context;
208				timetolive = sndrcv->sinfo_timetolive;
209				break;
210			}
211			if (cmsg->cmsg_len > 0)
212				cmsg = CMSG_NEXT(cmsg);
213			else
214				break;
215		}
216	}
217	if (msg_flags & MSG_ABORT) {
218		if (mp && mp->b_cont) {
219			mblk_t *pump = msgpullup(mp, -1);
220			if (!pump) {
221				return (ENOMEM);
222			}
223			freemsg(mp);
224			mp = pump;
225			mproto->b_cont = mp;
226		}
227		RUN_SCTP(sctp);
228		sctp_user_abort(sctp, mp);
229		freemsg(mproto);
230		goto process_sendq;
231	}
232	if (mp == NULL)
233		goto done;
234
235	RUN_SCTP(sctp);
236
237	/* Reject any new data requests if we are shutting down */
238	if (sctp->sctp_state > SCTPS_ESTABLISHED ||
239	    (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) {
240		error = EPIPE;
241		goto unlock_done;
242	}
243
244	/* Re-use the mproto to store relevant info. */
245	ASSERT(MBLKSIZE(mproto) >= sizeof (*sctp_msg_hdr));
246
247	mproto->b_rptr = mproto->b_datap->db_base;
248	mproto->b_wptr = mproto->b_rptr + sizeof (*sctp_msg_hdr);
249
250	sctp_msg_hdr = (sctp_msg_hdr_t *)mproto->b_rptr;
251	bzero(sctp_msg_hdr, sizeof (*sctp_msg_hdr));
252	sctp_msg_hdr->smh_context = context;
253	sctp_msg_hdr->smh_sid = sid;
254	sctp_msg_hdr->smh_ppid = ppid;
255	sctp_msg_hdr->smh_flags = msg_flags;
256	sctp_msg_hdr->smh_ttl = MSEC_TO_TICK(timetolive);
257	sctp_msg_hdr->smh_tob = lbolt64;
258	for (; mp != NULL; mp = mp->b_cont)
259		msg_len += MBLKL(mp);
260	sctp_msg_hdr->smh_msglen = msg_len;
261
262	/* User requested specific destination */
263	SCTP_SET_CHUNK_DEST(mproto, fp);
264
265	if (sctp->sctp_state >= SCTPS_COOKIE_ECHOED &&
266	    sid >= sctp->sctp_num_ostr) {
267		/* Send sendfail event */
268		sctp_sendfail_event(sctp, dupmsg(mproto), SCTP_ERR_BAD_SID,
269		    B_FALSE);
270		error = EINVAL;
271		goto unlock_done;
272	}
273
274	/* no data */
275	if (msg_len == 0) {
276		sctp_sendfail_event(sctp, dupmsg(mproto),
277		    SCTP_ERR_NO_USR_DATA, B_FALSE);
278		error = EINVAL;
279		goto unlock_done;
280	}
281
282	/* Add it to the unsent list */
283	if (sctp->sctp_xmit_unsent == NULL) {
284		sctp->sctp_xmit_unsent = sctp->sctp_xmit_unsent_tail = mproto;
285	} else {
286		sctp->sctp_xmit_unsent_tail->b_next = mproto;
287		sctp->sctp_xmit_unsent_tail = mproto;
288	}
289	sctp->sctp_unsent += msg_len;
290	BUMP_LOCAL(sctp->sctp_msgcount);
291	/*
292	 * Notify sockfs if the tx queue is full.
293	 */
294	if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) {
295		sctp->sctp_txq_full = 1;
296		sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE);
297	}
298	if (sctp->sctp_state == SCTPS_ESTABLISHED)
299		sctp_output(sctp, UINT_MAX);
300process_sendq:
301	WAKE_SCTP(sctp);
302	sctp_process_sendq(sctp);
303	return (0);
304unlock_done:
305	WAKE_SCTP(sctp);
306done:
307	return (error);
308}
309
310void
311sctp_chunkify(sctp_t *sctp, int first_len, int bytes_to_send)
312{
313	mblk_t			*mp;
314	mblk_t			*chunk_mp;
315	mblk_t			*chunk_head;
316	mblk_t			*chunk_hdr;
317	mblk_t			*chunk_tail = NULL;
318	int			count;
319	int			chunksize;
320	sctp_data_hdr_t		*sdc;
321	mblk_t			*mdblk = sctp->sctp_xmit_unsent;
322	sctp_faddr_t		*fp;
323	sctp_faddr_t		*fp1;
324	size_t			xtralen;
325	sctp_msg_hdr_t		*msg_hdr;
326	sctp_stack_t	*sctps = sctp->sctp_sctps;
327
328	fp = SCTP_CHUNK_DEST(mdblk);
329	if (fp == NULL)
330		fp = sctp->sctp_current;
331	if (fp->isv4)
332		xtralen = sctp->sctp_hdr_len + sctps->sctps_wroff_xtra +
333		    sizeof (*sdc);
334	else
335		xtralen = sctp->sctp_hdr6_len + sctps->sctps_wroff_xtra +
336		    sizeof (*sdc);
337	count = chunksize = first_len - sizeof (*sdc);
338nextmsg:
339	chunk_mp = mdblk->b_cont;
340
341	/*
342	 * If this partially chunked, we ignore the first_len for now
343	 * and use the one already present. For the unchunked bits, we
344	 * use the length of the last chunk.
345	 */
346	if (SCTP_IS_MSG_CHUNKED(mdblk)) {
347		int	chunk_len;
348
349		ASSERT(chunk_mp->b_next != NULL);
350		mdblk->b_cont = chunk_mp->b_next;
351		chunk_mp->b_next = NULL;
352		SCTP_MSG_CLEAR_CHUNKED(mdblk);
353		mp = mdblk->b_cont;
354		while (mp->b_next != NULL)
355			mp = mp->b_next;
356		chunk_len = ntohs(((sctp_data_hdr_t *)mp->b_rptr)->sdh_len);
357		if (fp->sfa_pmss - chunk_len > sizeof (*sdc))
358			count = chunksize = fp->sfa_pmss - chunk_len;
359		else
360			count = chunksize = fp->sfa_pmss;
361		count = chunksize = count - sizeof (*sdc);
362	} else {
363		msg_hdr = (sctp_msg_hdr_t *)mdblk->b_rptr;
364		if (SCTP_MSG_TO_BE_ABANDONED(mdblk, msg_hdr, sctp)) {
365			sctp->sctp_xmit_unsent = mdblk->b_next;
366			if (sctp->sctp_xmit_unsent == NULL)
367				sctp->sctp_xmit_unsent_tail = NULL;
368			ASSERT(sctp->sctp_unsent >= msg_hdr->smh_msglen);
369			sctp->sctp_unsent -= msg_hdr->smh_msglen;
370			mdblk->b_next = NULL;
371			BUMP_LOCAL(sctp->sctp_prsctpdrop);
372			/*
373			 * Update ULP the amount of queued data, which is
374			 * sent-unack'ed + unsent.
375			 */
376			if (!SCTP_IS_DETACHED(sctp))
377				SCTP_TXQ_UPDATE(sctp);
378			sctp_sendfail_event(sctp, mdblk, 0, B_FALSE);
379			goto try_next;
380		}
381		mdblk->b_cont = NULL;
382	}
383	msg_hdr = (sctp_msg_hdr_t *)mdblk->b_rptr;
384nextchunk:
385	chunk_head = chunk_mp;
386	chunk_tail = NULL;
387
388	/* Skip as many mblk's as we need */
389	while (chunk_mp != NULL && ((count - MBLKL(chunk_mp)) >= 0)) {
390		count -= MBLKL(chunk_mp);
391		chunk_tail = chunk_mp;
392		chunk_mp = chunk_mp->b_cont;
393	}
394	/* Split the chain, if needed */
395	if (chunk_mp != NULL) {
396		if (count > 0) {
397			mblk_t	*split_mp = dupb(chunk_mp);
398
399			if (split_mp == NULL) {
400				if (mdblk->b_cont == NULL) {
401					mdblk->b_cont = chunk_head;
402				} else  {
403					SCTP_MSG_SET_CHUNKED(mdblk);
404					ASSERT(chunk_head->b_next == NULL);
405					chunk_head->b_next = mdblk->b_cont;
406					mdblk->b_cont = chunk_head;
407				}
408				return;
409			}
410			if (chunk_tail != NULL) {
411				chunk_tail->b_cont = split_mp;
412				chunk_tail = chunk_tail->b_cont;
413			} else {
414				chunk_head = chunk_tail = split_mp;
415			}
416			chunk_tail->b_wptr = chunk_tail->b_rptr + count;
417			chunk_mp->b_rptr = chunk_tail->b_wptr;
418			count = 0;
419		} else if (chunk_tail == NULL) {
420			goto next;
421		} else {
422			chunk_tail->b_cont = NULL;
423		}
424	}
425	/* Alloc chunk hdr, if needed */
426	if (DB_REF(chunk_head) > 1 ||
427	    ((intptr_t)chunk_head->b_rptr) & (SCTP_ALIGN - 1) ||
428	    MBLKHEAD(chunk_head) < sizeof (*sdc)) {
429		if ((chunk_hdr = allocb(xtralen, BPRI_MED)) == NULL) {
430			if (mdblk->b_cont == NULL) {
431				if (chunk_mp != NULL)
432					linkb(chunk_head, chunk_mp);
433				mdblk->b_cont = chunk_head;
434			} else {
435				SCTP_MSG_SET_CHUNKED(mdblk);
436				if (chunk_mp != NULL)
437					linkb(chunk_head, chunk_mp);
438				ASSERT(chunk_head->b_next == NULL);
439				chunk_head->b_next = mdblk->b_cont;
440				mdblk->b_cont = chunk_head;
441			}
442			return;
443		}
444		chunk_hdr->b_rptr += xtralen - sizeof (*sdc);
445		chunk_hdr->b_wptr = chunk_hdr->b_rptr + sizeof (*sdc);
446		chunk_hdr->b_cont = chunk_head;
447	} else {
448		chunk_hdr = chunk_head;
449		chunk_hdr->b_rptr -= sizeof (*sdc);
450	}
451	ASSERT(chunk_hdr->b_datap->db_ref == 1);
452	sdc = (sctp_data_hdr_t *)chunk_hdr->b_rptr;
453	sdc->sdh_id = CHUNK_DATA;
454	sdc->sdh_flags = 0;
455	sdc->sdh_len = htons(sizeof (*sdc) + chunksize - count);
456	ASSERT(sdc->sdh_len);
457	sdc->sdh_sid = htons(msg_hdr->smh_sid);
458	/*
459	 * We defer assigning the SSN just before sending the chunk, else
460	 * if we drop the chunk in sctp_get_msg_to_send(), we would need
461	 * to send a Forward TSN to let the peer know. Some more comments
462	 * about this in sctp_impl.h for SCTP_CHUNK_SENT.
463	 */
464	sdc->sdh_payload_id = msg_hdr->smh_ppid;
465
466	if (mdblk->b_cont == NULL) {
467		mdblk->b_cont = chunk_hdr;
468		SCTP_DATA_SET_BBIT(sdc);
469	} else {
470		mp = mdblk->b_cont;
471		while (mp->b_next != NULL)
472			mp = mp->b_next;
473		mp->b_next = chunk_hdr;
474	}
475
476	bytes_to_send -= (chunksize - count);
477	if (chunk_mp != NULL) {
478next:
479		count = chunksize = fp->sfa_pmss - sizeof (*sdc);
480		goto nextchunk;
481	}
482	SCTP_DATA_SET_EBIT(sdc);
483	sctp->sctp_xmit_unsent = mdblk->b_next;
484	if (mdblk->b_next == NULL) {
485		sctp->sctp_xmit_unsent_tail = NULL;
486	}
487	mdblk->b_next = NULL;
488
489	if (sctp->sctp_xmit_tail == NULL) {
490		sctp->sctp_xmit_head = sctp->sctp_xmit_tail = mdblk;
491	} else {
492		mp = sctp->sctp_xmit_tail;
493		while (mp->b_next != NULL)
494			mp = mp->b_next;
495		mp->b_next = mdblk;
496		mdblk->b_prev = mp;
497	}
498try_next:
499	if (bytes_to_send > 0 && sctp->sctp_xmit_unsent != NULL) {
500		mdblk = sctp->sctp_xmit_unsent;
501		fp1 = SCTP_CHUNK_DEST(mdblk);
502		if (fp1 == NULL)
503			fp1 = sctp->sctp_current;
504		if (fp == fp1) {
505			size_t len = MBLKL(mdblk->b_cont);
506			if ((count > 0) &&
507			    ((len > fp->sfa_pmss - sizeof (*sdc)) ||
508			    (len <= count))) {
509				count -= sizeof (*sdc);
510				count = chunksize = count - (count & 0x3);
511			} else {
512				count = chunksize = fp->sfa_pmss -
513				    sizeof (*sdc);
514			}
515		} else {
516			if (fp1->isv4)
517				xtralen = sctp->sctp_hdr_len;
518			else
519				xtralen = sctp->sctp_hdr6_len;
520			xtralen += sctps->sctps_wroff_xtra + sizeof (*sdc);
521			count = chunksize = fp1->sfa_pmss - sizeof (*sdc);
522			fp = fp1;
523		}
524		goto nextmsg;
525	}
526}
527
528void
529sctp_free_msg(mblk_t *ump)
530{
531	mblk_t *mp, *nmp;
532
533	for (mp = ump->b_cont; mp; mp = nmp) {
534		nmp = mp->b_next;
535		mp->b_next = mp->b_prev = NULL;
536		freemsg(mp);
537	}
538	ASSERT(!ump->b_prev);
539	ump->b_next = NULL;
540	freeb(ump);
541}
542
543mblk_t *
544sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
545    int *error)
546{
547	int hdrlen;
548	char *hdr;
549	int isv4 = fp->isv4;
550	sctp_stack_t	*sctps = sctp->sctp_sctps;
551
552	if (error != NULL)
553		*error = 0;
554
555	if (isv4) {
556		hdrlen = sctp->sctp_hdr_len;
557		hdr = sctp->sctp_iphc;
558	} else {
559		hdrlen = sctp->sctp_hdr6_len;
560		hdr = sctp->sctp_iphc6;
561	}
562	/*
563	 * A null fp->ire could mean that the address is 'down'. Similarly,
564	 * it is possible that the address went down, we tried to send an
565	 * heartbeat and ended up setting fp->saddr as unspec because we
566	 * didn't have any usable source address.  In either case
567	 * sctp_get_ire() will try find an IRE, if available, and set
568	 * the source address, if needed.  If we still don't have any
569	 * usable source address, fp->state will be SCTP_FADDRS_UNREACH and
570	 * we return EHOSTUNREACH.
571	 */
572	if (fp->ire == NULL || SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) {
573		sctp_get_ire(sctp, fp);
574		if (fp->state == SCTP_FADDRS_UNREACH) {
575			if (error != NULL)
576				*error = EHOSTUNREACH;
577			return (NULL);
578		}
579	}
580	/* Copy in IP header. */
581	if ((mp->b_rptr - mp->b_datap->db_base) <
582	    (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2 ||
583	    !IS_P2ALIGNED(DB_BASE(mp), sizeof (ire_t *))) {
584		mblk_t *nmp;
585
586		/*
587		 * This can happen if IP headers are adjusted after
588		 * data was moved into chunks, or during retransmission,
589		 * or things like snoop is running.
590		 */
591		nmp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + sacklen,
592		    CONN_CRED(sctp->sctp_connp));
593		if (nmp == NULL) {
594			if (error !=  NULL)
595				*error = ENOMEM;
596			return (NULL);
597		}
598		nmp->b_rptr += sctps->sctps_wroff_xtra;
599		nmp->b_wptr = nmp->b_rptr + hdrlen + sacklen;
600		nmp->b_cont = mp;
601		mp = nmp;
602	} else {
603		mp->b_rptr -= (hdrlen + sacklen);
604		mblk_setcred(mp, CONN_CRED(sctp->sctp_connp));
605	}
606	bcopy(hdr, mp->b_rptr, hdrlen);
607	if (sacklen) {
608		sctp_fill_sack(sctp, mp->b_rptr + hdrlen, sacklen);
609	}
610	if (fp != sctp->sctp_current) {
611		/* change addresses in header */
612		if (isv4) {
613			ipha_t *iph = (ipha_t *)mp->b_rptr;
614
615			IN6_V4MAPPED_TO_IPADDR(&fp->faddr, iph->ipha_dst);
616			if (!IN6_IS_ADDR_V4MAPPED_ANY(&fp->saddr)) {
617				IN6_V4MAPPED_TO_IPADDR(&fp->saddr,
618				    iph->ipha_src);
619			} else if (sctp->sctp_bound_to_all) {
620				iph->ipha_src = INADDR_ANY;
621			}
622		} else {
623			((ip6_t *)(mp->b_rptr))->ip6_dst = fp->faddr;
624			if (!IN6_IS_ADDR_UNSPECIFIED(&fp->saddr)) {
625				((ip6_t *)(mp->b_rptr))->ip6_src = fp->saddr;
626			} else if (sctp->sctp_bound_to_all) {
627				V6_SET_ZERO(((ip6_t *)(mp->b_rptr))->ip6_src);
628			}
629		}
630	}
631	/*
632	 * IP will not free this IRE if it is condemned.  SCTP needs to
633	 * free it.
634	 */
635	if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) {
636		IRE_REFRELE_NOTR(fp->ire);
637		fp->ire = NULL;
638	}
639
640	/* Stash the conn and ire ptr info for IP */
641	SCTP_STASH_IPINFO(mp, fp->ire);
642
643	return (mp);
644}
645
646/*
647 * SCTP requires every chunk to be padded so that the total length
648 * is a multiple of SCTP_ALIGN.  This function returns a mblk with
649 * the specified pad length.
650 */
651static mblk_t *
652sctp_get_padding(sctp_t *sctp, int pad)
653{
654	mblk_t *fill;
655
656	ASSERT(pad < SCTP_ALIGN);
657	ASSERT(sctp->sctp_pad_mp != NULL);
658	if ((fill = dupb(sctp->sctp_pad_mp)) != NULL) {
659		fill->b_wptr += pad;
660		return (fill);
661	}
662
663	/*
664	 * The memory saving path of reusing the sctp_pad_mp
665	 * fails may be because it has been dupb() too
666	 * many times (DBLK_REFMAX).  Use the memory consuming
667	 * path of allocating the pad mblk.
668	 */
669	if ((fill = allocb(SCTP_ALIGN, BPRI_MED)) != NULL) {
670		/* Zero it out.  SCTP_ALIGN is sizeof (int32_t) */
671		*(int32_t *)fill->b_rptr = 0;
672		fill->b_wptr += pad;
673	}
674	return (fill);
675}
676
677static mblk_t *
678sctp_find_fast_rexmit_mblks(sctp_t *sctp, int *total, sctp_faddr_t **fp)
679{
680	mblk_t		*meta;
681	mblk_t		*start_mp = NULL;
682	mblk_t		*end_mp = NULL;
683	mblk_t		*mp, *nmp;
684	mblk_t		*fill;
685	sctp_data_hdr_t	*sdh;
686	int		msglen;
687	int		extra;
688	sctp_msg_hdr_t	*msg_hdr;
689	sctp_faddr_t	*old_fp = NULL;
690	sctp_faddr_t	*chunk_fp;
691	sctp_stack_t	*sctps = sctp->sctp_sctps;
692
693	for (meta = sctp->sctp_xmit_head; meta != NULL; meta = meta->b_next) {
694		msg_hdr = (sctp_msg_hdr_t *)meta->b_rptr;
695		if (SCTP_IS_MSG_ABANDONED(meta) ||
696		    SCTP_MSG_TO_BE_ABANDONED(meta, msg_hdr, sctp)) {
697			continue;
698		}
699		for (mp = meta->b_cont; mp != NULL; mp = mp->b_next) {
700			if (SCTP_CHUNK_WANT_REXMIT(mp)) {
701				/*
702				 * Use the same peer address to do fast
703				 * retransmission.  If the original peer
704				 * address is dead, switch to the current
705				 * one.  Record the old one so that we
706				 * will pick the chunks sent to the old
707				 * one for fast retransmission.
708				 */
709				chunk_fp = SCTP_CHUNK_DEST(mp);
710				if (*fp == NULL) {
711					*fp = chunk_fp;
712					if ((*fp)->state != SCTP_FADDRS_ALIVE) {
713						old_fp = *fp;
714						*fp = sctp->sctp_current;
715					}
716				} else if (old_fp == NULL && *fp != chunk_fp) {
717					continue;
718				} else if (old_fp != NULL &&
719				    old_fp != chunk_fp) {
720					continue;
721				}
722
723				sdh = (sctp_data_hdr_t *)mp->b_rptr;
724				msglen = ntohs(sdh->sdh_len);
725				if ((extra = msglen & (SCTP_ALIGN - 1)) != 0) {
726					extra = SCTP_ALIGN - extra;
727				}
728
729				/*
730				 * We still return at least the first message
731				 * even if that message cannot fit in as
732				 * PMTU may have changed.
733				 */
734				if (*total + msglen + extra >
735				    (*fp)->sfa_pmss && start_mp != NULL) {
736					return (start_mp);
737				}
738				if ((nmp = dupmsg(mp)) == NULL)
739					return (start_mp);
740				if (extra > 0) {
741					fill = sctp_get_padding(sctp, extra);
742					if (fill != NULL) {
743						linkb(nmp, fill);
744					} else {
745						return (start_mp);
746					}
747				}
748				BUMP_MIB(&sctps->sctps_mib, sctpOutFastRetrans);
749				BUMP_LOCAL(sctp->sctp_rxtchunks);
750				SCTP_CHUNK_CLEAR_REXMIT(mp);
751				if (start_mp == NULL) {
752					start_mp = nmp;
753				} else {
754					linkb(end_mp, nmp);
755				}
756				end_mp = nmp;
757				*total += msglen + extra;
758				dprint(2, ("sctp_find_fast_rexmit_mblks: "
759				    "tsn %x\n", sdh->sdh_tsn));
760			}
761		}
762	}
763	/* Clear the flag as there is no more message to be fast rexmitted. */
764	sctp->sctp_chk_fast_rexmit = B_FALSE;
765	return (start_mp);
766}
767
768/* A debug function just to make sure that a mblk chain is not broken */
769#ifdef	DEBUG
770static boolean_t
771sctp_verify_chain(mblk_t *head, mblk_t *tail)
772{
773	mblk_t	*mp = head;
774
775	if (head == NULL || tail == NULL)
776		return (B_TRUE);
777	while (mp != NULL) {
778		if (mp == tail)
779			return (B_TRUE);
780		mp = mp->b_next;
781	}
782	return (B_FALSE);
783}
784#endif
785
786/*
787 * Gets the next unsent chunk to transmit. Messages that are abandoned are
788 * skipped. A message can be abandoned if it has a non-zero timetolive and
789 * transmission has not yet started or if it is a partially reliable
790 * message and its time is up (assuming we are PR-SCTP aware).
791 * 'cansend' is used to determine if need to try and chunkify messages from
792 * the unsent list, if any, and also as an input to sctp_chunkify() if so.
793 *
794 * firstseg indicates the space already used, cansend represents remaining
795 * space in the window, ((sfa_pmss - firstseg) can therefore reasonably
796 * be used to compute the cansend arg).
797 */
798mblk_t *
799sctp_get_msg_to_send(sctp_t *sctp, mblk_t **mp, mblk_t *meta, int  *error,
800    int32_t firstseg, uint32_t cansend, sctp_faddr_t *fp)
801{
802	mblk_t		*mp1;
803	sctp_msg_hdr_t	*msg_hdr;
804	mblk_t		*tmp_meta;
805	sctp_faddr_t	*fp1;
806
807	ASSERT(error != NULL && mp != NULL);
808	*error = 0;
809
810	ASSERT(sctp->sctp_current != NULL);
811
812chunkified:
813	while (meta != NULL) {
814		tmp_meta = meta->b_next;
815		msg_hdr = (sctp_msg_hdr_t *)meta->b_rptr;
816		mp1 = meta->b_cont;
817		if (SCTP_IS_MSG_ABANDONED(meta))
818			goto next_msg;
819		if (!SCTP_MSG_TO_BE_ABANDONED(meta, msg_hdr, sctp)) {
820			while (mp1 != NULL) {
821				if (SCTP_CHUNK_CANSEND(mp1)) {
822					*mp = mp1;
823#ifdef	DEBUG
824					ASSERT(sctp_verify_chain(
825					    sctp->sctp_xmit_head, meta));
826#endif
827					return (meta);
828				}
829				mp1 = mp1->b_next;
830			}
831			goto next_msg;
832		}
833		/*
834		 * If we come here and the first chunk is sent, then we
835		 * we are PR-SCTP aware, in which case if the cumulative
836		 * TSN has moved upto or beyond the first chunk (which
837		 * means all the previous messages have been cumulative
838		 * SACK'd), then we send a Forward TSN with the last
839		 * chunk that was sent in this message. If we can't send
840		 * a Forward TSN because previous non-abandoned messages
841		 * have not been acked then we will defer the Forward TSN
842		 * to sctp_rexmit() or sctp_cumack().
843		 */
844		if (SCTP_CHUNK_ISSENT(mp1)) {
845			*error = sctp_check_abandoned_msg(sctp, meta);
846			if (*error != 0) {
847#ifdef	DEBUG
848				ASSERT(sctp_verify_chain(sctp->sctp_xmit_head,
849				    sctp->sctp_xmit_tail));
850#endif
851				return (NULL);
852			}
853			goto next_msg;
854		}
855		BUMP_LOCAL(sctp->sctp_prsctpdrop);
856		ASSERT(sctp->sctp_unsent >= msg_hdr->smh_msglen);
857		if (meta->b_prev == NULL) {
858			ASSERT(sctp->sctp_xmit_head == meta);
859			sctp->sctp_xmit_head = tmp_meta;
860			if (sctp->sctp_xmit_tail == meta)
861				sctp->sctp_xmit_tail = tmp_meta;
862			meta->b_next = NULL;
863			if (tmp_meta != NULL)
864				tmp_meta->b_prev = NULL;
865		} else if (meta->b_next == NULL) {
866			if (sctp->sctp_xmit_tail == meta)
867				sctp->sctp_xmit_tail = meta->b_prev;
868			meta->b_prev->b_next = NULL;
869			meta->b_prev = NULL;
870		} else {
871			meta->b_prev->b_next = tmp_meta;
872			tmp_meta->b_prev = meta->b_prev;
873			if (sctp->sctp_xmit_tail == meta)
874				sctp->sctp_xmit_tail = tmp_meta;
875			meta->b_prev = NULL;
876			meta->b_next = NULL;
877		}
878		sctp->sctp_unsent -= msg_hdr->smh_msglen;
879		/*
880		 * Update ULP the amount of queued data, which is
881		 * sent-unack'ed + unsent.
882		 */
883		if (!SCTP_IS_DETACHED(sctp))
884			SCTP_TXQ_UPDATE(sctp);
885		sctp_sendfail_event(sctp, meta, 0, B_TRUE);
886next_msg:
887		meta = tmp_meta;
888	}
889	/* chunkify, if needed */
890	if (cansend > 0 && sctp->sctp_xmit_unsent != NULL) {
891		ASSERT(sctp->sctp_unsent > 0);
892		if (fp == NULL) {
893			fp = SCTP_CHUNK_DEST(sctp->sctp_xmit_unsent);
894			if (fp == NULL || fp->state != SCTP_FADDRS_ALIVE)
895				fp = sctp->sctp_current;
896		} else {
897			/*
898			 * If user specified destination, try to honor that.
899			 */
900			fp1 = SCTP_CHUNK_DEST(sctp->sctp_xmit_unsent);
901			if (fp1 != NULL && fp1->state == SCTP_FADDRS_ALIVE &&
902			    fp1 != fp) {
903				goto chunk_done;
904			}
905		}
906		sctp_chunkify(sctp, fp->sfa_pmss - firstseg, cansend);
907		if ((meta = sctp->sctp_xmit_tail) == NULL)
908			goto chunk_done;
909		/*
910		 * sctp_chunkify() won't advance sctp_xmit_tail if it adds
911		 * new chunk(s) to the tail, so we need to skip the
912		 * sctp_xmit_tail, which would have already been processed.
913		 * This could happen when there is unacked chunks, but
914		 * nothing new to send.
915		 * When sctp_chunkify() is called when the transmit queue
916		 * is empty then we need to start from sctp_xmit_tail.
917		 */
918		if (SCTP_CHUNK_ISSENT(sctp->sctp_xmit_tail->b_cont)) {
919#ifdef	DEBUG
920			mp1 = sctp->sctp_xmit_tail->b_cont;
921			while (mp1 != NULL) {
922				ASSERT(!SCTP_CHUNK_CANSEND(mp1));
923				mp1 = mp1->b_next;
924			}
925#endif
926			if ((meta = sctp->sctp_xmit_tail->b_next) == NULL)
927				goto chunk_done;
928		}
929		goto chunkified;
930	}
931chunk_done:
932#ifdef	DEBUG
933	ASSERT(sctp_verify_chain(sctp->sctp_xmit_head, sctp->sctp_xmit_tail));
934#endif
935	return (NULL);
936}
937
938void
939sctp_fast_rexmit(sctp_t *sctp)
940{
941	mblk_t		*mp, *head;
942	int		pktlen = 0;
943	sctp_faddr_t	*fp = NULL;
944	sctp_stack_t	*sctps = sctp->sctp_sctps;
945
946	ASSERT(sctp->sctp_xmit_head != NULL);
947	mp = sctp_find_fast_rexmit_mblks(sctp, &pktlen, &fp);
948	if (mp == NULL) {
949		SCTP_KSTAT(sctps, sctp_fr_not_found);
950		return;
951	}
952	if ((head = sctp_add_proto_hdr(sctp, fp, mp, 0, NULL)) == NULL) {
953		freemsg(mp);
954		SCTP_KSTAT(sctps, sctp_fr_add_hdr);
955		return;
956	}
957	if ((pktlen > fp->sfa_pmss) && fp->isv4) {
958		ipha_t *iph = (ipha_t *)head->b_rptr;
959
960		iph->ipha_fragment_offset_and_flags = 0;
961	}
962
963	sctp_set_iplen(sctp, head);
964	sctp_add_sendq(sctp, head);
965	sctp->sctp_active = fp->lastactive = lbolt64;
966}
967
968void
969sctp_output(sctp_t *sctp, uint_t num_pkt)
970{
971	mblk_t			*mp = NULL;
972	mblk_t			*nmp;
973	mblk_t			*head;
974	mblk_t			*meta = sctp->sctp_xmit_tail;
975	mblk_t			*fill = NULL;
976	uint16_t 		chunklen;
977	uint32_t 		cansend;
978	int32_t			seglen;
979	int32_t			xtralen;
980	int32_t			sacklen;
981	int32_t			pad = 0;
982	int32_t			pathmax;
983	int			extra;
984	int64_t			now = lbolt64;
985	sctp_faddr_t		*fp;
986	sctp_faddr_t		*lfp;
987	sctp_data_hdr_t		*sdc;
988	int			error;
989	boolean_t		notsent = B_TRUE;
990	sctp_stack_t		*sctps = sctp->sctp_sctps;
991
992	if (sctp->sctp_ftsn == sctp->sctp_lastacked + 1) {
993		sacklen = 0;
994	} else {
995		/* send a SACK chunk */
996		sacklen = sizeof (sctp_chunk_hdr_t) +
997		    sizeof (sctp_sack_chunk_t) +
998		    (sizeof (sctp_sack_frag_t) * sctp->sctp_sack_gaps);
999		lfp = sctp->sctp_lastdata;
1000		ASSERT(lfp != NULL);
1001		if (lfp->state != SCTP_FADDRS_ALIVE)
1002			lfp = sctp->sctp_current;
1003	}
1004
1005	cansend = sctp->sctp_frwnd;
1006	if (sctp->sctp_unsent < cansend)
1007		cansend = sctp->sctp_unsent;
1008
1009	/*
1010	 * Start persist timer if unable to send or when
1011	 * trying to send into a zero window. This timer
1012	 * ensures the blocked send attempt is retried.
1013	 */
1014	if ((cansend < sctp->sctp_current->sfa_pmss / 2) &&
1015	    (sctp->sctp_unacked != 0) &&
1016	    (sctp->sctp_unacked < sctp->sctp_current->sfa_pmss) &&
1017	    !sctp->sctp_ndelay ||
1018	    (cansend == 0 && sctp->sctp_unacked == 0 &&
1019	    sctp->sctp_unsent != 0)) {
1020		head = NULL;
1021		fp = sctp->sctp_current;
1022		goto unsent_data;
1023	}
1024	if (meta != NULL)
1025		mp = meta->b_cont;
1026	while (cansend > 0 && num_pkt-- != 0) {
1027		pad = 0;
1028
1029		/*
1030		 * Find first segment eligible for transmit.
1031		 */
1032		while (mp != NULL) {
1033			if (SCTP_CHUNK_CANSEND(mp))
1034				break;
1035			mp = mp->b_next;
1036		}
1037		if (mp == NULL) {
1038			meta = sctp_get_msg_to_send(sctp, &mp,
1039			    meta == NULL ? NULL : meta->b_next, &error, sacklen,
1040			    cansend, NULL);
1041			if (error != 0 || meta == NULL) {
1042				head = NULL;
1043				fp = sctp->sctp_current;
1044				goto unsent_data;
1045			}
1046			sctp->sctp_xmit_tail =  meta;
1047		}
1048
1049		sdc = (sctp_data_hdr_t *)mp->b_rptr;
1050		seglen = ntohs(sdc->sdh_len);
1051		xtralen = sizeof (*sdc);
1052		chunklen = seglen - xtralen;
1053
1054		/*
1055		 * Check rwnd.
1056		 */
1057		if (chunklen > cansend) {
1058			head = NULL;
1059			fp = SCTP_CHUNK_DEST(meta);
1060			if (fp == NULL || fp->state != SCTP_FADDRS_ALIVE)
1061				fp = sctp->sctp_current;
1062			goto unsent_data;
1063		}
1064		if ((extra = seglen & (SCTP_ALIGN - 1)) != 0)
1065			extra = SCTP_ALIGN - extra;
1066
1067		/*
1068		 * Pick destination address, and check cwnd.
1069		 */
1070		if (sacklen > 0 && (seglen + extra <= lfp->cwnd - lfp->suna) &&
1071		    (seglen + sacklen + extra <= lfp->sfa_pmss)) {
1072			/*
1073			 * Only include SACK chunk if it can be bundled
1074			 * with a data chunk, and sent to sctp_lastdata.
1075			 */
1076			pathmax = lfp->cwnd - lfp->suna;
1077
1078			fp = lfp;
1079			if ((nmp = dupmsg(mp)) == NULL) {
1080				head = NULL;
1081				goto unsent_data;
1082			}
1083			SCTP_CHUNK_CLEAR_FLAGS(nmp);
1084			head = sctp_add_proto_hdr(sctp, fp, nmp, sacklen,
1085			    &error);
1086			if (head == NULL) {
1087				/*
1088				 * If none of the source addresses are
1089				 * available (i.e error == EHOSTUNREACH),
1090				 * pretend we have sent the data. We will
1091				 * eventually time out trying to retramsmit
1092				 * the data if the interface never comes up.
1093				 * If we have already sent some stuff (i.e.,
1094				 * notsent is B_FALSE) then we are fine, else
1095				 * just mark this packet as sent.
1096				 */
1097				if (notsent && error == EHOSTUNREACH) {
1098					SCTP_CHUNK_SENT(sctp, mp, sdc,
1099					    fp, chunklen, meta);
1100				}
1101				freemsg(nmp);
1102				SCTP_KSTAT(sctps, sctp_output_failed);
1103				goto unsent_data;
1104			}
1105			seglen += sacklen;
1106			xtralen += sacklen;
1107			sacklen = 0;
1108		} else {
1109			fp = SCTP_CHUNK_DEST(meta);
1110			if (fp == NULL || fp->state != SCTP_FADDRS_ALIVE)
1111				fp = sctp->sctp_current;
1112			/*
1113			 * If we haven't sent data to this destination for
1114			 * a while, do slow start again.
1115			 */
1116			if (now - fp->lastactive > fp->rto) {
1117				SET_CWND(fp, fp->sfa_pmss,
1118				    sctps->sctps_slow_start_after_idle);
1119			}
1120
1121			pathmax = fp->cwnd - fp->suna;
1122			if (seglen + extra > pathmax) {
1123				head = NULL;
1124				goto unsent_data;
1125			}
1126			if ((nmp = dupmsg(mp)) == NULL) {
1127				head = NULL;
1128				goto unsent_data;
1129			}
1130			SCTP_CHUNK_CLEAR_FLAGS(nmp);
1131			head = sctp_add_proto_hdr(sctp, fp, nmp, 0, &error);
1132			if (head == NULL) {
1133				/*
1134				 * If none of the source addresses are
1135				 * available (i.e error == EHOSTUNREACH),
1136				 * pretend we have sent the data. We will
1137				 * eventually time out trying to retramsmit
1138				 * the data if the interface never comes up.
1139				 * If we have already sent some stuff (i.e.,
1140				 * notsent is B_FALSE) then we are fine, else
1141				 * just mark this packet as sent.
1142				 */
1143				if (notsent && error == EHOSTUNREACH) {
1144					SCTP_CHUNK_SENT(sctp, mp, sdc,
1145					    fp, chunklen, meta);
1146				}
1147				freemsg(nmp);
1148				SCTP_KSTAT(sctps, sctp_output_failed);
1149				goto unsent_data;
1150			}
1151		}
1152		fp->lastactive = now;
1153		if (pathmax > fp->sfa_pmss)
1154			pathmax = fp->sfa_pmss;
1155		SCTP_CHUNK_SENT(sctp, mp, sdc, fp, chunklen, meta);
1156		mp = mp->b_next;
1157
1158		/* Use this chunk to measure RTT? */
1159		if (sctp->sctp_out_time == 0) {
1160			sctp->sctp_out_time = now;
1161			sctp->sctp_rtt_tsn = sctp->sctp_ltsn - 1;
1162			ASSERT(sctp->sctp_rtt_tsn == ntohl(sdc->sdh_tsn));
1163		}
1164		if (extra > 0) {
1165			fill = sctp_get_padding(sctp, extra);
1166			if (fill != NULL) {
1167				linkb(head, fill);
1168				pad = extra;
1169				seglen += extra;
1170			} else {
1171				goto unsent_data;
1172			}
1173		}
1174		/* See if we can bundle more. */
1175		while (seglen < pathmax) {
1176			int32_t		new_len;
1177			int32_t		new_xtralen;
1178
1179			while (mp != NULL) {
1180				if (SCTP_CHUNK_CANSEND(mp))
1181					break;
1182				mp = mp->b_next;
1183			}
1184			if (mp == NULL) {
1185				meta = sctp_get_msg_to_send(sctp, &mp,
1186				    meta->b_next, &error, seglen,
1187				    (seglen - xtralen) >= cansend ? 0 :
1188				    cansend - seglen, fp);
1189				if (error != 0 || meta == NULL)
1190					break;
1191				sctp->sctp_xmit_tail =  meta;
1192			}
1193			ASSERT(mp != NULL);
1194			if (!SCTP_CHUNK_ISSENT(mp) && SCTP_CHUNK_DEST(meta) &&
1195			    fp != SCTP_CHUNK_DEST(meta)) {
1196				break;
1197			}
1198			sdc = (sctp_data_hdr_t *)mp->b_rptr;
1199			chunklen = ntohs(sdc->sdh_len);
1200			if ((extra = chunklen  & (SCTP_ALIGN - 1)) != 0)
1201				extra = SCTP_ALIGN - extra;
1202
1203			new_len = seglen + chunklen;
1204			new_xtralen = xtralen + sizeof (*sdc);
1205			chunklen -= sizeof (*sdc);
1206
1207			if (new_len - new_xtralen > cansend ||
1208			    new_len + extra > pathmax) {
1209				break;
1210			}
1211			if ((nmp = dupmsg(mp)) == NULL)
1212				break;
1213			if (extra > 0) {
1214				fill = sctp_get_padding(sctp, extra);
1215				if (fill != NULL) {
1216					pad += extra;
1217					new_len += extra;
1218					linkb(nmp, fill);
1219				} else {
1220					freemsg(nmp);
1221					break;
1222				}
1223			}
1224			seglen = new_len;
1225			xtralen = new_xtralen;
1226			SCTP_CHUNK_CLEAR_FLAGS(nmp);
1227			SCTP_CHUNK_SENT(sctp, mp, sdc, fp, chunklen, meta);
1228			linkb(head, nmp);
1229			mp = mp->b_next;
1230		}
1231		if ((seglen > fp->sfa_pmss) && fp->isv4) {
1232			ipha_t *iph = (ipha_t *)head->b_rptr;
1233
1234			/*
1235			 * Path MTU is different from what we thought it would
1236			 * be when we created chunks, or IP headers have grown.
1237			 * Need to clear the DF bit.
1238			 */
1239			iph->ipha_fragment_offset_and_flags = 0;
1240		}
1241		/* xmit segment */
1242		ASSERT(cansend >= seglen - pad - xtralen);
1243		cansend -= (seglen - pad - xtralen);
1244		dprint(2, ("sctp_output: Sending packet %d bytes, tsn %x "
1245		    "ssn %d to %p (rwnd %d, cansend %d, lastack_rxd %x)\n",
1246		    seglen - xtralen, ntohl(sdc->sdh_tsn),
1247		    ntohs(sdc->sdh_ssn), (void *)fp, sctp->sctp_frwnd,
1248		    cansend, sctp->sctp_lastack_rxd));
1249		sctp_set_iplen(sctp, head);
1250		sctp_add_sendq(sctp, head);
1251		/* arm rto timer (if not set) */
1252		if (!fp->timer_running)
1253			SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
1254		notsent = B_FALSE;
1255	}
1256	sctp->sctp_active = now;
1257	return;
1258unsent_data:
1259	/* arm persist timer (if rto timer not set) */
1260	if (!fp->timer_running)
1261		SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
1262	if (head != NULL)
1263		freemsg(head);
1264}
1265
1266/*
1267 * The following two functions initialize and destroy the cache
1268 * associated with the sets used for PR-SCTP.
1269 */
1270void
1271sctp_ftsn_sets_init(void)
1272{
1273	sctp_kmem_ftsn_set_cache = kmem_cache_create("sctp_ftsn_set_cache",
1274	    sizeof (sctp_ftsn_set_t), 0, NULL, NULL, NULL, NULL,
1275	    NULL, 0);
1276}
1277
1278void
1279sctp_ftsn_sets_fini(void)
1280{
1281	kmem_cache_destroy(sctp_kmem_ftsn_set_cache);
1282}
1283
1284
1285/* Free PR-SCTP sets */
1286void
1287sctp_free_ftsn_set(sctp_ftsn_set_t *s)
1288{
1289	sctp_ftsn_set_t *p;
1290
1291	while (s != NULL) {
1292		p = s->next;
1293		s->next = NULL;
1294		kmem_cache_free(sctp_kmem_ftsn_set_cache, s);
1295		s = p;
1296	}
1297}
1298
1299/*
1300 * Given a message meta block, meta, this routine creates or modifies
1301 * the set that will be used to generate a Forward TSN chunk. If the
1302 * entry for stream id, sid, for this message already exists, the
1303 * sequence number, ssn, is updated if it is greater than the existing
1304 * one. If an entry for this sid does not exist, one is created if
1305 * the size does not exceed fp->sfa_pmss. We return false in case
1306 * or an error.
1307 */
1308boolean_t
1309sctp_add_ftsn_set(sctp_ftsn_set_t **s, sctp_faddr_t *fp, mblk_t *meta,
1310    uint_t *nsets, uint32_t *slen)
1311{
1312	sctp_ftsn_set_t		*p;
1313	sctp_msg_hdr_t		*msg_hdr = (sctp_msg_hdr_t *)meta->b_rptr;
1314	uint16_t		sid = htons(msg_hdr->smh_sid);
1315	/* msg_hdr->smh_ssn is already in NBO */
1316	uint16_t		ssn = msg_hdr->smh_ssn;
1317
1318	ASSERT(s != NULL && nsets != NULL);
1319	ASSERT((*nsets == 0 && *s == NULL) || (*nsets > 0 && *s != NULL));
1320
1321	if (*s == NULL) {
1322		ASSERT((*slen + sizeof (uint32_t)) <= fp->sfa_pmss);
1323		*s = kmem_cache_alloc(sctp_kmem_ftsn_set_cache, KM_NOSLEEP);
1324		if (*s == NULL)
1325			return (B_FALSE);
1326		(*s)->ftsn_entries.ftsn_sid = sid;
1327		(*s)->ftsn_entries.ftsn_ssn = ssn;
1328		(*s)->next = NULL;
1329		*nsets = 1;
1330		*slen += sizeof (uint32_t);
1331		return (B_TRUE);
1332	}
1333	for (p = *s; p->next != NULL; p = p->next) {
1334		if (p->ftsn_entries.ftsn_sid == sid) {
1335			if (SSN_GT(ssn, p->ftsn_entries.ftsn_ssn))
1336				p->ftsn_entries.ftsn_ssn = ssn;
1337			return (B_TRUE);
1338		}
1339	}
1340	/* the last one */
1341	if (p->ftsn_entries.ftsn_sid == sid) {
1342		if (SSN_GT(ssn, p->ftsn_entries.ftsn_ssn))
1343			p->ftsn_entries.ftsn_ssn = ssn;
1344	} else {
1345		if ((*slen + sizeof (uint32_t)) > fp->sfa_pmss)
1346			return (B_FALSE);
1347		p->next = kmem_cache_alloc(sctp_kmem_ftsn_set_cache,
1348		    KM_NOSLEEP);
1349		if (p->next == NULL)
1350			return (B_FALSE);
1351		p = p->next;
1352		p->ftsn_entries.ftsn_sid = sid;
1353		p->ftsn_entries.ftsn_ssn = ssn;
1354		p->next = NULL;
1355		(*nsets)++;
1356		*slen += sizeof (uint32_t);
1357	}
1358	return (B_TRUE);
1359}
1360
1361/*
1362 * Given a set of stream id - sequence number pairs, this routing creates
1363 * a Forward TSN chunk. The cumulative TSN (advanced peer ack point)
1364 * for the chunk is obtained from sctp->sctp_adv_pap. The caller
1365 * will add the IP/SCTP header.
1366 */
1367mblk_t *
1368sctp_make_ftsn_chunk(sctp_t *sctp, sctp_faddr_t *fp, sctp_ftsn_set_t *sets,
1369    uint_t nsets, uint32_t seglen)
1370{
1371	mblk_t			*ftsn_mp;
1372	sctp_chunk_hdr_t	*ch_hdr;
1373	uint32_t		*advtsn;
1374	uint16_t		schlen;
1375	size_t			xtralen;
1376	ftsn_entry_t		*ftsn_entry;
1377	sctp_stack_t	*sctps = sctp->sctp_sctps;
1378
1379	seglen += sizeof (sctp_chunk_hdr_t);
1380	if (fp->isv4)
1381		xtralen = sctp->sctp_hdr_len + sctps->sctps_wroff_xtra;
1382	else
1383		xtralen = sctp->sctp_hdr6_len + sctps->sctps_wroff_xtra;
1384	ftsn_mp = allocb_cred(xtralen + seglen, CONN_CRED(sctp->sctp_connp));
1385	if (ftsn_mp == NULL)
1386		return (NULL);
1387	ftsn_mp->b_rptr += xtralen;
1388	ftsn_mp->b_wptr = ftsn_mp->b_rptr + seglen;
1389
1390	ch_hdr = (sctp_chunk_hdr_t *)ftsn_mp->b_rptr;
1391	ch_hdr->sch_id = CHUNK_FORWARD_TSN;
1392	ch_hdr->sch_flags = 0;
1393	/*
1394	 * The cast here should not be an issue since seglen is
1395	 * the length of the Forward TSN chunk.
1396	 */
1397	schlen = (uint16_t)seglen;
1398	U16_TO_ABE16(schlen, &(ch_hdr->sch_len));
1399
1400	advtsn = (uint32_t *)(ch_hdr + 1);
1401	U32_TO_ABE32(sctp->sctp_adv_pap, advtsn);
1402	ftsn_entry = (ftsn_entry_t *)(advtsn + 1);
1403	while (nsets > 0) {
1404		ASSERT((uchar_t *)&ftsn_entry[1] <= ftsn_mp->b_wptr);
1405		ftsn_entry->ftsn_sid = sets->ftsn_entries.ftsn_sid;
1406		ftsn_entry->ftsn_ssn = sets->ftsn_entries.ftsn_ssn;
1407		ftsn_entry++;
1408		sets = sets->next;
1409		nsets--;
1410	}
1411	return (ftsn_mp);
1412}
1413
1414/*
1415 * Given a starting message, the routine steps through all the
1416 * messages whose TSN is less than sctp->sctp_adv_pap and creates
1417 * ftsn sets. The ftsn sets is then used to create an Forward TSN
1418 * chunk. All the messages, that have chunks that are included in the
1419 * ftsn sets, are flagged abandonded. If a message is partially sent
1420 * and is deemed abandoned, all remaining unsent chunks are marked
1421 * abandoned and are deducted from sctp_unsent.
1422 */
1423void
1424sctp_make_ftsns(sctp_t *sctp, mblk_t *meta, mblk_t *mp, mblk_t **nmp,
1425    sctp_faddr_t *fp, uint32_t *seglen)
1426{
1427	mblk_t		*mp1 = mp;
1428	mblk_t		*mp_head = mp;
1429	mblk_t		*meta_head = meta;
1430	mblk_t		*head;
1431	sctp_ftsn_set_t	*sets = NULL;
1432	uint_t		nsets = 0;
1433	uint16_t	clen;
1434	sctp_data_hdr_t	*sdc;
1435	uint32_t	sacklen;
1436	uint32_t	adv_pap = sctp->sctp_adv_pap;
1437	uint32_t	unsent = 0;
1438	boolean_t	ubit;
1439	sctp_stack_t	*sctps = sctp->sctp_sctps;
1440
1441	*seglen = sizeof (uint32_t);
1442
1443	sdc  = (sctp_data_hdr_t *)mp1->b_rptr;
1444	while (meta != NULL &&
1445	    SEQ_GEQ(sctp->sctp_adv_pap, ntohl(sdc->sdh_tsn))) {
1446		/*
1447		 * Skip adding FTSN sets for un-ordered messages as they do
1448		 * not have SSNs.
1449		 */
1450		ubit = SCTP_DATA_GET_UBIT(sdc);
1451		if (!ubit &&
1452		    !sctp_add_ftsn_set(&sets, fp, meta, &nsets, seglen)) {
1453			meta = NULL;
1454			sctp->sctp_adv_pap = adv_pap;
1455			goto ftsn_done;
1456		}
1457		while (mp1 != NULL && SCTP_CHUNK_ISSENT(mp1)) {
1458			sdc = (sctp_data_hdr_t *)mp1->b_rptr;
1459			adv_pap = ntohl(sdc->sdh_tsn);
1460			mp1 = mp1->b_next;
1461		}
1462		meta = meta->b_next;
1463		if (meta != NULL) {
1464			mp1 = meta->b_cont;
1465			if (!SCTP_CHUNK_ISSENT(mp1))
1466				break;
1467			sdc  = (sctp_data_hdr_t *)mp1->b_rptr;
1468		}
1469	}
1470ftsn_done:
1471	/*
1472	 * Can't compare with sets == NULL, since we don't add any
1473	 * sets for un-ordered messages.
1474	 */
1475	if (meta == meta_head)
1476		return;
1477	*nmp = sctp_make_ftsn_chunk(sctp, fp, sets, nsets, *seglen);
1478	sctp_free_ftsn_set(sets);
1479	if (*nmp == NULL)
1480		return;
1481	if (sctp->sctp_ftsn == sctp->sctp_lastacked + 1) {
1482		sacklen = 0;
1483	} else {
1484		sacklen = sizeof (sctp_chunk_hdr_t) +
1485		    sizeof (sctp_sack_chunk_t) +
1486		    (sizeof (sctp_sack_frag_t) * sctp->sctp_sack_gaps);
1487		if (*seglen + sacklen > sctp->sctp_lastdata->sfa_pmss) {
1488			/* piggybacked SACK doesn't fit */
1489			sacklen = 0;
1490		} else {
1491			fp = sctp->sctp_lastdata;
1492		}
1493	}
1494	head = sctp_add_proto_hdr(sctp, fp, *nmp, sacklen, NULL);
1495	if (head == NULL) {
1496		freemsg(*nmp);
1497		*nmp = NULL;
1498		SCTP_KSTAT(sctps, sctp_send_ftsn_failed);
1499		return;
1500	}
1501	*seglen += sacklen;
1502	*nmp = head;
1503
1504	/*
1505	 * XXXNeed to optimise this, the reason it is done here is so
1506	 * that we don't have to undo in case of failure.
1507	 */
1508	mp1 = mp_head;
1509	sdc  = (sctp_data_hdr_t *)mp1->b_rptr;
1510	while (meta_head != NULL &&
1511	    SEQ_GEQ(sctp->sctp_adv_pap, ntohl(sdc->sdh_tsn))) {
1512		if (!SCTP_IS_MSG_ABANDONED(meta_head))
1513			SCTP_MSG_SET_ABANDONED(meta_head);
1514		while (mp1 != NULL && SCTP_CHUNK_ISSENT(mp1)) {
1515			sdc = (sctp_data_hdr_t *)mp1->b_rptr;
1516			if (!SCTP_CHUNK_ISACKED(mp1)) {
1517				clen = ntohs(sdc->sdh_len) - sizeof (*sdc);
1518				SCTP_CHUNK_SENT(sctp, mp1, sdc, fp, clen,
1519				    meta_head);
1520			}
1521			mp1 = mp1->b_next;
1522		}
1523		while (mp1 != NULL) {
1524			sdc = (sctp_data_hdr_t *)mp1->b_rptr;
1525			if (!SCTP_CHUNK_ABANDONED(mp1)) {
1526				ASSERT(!SCTP_CHUNK_ISSENT(mp1));
1527				unsent += ntohs(sdc->sdh_len) - sizeof (*sdc);
1528				SCTP_ABANDON_CHUNK(mp1);
1529			}
1530			mp1 = mp1->b_next;
1531		}
1532		meta_head = meta_head->b_next;
1533		if (meta_head != NULL) {
1534			mp1 = meta_head->b_cont;
1535			if (!SCTP_CHUNK_ISSENT(mp1))
1536				break;
1537			sdc  = (sctp_data_hdr_t *)mp1->b_rptr;
1538		}
1539	}
1540	if (unsent > 0) {
1541		ASSERT(sctp->sctp_unsent >= unsent);
1542		sctp->sctp_unsent -= unsent;
1543		/*
1544		 * Update ULP the amount of queued data, which is
1545		 * sent-unack'ed + unsent.
1546		 */
1547		if (!SCTP_IS_DETACHED(sctp))
1548			SCTP_TXQ_UPDATE(sctp);
1549	}
1550}
1551
1552/*
1553 * This function steps through messages starting at meta and checks if
1554 * the message is abandoned. It stops when it hits an unsent chunk or
1555 * a message that has all its chunk acked. This is the only place
1556 * where the sctp_adv_pap is moved forward to indicated abandoned
1557 * messages.
1558 */
1559void
1560sctp_check_adv_ack_pt(sctp_t *sctp, mblk_t *meta, mblk_t *mp)
1561{
1562	uint32_t	tsn = sctp->sctp_adv_pap;
1563	sctp_data_hdr_t	*sdc;
1564	sctp_msg_hdr_t	*msg_hdr;
1565
1566	ASSERT(mp != NULL);
1567	sdc = (sctp_data_hdr_t *)mp->b_rptr;
1568	ASSERT(SEQ_GT(ntohl(sdc->sdh_tsn), sctp->sctp_lastack_rxd));
1569	msg_hdr = (sctp_msg_hdr_t *)meta->b_rptr;
1570	if (!SCTP_IS_MSG_ABANDONED(meta) &&
1571	    !SCTP_MSG_TO_BE_ABANDONED(meta, msg_hdr, sctp)) {
1572		return;
1573	}
1574	while (meta != NULL) {
1575		while (mp != NULL && SCTP_CHUNK_ISSENT(mp)) {
1576			sdc = (sctp_data_hdr_t *)mp->b_rptr;
1577			tsn = ntohl(sdc->sdh_tsn);
1578			mp = mp->b_next;
1579		}
1580		if (mp != NULL)
1581			break;
1582		/*
1583		 * We continue checking for successive messages only if there
1584		 * is a chunk marked for retransmission. Else, we might
1585		 * end up sending FTSN prematurely for chunks that have been
1586		 * sent, but not yet acked.
1587		 */
1588		if ((meta = meta->b_next) != NULL) {
1589			msg_hdr = (sctp_msg_hdr_t *)meta->b_rptr;
1590			if (!SCTP_IS_MSG_ABANDONED(meta) &&
1591			    !SCTP_MSG_TO_BE_ABANDONED(meta, msg_hdr, sctp)) {
1592				break;
1593			}
1594			for (mp = meta->b_cont; mp != NULL; mp = mp->b_next) {
1595				if (!SCTP_CHUNK_ISSENT(mp)) {
1596					sctp->sctp_adv_pap = tsn;
1597					return;
1598				}
1599				if (SCTP_CHUNK_WANT_REXMIT(mp))
1600					break;
1601			}
1602			if (mp == NULL)
1603				break;
1604		}
1605	}
1606	sctp->sctp_adv_pap = tsn;
1607}
1608
1609
1610/*
1611 * Determine if we should bundle a data chunk with the chunk being
1612 * retransmitted.  We bundle if
1613 *
1614 * - the chunk is sent to the same destination and unack'ed.
1615 *
1616 * OR
1617 *
1618 * - the chunk is unsent, i.e. new data.
1619 */
1620#define	SCTP_CHUNK_RX_CANBUNDLE(mp, fp)					\
1621	(!SCTP_CHUNK_ABANDONED((mp)) && 				\
1622	((SCTP_CHUNK_ISSENT((mp)) && (SCTP_CHUNK_DEST(mp) == (fp) &&	\
1623	!SCTP_CHUNK_ISACKED(mp))) ||					\
1624	(((mp)->b_flag & (SCTP_CHUNK_FLAG_REXMIT|SCTP_CHUNK_FLAG_SENT)) != \
1625	SCTP_CHUNK_FLAG_SENT)))
1626
1627/*
1628 * Retransmit first segment which hasn't been acked with cumtsn or send
1629 * a Forward TSN chunk, if appropriate.
1630 */
1631void
1632sctp_rexmit(sctp_t *sctp, sctp_faddr_t *oldfp)
1633{
1634	mblk_t		*mp;
1635	mblk_t		*nmp = NULL;
1636	mblk_t		*head;
1637	mblk_t		*meta = sctp->sctp_xmit_head;
1638	mblk_t		*fill;
1639	uint32_t	seglen = 0;
1640	uint32_t	sacklen;
1641	uint16_t	chunklen;
1642	int		extra;
1643	sctp_data_hdr_t	*sdc;
1644	sctp_faddr_t	*fp;
1645	uint32_t	adv_pap = sctp->sctp_adv_pap;
1646	boolean_t	do_ftsn = B_FALSE;
1647	boolean_t	ftsn_check = B_TRUE;
1648	uint32_t	first_ua_tsn;
1649	sctp_msg_hdr_t	*mhdr;
1650	sctp_stack_t	*sctps = sctp->sctp_sctps;
1651	int		error;
1652
1653	while (meta != NULL) {
1654		for (mp = meta->b_cont; mp != NULL; mp = mp->b_next) {
1655			uint32_t	tsn;
1656
1657			if (!SCTP_CHUNK_ISSENT(mp))
1658				goto window_probe;
1659			/*
1660			 * We break in the following cases -
1661			 *
1662			 *	if the advanced peer ack point includes the next
1663			 *	chunk to be retransmited - possibly the Forward
1664			 * 	TSN was lost.
1665			 *
1666			 *	if we are PRSCTP aware and the next chunk to be
1667			 *	retransmitted is now abandoned
1668			 *
1669			 *	if the next chunk to be retransmitted is for
1670			 *	the dest on which the timer went off. (this
1671			 *	message is not abandoned).
1672			 *
1673			 * We check for Forward TSN only for the first
1674			 * eligible chunk to be retransmitted. The reason
1675			 * being if the first eligible chunk is skipped (say
1676			 * it was sent to a destination other than oldfp)
1677			 * then we cannot advance the cum TSN via Forward
1678			 * TSN chunk.
1679			 *
1680			 * Also, ftsn_check is B_TRUE only for the first
1681			 * eligible chunk, it  will be B_FALSE for all
1682			 * subsequent candidate messages for retransmission.
1683			 */
1684			sdc = (sctp_data_hdr_t *)mp->b_rptr;
1685			tsn = ntohl(sdc->sdh_tsn);
1686			if (SEQ_GT(tsn, sctp->sctp_lastack_rxd)) {
1687				if (sctp->sctp_prsctp_aware && ftsn_check) {
1688					if (SEQ_GEQ(sctp->sctp_adv_pap, tsn)) {
1689						ASSERT(sctp->sctp_prsctp_aware);
1690						do_ftsn = B_TRUE;
1691						goto out;
1692					} else {
1693						sctp_check_adv_ack_pt(sctp,
1694						    meta, mp);
1695						if (SEQ_GT(sctp->sctp_adv_pap,
1696						    adv_pap)) {
1697							do_ftsn = B_TRUE;
1698							goto out;
1699						}
1700					}
1701					ftsn_check = B_FALSE;
1702				}
1703				if (SCTP_CHUNK_DEST(mp) == oldfp)
1704					goto out;
1705			}
1706		}
1707		meta = meta->b_next;
1708		if (meta != NULL && sctp->sctp_prsctp_aware) {
1709			mhdr = (sctp_msg_hdr_t *)meta->b_rptr;
1710
1711			while (meta != NULL && (SCTP_IS_MSG_ABANDONED(meta) ||
1712			    SCTP_MSG_TO_BE_ABANDONED(meta, mhdr, sctp))) {
1713				meta = meta->b_next;
1714			}
1715		}
1716	}
1717window_probe:
1718	/*
1719	 * Retransmit fired for a destination which didn't have
1720	 * any unacked data pending.
1721	 */
1722	if (sctp->sctp_unacked == 0 && sctp->sctp_unsent != 0) {
1723		/*
1724		 * Send a window probe. Inflate frwnd to allow
1725		 * sending one segment.
1726		 */
1727		if (sctp->sctp_frwnd < (oldfp->sfa_pmss - sizeof (*sdc)))
1728			sctp->sctp_frwnd = oldfp->sfa_pmss - sizeof (*sdc);
1729
1730		/* next TSN to send */
1731		sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn;
1732
1733		/*
1734		 * The above sctp_frwnd adjustment is coarse.  The "changed"
1735		 * sctp_frwnd may allow us to send more than 1 packet.  So
1736		 * tell sctp_output() to send only 1 packet.
1737		 */
1738		sctp_output(sctp, 1);
1739
1740		/* Last sent TSN */
1741		sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1;
1742		ASSERT(sctp->sctp_rxt_maxtsn >= sctp->sctp_rxt_nxttsn);
1743		sctp->sctp_zero_win_probe = B_TRUE;
1744		BUMP_MIB(&sctps->sctps_mib, sctpOutWinProbe);
1745	}
1746	return;
1747out:
1748	/*
1749	 * After a time out, assume that everything has left the network.  So
1750	 * we can clear rxt_unacked for the original peer address.
1751	 */
1752	oldfp->rxt_unacked = 0;
1753
1754	/*
1755	 * If we were probing for zero window, don't adjust retransmission
1756	 * variables, but the timer is still backed off.
1757	 */
1758	if (sctp->sctp_zero_win_probe) {
1759		mblk_t	*pkt;
1760		uint_t	pkt_len;
1761
1762		/*
1763		 * Get the Zero Win Probe for retrasmission, sctp_rxt_nxttsn
1764		 * and sctp_rxt_maxtsn will specify the ZWP packet.
1765		 */
1766		fp = oldfp;
1767		if (oldfp->state != SCTP_FADDRS_ALIVE)
1768			fp = sctp_rotate_faddr(sctp, oldfp);
1769		pkt = sctp_rexmit_packet(sctp, &meta, &mp, fp, &pkt_len);
1770		if (pkt != NULL) {
1771			ASSERT(pkt_len <= fp->sfa_pmss);
1772			sctp_set_iplen(sctp, pkt);
1773			sctp_add_sendq(sctp, pkt);
1774		} else {
1775			SCTP_KSTAT(sctps, sctp_ss_rexmit_failed);
1776		}
1777
1778		/*
1779		 * The strikes will be clear by sctp_faddr_alive() when the
1780		 * other side sends us an ack.
1781		 */
1782		oldfp->strikes++;
1783		sctp->sctp_strikes++;
1784
1785		SCTP_CALC_RXT(oldfp, sctp->sctp_rto_max);
1786		if (oldfp != fp && oldfp->suna != 0)
1787			SCTP_FADDR_TIMER_RESTART(sctp, oldfp, fp->rto);
1788		SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
1789		BUMP_MIB(&sctps->sctps_mib, sctpOutWinProbe);
1790		return;
1791	}
1792
1793	/*
1794	 * Enter slowstart for this destination
1795	 */
1796	oldfp->ssthresh = oldfp->cwnd / 2;
1797	if (oldfp->ssthresh < 2 * oldfp->sfa_pmss)
1798		oldfp->ssthresh = 2 * oldfp->sfa_pmss;
1799	oldfp->cwnd = oldfp->sfa_pmss;
1800	oldfp->pba = 0;
1801	fp = sctp_rotate_faddr(sctp, oldfp);
1802	ASSERT(fp != NULL);
1803	sdc = (sctp_data_hdr_t *)mp->b_rptr;
1804
1805	first_ua_tsn = ntohl(sdc->sdh_tsn);
1806	if (do_ftsn) {
1807		sctp_make_ftsns(sctp, meta, mp, &nmp, fp, &seglen);
1808		if (nmp == NULL) {
1809			sctp->sctp_adv_pap = adv_pap;
1810			goto restart_timer;
1811		}
1812		head = nmp;
1813		/*
1814		 * Move to the next unabandoned chunk. XXXCheck if meta will
1815		 * always be marked abandoned.
1816		 */
1817		while (meta != NULL && SCTP_IS_MSG_ABANDONED(meta))
1818			meta = meta->b_next;
1819		if (meta != NULL)
1820			mp = mp->b_cont;
1821		else
1822			mp = NULL;
1823		goto try_bundle;
1824	}
1825	seglen = ntohs(sdc->sdh_len);
1826	chunklen = seglen - sizeof (*sdc);
1827	if ((extra = seglen & (SCTP_ALIGN - 1)) != 0)
1828		extra = SCTP_ALIGN - extra;
1829
1830	/* Find out if we need to piggyback SACK. */
1831	if (sctp->sctp_ftsn == sctp->sctp_lastacked + 1) {
1832		sacklen = 0;
1833	} else {
1834		sacklen = sizeof (sctp_chunk_hdr_t) +
1835		    sizeof (sctp_sack_chunk_t) +
1836		    (sizeof (sctp_sack_frag_t) * sctp->sctp_sack_gaps);
1837		if (seglen + sacklen > sctp->sctp_lastdata->sfa_pmss) {
1838			/* piggybacked SACK doesn't fit */
1839			sacklen = 0;
1840		} else {
1841			/*
1842			 * OK, we have room to send SACK back.  But we
1843			 * should send it back to the last fp where we
1844			 * receive data from, unless sctp_lastdata equals
1845			 * oldfp, then we should probably not send it
1846			 * back to that fp.  Also we should check that
1847			 * the fp is alive.
1848			 */
1849			if (sctp->sctp_lastdata != oldfp &&
1850			    sctp->sctp_lastdata->state == SCTP_FADDRS_ALIVE) {
1851				fp = sctp->sctp_lastdata;
1852			}
1853		}
1854	}
1855
1856	/*
1857	 * Cancel RTT measurement if the retransmitted TSN is before the
1858	 * TSN used for timimg.
1859	 */
1860	if (sctp->sctp_out_time != 0 &&
1861	    SEQ_GEQ(sctp->sctp_rtt_tsn, sdc->sdh_tsn)) {
1862		sctp->sctp_out_time = 0;
1863	}
1864	/* Clear the counter as the RTT calculation may be off. */
1865	fp->rtt_updates = 0;
1866	oldfp->rtt_updates = 0;
1867
1868	/*
1869	 * After a timeout, we should change the current faddr so that
1870	 * new chunks will be sent to the alternate address.
1871	 */
1872	sctp_set_faddr_current(sctp, fp);
1873
1874	nmp = dupmsg(mp);
1875	if (nmp == NULL)
1876		goto restart_timer;
1877	if (extra > 0) {
1878		fill = sctp_get_padding(sctp, extra);
1879		if (fill != NULL) {
1880			linkb(nmp, fill);
1881			seglen += extra;
1882		} else {
1883			freemsg(nmp);
1884			goto restart_timer;
1885		}
1886	}
1887	SCTP_CHUNK_CLEAR_FLAGS(nmp);
1888	head = sctp_add_proto_hdr(sctp, fp, nmp, sacklen, NULL);
1889	if (head == NULL) {
1890		freemsg(nmp);
1891		SCTP_KSTAT(sctps, sctp_rexmit_failed);
1892		goto restart_timer;
1893	}
1894	seglen += sacklen;
1895
1896	SCTP_CHUNK_SENT(sctp, mp, sdc, fp, chunklen, meta);
1897
1898	mp = mp->b_next;
1899
1900try_bundle:
1901	/* We can at least and at most send 1 packet at timeout. */
1902	while (seglen < fp->sfa_pmss) {
1903		int32_t new_len;
1904
1905		/* Go through the list to find more chunks to be bundled. */
1906		while (mp != NULL) {
1907			/* Check if the chunk can be bundled. */
1908			if (SCTP_CHUNK_RX_CANBUNDLE(mp, oldfp))
1909				break;
1910			mp = mp->b_next;
1911		}
1912		/* Go to the next message. */
1913		if (mp == NULL) {
1914			for (meta = meta->b_next; meta != NULL;
1915			    meta = meta->b_next) {
1916				mhdr = (sctp_msg_hdr_t *)meta->b_rptr;
1917
1918				if (SCTP_IS_MSG_ABANDONED(meta) ||
1919				    SCTP_MSG_TO_BE_ABANDONED(meta, mhdr,
1920				    sctp)) {
1921					continue;
1922				}
1923
1924				mp = meta->b_cont;
1925				goto try_bundle;
1926			}
1927			/*
1928			 * Check if there is a new message which potentially
1929			 * could be bundled with this retransmission.
1930			 */
1931			meta = sctp_get_msg_to_send(sctp, &mp, NULL, &error,
1932			    seglen, fp->sfa_pmss - seglen, NULL);
1933			if (error != 0 || meta == NULL) {
1934				/* No more chunk to be bundled. */
1935				break;
1936			} else {
1937				goto try_bundle;
1938			}
1939		}
1940
1941		sdc = (sctp_data_hdr_t *)mp->b_rptr;
1942		new_len = ntohs(sdc->sdh_len);
1943		chunklen = new_len - sizeof (*sdc);
1944
1945		if ((extra = new_len & (SCTP_ALIGN - 1)) != 0)
1946			extra = SCTP_ALIGN - extra;
1947		if ((new_len = seglen + new_len + extra) > fp->sfa_pmss)
1948			break;
1949		if ((nmp = dupmsg(mp)) == NULL)
1950			break;
1951
1952		if (extra > 0) {
1953			fill = sctp_get_padding(sctp, extra);
1954			if (fill != NULL) {
1955				linkb(nmp, fill);
1956			} else {
1957				freemsg(nmp);
1958				break;
1959			}
1960		}
1961		linkb(head, nmp);
1962
1963		SCTP_CHUNK_CLEAR_FLAGS(nmp);
1964		SCTP_CHUNK_SENT(sctp, mp, sdc, fp, chunklen, meta);
1965
1966		seglen = new_len;
1967		mp = mp->b_next;
1968	}
1969done_bundle:
1970	if ((seglen > fp->sfa_pmss) && fp->isv4) {
1971		ipha_t *iph = (ipha_t *)head->b_rptr;
1972
1973		/*
1974		 * Path MTU is different from path we thought it would
1975		 * be when we created chunks, or IP headers have grown.
1976		 * Need to clear the DF bit.
1977		 */
1978		iph->ipha_fragment_offset_and_flags = 0;
1979	}
1980	fp->rxt_unacked += seglen;
1981
1982	dprint(2, ("sctp_rexmit: Sending packet %d bytes, tsn %x "
1983	    "ssn %d to %p (rwnd %d, lastack_rxd %x)\n",
1984	    seglen, ntohl(sdc->sdh_tsn), ntohs(sdc->sdh_ssn),
1985	    (void *)fp, sctp->sctp_frwnd, sctp->sctp_lastack_rxd));
1986
1987	sctp->sctp_rexmitting = B_TRUE;
1988	sctp->sctp_rxt_nxttsn = first_ua_tsn;
1989	sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1;
1990	sctp_set_iplen(sctp, head);
1991	sctp_add_sendq(sctp, head);
1992
1993	/*
1994	 * Restart the oldfp timer with exponential backoff and
1995	 * the new fp timer for the retransmitted chunks.
1996	 */
1997restart_timer:
1998	oldfp->strikes++;
1999	sctp->sctp_strikes++;
2000	SCTP_CALC_RXT(oldfp, sctp->sctp_rto_max);
2001	/*
2002	 * If there is still some data in the oldfp, restart the
2003	 * retransmission timer.  If there is no data, the heartbeat will
2004	 * continue to run so it will do its job in checking the reachability
2005	 * of the oldfp.
2006	 */
2007	if (oldfp != fp && oldfp->suna != 0)
2008		SCTP_FADDR_TIMER_RESTART(sctp, oldfp, oldfp->rto);
2009
2010	/*
2011	 * Should we restart the timer of the new fp?  If there is
2012	 * outstanding data to the new fp, the timer should be
2013	 * running already.  So restarting it means that the timer
2014	 * will fire later for those outstanding data.  But if
2015	 * we don't restart it, the timer will fire too early for the
2016	 * just retransmitted chunks to the new fp.  The reason is that we
2017	 * don't keep a timestamp on when a chunk is retransmitted.
2018	 * So when the timer fires, it will just search for the
2019	 * chunk with the earliest TSN sent to new fp.  This probably
2020	 * is the chunk we just retransmitted.  So for now, let's
2021	 * be conservative and restart the timer of the new fp.
2022	 */
2023	SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
2024
2025	sctp->sctp_active = lbolt64;
2026}
2027
2028/*
2029 * This function is called by sctp_ss_rexmit() to create a packet
2030 * to be retransmitted to the given fp.  The given meta and mp
2031 * parameters are respectively the sctp_msg_hdr_t and the mblk of the
2032 * first chunk to be retransmitted.  This is also called when we want
2033 * to retransmit a zero window probe from sctp_rexmit() or when we
2034 * want to retransmit the zero window probe after the window has
2035 * opened from sctp_got_sack().
2036 */
2037mblk_t *
2038sctp_rexmit_packet(sctp_t *sctp, mblk_t **meta, mblk_t **mp, sctp_faddr_t *fp,
2039    uint_t *packet_len)
2040{
2041	uint32_t	seglen = 0;
2042	uint16_t	chunklen;
2043	int		extra;
2044	mblk_t		*nmp;
2045	mblk_t		*head;
2046	mblk_t		*fill;
2047	sctp_data_hdr_t	*sdc;
2048	sctp_msg_hdr_t	*mhdr;
2049
2050	sdc = (sctp_data_hdr_t *)(*mp)->b_rptr;
2051	seglen = ntohs(sdc->sdh_len);
2052	chunklen = seglen - sizeof (*sdc);
2053	if ((extra = seglen & (SCTP_ALIGN - 1)) != 0)
2054		extra = SCTP_ALIGN - extra;
2055
2056	nmp = dupmsg(*mp);
2057	if (nmp == NULL)
2058		return (NULL);
2059	if (extra > 0) {
2060		fill = sctp_get_padding(sctp, extra);
2061		if (fill != NULL) {
2062			linkb(nmp, fill);
2063			seglen += extra;
2064		} else {
2065			freemsg(nmp);
2066			return (NULL);
2067		}
2068	}
2069	SCTP_CHUNK_CLEAR_FLAGS(nmp);
2070	head = sctp_add_proto_hdr(sctp, fp, nmp, 0, NULL);
2071	if (head == NULL) {
2072		freemsg(nmp);
2073		return (NULL);
2074	}
2075	SCTP_CHUNK_SENT(sctp, *mp, sdc, fp, chunklen, *meta);
2076	/*
2077	 * Don't update the TSN if we are doing a Zero Win Probe.
2078	 */
2079	if (!sctp->sctp_zero_win_probe)
2080		sctp->sctp_rxt_nxttsn = ntohl(sdc->sdh_tsn);
2081	*mp = (*mp)->b_next;
2082
2083try_bundle:
2084	while (seglen < fp->sfa_pmss) {
2085		int32_t new_len;
2086
2087		/*
2088		 * Go through the list to find more chunks to be bundled.
2089		 * We should only retransmit sent by unack'ed chunks.  Since
2090		 * they were sent before, the peer's receive window should
2091		 * be able to receive them.
2092		 */
2093		while (*mp != NULL) {
2094			/* Check if the chunk can be bundled. */
2095			if (SCTP_CHUNK_ISSENT(*mp) && !SCTP_CHUNK_ISACKED(*mp))
2096				break;
2097			*mp = (*mp)->b_next;
2098		}
2099		/* Go to the next message. */
2100		if (*mp == NULL) {
2101			for (*meta = (*meta)->b_next; *meta != NULL;
2102			    *meta = (*meta)->b_next) {
2103				mhdr = (sctp_msg_hdr_t *)(*meta)->b_rptr;
2104
2105				if (SCTP_IS_MSG_ABANDONED(*meta) ||
2106				    SCTP_MSG_TO_BE_ABANDONED(*meta, mhdr,
2107				    sctp)) {
2108					continue;
2109				}
2110
2111				*mp = (*meta)->b_cont;
2112				goto try_bundle;
2113			}
2114			/* No more chunk to be bundled. */
2115			break;
2116		}
2117
2118		sdc = (sctp_data_hdr_t *)(*mp)->b_rptr;
2119		/* Don't bundle chunks beyond sctp_rxt_maxtsn. */
2120		if (SEQ_GT(ntohl(sdc->sdh_tsn), sctp->sctp_rxt_maxtsn))
2121			break;
2122		new_len = ntohs(sdc->sdh_len);
2123		chunklen = new_len - sizeof (*sdc);
2124
2125		if ((extra = new_len & (SCTP_ALIGN - 1)) != 0)
2126			extra = SCTP_ALIGN - extra;
2127		if ((new_len = seglen + new_len + extra) > fp->sfa_pmss)
2128			break;
2129		if ((nmp = dupmsg(*mp)) == NULL)
2130			break;
2131
2132		if (extra > 0) {
2133			fill = sctp_get_padding(sctp, extra);
2134			if (fill != NULL) {
2135				linkb(nmp, fill);
2136			} else {
2137				freemsg(nmp);
2138				break;
2139			}
2140		}
2141		linkb(head, nmp);
2142
2143		SCTP_CHUNK_CLEAR_FLAGS(nmp);
2144		SCTP_CHUNK_SENT(sctp, *mp, sdc, fp, chunklen, *meta);
2145		/*
2146		 * Don't update the TSN if we are doing a Zero Win Probe.
2147		 */
2148		if (!sctp->sctp_zero_win_probe)
2149			sctp->sctp_rxt_nxttsn = ntohl(sdc->sdh_tsn);
2150
2151		seglen = new_len;
2152		*mp = (*mp)->b_next;
2153	}
2154	*packet_len = seglen;
2155	fp->rxt_unacked += seglen;
2156	return (head);
2157}
2158
2159/*
2160 * sctp_ss_rexmit() is called when we get a SACK after a timeout which
2161 * advances the cum_tsn but the cum_tsn is still less than what we have sent
2162 * (sctp_rxt_maxtsn) at the time of the timeout.  This SACK is a "partial"
2163 * SACK.  We retransmit unacked chunks without having to wait for another
2164 * timeout.  The rationale is that the SACK should not be "partial" if all the
2165 * lost chunks have been retransmitted.  Since the SACK is "partial,"
2166 * the chunks between the cum_tsn and the sctp_rxt_maxtsn should still
2167 * be missing.  It is better for us to retransmit them now instead
2168 * of waiting for a timeout.
2169 */
2170void
2171sctp_ss_rexmit(sctp_t *sctp)
2172{
2173	mblk_t		*meta;
2174	mblk_t		*mp;
2175	mblk_t		*pkt;
2176	sctp_faddr_t	*fp;
2177	uint_t		pkt_len;
2178	uint32_t	tot_wnd;
2179	sctp_data_hdr_t	*sdc;
2180	int		burst;
2181	sctp_stack_t	*sctps = sctp->sctp_sctps;
2182
2183	ASSERT(!sctp->sctp_zero_win_probe);
2184
2185	/*
2186	 * If the last cum ack is smaller than what we have just
2187	 * retransmitted, simply return.
2188	 */
2189	if (SEQ_GEQ(sctp->sctp_lastack_rxd, sctp->sctp_rxt_nxttsn))
2190		sctp->sctp_rxt_nxttsn = sctp->sctp_lastack_rxd + 1;
2191	else
2192		return;
2193	ASSERT(SEQ_LEQ(sctp->sctp_rxt_nxttsn, sctp->sctp_rxt_maxtsn));
2194
2195	/*
2196	 * After a timer fires, sctp_current should be set to the new
2197	 * fp where the retransmitted chunks are sent.
2198	 */
2199	fp = sctp->sctp_current;
2200
2201	/*
2202	 * Since we are retransmitting, we only need to use cwnd to determine
2203	 * how much we can send as we were allowed (by peer's receive window)
2204	 * to send those retransmitted chunks previously when they are first
2205	 * sent.  If we record how much we have retransmitted but
2206	 * unacknowledged using rxt_unacked, then the amount we can now send
2207	 * is equal to cwnd minus rxt_unacked.
2208	 *
2209	 * The field rxt_unacked is incremented when we retransmit a packet
2210	 * and decremented when we got a SACK acknowledging something.  And
2211	 * it is reset when the retransmission timer fires as we assume that
2212	 * all packets have left the network after a timeout.  If this
2213	 * assumption is not true, it means that after a timeout, we can
2214	 * get a SACK acknowledging more than rxt_unacked (its value only
2215	 * contains what is retransmitted when the timer fires).  So
2216	 * rxt_unacked will become very big (it is an unsiged int so going
2217	 * negative means that the value is huge).  This is the reason we
2218	 * always send at least 1 MSS bytes.
2219	 *
2220	 * The reason why we do not have an accurate count is that we
2221	 * only know how many packets are outstanding (using the TSN numbers).
2222	 * But we do not know how many bytes those packets contain.  To
2223	 * have an accurate count, we need to walk through the send list.
2224	 * As it is not really important to have an accurate count during
2225	 * retransmission, we skip this walk to save some time.  This should
2226	 * not make the retransmission too aggressive to cause congestion.
2227	 */
2228	if (fp->cwnd <= fp->rxt_unacked)
2229		tot_wnd = fp->sfa_pmss;
2230	else
2231		tot_wnd = fp->cwnd - fp->rxt_unacked;
2232
2233	/* Find the first unack'ed chunk */
2234	for (meta = sctp->sctp_xmit_head; meta != NULL; meta = meta->b_next) {
2235		sctp_msg_hdr_t	*mhdr = (sctp_msg_hdr_t *)meta->b_rptr;
2236
2237		if (SCTP_IS_MSG_ABANDONED(meta) ||
2238		    SCTP_MSG_TO_BE_ABANDONED(meta, mhdr, sctp)) {
2239			continue;
2240		}
2241
2242		for (mp = meta->b_cont; mp != NULL; mp = mp->b_next) {
2243			/* Again, this may not be possible */
2244			if (!SCTP_CHUNK_ISSENT(mp))
2245				return;
2246			sdc = (sctp_data_hdr_t *)mp->b_rptr;
2247			if (ntohl(sdc->sdh_tsn) == sctp->sctp_rxt_nxttsn)
2248				goto found_msg;
2249		}
2250	}
2251
2252	/* Everything is abandoned... */
2253	return;
2254
2255found_msg:
2256	if (!fp->timer_running)
2257		SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
2258	pkt = sctp_rexmit_packet(sctp, &meta, &mp, fp, &pkt_len);
2259	if (pkt == NULL) {
2260		SCTP_KSTAT(sctps, sctp_ss_rexmit_failed);
2261		return;
2262	}
2263	if ((pkt_len > fp->sfa_pmss) && fp->isv4) {
2264		ipha_t	*iph = (ipha_t *)pkt->b_rptr;
2265
2266		/*
2267		 * Path MTU is different from path we thought it would
2268		 * be when we created chunks, or IP headers have grown.
2269		 *  Need to clear the DF bit.
2270		 */
2271		iph->ipha_fragment_offset_and_flags = 0;
2272	}
2273	sctp_set_iplen(sctp, pkt);
2274	sctp_add_sendq(sctp, pkt);
2275
2276	/* Check and see if there is more chunk to be retransmitted. */
2277	if (tot_wnd <= pkt_len || tot_wnd - pkt_len < fp->sfa_pmss ||
2278	    meta == NULL)
2279		return;
2280	if (mp == NULL)
2281		meta = meta->b_next;
2282	if (meta == NULL)
2283		return;
2284
2285	/* Retransmit another packet if the window allows. */
2286	for (tot_wnd -= pkt_len, burst = sctps->sctps_maxburst - 1;
2287	    meta != NULL && burst > 0; meta = meta->b_next, burst--) {
2288		if (mp == NULL)
2289			mp = meta->b_cont;
2290		for (; mp != NULL; mp = mp->b_next) {
2291			/* Again, this may not be possible */
2292			if (!SCTP_CHUNK_ISSENT(mp))
2293				return;
2294			if (!SCTP_CHUNK_ISACKED(mp))
2295				goto found_msg;
2296		}
2297	}
2298}
2299