sctp_timer.c revision 171477
1/*-
2 * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * a) Redistributions of source code must retain the above copyright notice,
8 *   this list of conditions and the following disclaimer.
9 *
10 * b) Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *   the documentation and/or other materials provided with the distribution.
13 *
14 * c) Neither the name of Cisco Systems, Inc. nor the names of its
15 *    contributors may be used to endorse or promote products derived
16 *    from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
28 * THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/* $KAME: sctp_timer.c,v 1.29 2005/03/06 16:04:18 itojun Exp $	 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: head/sys/netinet/sctp_timer.c 171477 2007-07-17 20:58:26Z rrs $");
35
36#define _IP_VHL
37#include <netinet/sctp_os.h>
38#include <netinet/sctp_pcb.h>
39#ifdef INET6
40#include <netinet6/sctp6_var.h>
41#endif
42#include <netinet/sctp_var.h>
43#include <netinet/sctp_sysctl.h>
44#include <netinet/sctp_timer.h>
45#include <netinet/sctputil.h>
46#include <netinet/sctp_output.h>
47#include <netinet/sctp_header.h>
48#include <netinet/sctp_indata.h>
49#include <netinet/sctp_asconf.h>
50#include <netinet/sctp_input.h>
51#include <netinet/sctp.h>
52#include <netinet/sctp_uio.h>
53
54
55
56void
57sctp_early_fr_timer(struct sctp_inpcb *inp,
58    struct sctp_tcb *stcb,
59    struct sctp_nets *net)
60{
61	struct sctp_tmit_chunk *chk, *tp2;
62	struct timeval now, min_wait, tv;
63	unsigned int cur_rtt, cnt = 0, cnt_resend = 0;
64
65	/* an early FR is occuring. */
66	(void)SCTP_GETTIME_TIMEVAL(&now);
67	/* get cur rto in micro-seconds */
68	if (net->lastsa == 0) {
69		/* Hmm no rtt estimate yet? */
70		cur_rtt = stcb->asoc.initial_rto >> 2;
71	} else {
72
73		cur_rtt = ((net->lastsa >> 2) + net->lastsv) >> 1;
74	}
75	if (cur_rtt < sctp_early_fr_msec) {
76		cur_rtt = sctp_early_fr_msec;
77	}
78	cur_rtt *= 1000;
79	tv.tv_sec = cur_rtt / 1000000;
80	tv.tv_usec = cur_rtt % 1000000;
81	min_wait = now;
82	timevalsub(&min_wait, &tv);
83	if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) {
84		/*
85		 * if we hit here, we don't have enough seconds on the clock
86		 * to account for the RTO. We just let the lower seconds be
87		 * the bounds and don't worry about it. This may mean we
88		 * will mark a lot more than we should.
89		 */
90		min_wait.tv_sec = min_wait.tv_usec = 0;
91	}
92	chk = TAILQ_LAST(&stcb->asoc.sent_queue, sctpchunk_listhead);
93	for (; chk != NULL; chk = tp2) {
94		tp2 = TAILQ_PREV(chk, sctpchunk_listhead, sctp_next);
95		if (chk->whoTo != net) {
96			continue;
97		}
98		if (chk->sent == SCTP_DATAGRAM_RESEND)
99			cnt_resend++;
100		else if ((chk->sent > SCTP_DATAGRAM_UNSENT) &&
101		    (chk->sent < SCTP_DATAGRAM_RESEND)) {
102			/* pending, may need retran */
103			if (chk->sent_rcv_time.tv_sec > min_wait.tv_sec) {
104				/*
105				 * we have reached a chunk that was sent
106				 * some seconds past our min.. forget it we
107				 * will find no more to send.
108				 */
109				continue;
110			} else if (chk->sent_rcv_time.tv_sec == min_wait.tv_sec) {
111				/*
112				 * we must look at the micro seconds to
113				 * know.
114				 */
115				if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) {
116					/*
117					 * ok it was sent after our boundary
118					 * time.
119					 */
120					continue;
121				}
122			}
123			if (sctp_logging_level & SCTP_EARLYFR_LOGGING_ENABLE) {
124				sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count,
125				    4, SCTP_FR_MARKED_EARLY);
126			}
127			SCTP_STAT_INCR(sctps_earlyfrmrkretrans);
128			chk->sent = SCTP_DATAGRAM_RESEND;
129			sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
130			/* double book size since we are doing an early FR */
131			chk->book_size_scale++;
132			cnt += chk->send_size;
133			if ((cnt + net->flight_size) > net->cwnd) {
134				/* Mark all we could possibly resend */
135				break;
136			}
137		}
138	}
139	if (cnt) {
140		/*
141		 * JRS - Use the congestion control given in the congestion
142		 * control module
143		 */
144		stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer(inp, stcb, net);
145	} else if (cnt_resend) {
146		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR);
147	}
148	/* Restart it? */
149	if (net->flight_size < net->cwnd) {
150		SCTP_STAT_INCR(sctps_earlyfrstrtmr);
151		sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net);
152	}
153}
154
155void
156sctp_audit_retranmission_queue(struct sctp_association *asoc)
157{
158	struct sctp_tmit_chunk *chk;
159
160	SCTPDBG(SCTP_DEBUG_TIMER4, "Audit invoked on send queue cnt:%d onqueue:%d\n",
161	    asoc->sent_queue_retran_cnt,
162	    asoc->sent_queue_cnt);
163	asoc->sent_queue_retran_cnt = 0;
164	asoc->sent_queue_cnt = 0;
165	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
166		if (chk->sent == SCTP_DATAGRAM_RESEND) {
167			sctp_ucount_incr(asoc->sent_queue_retran_cnt);
168		}
169		asoc->sent_queue_cnt++;
170	}
171	TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
172		if (chk->sent == SCTP_DATAGRAM_RESEND) {
173			sctp_ucount_incr(asoc->sent_queue_retran_cnt);
174		}
175	}
176	SCTPDBG(SCTP_DEBUG_TIMER4, "Audit completes retran:%d onqueue:%d\n",
177	    asoc->sent_queue_retran_cnt,
178	    asoc->sent_queue_cnt);
179}
180
181int
182sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
183    struct sctp_nets *net, uint16_t threshold)
184{
185	if (net) {
186		net->error_count++;
187		SCTPDBG(SCTP_DEBUG_TIMER4, "Error count for %p now %d thresh:%d\n",
188		    net, net->error_count,
189		    net->failure_threshold);
190		if (net->error_count > net->failure_threshold) {
191			/* We had a threshold failure */
192			if (net->dest_state & SCTP_ADDR_REACHABLE) {
193				net->dest_state &= ~SCTP_ADDR_REACHABLE;
194				net->dest_state |= SCTP_ADDR_NOT_REACHABLE;
195				net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY;
196				if (net == stcb->asoc.primary_destination) {
197					net->dest_state |= SCTP_ADDR_WAS_PRIMARY;
198				}
199				/*
200				 * JRS 5/14/07 - If a destination is
201				 * unreachable, the PF bit is turned off.
202				 * This allows an unambiguous use of the PF
203				 * bit for destinations that are reachable
204				 * but potentially failed. If the
205				 * destination is set to the unreachable
206				 * state, also set the destination to the PF
207				 * state.
208				 */
209				/*
210				 * Add debug message here if destination is
211				 * not in PF state.
212				 */
213				/* Stop any running T3 timers here? */
214				if (sctp_cmt_on_off && sctp_cmt_pf) {
215					net->dest_state &= ~SCTP_ADDR_PF;
216					SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to unreachable.\n",
217					    net);
218				}
219				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
220				    stcb,
221				    SCTP_FAILED_THRESHOLD,
222				    (void *)net);
223			}
224		}
225		/*********HOLD THIS COMMENT FOR PATCH OF ALTERNATE
226		 *********ROUTING CODE
227		 */
228		/*********HOLD THIS COMMENT FOR END OF PATCH OF ALTERNATE
229		 *********ROUTING CODE
230		 */
231	}
232	if (stcb == NULL)
233		return (0);
234
235	if (net) {
236		if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) {
237			stcb->asoc.overall_error_count++;
238		}
239	} else {
240		stcb->asoc.overall_error_count++;
241	}
242	SCTPDBG(SCTP_DEBUG_TIMER4, "Overall error count for %p now %d thresh:%u state:%x\n",
243	    &stcb->asoc, stcb->asoc.overall_error_count,
244	    (uint32_t) threshold,
245	    ((net == NULL) ? (uint32_t) 0 : (uint32_t) net->dest_state));
246	/*
247	 * We specifically do not do >= to give the assoc one more change
248	 * before we fail it.
249	 */
250	if (stcb->asoc.overall_error_count > threshold) {
251		/* Abort notification sends a ULP notify */
252		struct mbuf *oper;
253
254		oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
255		    0, M_DONTWAIT, 1, MT_DATA);
256		if (oper) {
257			struct sctp_paramhdr *ph;
258			uint32_t *ippp;
259
260			SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
261			    sizeof(uint32_t);
262			ph = mtod(oper, struct sctp_paramhdr *);
263			ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
264			ph->param_length = htons(SCTP_BUF_LEN(oper));
265			ippp = (uint32_t *) (ph + 1);
266			*ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_1);
267		}
268		inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1;
269		sctp_abort_an_association(inp, stcb, SCTP_FAILED_THRESHOLD, oper);
270		return (1);
271	}
272	return (0);
273}
274
275struct sctp_nets *
276sctp_find_alternate_net(struct sctp_tcb *stcb,
277    struct sctp_nets *net,
278    int mode)
279{
280	/* Find and return an alternate network if possible */
281	struct sctp_nets *alt, *mnet, *min_errors_net = NULL, *max_cwnd_net = NULL;
282	int once;
283
284	/* JRS 5/14/07 - Initialize min_errors to an impossible value. */
285	int min_errors = -1;
286	uint32_t max_cwnd = 0;
287
288	if (stcb->asoc.numnets == 1) {
289		/* No others but net */
290		return (TAILQ_FIRST(&stcb->asoc.nets));
291	}
292	/*
293	 * JRS 5/14/07 - If mode is set to 2, use the CMT PF find alternate
294	 * net algorithm. This algorithm chooses the active destination (not
295	 * in PF state) with the largest cwnd value. If all destinations are
296	 * in PF state, unreachable, or unconfirmed, choose the desination
297	 * that is in PF state with the lowest error count. In case of a
298	 * tie, choose the destination that was most recently active.
299	 */
300	if (mode == 2) {
301		TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) {
302			/*
303			 * JRS 5/14/07 - If the destination is unreachable
304			 * or unconfirmed, skip it.
305			 */
306			if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) ||
307			    (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)) {
308				continue;
309			}
310			/*
311			 * JRS 5/14/07 -  If the destination is reachable
312			 * but in PF state, compare the error count of the
313			 * destination to the minimum error count seen thus
314			 * far. Store the destination with the lower error
315			 * count.  If the error counts are equal, store the
316			 * destination that was most recently active.
317			 */
318			if (mnet->dest_state & SCTP_ADDR_PF) {
319				/*
320				 * JRS 5/14/07 - If the destination under
321				 * consideration is the current destination,
322				 * work as if the error count is one higher.
323				 * The actual error count will not be
324				 * incremented until later in the t3
325				 * handler.
326				 */
327				if (mnet == net) {
328					if (min_errors == -1) {
329						min_errors = mnet->error_count + 1;
330						min_errors_net = mnet;
331					} else if (mnet->error_count + 1 < min_errors) {
332						min_errors = mnet->error_count + 1;
333						min_errors_net = mnet;
334					} else if (mnet->error_count + 1 == min_errors
335					    && mnet->last_active > min_errors_net->last_active) {
336						min_errors_net = mnet;
337						min_errors = mnet->error_count + 1;
338					}
339					continue;
340				} else {
341					if (min_errors == -1) {
342						min_errors = mnet->error_count;
343						min_errors_net = mnet;
344					} else if (mnet->error_count < min_errors) {
345						min_errors = mnet->error_count;
346						min_errors_net = mnet;
347					} else if (mnet->error_count == min_errors
348					    && mnet->last_active > min_errors_net->last_active) {
349						min_errors_net = mnet;
350						min_errors = mnet->error_count;
351					}
352					continue;
353				}
354			}
355			/*
356			 * JRS 5/14/07 - If the destination is reachable and
357			 * not in PF state, compare the cwnd of the
358			 * destination to the highest cwnd seen thus far.
359			 * Store the destination with the higher cwnd value.
360			 * If the cwnd values are equal, randomly choose one
361			 * of the two destinations.
362			 */
363			if (max_cwnd < mnet->cwnd) {
364				max_cwnd_net = mnet;
365				max_cwnd = mnet->cwnd;
366			} else if (max_cwnd == mnet->cwnd) {
367				uint32_t rndval;
368				uint8_t this_random;
369
370				if (stcb->asoc.hb_random_idx > 3) {
371					rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
372					memcpy(stcb->asoc.hb_random_values, &rndval, sizeof(stcb->asoc.hb_random_values));
373					this_random = stcb->asoc.hb_random_values[0];
374					stcb->asoc.hb_random_idx++;
375					stcb->asoc.hb_ect_randombit = 0;
376				} else {
377					this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
378					stcb->asoc.hb_random_idx++;
379					stcb->asoc.hb_ect_randombit = 0;
380				}
381				if (this_random % 2 == 1) {
382					max_cwnd_net = mnet;
383					max_cwnd = mnet->cwnd;
384					//Useless ?
385				}
386			}
387		}
388		/*
389		 * JRS 5/14/07 - After all destination have been considered
390		 * as alternates, check to see if there was some active
391		 * destination (not in PF state).  If not, check to see if
392		 * there was some PF destination with the minimum number of
393		 * errors.  If not, return the original destination.  If
394		 * there is a min_errors_net, remove the PF flag from that
395		 * destination, set the cwnd to one or two MTUs, and return
396		 * the destination as an alt. If there was some active
397		 * destination with a highest cwnd, return the destination
398		 * as an alt.
399		 */
400		if (max_cwnd_net == NULL) {
401			if (min_errors_net == NULL) {
402				return (net);
403			}
404			min_errors_net->dest_state &= ~SCTP_ADDR_PF;
405			min_errors_net->cwnd = min_errors_net->mtu * sctp_cmt_pf;
406			if (SCTP_OS_TIMER_PENDING(&min_errors_net->rxt_timer.timer)) {
407				sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
408				    stcb, min_errors_net,
409				    SCTP_FROM_SCTP_TIMER + SCTP_LOC_2);
410			}
411			SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to active with %d errors.\n",
412			    min_errors_net, min_errors_net->error_count);
413			return (min_errors_net);
414		} else {
415			return (max_cwnd_net);
416		}
417	}
418	/*
419	 * JRS 5/14/07 - If mode is set to 1, use the CMT policy for
420	 * choosing an alternate net.
421	 */
422	else if (mode == 1) {
423		TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) {
424			if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) ||
425			    (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)
426			    ) {
427				/*
428				 * will skip ones that are not-reachable or
429				 * unconfirmed
430				 */
431				continue;
432			}
433			if (max_cwnd < mnet->cwnd) {
434				max_cwnd_net = mnet;
435				max_cwnd = mnet->cwnd;
436			} else if (max_cwnd == mnet->cwnd) {
437				uint32_t rndval;
438				uint8_t this_random;
439
440				if (stcb->asoc.hb_random_idx > 3) {
441					rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep);
442					memcpy(stcb->asoc.hb_random_values, &rndval,
443					    sizeof(stcb->asoc.hb_random_values));
444					this_random = stcb->asoc.hb_random_values[0];
445					stcb->asoc.hb_random_idx = 0;
446					stcb->asoc.hb_ect_randombit = 0;
447				} else {
448					this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx];
449					stcb->asoc.hb_random_idx++;
450					stcb->asoc.hb_ect_randombit = 0;
451				}
452				if (this_random % 2) {
453					max_cwnd_net = mnet;
454					max_cwnd = mnet->cwnd;
455				}
456			}
457		}
458		if (max_cwnd_net) {
459			return (max_cwnd_net);
460		}
461	}
462	mnet = net;
463	once = 0;
464
465	if (mnet == NULL) {
466		mnet = TAILQ_FIRST(&stcb->asoc.nets);
467	}
468	do {
469		alt = TAILQ_NEXT(mnet, sctp_next);
470		if (alt == NULL) {
471			once++;
472			if (once > 1) {
473				break;
474			}
475			alt = TAILQ_FIRST(&stcb->asoc.nets);
476		}
477		if (alt->ro.ro_rt == NULL) {
478			if (alt->ro._s_addr) {
479				sctp_free_ifa(alt->ro._s_addr);
480				alt->ro._s_addr = NULL;
481
482			}
483			alt->src_addr_selected = 0;
484		}
485		if (
486		    ((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) &&
487		    (alt->ro.ro_rt != NULL) &&
488		/* sa_ignore NO_NULL_CHK */
489		    (!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))
490		    ) {
491			/* Found a reachable address */
492			break;
493		}
494		mnet = alt;
495	} while (alt != NULL);
496
497	if (alt == NULL) {
498		/* Case where NO insv network exists (dormant state) */
499		/* we rotate destinations */
500		once = 0;
501		mnet = net;
502		do {
503			alt = TAILQ_NEXT(mnet, sctp_next);
504			if (alt == NULL) {
505				once++;
506				if (once > 1) {
507					break;
508				}
509				alt = TAILQ_FIRST(&stcb->asoc.nets);
510			}
511			/* sa_ignore NO_NULL_CHK */
512			if ((!(alt->dest_state & SCTP_ADDR_UNCONFIRMED)) &&
513			    (alt != net)) {
514				/* Found an alternate address */
515				break;
516			}
517			mnet = alt;
518		} while (alt != NULL);
519	}
520	if (alt == NULL) {
521		return (net);
522	}
523	return (alt);
524}
525
526
527
528static void
529sctp_backoff_on_timeout(struct sctp_tcb *stcb,
530    struct sctp_nets *net,
531    int win_probe,
532    int num_marked)
533{
534	if (net->RTO == 0) {
535		net->RTO = stcb->asoc.minrto;
536	}
537	net->RTO <<= 1;
538	if (net->RTO > stcb->asoc.maxrto) {
539		net->RTO = stcb->asoc.maxrto;
540	}
541	if ((win_probe == 0) && num_marked) {
542		/* We don't apply penalty to window probe scenarios */
543		/* JRS - Use the congestion control given in the CC module */
544		stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout(stcb, net);
545	}
546}
547
548static int
549sctp_mark_all_for_resend(struct sctp_tcb *stcb,
550    struct sctp_nets *net,
551    struct sctp_nets *alt,
552    int window_probe,
553    int *num_marked)
554{
555
556	/*
557	 * Mark all chunks (well not all) that were sent to *net for
558	 * retransmission. Move them to alt for there destination as well...
559	 * We only mark chunks that have been outstanding long enough to
560	 * have received feed-back.
561	 */
562	struct sctp_tmit_chunk *chk, *tp2, *could_be_sent = NULL;
563	struct sctp_nets *lnets;
564	struct timeval now, min_wait, tv;
565	int cur_rtt;
566	int audit_tf, num_mk, fir;
567	unsigned int cnt_mk;
568	uint32_t orig_flight, orig_tf;
569	uint32_t tsnlast, tsnfirst;
570
571
572	/* none in flight now */
573	audit_tf = 0;
574	fir = 0;
575	/*
576	 * figure out how long a data chunk must be pending before we can
577	 * mark it ..
578	 */
579	(void)SCTP_GETTIME_TIMEVAL(&now);
580	/* get cur rto in micro-seconds */
581	cur_rtt = (((net->lastsa >> 2) + net->lastsv) >> 1);
582	cur_rtt *= 1000;
583	if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
584		sctp_log_fr(cur_rtt,
585		    stcb->asoc.peers_rwnd,
586		    window_probe,
587		    SCTP_FR_T3_MARK_TIME);
588		sctp_log_fr(net->flight_size,
589		    SCTP_OS_TIMER_PENDING(&net->fr_timer.timer),
590		    SCTP_OS_TIMER_ACTIVE(&net->fr_timer.timer),
591		    SCTP_FR_CWND_REPORT);
592		sctp_log_fr(net->flight_size, net->cwnd, stcb->asoc.total_flight, SCTP_FR_CWND_REPORT);
593	}
594	tv.tv_sec = cur_rtt / 1000000;
595	tv.tv_usec = cur_rtt % 1000000;
596	min_wait = now;
597	timevalsub(&min_wait, &tv);
598	if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) {
599		/*
600		 * if we hit here, we don't have enough seconds on the clock
601		 * to account for the RTO. We just let the lower seconds be
602		 * the bounds and don't worry about it. This may mean we
603		 * will mark a lot more than we should.
604		 */
605		min_wait.tv_sec = min_wait.tv_usec = 0;
606	}
607	if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
608		sctp_log_fr(cur_rtt, now.tv_sec, now.tv_usec, SCTP_FR_T3_MARK_TIME);
609		sctp_log_fr(0, min_wait.tv_sec, min_wait.tv_usec, SCTP_FR_T3_MARK_TIME);
610	}
611	/*
612	 * Our rwnd will be incorrect here since we are not adding back the
613	 * cnt * mbuf but we will fix that down below.
614	 */
615	orig_flight = net->flight_size;
616	orig_tf = stcb->asoc.total_flight;
617
618	net->fast_retran_ip = 0;
619	/* Now on to each chunk */
620	num_mk = cnt_mk = 0;
621	tsnfirst = tsnlast = 0;
622	chk = TAILQ_FIRST(&stcb->asoc.sent_queue);
623	for (; chk != NULL; chk = tp2) {
624		tp2 = TAILQ_NEXT(chk, sctp_next);
625		if ((compare_with_wrap(stcb->asoc.last_acked_seq,
626		    chk->rec.data.TSN_seq,
627		    MAX_TSN)) ||
628		    (stcb->asoc.last_acked_seq == chk->rec.data.TSN_seq)) {
629			/* Strange case our list got out of order? */
630			SCTP_PRINTF("Our list is out of order?\n");
631			panic("Out of order list");
632		}
633		if ((chk->whoTo == net) && (chk->sent < SCTP_DATAGRAM_ACKED)) {
634			/*
635			 * found one to mark: If it is less than
636			 * DATAGRAM_ACKED it MUST not be a skipped or marked
637			 * TSN but instead one that is either already set
638			 * for retransmission OR one that needs
639			 * retransmission.
640			 */
641
642			/* validate its been outstanding long enough */
643			if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
644				sctp_log_fr(chk->rec.data.TSN_seq,
645				    chk->sent_rcv_time.tv_sec,
646				    chk->sent_rcv_time.tv_usec,
647				    SCTP_FR_T3_MARK_TIME);
648			}
649			if ((chk->sent_rcv_time.tv_sec > min_wait.tv_sec) && (window_probe == 0)) {
650				/*
651				 * we have reached a chunk that was sent
652				 * some seconds past our min.. forget it we
653				 * will find no more to send.
654				 */
655				if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
656					sctp_log_fr(0,
657					    chk->sent_rcv_time.tv_sec,
658					    chk->sent_rcv_time.tv_usec,
659					    SCTP_FR_T3_STOPPED);
660				}
661				continue;
662			} else if ((chk->sent_rcv_time.tv_sec == min_wait.tv_sec) &&
663			    (window_probe == 0)) {
664				/*
665				 * we must look at the micro seconds to
666				 * know.
667				 */
668				if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) {
669					/*
670					 * ok it was sent after our boundary
671					 * time.
672					 */
673					if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
674						sctp_log_fr(0,
675						    chk->sent_rcv_time.tv_sec,
676						    chk->sent_rcv_time.tv_usec,
677						    SCTP_FR_T3_STOPPED);
678					}
679					continue;
680				}
681			}
682			if (PR_SCTP_TTL_ENABLED(chk->flags)) {
683				/* Is it expired? */
684				if ((now.tv_sec > chk->rec.data.timetodrop.tv_sec) ||
685				    ((chk->rec.data.timetodrop.tv_sec == now.tv_sec) &&
686				    (now.tv_usec > chk->rec.data.timetodrop.tv_usec))) {
687					/* Yes so drop it */
688					if (chk->data) {
689						(void)sctp_release_pr_sctp_chunk(stcb,
690						    chk,
691						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
692						    &stcb->asoc.sent_queue);
693					}
694				}
695				continue;
696			}
697			if (PR_SCTP_RTX_ENABLED(chk->flags)) {
698				/* Has it been retransmitted tv_sec times? */
699				if (chk->snd_count > chk->rec.data.timetodrop.tv_sec) {
700					if (chk->data) {
701						(void)sctp_release_pr_sctp_chunk(stcb,
702						    chk,
703						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
704						    &stcb->asoc.sent_queue);
705					}
706				}
707				continue;
708			}
709			if (chk->sent < SCTP_DATAGRAM_RESEND) {
710				sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
711				num_mk++;
712				if (fir == 0) {
713					fir = 1;
714					tsnfirst = chk->rec.data.TSN_seq;
715				}
716				tsnlast = chk->rec.data.TSN_seq;
717				if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
718					sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count,
719					    0, SCTP_FR_T3_MARKED);
720				}
721				if (chk->rec.data.chunk_was_revoked) {
722					/* deflate the cwnd */
723					chk->whoTo->cwnd -= chk->book_size;
724					chk->rec.data.chunk_was_revoked = 0;
725				}
726				net->marked_retrans++;
727				stcb->asoc.marked_retrans++;
728				if (sctp_logging_level & SCTP_FLIGHT_LOGGING_ENABLE) {
729					sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND_TO,
730					    chk->whoTo->flight_size,
731					    chk->book_size,
732					    (uintptr_t) chk->whoTo,
733					    chk->rec.data.TSN_seq);
734				}
735				sctp_flight_size_decrease(chk);
736				sctp_total_flight_decrease(stcb, chk);
737				stcb->asoc.peers_rwnd += chk->send_size;
738				stcb->asoc.peers_rwnd += sctp_peer_chunk_oh;
739			}
740			chk->sent = SCTP_DATAGRAM_RESEND;
741			SCTP_STAT_INCR(sctps_markedretrans);
742
743			/* reset the TSN for striking and other FR stuff */
744			chk->rec.data.doing_fast_retransmit = 0;
745			/* Clear any time so NO RTT is being done */
746			chk->do_rtt = 0;
747			if (alt != net) {
748				sctp_free_remote_addr(chk->whoTo);
749				chk->no_fr_allowed = 1;
750				chk->whoTo = alt;
751				atomic_add_int(&alt->ref_count, 1);
752			} else {
753				chk->no_fr_allowed = 0;
754				if (TAILQ_EMPTY(&stcb->asoc.send_queue)) {
755					chk->rec.data.fast_retran_tsn = stcb->asoc.sending_seq;
756				} else {
757					chk->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq;
758				}
759			}
760			/*
761			 * CMT: Do not allow FRs on retransmitted TSNs.
762			 */
763			if (sctp_cmt_on_off == 1) {
764				chk->no_fr_allowed = 1;
765			}
766		} else if (chk->sent == SCTP_DATAGRAM_ACKED) {
767			/* remember highest acked one */
768			could_be_sent = chk;
769		}
770		if (chk->sent == SCTP_DATAGRAM_RESEND) {
771			cnt_mk++;
772		}
773	}
774	if ((orig_flight - net->flight_size) != (orig_tf - stcb->asoc.total_flight)) {
775		/* we did not subtract the same things? */
776		audit_tf = 1;
777	}
778	if (sctp_logging_level & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) {
779		sctp_log_fr(tsnfirst, tsnlast, num_mk, SCTP_FR_T3_TIMEOUT);
780	}
781#ifdef SCTP_DEBUG
782	if (num_mk) {
783		SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n",
784		    tsnlast);
785		SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%ld\n",
786		    num_mk, (u_long)stcb->asoc.peers_rwnd);
787		SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n",
788		    tsnlast);
789		SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%d\n",
790		    num_mk,
791		    (int)stcb->asoc.peers_rwnd);
792	}
793#endif
794	*num_marked = num_mk;
795	if ((stcb->asoc.sent_queue_retran_cnt == 0) && (could_be_sent)) {
796		/* fix it so we retransmit the highest acked anyway */
797		sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
798		cnt_mk++;
799		could_be_sent->sent = SCTP_DATAGRAM_RESEND;
800	}
801	if (stcb->asoc.sent_queue_retran_cnt != cnt_mk) {
802#ifdef INVARIANTS
803		SCTP_PRINTF("Local Audit says there are %d for retran asoc cnt:%d we marked:%d this time\n",
804		    cnt_mk, stcb->asoc.sent_queue_retran_cnt, num_mk);
805#endif
806#ifndef SCTP_AUDITING_ENABLED
807		stcb->asoc.sent_queue_retran_cnt = cnt_mk;
808#endif
809	}
810	/* Now check for a ECN Echo that may be stranded */
811	TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
812		if ((chk->whoTo == net) &&
813		    (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
814			sctp_free_remote_addr(chk->whoTo);
815			chk->whoTo = alt;
816			if (chk->sent != SCTP_DATAGRAM_RESEND) {
817				chk->sent = SCTP_DATAGRAM_RESEND;
818				sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
819			}
820			atomic_add_int(&alt->ref_count, 1);
821		}
822	}
823	if (audit_tf) {
824		SCTPDBG(SCTP_DEBUG_TIMER4,
825		    "Audit total flight due to negative value net:%p\n",
826		    net);
827		stcb->asoc.total_flight = 0;
828		stcb->asoc.total_flight_count = 0;
829		/* Clear all networks flight size */
830		TAILQ_FOREACH(lnets, &stcb->asoc.nets, sctp_next) {
831			lnets->flight_size = 0;
832			SCTPDBG(SCTP_DEBUG_TIMER4,
833			    "Net:%p c-f cwnd:%d ssthresh:%d\n",
834			    lnets, lnets->cwnd, lnets->ssthresh);
835		}
836		TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
837			if (chk->sent < SCTP_DATAGRAM_RESEND) {
838				if (sctp_logging_level & SCTP_FLIGHT_LOGGING_ENABLE) {
839					sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
840					    chk->whoTo->flight_size,
841					    chk->book_size,
842					    (uintptr_t) chk->whoTo,
843					    chk->rec.data.TSN_seq);
844				}
845				sctp_flight_size_increase(chk);
846				sctp_total_flight_increase(stcb, chk);
847			}
848		}
849	}
850	/*
851	 * Setup the ecn nonce re-sync point. We do this since
852	 * retranmissions are NOT setup for ECN. This means that do to
853	 * Karn's rule, we don't know the total of the peers ecn bits.
854	 */
855	chk = TAILQ_FIRST(&stcb->asoc.send_queue);
856	if (chk == NULL) {
857		stcb->asoc.nonce_resync_tsn = stcb->asoc.sending_seq;
858	} else {
859		stcb->asoc.nonce_resync_tsn = chk->rec.data.TSN_seq;
860	}
861	stcb->asoc.nonce_wait_for_ecne = 0;
862	stcb->asoc.nonce_sum_check = 0;
863	/* We return 1 if we only have a window probe outstanding */
864	return (0);
865}
866
867static void
868sctp_move_all_chunks_to_alt(struct sctp_tcb *stcb,
869    struct sctp_nets *net,
870    struct sctp_nets *alt)
871{
872	struct sctp_association *asoc;
873	struct sctp_stream_out *outs;
874	struct sctp_tmit_chunk *chk;
875	struct sctp_stream_queue_pending *sp;
876
877	if (net == alt)
878		/* nothing to do */
879		return;
880
881	asoc = &stcb->asoc;
882
883	/*
884	 * now through all the streams checking for chunks sent to our bad
885	 * network.
886	 */
887	TAILQ_FOREACH(outs, &asoc->out_wheel, next_spoke) {
888		/* now clean up any chunks here */
889		TAILQ_FOREACH(sp, &outs->outqueue, next) {
890			if (sp->net == net) {
891				sctp_free_remote_addr(sp->net);
892				sp->net = alt;
893				atomic_add_int(&alt->ref_count, 1);
894			}
895		}
896	}
897	/* Now check the pending queue */
898	TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
899		if (chk->whoTo == net) {
900			sctp_free_remote_addr(chk->whoTo);
901			chk->whoTo = alt;
902			atomic_add_int(&alt->ref_count, 1);
903		}
904	}
905
906}
907
908int
909sctp_t3rxt_timer(struct sctp_inpcb *inp,
910    struct sctp_tcb *stcb,
911    struct sctp_nets *net)
912{
913	struct sctp_nets *alt;
914	int win_probe, num_mk;
915
916	if (sctp_logging_level & SCTP_FR_LOGGING_ENABLE) {
917		sctp_log_fr(0, 0, 0, SCTP_FR_T3_TIMEOUT);
918	}
919	if (sctp_logging_level & SCTP_CWND_LOGGING_ENABLE) {
920		struct sctp_nets *lnet;
921
922		TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) {
923			if (net == lnet) {
924				sctp_log_cwnd(stcb, lnet, 1, SCTP_CWND_LOG_FROM_T3);
925			} else {
926				sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_LOG_FROM_T3);
927			}
928		}
929	}
930	/* Find an alternate and mark those for retransmission */
931	if ((stcb->asoc.peers_rwnd == 0) &&
932	    (stcb->asoc.total_flight < net->mtu)) {
933		SCTP_STAT_INCR(sctps_timowindowprobe);
934		win_probe = 1;
935	} else {
936		win_probe = 0;
937	}
938
939	/*
940	 * JRS 5/14/07 - If CMT PF is on and the destination if not already
941	 * in PF state, set the destination to PF state and store the
942	 * current time as the time that the destination was last active. In
943	 * addition, find an alternate destination with PF-based
944	 * find_alt_net().
945	 */
946	if (sctp_cmt_on_off && sctp_cmt_pf) {
947		if ((net->dest_state & SCTP_ADDR_PF) != SCTP_ADDR_PF) {
948			net->dest_state |= SCTP_ADDR_PF;
949			net->last_active = sctp_get_tick_count();
950			SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from active to PF.\n",
951			    net);
952		}
953		alt = sctp_find_alternate_net(stcb, net, 2);
954	} else if (sctp_cmt_on_off) {
955		/*
956		 * CMT: Using RTX_SSTHRESH policy for CMT. If CMT is being
957		 * used, then pick dest with largest ssthresh for any
958		 * retransmission.
959		 */
960		alt = net;
961		alt = sctp_find_alternate_net(stcb, alt, 1);
962		/*
963		 * CUCv2: If a different dest is picked for the
964		 * retransmission, then new (rtx-)pseudo_cumack needs to be
965		 * tracked for orig dest. Let CUCv2 track new (rtx-)
966		 * pseudo-cumack always.
967		 */
968		net->find_pseudo_cumack = 1;
969		net->find_rtx_pseudo_cumack = 1;
970
971	} else {		/* CMT is OFF */
972		alt = sctp_find_alternate_net(stcb, net, 0);
973	}
974
975	(void)sctp_mark_all_for_resend(stcb, net, alt, win_probe, &num_mk);
976	/* FR Loss recovery just ended with the T3. */
977	stcb->asoc.fast_retran_loss_recovery = 0;
978
979	/* CMT FR loss recovery ended with the T3 */
980	net->fast_retran_loss_recovery = 0;
981
982	/*
983	 * setup the sat loss recovery that prevents satellite cwnd advance.
984	 */
985	stcb->asoc.sat_t3_loss_recovery = 1;
986	stcb->asoc.sat_t3_recovery_tsn = stcb->asoc.sending_seq;
987
988	/* Backoff the timer and cwnd */
989	sctp_backoff_on_timeout(stcb, net, win_probe, num_mk);
990	if (win_probe == 0) {
991		/* We don't do normal threshold management on window probes */
992		if (sctp_threshold_management(inp, stcb, net,
993		    stcb->asoc.max_send_times)) {
994			/* Association was destroyed */
995			return (1);
996		} else {
997			if (net != stcb->asoc.primary_destination) {
998				/* send a immediate HB if our RTO is stale */
999				struct timeval now;
1000				unsigned int ms_goneby;
1001
1002				(void)SCTP_GETTIME_TIMEVAL(&now);
1003				if (net->last_sent_time.tv_sec) {
1004					ms_goneby = (now.tv_sec - net->last_sent_time.tv_sec) * 1000;
1005				} else {
1006					ms_goneby = 0;
1007				}
1008				if ((ms_goneby > net->RTO) || (net->RTO == 0)) {
1009					/*
1010					 * no recent feed back in an RTO or
1011					 * more, request a RTT update
1012					 */
1013					if (sctp_send_hb(stcb, 1, net) < 0)
1014						return 1;
1015				}
1016			}
1017		}
1018	} else {
1019		/*
1020		 * For a window probe we don't penalize the net's but only
1021		 * the association. This may fail it if SACKs are not coming
1022		 * back. If sack's are coming with rwnd locked at 0, we will
1023		 * continue to hold things waiting for rwnd to raise
1024		 */
1025		if (sctp_threshold_management(inp, stcb, NULL,
1026		    stcb->asoc.max_send_times)) {
1027			/* Association was destroyed */
1028			return (1);
1029		}
1030	}
1031	if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
1032		/* Move all pending over too */
1033		sctp_move_all_chunks_to_alt(stcb, net, alt);
1034
1035		/*
1036		 * Get the address that failed, to force a new src address
1037		 * selecton and a route allocation.
1038		 */
1039		if (net->ro._s_addr) {
1040			sctp_free_ifa(net->ro._s_addr);
1041			net->ro._s_addr = NULL;
1042		}
1043		net->src_addr_selected = 0;
1044
1045		/* Force a route allocation too */
1046		if (net->ro.ro_rt) {
1047			RTFREE(net->ro.ro_rt);
1048			net->ro.ro_rt = NULL;
1049		}
1050		/* Was it our primary? */
1051		if ((stcb->asoc.primary_destination == net) && (alt != net)) {
1052			/*
1053			 * Yes, note it as such and find an alternate note:
1054			 * this means HB code must use this to resent the
1055			 * primary if it goes active AND if someone does a
1056			 * change-primary then this flag must be cleared
1057			 * from any net structures.
1058			 */
1059			if (sctp_set_primary_addr(stcb,
1060			    (struct sockaddr *)NULL,
1061			    alt) == 0) {
1062				net->dest_state |= SCTP_ADDR_WAS_PRIMARY;
1063			}
1064		}
1065	} else if (sctp_cmt_on_off && sctp_cmt_pf && (net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF) {
1066		/*
1067		 * JRS 5/14/07 - If the destination hasn't failed completely
1068		 * but is in PF state, a PF-heartbeat needs to be sent
1069		 * manually.
1070		 */
1071		if (sctp_send_hb(stcb, 1, net) < 0)
1072			return 1;
1073	}
1074	/*
1075	 * Special case for cookie-echo'ed case, we don't do output but must
1076	 * await the COOKIE-ACK before retransmission
1077	 */
1078	if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) {
1079		/*
1080		 * Here we just reset the timer and start again since we
1081		 * have not established the asoc
1082		 */
1083		sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
1084		return (0);
1085	}
1086	if (stcb->asoc.peer_supports_prsctp) {
1087		struct sctp_tmit_chunk *lchk;
1088
1089		lchk = sctp_try_advance_peer_ack_point(stcb, &stcb->asoc);
1090		/* C3. See if we need to send a Fwd-TSN */
1091		if (compare_with_wrap(stcb->asoc.advanced_peer_ack_point,
1092		    stcb->asoc.last_acked_seq, MAX_TSN)) {
1093			/*
1094			 * ISSUE with ECN, see FWD-TSN processing for notes
1095			 * on issues that will occur when the ECN NONCE
1096			 * stuff is put into SCTP for cross checking.
1097			 */
1098			send_forward_tsn(stcb, &stcb->asoc);
1099			if (lchk) {
1100				/* Assure a timer is up */
1101				sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, lchk->whoTo);
1102			}
1103		}
1104	}
1105	if (sctp_logging_level & SCTP_CWND_MONITOR_ENABLE) {
1106		sctp_log_cwnd(stcb, net, net->cwnd, SCTP_CWND_LOG_FROM_RTX);
1107	}
1108	return (0);
1109}
1110
1111int
1112sctp_t1init_timer(struct sctp_inpcb *inp,
1113    struct sctp_tcb *stcb,
1114    struct sctp_nets *net)
1115{
1116	/* bump the thresholds */
1117	if (stcb->asoc.delayed_connection) {
1118		/*
1119		 * special hook for delayed connection. The library did NOT
1120		 * complete the rest of its sends.
1121		 */
1122		stcb->asoc.delayed_connection = 0;
1123		sctp_send_initiate(inp, stcb);
1124		return (0);
1125	}
1126	if (SCTP_GET_STATE((&stcb->asoc)) != SCTP_STATE_COOKIE_WAIT) {
1127		return (0);
1128	}
1129	if (sctp_threshold_management(inp, stcb, net,
1130	    stcb->asoc.max_init_times)) {
1131		/* Association was destroyed */
1132		return (1);
1133	}
1134	stcb->asoc.dropped_special_cnt = 0;
1135	sctp_backoff_on_timeout(stcb, stcb->asoc.primary_destination, 1, 0);
1136	if (stcb->asoc.initial_init_rto_max < net->RTO) {
1137		net->RTO = stcb->asoc.initial_init_rto_max;
1138	}
1139	if (stcb->asoc.numnets > 1) {
1140		/* If we have more than one addr use it */
1141		struct sctp_nets *alt;
1142
1143		alt = sctp_find_alternate_net(stcb, stcb->asoc.primary_destination, 0);
1144		if ((alt != NULL) && (alt != stcb->asoc.primary_destination)) {
1145			sctp_move_all_chunks_to_alt(stcb, stcb->asoc.primary_destination, alt);
1146			stcb->asoc.primary_destination = alt;
1147		}
1148	}
1149	/* Send out a new init */
1150	sctp_send_initiate(inp, stcb);
1151	return (0);
1152}
1153
1154/*
1155 * For cookie and asconf we actually need to find and mark for resend, then
1156 * increment the resend counter (after all the threshold management stuff of
1157 * course).
1158 */
1159int
1160sctp_cookie_timer(struct sctp_inpcb *inp,
1161    struct sctp_tcb *stcb,
1162    struct sctp_nets *net)
1163{
1164	struct sctp_nets *alt;
1165	struct sctp_tmit_chunk *cookie;
1166
1167	/* first before all else we must find the cookie */
1168	TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) {
1169		if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
1170			break;
1171		}
1172	}
1173	if (cookie == NULL) {
1174		if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) {
1175			/* FOOBAR! */
1176			struct mbuf *oper;
1177
1178			oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)),
1179			    0, M_DONTWAIT, 1, MT_DATA);
1180			if (oper) {
1181				struct sctp_paramhdr *ph;
1182				uint32_t *ippp;
1183
1184				SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) +
1185				    sizeof(uint32_t);
1186				ph = mtod(oper, struct sctp_paramhdr *);
1187				ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION);
1188				ph->param_length = htons(SCTP_BUF_LEN(oper));
1189				ippp = (uint32_t *) (ph + 1);
1190				*ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
1191			}
1192			inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4;
1193			sctp_abort_an_association(inp, stcb, SCTP_INTERNAL_ERROR,
1194			    oper);
1195		} else {
1196#ifdef INVARIANTS
1197			panic("Cookie timer expires in wrong state?");
1198#else
1199			SCTP_PRINTF("Strange in state %d not cookie-echoed yet c-e timer expires?\n", SCTP_GET_STATE(&stcb->asoc));
1200			return (0);
1201#endif
1202		}
1203		return (0);
1204	}
1205	/* Ok we found the cookie, threshold management next */
1206	if (sctp_threshold_management(inp, stcb, cookie->whoTo,
1207	    stcb->asoc.max_init_times)) {
1208		/* Assoc is over */
1209		return (1);
1210	}
1211	/*
1212	 * cleared theshold management now lets backoff the address & select
1213	 * an alternate
1214	 */
1215	stcb->asoc.dropped_special_cnt = 0;
1216	sctp_backoff_on_timeout(stcb, cookie->whoTo, 1, 0);
1217	alt = sctp_find_alternate_net(stcb, cookie->whoTo, 0);
1218	if (alt != cookie->whoTo) {
1219		sctp_free_remote_addr(cookie->whoTo);
1220		cookie->whoTo = alt;
1221		atomic_add_int(&alt->ref_count, 1);
1222	}
1223	/* Now mark the retran info */
1224	if (cookie->sent != SCTP_DATAGRAM_RESEND) {
1225		sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
1226	}
1227	cookie->sent = SCTP_DATAGRAM_RESEND;
1228	/*
1229	 * Now call the output routine to kick out the cookie again, Note we
1230	 * don't mark any chunks for retran so that FR will need to kick in
1231	 * to move these (or a send timer).
1232	 */
1233	return (0);
1234}
1235
1236int
1237sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
1238    struct sctp_nets *net)
1239{
1240	struct sctp_nets *alt;
1241	struct sctp_tmit_chunk *strrst = NULL, *chk = NULL;
1242
1243	if (stcb->asoc.stream_reset_outstanding == 0) {
1244		return (0);
1245	}
1246	/* find the existing STRRESET, we use the seq number we sent out on */
1247	(void)sctp_find_stream_reset(stcb, stcb->asoc.str_reset_seq_out, &strrst);
1248	if (strrst == NULL) {
1249		return (0);
1250	}
1251	/* do threshold management */
1252	if (sctp_threshold_management(inp, stcb, strrst->whoTo,
1253	    stcb->asoc.max_send_times)) {
1254		/* Assoc is over */
1255		return (1);
1256	}
1257	/*
1258	 * cleared theshold management now lets backoff the address & select
1259	 * an alternate
1260	 */
1261	sctp_backoff_on_timeout(stcb, strrst->whoTo, 1, 0);
1262	alt = sctp_find_alternate_net(stcb, strrst->whoTo, 0);
1263	sctp_free_remote_addr(strrst->whoTo);
1264	strrst->whoTo = alt;
1265	atomic_add_int(&alt->ref_count, 1);
1266
1267	/* See if a ECN Echo is also stranded */
1268	TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
1269		if ((chk->whoTo == net) &&
1270		    (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
1271			sctp_free_remote_addr(chk->whoTo);
1272			if (chk->sent != SCTP_DATAGRAM_RESEND) {
1273				chk->sent = SCTP_DATAGRAM_RESEND;
1274				sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
1275			}
1276			chk->whoTo = alt;
1277			atomic_add_int(&alt->ref_count, 1);
1278		}
1279	}
1280	if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
1281		/*
1282		 * If the address went un-reachable, we need to move to
1283		 * alternates for ALL chk's in queue
1284		 */
1285		sctp_move_all_chunks_to_alt(stcb, net, alt);
1286	}
1287	/* mark the retran info */
1288	if (strrst->sent != SCTP_DATAGRAM_RESEND)
1289		sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
1290	strrst->sent = SCTP_DATAGRAM_RESEND;
1291
1292	/* restart the timer */
1293	sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, inp, stcb, strrst->whoTo);
1294	return (0);
1295}
1296
1297int
1298sctp_asconf_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
1299    struct sctp_nets *net)
1300{
1301	struct sctp_nets *alt;
1302	struct sctp_tmit_chunk *asconf, *chk;
1303
1304	/* is this the first send, or a retransmission? */
1305	if (stcb->asoc.asconf_sent == 0) {
1306		/* compose a new ASCONF chunk and send it */
1307		sctp_send_asconf(stcb, net);
1308	} else {
1309		/* Retransmission of the existing ASCONF needed... */
1310
1311		/* find the existing ASCONF */
1312		TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue,
1313		    sctp_next) {
1314			if (asconf->rec.chunk_id.id == SCTP_ASCONF) {
1315				break;
1316			}
1317		}
1318		if (asconf == NULL) {
1319			return (0);
1320		}
1321		/* do threshold management */
1322		if (sctp_threshold_management(inp, stcb, asconf->whoTo,
1323		    stcb->asoc.max_send_times)) {
1324			/* Assoc is over */
1325			return (1);
1326		}
1327		/*
1328		 * PETER? FIX? How will the following code ever run? If the
1329		 * max_send_times is hit, threshold managment will blow away
1330		 * the association?
1331		 */
1332		if (asconf->snd_count > stcb->asoc.max_send_times) {
1333			/*
1334			 * Something is rotten, peer is not responding to
1335			 * ASCONFs but maybe is to data etc.  e.g. it is not
1336			 * properly handling the chunk type upper bits Mark
1337			 * this peer as ASCONF incapable and cleanup
1338			 */
1339			SCTPDBG(SCTP_DEBUG_TIMER1, "asconf_timer: Peer has not responded to our repeated ASCONFs\n");
1340			sctp_asconf_cleanup(stcb, net);
1341			return (0);
1342		}
1343		/*
1344		 * cleared theshold management now lets backoff the address
1345		 * & select an alternate
1346		 */
1347		sctp_backoff_on_timeout(stcb, asconf->whoTo, 1, 0);
1348		alt = sctp_find_alternate_net(stcb, asconf->whoTo, 0);
1349		sctp_free_remote_addr(asconf->whoTo);
1350		asconf->whoTo = alt;
1351		atomic_add_int(&alt->ref_count, 1);
1352
1353		/* See if a ECN Echo is also stranded */
1354		TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
1355			if ((chk->whoTo == net) &&
1356			    (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) {
1357				sctp_free_remote_addr(chk->whoTo);
1358				chk->whoTo = alt;
1359				if (chk->sent != SCTP_DATAGRAM_RESEND) {
1360					chk->sent = SCTP_DATAGRAM_RESEND;
1361					sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
1362				}
1363				atomic_add_int(&alt->ref_count, 1);
1364			}
1365		}
1366		if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) {
1367			/*
1368			 * If the address went un-reachable, we need to move
1369			 * to alternates for ALL chk's in queue
1370			 */
1371			sctp_move_all_chunks_to_alt(stcb, net, alt);
1372		}
1373		/* mark the retran info */
1374		if (asconf->sent != SCTP_DATAGRAM_RESEND)
1375			sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
1376		asconf->sent = SCTP_DATAGRAM_RESEND;
1377	}
1378	return (0);
1379}
1380
1381/*
1382 * For the shutdown and shutdown-ack, we do not keep one around on the
1383 * control queue. This means we must generate a new one and call the general
1384 * chunk output routine, AFTER having done threshold management.
1385 */
1386int
1387sctp_shutdown_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
1388    struct sctp_nets *net)
1389{
1390	struct sctp_nets *alt;
1391
1392	/* first threshold managment */
1393	if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
1394		/* Assoc is over */
1395		return (1);
1396	}
1397	/* second select an alternative */
1398	alt = sctp_find_alternate_net(stcb, net, 0);
1399
1400	/* third generate a shutdown into the queue for out net */
1401	if (alt) {
1402		sctp_send_shutdown(stcb, alt);
1403	} else {
1404		/*
1405		 * if alt is NULL, there is no dest to send to??
1406		 */
1407		return (0);
1408	}
1409	/* fourth restart timer */
1410	sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, inp, stcb, alt);
1411	return (0);
1412}
1413
1414int
1415sctp_shutdownack_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
1416    struct sctp_nets *net)
1417{
1418	struct sctp_nets *alt;
1419
1420	/* first threshold managment */
1421	if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
1422		/* Assoc is over */
1423		return (1);
1424	}
1425	/* second select an alternative */
1426	alt = sctp_find_alternate_net(stcb, net, 0);
1427
1428	/* third generate a shutdown into the queue for out net */
1429	sctp_send_shutdown_ack(stcb, alt);
1430
1431	/* fourth restart timer */
1432	sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, inp, stcb, alt);
1433	return (0);
1434}
1435
1436static void
1437sctp_audit_stream_queues_for_size(struct sctp_inpcb *inp,
1438    struct sctp_tcb *stcb)
1439{
1440	struct sctp_stream_out *outs;
1441	struct sctp_stream_queue_pending *sp;
1442	unsigned int chks_in_queue = 0;
1443	int being_filled = 0;
1444
1445	/*
1446	 * This function is ONLY called when the send/sent queues are empty.
1447	 */
1448	if ((stcb == NULL) || (inp == NULL))
1449		return;
1450
1451	if (stcb->asoc.sent_queue_retran_cnt) {
1452		SCTP_PRINTF("Hmm, sent_queue_retran_cnt is non-zero %d\n",
1453		    stcb->asoc.sent_queue_retran_cnt);
1454		stcb->asoc.sent_queue_retran_cnt = 0;
1455	}
1456	SCTP_TCB_SEND_LOCK(stcb);
1457	if (TAILQ_EMPTY(&stcb->asoc.out_wheel)) {
1458		int i, cnt = 0;
1459
1460		/* Check to see if a spoke fell off the wheel */
1461		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
1462			if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
1463				sctp_insert_on_wheel(stcb, &stcb->asoc, &stcb->asoc.strmout[i], 1);
1464				cnt++;
1465			}
1466		}
1467		if (cnt) {
1468			/* yep, we lost a spoke or two */
1469			SCTP_PRINTF("Found an additional %d streams NOT on outwheel, corrected\n", cnt);
1470		} else {
1471			/* no spokes lost, */
1472			stcb->asoc.total_output_queue_size = 0;
1473		}
1474		SCTP_TCB_SEND_UNLOCK(stcb);
1475		return;
1476	}
1477	SCTP_TCB_SEND_UNLOCK(stcb);
1478	/* Check to see if some data queued, if so report it */
1479	TAILQ_FOREACH(outs, &stcb->asoc.out_wheel, next_spoke) {
1480		if (!TAILQ_EMPTY(&outs->outqueue)) {
1481			TAILQ_FOREACH(sp, &outs->outqueue, next) {
1482				if (sp->msg_is_complete)
1483					being_filled++;
1484				chks_in_queue++;
1485			}
1486		}
1487	}
1488	if (chks_in_queue != stcb->asoc.stream_queue_cnt) {
1489		SCTP_PRINTF("Hmm, stream queue cnt at %d I counted %d in stream out wheel\n",
1490		    stcb->asoc.stream_queue_cnt, chks_in_queue);
1491	}
1492	if (chks_in_queue) {
1493		/* call the output queue function */
1494		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3);
1495		if ((TAILQ_EMPTY(&stcb->asoc.send_queue)) &&
1496		    (TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
1497			/*
1498			 * Probably should go in and make it go back through
1499			 * and add fragments allowed
1500			 */
1501			if (being_filled == 0) {
1502				SCTP_PRINTF("Still nothing moved %d chunks are stuck\n",
1503				    chks_in_queue);
1504			}
1505		}
1506	} else {
1507		SCTP_PRINTF("Found no chunks on any queue tot:%lu\n",
1508		    (u_long)stcb->asoc.total_output_queue_size);
1509		stcb->asoc.total_output_queue_size = 0;
1510	}
1511}
1512
1513int
1514sctp_heartbeat_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
1515    struct sctp_nets *net, int cnt_of_unconf)
1516{
1517	int ret;
1518
1519	if (net) {
1520		if (net->hb_responded == 0) {
1521			if (net->ro._s_addr) {
1522				/*
1523				 * Invalidate the src address if we did not
1524				 * get a response last time.
1525				 */
1526				sctp_free_ifa(net->ro._s_addr);
1527				net->ro._s_addr = NULL;
1528				net->src_addr_selected = 0;
1529			}
1530			sctp_backoff_on_timeout(stcb, net, 1, 0);
1531		}
1532		/* Zero PBA, if it needs it */
1533		if (net->partial_bytes_acked) {
1534			net->partial_bytes_acked = 0;
1535		}
1536	}
1537	if ((stcb->asoc.total_output_queue_size > 0) &&
1538	    (TAILQ_EMPTY(&stcb->asoc.send_queue)) &&
1539	    (TAILQ_EMPTY(&stcb->asoc.sent_queue))) {
1540		sctp_audit_stream_queues_for_size(inp, stcb);
1541	}
1542	/* Send a new HB, this will do threshold managment, pick a new dest */
1543	if (cnt_of_unconf == 0) {
1544		if (sctp_send_hb(stcb, 0, NULL) < 0) {
1545			return (1);
1546		}
1547	} else {
1548		/*
1549		 * this will send out extra hb's up to maxburst if there are
1550		 * any unconfirmed addresses.
1551		 */
1552		uint32_t cnt_sent = 0;
1553
1554		TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
1555			if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
1556			    (net->dest_state & SCTP_ADDR_REACHABLE)) {
1557				cnt_sent++;
1558				if (net->hb_responded == 0) {
1559					/* Did we respond last time? */
1560					if (net->ro._s_addr) {
1561						sctp_free_ifa(net->ro._s_addr);
1562						net->ro._s_addr = NULL;
1563						net->src_addr_selected = 0;
1564					}
1565				}
1566				ret = sctp_send_hb(stcb, 1, net);
1567				if (ret < 0)
1568					return 1;
1569				else if (ret == 0) {
1570					break;
1571				}
1572				if (cnt_sent >= sctp_hb_maxburst)
1573					break;
1574			}
1575		}
1576	}
1577	return (0);
1578}
1579
1580int
1581sctp_is_hb_timer_running(struct sctp_tcb *stcb)
1582{
1583	if (SCTP_OS_TIMER_PENDING(&stcb->asoc.hb_timer.timer)) {
1584		/* its running */
1585		return (1);
1586	} else {
1587		/* nope */
1588		return (0);
1589	}
1590}
1591
1592int
1593sctp_is_sack_timer_running(struct sctp_tcb *stcb)
1594{
1595	if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
1596		/* its running */
1597		return (1);
1598	} else {
1599		/* nope */
1600		return (0);
1601	}
1602}
1603
1604#define SCTP_NUMBER_OF_MTU_SIZES 18
1605static uint32_t mtu_sizes[] = {
1606	68,
1607	296,
1608	508,
1609	512,
1610	544,
1611	576,
1612	1006,
1613	1492,
1614	1500,
1615	1536,
1616	2002,
1617	2048,
1618	4352,
1619	4464,
1620	8166,
1621	17914,
1622	32000,
1623	65535
1624};
1625
1626
1627static uint32_t
1628sctp_getnext_mtu(struct sctp_inpcb *inp, uint32_t cur_mtu)
1629{
1630	/* select another MTU that is just bigger than this one */
1631	int i;
1632
1633	for (i = 0; i < SCTP_NUMBER_OF_MTU_SIZES; i++) {
1634		if (cur_mtu < mtu_sizes[i]) {
1635			/* no max_mtu is bigger than this one */
1636			return (mtu_sizes[i]);
1637		}
1638	}
1639	/* here return the highest allowable */
1640	return (cur_mtu);
1641}
1642
1643
1644void
1645sctp_pathmtu_timer(struct sctp_inpcb *inp,
1646    struct sctp_tcb *stcb,
1647    struct sctp_nets *net)
1648{
1649	uint32_t next_mtu;
1650
1651	/* restart the timer in any case */
1652	next_mtu = sctp_getnext_mtu(inp, net->mtu);
1653	if (next_mtu <= net->mtu) {
1654		/* nothing to do */
1655		return;
1656	} {
1657		uint32_t mtu;
1658
1659		if ((net->src_addr_selected == 0) ||
1660		    (net->ro._s_addr == NULL) ||
1661		    (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) {
1662			if ((net->ro._s_addr != NULL) && (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) {
1663				sctp_free_ifa(net->ro._s_addr);
1664				net->ro._s_addr = NULL;
1665				net->src_addr_selected = 0;
1666			} else if (net->ro._s_addr == NULL) {
1667				net->ro._s_addr = sctp_source_address_selection(inp,
1668				    stcb,
1669				    (sctp_route_t *) & net->ro,
1670				    net, 0, stcb->asoc.vrf_id);
1671			}
1672			if (net->ro._s_addr)
1673				net->src_addr_selected = 1;
1674		}
1675		if (net->ro._s_addr) {
1676			mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt);
1677			if (mtu > next_mtu) {
1678				net->mtu = next_mtu;
1679			}
1680		}
1681	}
1682	/* restart the timer */
1683	sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
1684}
1685
1686void
1687sctp_autoclose_timer(struct sctp_inpcb *inp,
1688    struct sctp_tcb *stcb,
1689    struct sctp_nets *net)
1690{
1691	struct timeval tn, *tim_touse;
1692	struct sctp_association *asoc;
1693	int ticks_gone_by;
1694
1695	(void)SCTP_GETTIME_TIMEVAL(&tn);
1696	if (stcb->asoc.sctp_autoclose_ticks &&
1697	    sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
1698		/* Auto close is on */
1699		asoc = &stcb->asoc;
1700		/* pick the time to use */
1701		if (asoc->time_last_rcvd.tv_sec >
1702		    asoc->time_last_sent.tv_sec) {
1703			tim_touse = &asoc->time_last_rcvd;
1704		} else {
1705			tim_touse = &asoc->time_last_sent;
1706		}
1707		/* Now has long enough transpired to autoclose? */
1708		ticks_gone_by = SEC_TO_TICKS(tn.tv_sec - tim_touse->tv_sec);
1709		if ((ticks_gone_by > 0) &&
1710		    (ticks_gone_by >= (int)asoc->sctp_autoclose_ticks)) {
1711			/*
1712			 * autoclose time has hit, call the output routine,
1713			 * which should do nothing just to be SURE we don't
1714			 * have hanging data. We can then safely check the
1715			 * queues and know that we are clear to send
1716			 * shutdown
1717			 */
1718			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR);
1719			/* Are we clean? */
1720			if (TAILQ_EMPTY(&asoc->send_queue) &&
1721			    TAILQ_EMPTY(&asoc->sent_queue)) {
1722				/*
1723				 * there is nothing queued to send, so I'm
1724				 * done...
1725				 */
1726				if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) {
1727					/* only send SHUTDOWN 1st time thru */
1728					sctp_send_shutdown(stcb, stcb->asoc.primary_destination);
1729					if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
1730					    (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
1731						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
1732					}
1733					asoc->state = SCTP_STATE_SHUTDOWN_SENT;
1734					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
1735					    stcb->sctp_ep, stcb,
1736					    asoc->primary_destination);
1737					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
1738					    stcb->sctp_ep, stcb,
1739					    asoc->primary_destination);
1740				}
1741			}
1742		} else {
1743			/*
1744			 * No auto close at this time, reset t-o to check
1745			 * later
1746			 */
1747			int tmp;
1748
1749			/* fool the timer startup to use the time left */
1750			tmp = asoc->sctp_autoclose_ticks;
1751			asoc->sctp_autoclose_ticks -= ticks_gone_by;
1752			sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb,
1753			    net);
1754			/* restore the real tick value */
1755			asoc->sctp_autoclose_ticks = tmp;
1756		}
1757	}
1758}
1759
1760void
1761sctp_iterator_timer(struct sctp_iterator *it)
1762{
1763	int iteration_count = 0;
1764	int inp_skip = 0;
1765
1766	/*
1767	 * only one iterator can run at a time. This is the only way we can
1768	 * cleanly pull ep's from underneath all the running interators when
1769	 * a ep is freed.
1770	 */
1771	SCTP_ITERATOR_LOCK();
1772	if (it->inp == NULL) {
1773		/* iterator is complete */
1774done_with_iterator:
1775		SCTP_ITERATOR_UNLOCK();
1776		SCTP_INP_INFO_WLOCK();
1777		TAILQ_REMOVE(&sctppcbinfo.iteratorhead, it, sctp_nxt_itr);
1778		/* stopping the callout is not needed, in theory */
1779		SCTP_INP_INFO_WUNLOCK();
1780		(void)SCTP_OS_TIMER_STOP(&it->tmr.timer);
1781		if (it->function_atend != NULL) {
1782			(*it->function_atend) (it->pointer, it->val);
1783		}
1784		SCTP_FREE(it, SCTP_M_ITER);
1785		return;
1786	}
1787select_a_new_ep:
1788	SCTP_INP_WLOCK(it->inp);
1789	while (((it->pcb_flags) &&
1790	    ((it->inp->sctp_flags & it->pcb_flags) != it->pcb_flags)) ||
1791	    ((it->pcb_features) &&
1792	    ((it->inp->sctp_features & it->pcb_features) != it->pcb_features))) {
1793		/* endpoint flags or features don't match, so keep looking */
1794		if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
1795			SCTP_INP_WUNLOCK(it->inp);
1796			goto done_with_iterator;
1797		}
1798		SCTP_INP_WUNLOCK(it->inp);
1799		it->inp = LIST_NEXT(it->inp, sctp_list);
1800		if (it->inp == NULL) {
1801			goto done_with_iterator;
1802		}
1803		SCTP_INP_WLOCK(it->inp);
1804	}
1805	if ((it->inp->inp_starting_point_for_iterator != NULL) &&
1806	    (it->inp->inp_starting_point_for_iterator != it)) {
1807		SCTP_PRINTF("Iterator collision, waiting for one at %p\n",
1808		    it->inp);
1809		SCTP_INP_WUNLOCK(it->inp);
1810		goto start_timer_return;
1811	}
1812	/* mark the current iterator on the endpoint */
1813	it->inp->inp_starting_point_for_iterator = it;
1814	SCTP_INP_WUNLOCK(it->inp);
1815	SCTP_INP_RLOCK(it->inp);
1816	/* now go through each assoc which is in the desired state */
1817	if (it->done_current_ep == 0) {
1818		if (it->function_inp != NULL)
1819			inp_skip = (*it->function_inp) (it->inp, it->pointer, it->val);
1820		it->done_current_ep = 1;
1821	}
1822	if (it->stcb == NULL) {
1823		/* run the per instance function */
1824		it->stcb = LIST_FIRST(&it->inp->sctp_asoc_list);
1825	}
1826	SCTP_INP_RUNLOCK(it->inp);
1827	if ((inp_skip) || it->stcb == NULL) {
1828		if (it->function_inp_end != NULL) {
1829			inp_skip = (*it->function_inp_end) (it->inp,
1830			    it->pointer,
1831			    it->val);
1832		}
1833		goto no_stcb;
1834	}
1835	if ((it->stcb) &&
1836	    (it->stcb->asoc.stcb_starting_point_for_iterator == it)) {
1837		it->stcb->asoc.stcb_starting_point_for_iterator = NULL;
1838	}
1839	while (it->stcb) {
1840		SCTP_TCB_LOCK(it->stcb);
1841		if (it->asoc_state && ((it->stcb->asoc.state & it->asoc_state) != it->asoc_state)) {
1842			/* not in the right state... keep looking */
1843			SCTP_TCB_UNLOCK(it->stcb);
1844			goto next_assoc;
1845		}
1846		/* mark the current iterator on the assoc */
1847		it->stcb->asoc.stcb_starting_point_for_iterator = it;
1848		/* see if we have limited out the iterator loop */
1849		iteration_count++;
1850		if (iteration_count > SCTP_ITERATOR_MAX_AT_ONCE) {
1851	start_timer_return:
1852			/* set a timer to continue this later */
1853			if (it->stcb)
1854				SCTP_TCB_UNLOCK(it->stcb);
1855			sctp_timer_start(SCTP_TIMER_TYPE_ITERATOR,
1856			    (struct sctp_inpcb *)it, NULL, NULL);
1857			SCTP_ITERATOR_UNLOCK();
1858			return;
1859		}
1860		/* run function on this one */
1861		(*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val);
1862
1863		/*
1864		 * we lie here, it really needs to have its own type but
1865		 * first I must verify that this won't effect things :-0
1866		 */
1867		if (it->no_chunk_output == 0)
1868			sctp_chunk_output(it->inp, it->stcb, SCTP_OUTPUT_FROM_T3);
1869
1870		SCTP_TCB_UNLOCK(it->stcb);
1871next_assoc:
1872		it->stcb = LIST_NEXT(it->stcb, sctp_tcblist);
1873		if (it->stcb == NULL) {
1874			if (it->function_inp_end != NULL) {
1875				inp_skip = (*it->function_inp_end) (it->inp,
1876				    it->pointer,
1877				    it->val);
1878			}
1879		}
1880	}
1881no_stcb:
1882	/* done with all assocs on this endpoint, move on to next endpoint */
1883	it->done_current_ep = 0;
1884	SCTP_INP_WLOCK(it->inp);
1885	it->inp->inp_starting_point_for_iterator = NULL;
1886	SCTP_INP_WUNLOCK(it->inp);
1887	if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) {
1888		it->inp = NULL;
1889	} else {
1890		SCTP_INP_INFO_RLOCK();
1891		it->inp = LIST_NEXT(it->inp, sctp_list);
1892		SCTP_INP_INFO_RUNLOCK();
1893	}
1894	if (it->inp == NULL) {
1895		goto done_with_iterator;
1896	}
1897	goto select_a_new_ep;
1898}
1899