1/*-
2 * Copyright (c) 2016-2018 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD$");
28
29#include "opt_inet.h"
30#include "opt_inet6.h"
31#include "opt_rss.h"
32#include "opt_tcpdebug.h"
33
34/**
35 * Some notes about usage.
36 *
37 * The tcp_hpts system is designed to provide a high precision timer
38 * system for tcp. Its main purpose is to provide a mechanism for
39 * pacing packets out onto the wire. It can be used in two ways
40 * by a given TCP stack (and those two methods can be used simultaneously).
41 *
42 * First, and probably the main thing its used by Rack and BBR, it can
43 * be used to call tcp_output() of a transport stack at some time in the future.
44 * The normal way this is done is that tcp_output() of the stack schedules
45 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
46 * slot is the time from now that the stack wants to be called but it
47 * must be converted to tcp_hpts's notion of slot. This is done with
48 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
49 * call from the tcp_output() routine might look like:
50 *
51 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
52 *
53 * The above would schedule tcp_ouput() to be called in 550 useconds.
54 * Note that if using this mechanism the stack will want to add near
55 * its top a check to prevent unwanted calls (from user land or the
56 * arrival of incoming ack's). So it would add something like:
57 *
58 * if (inp->inp_in_hpts)
59 *    return;
60 *
61 * to prevent output processing until the time alotted has gone by.
62 * Of course this is a bare bones example and the stack will probably
63 * have more consideration then just the above.
64 *
65 * Now the second function (actually two functions I guess :D)
66 * the tcp_hpts system provides is the  ability to either abort
67 * a connection (later) or process input on a connection.
68 * Why would you want to do this? To keep processor locality
69 * and or not have to worry about untangling any recursive
70 * locks. The input function now is hooked to the new LRO
71 * system as well.
72 *
73 * In order to use the input redirection function the
74 * tcp stack must define an input function for
75 * tfb_do_queued_segments(). This function understands
76 * how to dequeue a array of packets that were input and
77 * knows how to call the correct processing routine.
78 *
79 * Locking in this is important as well so most likely the
80 * stack will need to define the tfb_do_segment_nounlock()
81 * splitting tfb_do_segment() into two parts. The main processing
82 * part that does not unlock the INP and returns a value of 1 or 0.
83 * It returns 0 if all is well and the lock was not released. It
84 * returns 1 if we had to destroy the TCB (a reset received etc).
85 * The remains of tfb_do_segment() then become just a simple call
86 * to the tfb_do_segment_nounlock() function and check the return
87 * code and possibly unlock.
88 *
89 * The stack must also set the flag on the INP that it supports this
90 * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
91 * this flag as well and will queue packets when it is set.
92 * There are other flags as well INP_MBUF_QUEUE_READY and
93 * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
94 * that we are in the pacer for output so there is no
95 * need to wake up the hpts system to get immediate
96 * input. The second tells the LRO code that its okay
97 * if a SACK arrives you can still defer input and let
98 * the current hpts timer run (this is usually set when
99 * a rack timer is up so we know SACK's are happening
100 * on the connection already and don't want to wakeup yet).
101 *
102 * There is a common functions within the rack_bbr_common code
103 * version i.e. ctf_do_queued_segments(). This function
104 * knows how to take the input queue of packets from
105 * tp->t_in_pkts and process them digging out
106 * all the arguments, calling any bpf tap and
107 * calling into tfb_do_segment_nounlock(). The common
108 * function (ctf_do_queued_segments())  requires that
109 * you have defined the tfb_do_segment_nounlock() as
110 * described above.
111 *
112 * The second feature of the input side of hpts is the
113 * dropping of a connection. This is due to the way that
114 * locking may have occured on the INP_WLOCK. So if
115 * a stack wants to drop a connection it calls:
116 *
117 *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
118 *
119 * To schedule the tcp_hpts system to call
120 *
121 *    tcp_drop(tp, drop_reason)
122 *
123 * at a future point. This is quite handy to prevent locking
124 * issues when dropping connections.
125 *
126 */
127
128#include <sys/param.h>
129#include <sys/bus.h>
130#include <sys/interrupt.h>
131#include <sys/module.h>
132#include <sys/kernel.h>
133#include <sys/hhook.h>
134#include <sys/malloc.h>
135#include <sys/mbuf.h>
136#include <sys/proc.h>		/* for proc0 declaration */
137#include <sys/socket.h>
138#include <sys/socketvar.h>
139#include <sys/sysctl.h>
140#include <sys/systm.h>
141#include <sys/refcount.h>
142#include <sys/sched.h>
143#include <sys/queue.h>
144#include <sys/smp.h>
145#include <sys/counter.h>
146#include <sys/time.h>
147#include <sys/kthread.h>
148#include <sys/kern_prefetch.h>
149
150#include <vm/uma.h>
151#include <vm/vm.h>
152
153#include <net/route.h>
154#include <net/vnet.h>
155
156#ifdef RSS
157#include <net/netisr.h>
158#include <net/rss_config.h>
159#endif
160
161#define TCPSTATES		/* for logging */
162
163#include <netinet/in.h>
164#include <netinet/in_kdtrace.h>
165#include <netinet/in_pcb.h>
166#include <netinet/ip.h>
167#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
168#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
169#include <netinet/ip_var.h>
170#include <netinet/ip6.h>
171#include <netinet6/in6_pcb.h>
172#include <netinet6/ip6_var.h>
173#include <netinet/tcp.h>
174#include <netinet/tcp_fsm.h>
175#include <netinet/tcp_seq.h>
176#include <netinet/tcp_timer.h>
177#include <netinet/tcp_var.h>
178#include <netinet/tcpip.h>
179#include <netinet/cc/cc.h>
180#include <netinet/tcp_hpts.h>
181#include <netinet/tcp_log_buf.h>
182
183#ifdef tcpdebug
184#include <netinet/tcp_debug.h>
185#endif				/* tcpdebug */
186#ifdef tcp_offload
187#include <netinet/tcp_offload.h>
188#endif
189
190MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
191#ifdef RSS
192static int tcp_bind_threads = 1;
193#else
194static int tcp_bind_threads = 2;
195#endif
196TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
197
198static struct tcp_hptsi tcp_pace;
199static int hpts_does_tp_logging = 0;
200
201static void tcp_wakehpts(struct tcp_hpts_entry *p);
202static void tcp_wakeinput(struct tcp_hpts_entry *p);
203static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
204static void tcp_hptsi(struct tcp_hpts_entry *hpts);
205static void tcp_hpts_thread(void *ctx);
206static void tcp_init_hptsi(void *st);
207
208int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
209static int32_t tcp_hpts_callout_skip_swi = 0;
210
211SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
212    "TCP Hpts controls");
213
214#define	timersub(tvp, uvp, vvp)						\
215	do {								\
216		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
217		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
218		if ((vvp)->tv_usec < 0) {				\
219			(vvp)->tv_sec--;				\
220			(vvp)->tv_usec += 1000000;			\
221		}							\
222	} while (0)
223
224static int32_t tcp_hpts_precision = 120;
225
226struct hpts_domain_info {
227	int count;
228	int cpu[MAXCPU];
229};
230
231struct hpts_domain_info hpts_domains[MAXMEMDOM];
232
233SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
234    &tcp_hpts_precision, 120,
235    "Value for PRE() precision of callout");
236
237counter_u64_t hpts_hopelessly_behind;
238
239SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
240    &hpts_hopelessly_behind,
241    "Number of times hpts could not catch up and was behind hopelessly");
242
243counter_u64_t hpts_loops;
244
245SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
246    &hpts_loops, "Number of times hpts had to loop to catch up");
247
248counter_u64_t back_tosleep;
249
250SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
251    &back_tosleep, "Number of times hpts found no tcbs");
252
253counter_u64_t combined_wheel_wrap;
254
255SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
256    &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
257
258counter_u64_t wheel_wrap;
259
260SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
261    &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
262
263static int32_t out_ts_percision = 0;
264
265SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
266    &out_ts_percision, 0,
267    "Do we use a percise timestamp for every output cts");
268SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
269    &hpts_does_tp_logging, 0,
270    "Do we add to any tp that has logging on pacer logs");
271
272static int32_t max_pacer_loops = 10;
273SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
274    &max_pacer_loops, 10,
275    "What is the maximum number of times the pacer will loop trying to catch up");
276
277#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
278
279static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
280
281static int
282sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
283{
284	int error;
285	uint32_t new;
286
287	new = hpts_sleep_max;
288	error = sysctl_handle_int(oidp, &new, 0, req);
289	if (error == 0 && req->newptr) {
290		if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
291		    (new > HPTS_MAX_SLEEP_ALLOWED))
292			error = EINVAL;
293		else
294			hpts_sleep_max = new;
295	}
296	return (error);
297}
298
299SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
300    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
301    &hpts_sleep_max, 0,
302    &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
303    "Maximum time hpts will sleep");
304
305SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
306    &tcp_min_hptsi_time, 0,
307    "The minimum time the hpts must sleep before processing more slots");
308
309SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
310    &tcp_hpts_callout_skip_swi, 0,
311    "Do we have the callout call directly to the hpts?");
312
313static void
314tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
315	     int ticks_to_run, int idx)
316{
317	union tcp_log_stackspecific log;
318
319	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
320	log.u_bbr.flex1 = hpts->p_nxt_slot;
321	log.u_bbr.flex2 = hpts->p_cur_slot;
322	log.u_bbr.flex3 = hpts->p_prev_slot;
323	log.u_bbr.flex4 = idx;
324	log.u_bbr.flex5 = hpts->p_curtick;
325	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
326	log.u_bbr.use_lt_bw = 1;
327	log.u_bbr.inflight = ticks_to_run;
328	log.u_bbr.applimited = hpts->overidden_sleep;
329	log.u_bbr.delivered = hpts->saved_curtick;
330	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
331	log.u_bbr.epoch = hpts->saved_curslot;
332	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
333	log.u_bbr.pkts_out = hpts->p_delayed_by;
334	log.u_bbr.lost = hpts->p_hpts_sleep_time;
335	log.u_bbr.cur_del_rate = hpts->p_runningtick;
336	TCP_LOG_EVENTP(tp, NULL,
337		       &tp->t_inpcb->inp_socket->so_rcv,
338		       &tp->t_inpcb->inp_socket->so_snd,
339		       BBR_LOG_HPTSDIAG, 0,
340		       0, &log, false, tv);
341}
342
343static void
344hpts_timeout_swi(void *arg)
345{
346	struct tcp_hpts_entry *hpts;
347
348	hpts = (struct tcp_hpts_entry *)arg;
349	swi_sched(hpts->ie_cookie, 0);
350}
351
352static void
353hpts_timeout_dir(void *arg)
354{
355	tcp_hpts_thread(arg);
356}
357
358static inline void
359hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
360{
361#ifdef INVARIANTS
362	if (mtx_owned(&hpts->p_mtx) == 0) {
363		/* We don't own the mutex? */
364		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
365	}
366	if (hpts->p_cpu != inp->inp_hpts_cpu) {
367		/* It is not the right cpu/mutex? */
368		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
369	}
370	if (inp->inp_in_hpts == 0) {
371		/* We are not on the hpts? */
372		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
373	}
374#endif
375	TAILQ_REMOVE(head, inp, inp_hpts);
376	hpts->p_on_queue_cnt--;
377	if (hpts->p_on_queue_cnt < 0) {
378		/* Count should not go negative .. */
379#ifdef INVARIANTS
380		panic("Hpts goes negative inp:%p hpts:%p",
381		    inp, hpts);
382#endif
383		hpts->p_on_queue_cnt = 0;
384	}
385	if (clear) {
386		inp->inp_hpts_request = 0;
387		inp->inp_in_hpts = 0;
388	}
389}
390
391static inline void
392hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
393{
394#ifdef INVARIANTS
395	if (mtx_owned(&hpts->p_mtx) == 0) {
396		/* We don't own the mutex? */
397		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
398	}
399	if (hpts->p_cpu != inp->inp_hpts_cpu) {
400		/* It is not the right cpu/mutex? */
401		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
402	}
403	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
404		/* We are already on the hpts? */
405		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
406	}
407#endif
408	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
409	inp->inp_in_hpts = 1;
410	hpts->p_on_queue_cnt++;
411	if (noref == 0) {
412		in_pcbref(inp);
413	}
414}
415
416static inline void
417hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
418{
419#ifdef INVARIANTS
420	if (mtx_owned(&hpts->p_mtx) == 0) {
421		/* We don't own the mutex? */
422		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
423	}
424	if (hpts->p_cpu != inp->inp_input_cpu) {
425		/* It is not the right cpu/mutex? */
426		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
427	}
428	if (inp->inp_in_input == 0) {
429		/* We are not on the input hpts? */
430		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
431	}
432#endif
433	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
434	hpts->p_on_inqueue_cnt--;
435	if (hpts->p_on_inqueue_cnt < 0) {
436#ifdef INVARIANTS
437		panic("Hpts in goes negative inp:%p hpts:%p",
438		    inp, hpts);
439#endif
440		hpts->p_on_inqueue_cnt = 0;
441	}
442#ifdef INVARIANTS
443	if (TAILQ_EMPTY(&hpts->p_input) &&
444	    (hpts->p_on_inqueue_cnt != 0)) {
445		/* We should not be empty with a queue count */
446		panic("%s hpts:%p in_hpts input empty but cnt:%d",
447		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
448	}
449#endif
450	if (clear)
451		inp->inp_in_input = 0;
452}
453
454static inline void
455hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
456{
457#ifdef INVARIANTS
458	if (mtx_owned(&hpts->p_mtx) == 0) {
459		/* We don't own the mutex? */
460		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
461	}
462	if (hpts->p_cpu != inp->inp_input_cpu) {
463		/* It is not the right cpu/mutex? */
464		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
465	}
466	if (inp->inp_in_input == 1) {
467		/* We are already on the input hpts? */
468		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
469	}
470#endif
471	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
472	inp->inp_in_input = 1;
473	hpts->p_on_inqueue_cnt++;
474	in_pcbref(inp);
475}
476
477static void
478tcp_wakehpts(struct tcp_hpts_entry *hpts)
479{
480	HPTS_MTX_ASSERT(hpts);
481	if (hpts->p_hpts_wake_scheduled == 0) {
482		hpts->p_hpts_wake_scheduled = 1;
483		swi_sched(hpts->ie_cookie, 0);
484	}
485}
486
487static void
488tcp_wakeinput(struct tcp_hpts_entry *hpts)
489{
490	HPTS_MTX_ASSERT(hpts);
491	if (hpts->p_hpts_wake_scheduled == 0) {
492		hpts->p_hpts_wake_scheduled = 1;
493		swi_sched(hpts->ie_cookie, 0);
494	}
495}
496
497struct tcp_hpts_entry *
498tcp_cur_hpts(struct inpcb *inp)
499{
500	int32_t hpts_num;
501	struct tcp_hpts_entry *hpts;
502
503	hpts_num = inp->inp_hpts_cpu;
504	hpts = tcp_pace.rp_ent[hpts_num];
505	return (hpts);
506}
507
508struct tcp_hpts_entry *
509tcp_hpts_lock(struct inpcb *inp)
510{
511	struct tcp_hpts_entry *hpts;
512	int32_t hpts_num;
513
514again:
515	hpts_num = inp->inp_hpts_cpu;
516	hpts = tcp_pace.rp_ent[hpts_num];
517#ifdef INVARIANTS
518	if (mtx_owned(&hpts->p_mtx)) {
519		panic("Hpts:%p owns mtx prior-to lock line:%d",
520		    hpts, __LINE__);
521	}
522#endif
523	mtx_lock(&hpts->p_mtx);
524	if (hpts_num != inp->inp_hpts_cpu) {
525		mtx_unlock(&hpts->p_mtx);
526		goto again;
527	}
528	return (hpts);
529}
530
531struct tcp_hpts_entry *
532tcp_input_lock(struct inpcb *inp)
533{
534	struct tcp_hpts_entry *hpts;
535	int32_t hpts_num;
536
537again:
538	hpts_num = inp->inp_input_cpu;
539	hpts = tcp_pace.rp_ent[hpts_num];
540#ifdef INVARIANTS
541	if (mtx_owned(&hpts->p_mtx)) {
542		panic("Hpts:%p owns mtx prior-to lock line:%d",
543		    hpts, __LINE__);
544	}
545#endif
546	mtx_lock(&hpts->p_mtx);
547	if (hpts_num != inp->inp_input_cpu) {
548		mtx_unlock(&hpts->p_mtx);
549		goto again;
550	}
551	return (hpts);
552}
553
554static void
555tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
556{
557	int32_t add_freed;
558
559	if (inp->inp_flags2 & INP_FREED) {
560		/*
561		 * Need to play a special trick so that in_pcbrele_wlocked
562		 * does not return 1 when it really should have returned 0.
563		 */
564		add_freed = 1;
565		inp->inp_flags2 &= ~INP_FREED;
566	} else {
567		add_freed = 0;
568	}
569#ifndef INP_REF_DEBUG
570	if (in_pcbrele_wlocked(inp)) {
571		/*
572		 * This should not happen. We have the inpcb referred to by
573		 * the main socket (why we are called) and the hpts. It
574		 * should always return 0.
575		 */
576		panic("inpcb:%p release ret 1",
577		    inp);
578	}
579#else
580	if (__in_pcbrele_wlocked(inp, line)) {
581		/*
582		 * This should not happen. We have the inpcb referred to by
583		 * the main socket (why we are called) and the hpts. It
584		 * should always return 0.
585		 */
586		panic("inpcb:%p release ret 1",
587		    inp);
588	}
589#endif
590	if (add_freed) {
591		inp->inp_flags2 |= INP_FREED;
592	}
593}
594
595static void
596tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
597{
598	if (inp->inp_in_hpts) {
599		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
600		tcp_remove_hpts_ref(inp, hpts, line);
601	}
602}
603
604static void
605tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
606{
607	HPTS_MTX_ASSERT(hpts);
608	if (inp->inp_in_input) {
609		hpts_sane_input_remove(hpts, inp, 1);
610		tcp_remove_hpts_ref(inp, hpts, line);
611	}
612}
613
614/*
615 * Called normally with the INP_LOCKED but it
616 * does not matter, the hpts lock is the key
617 * but the lock order allows us to hold the
618 * INP lock and then get the hpts lock.
619 *
620 * Valid values in the flags are
621 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
622 * HPTS_REMOVE_INPUT - remove from the input of the hpts.
623 * Note that you can use one or both values together
624 * and get two actions.
625 */
626void
627__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
628{
629	struct tcp_hpts_entry *hpts;
630
631	INP_WLOCK_ASSERT(inp);
632	if (flags & HPTS_REMOVE_OUTPUT) {
633		hpts = tcp_hpts_lock(inp);
634		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
635		mtx_unlock(&hpts->p_mtx);
636	}
637	if (flags & HPTS_REMOVE_INPUT) {
638		hpts = tcp_input_lock(inp);
639		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
640		mtx_unlock(&hpts->p_mtx);
641	}
642}
643
644static inline int
645hpts_tick(uint32_t wheel_tick, uint32_t plus)
646{
647	/*
648	 * Given a slot on the wheel, what slot
649	 * is that plus ticks out?
650	 */
651	KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
652	return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
653}
654
655static inline int
656tick_to_wheel(uint32_t cts_in_wticks)
657{
658	/*
659	 * Given a timestamp in wheel ticks (10usec inc's)
660	 * map it to our limited space wheel.
661	 */
662	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
663}
664
665static inline int
666hpts_ticks_diff(int prev_tick, int tick_now)
667{
668	/*
669	 * Given two ticks that are someplace
670	 * on our wheel. How far are they apart?
671	 */
672	if (tick_now > prev_tick)
673		return (tick_now - prev_tick);
674	else if (tick_now == prev_tick)
675		/*
676		 * Special case, same means we can go all of our
677		 * wheel less one slot.
678		 */
679		return (NUM_OF_HPTSI_SLOTS - 1);
680	else
681		return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
682}
683
684/*
685 * Given a tick on the wheel that is the current time
686 * mapped to the wheel (wheel_tick), what is the maximum
687 * distance forward that can be obtained without
688 * wrapping past either prev_tick or running_tick
689 * depending on the htps state? Also if passed
690 * a uint32_t *, fill it with the tick location.
691 *
692 * Note if you do not give this function the current
693 * time (that you think it is) mapped to the wheel
694 * then the results will not be what you expect and
695 * could lead to invalid inserts.
696 */
697static inline int32_t
698max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
699{
700	uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
701
702	if ((hpts->p_hpts_active == 1) &&
703	    (hpts->p_wheel_complete == 0)) {
704		end_tick = hpts->p_runningtick;
705		/* Back up one tick */
706		if (end_tick == 0)
707			end_tick = NUM_OF_HPTSI_SLOTS - 1;
708		else
709			end_tick--;
710		if (target_tick)
711			*target_tick = end_tick;
712	} else {
713		/*
714		 * For the case where we are
715		 * not active, or we have
716		 * completed the pass over
717		 * the wheel, we can use the
718		 * prev tick and subtract one from it. This puts us
719		 * as far out as possible on the wheel.
720		 */
721		end_tick = hpts->p_prev_slot;
722		if (end_tick == 0)
723			end_tick = NUM_OF_HPTSI_SLOTS - 1;
724		else
725			end_tick--;
726		if (target_tick)
727			*target_tick = end_tick;
728		/*
729		 * Now we have close to the full wheel left minus the
730		 * time it has been since the pacer went to sleep. Note
731		 * that wheel_tick, passed in, should be the current time
732		 * from the perspective of the caller, mapped to the wheel.
733		 */
734		if (hpts->p_prev_slot != wheel_tick)
735			dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
736		else
737			dis_to_travel = 1;
738		/*
739		 * dis_to_travel in this case is the space from when the
740		 * pacer stopped (p_prev_slot) and where our wheel_tick
741		 * is now. To know how many slots we can put it in we
742		 * subtract from the wheel size. We would not want
743		 * to place something after p_prev_slot or it will
744		 * get ran too soon.
745		 */
746		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
747	}
748	/*
749	 * So how many slots are open between p_runningtick -> p_cur_slot
750	 * that is what is currently un-available for insertion. Special
751	 * case when we are at the last slot, this gets 1, so that
752	 * the answer to how many slots are available is all but 1.
753	 */
754	if (hpts->p_runningtick == hpts->p_cur_slot)
755		dis_to_travel = 1;
756	else
757		dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
758	/*
759	 * How long has the pacer been running?
760	 */
761	if (hpts->p_cur_slot != wheel_tick) {
762		/* The pacer is a bit late */
763		pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
764	} else {
765		/* The pacer is right on time, now == pacers start time */
766		pacer_to_now = 0;
767	}
768	/*
769	 * To get the number left we can insert into we simply
770	 * subract the distance the pacer has to run from how
771	 * many slots there are.
772	 */
773	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
774	/*
775	 * Now how many of those we will eat due to the pacer's
776	 * time (p_cur_slot) of start being behind the
777	 * real time (wheel_tick)?
778	 */
779	if (avail_on_wheel <= pacer_to_now) {
780		/*
781		 * Wheel wrap, we can't fit on the wheel, that
782		 * is unusual the system must be way overloaded!
783		 * Insert into the assured tick, and return special
784		 * "0".
785		 */
786		counter_u64_add(combined_wheel_wrap, 1);
787		*target_tick = hpts->p_nxt_slot;
788		return (0);
789	} else {
790		/*
791		 * We know how many slots are open
792		 * on the wheel (the reverse of what
793		 * is left to run. Take away the time
794		 * the pacer started to now (wheel_tick)
795		 * and that tells you how many slots are
796		 * open that can be inserted into that won't
797		 * be touched by the pacer until later.
798		 */
799		return (avail_on_wheel - pacer_to_now);
800	}
801}
802
803static int
804tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
805{
806	uint32_t need_wake = 0;
807
808	HPTS_MTX_ASSERT(hpts);
809	if (inp->inp_in_hpts == 0) {
810		/* Ok we need to set it on the hpts in the current slot */
811		inp->inp_hpts_request = 0;
812		if ((hpts->p_hpts_active == 0) ||
813		    (hpts->p_wheel_complete)) {
814			/*
815			 * A sleeping hpts we want in next slot to run
816			 * note that in this state p_prev_slot == p_cur_slot
817			 */
818			inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
819			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
820				need_wake = 1;
821		} else if ((void *)inp == hpts->p_inp) {
822			/*
823			 * The hpts system is running and the caller
824			 * was awoken by the hpts system.
825			 * We can't allow you to go into the same slot we
826			 * are in (we don't want a loop :-D).
827			 */
828			inp->inp_hptsslot = hpts->p_nxt_slot;
829		} else
830			inp->inp_hptsslot = hpts->p_runningtick;
831		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
832		if (need_wake) {
833			/*
834			 * Activate the hpts if it is sleeping and its
835			 * timeout is not 1.
836			 */
837			hpts->p_direct_wake = 1;
838			tcp_wakehpts(hpts);
839		}
840	}
841	return (need_wake);
842}
843
844int
845__tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
846{
847	int32_t ret;
848	struct tcp_hpts_entry *hpts;
849
850	INP_WLOCK_ASSERT(inp);
851	hpts = tcp_hpts_lock(inp);
852	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
853	mtx_unlock(&hpts->p_mtx);
854	return (ret);
855}
856
857#ifdef INVARIANTS
858static void
859check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
860{
861	/*
862	 * Sanity checks for the pacer with invariants
863	 * on insert.
864	 */
865	if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
866		panic("hpts:%p inp:%p slot:%d > max",
867		      hpts, inp, inp_hptsslot);
868	if ((hpts->p_hpts_active) &&
869	    (hpts->p_wheel_complete == 0)) {
870		/*
871		 * If the pacer is processing a arc
872		 * of the wheel, we need to make
873		 * sure we are not inserting within
874		 * that arc.
875		 */
876		int distance, yet_to_run;
877
878		distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
879		if (hpts->p_runningtick != hpts->p_cur_slot)
880			yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
881		else
882			yet_to_run = 0;	/* processing last slot */
883		if (yet_to_run > distance) {
884			panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
885			      hpts, inp, inp_hptsslot,
886			      distance, yet_to_run,
887			      hpts->p_runningtick, hpts->p_cur_slot);
888		}
889	}
890}
891#endif
892
893static void
894tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
895		       struct hpts_diag *diag, struct timeval *tv)
896{
897	uint32_t need_new_to = 0;
898	uint32_t wheel_cts, last_tick;
899	int32_t wheel_tick, maxticks;
900	int8_t need_wakeup = 0;
901
902	HPTS_MTX_ASSERT(hpts);
903	if (diag) {
904		memset(diag, 0, sizeof(struct hpts_diag));
905		diag->p_hpts_active = hpts->p_hpts_active;
906		diag->p_prev_slot = hpts->p_prev_slot;
907		diag->p_runningtick = hpts->p_runningtick;
908		diag->p_nxt_slot = hpts->p_nxt_slot;
909		diag->p_cur_slot = hpts->p_cur_slot;
910		diag->p_curtick = hpts->p_curtick;
911		diag->p_lasttick = hpts->p_lasttick;
912		diag->slot_req = slot;
913		diag->p_on_min_sleep = hpts->p_on_min_sleep;
914		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
915	}
916	if (inp->inp_in_hpts == 0) {
917		if (slot == 0) {
918			/* Immediate */
919			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
920			return;
921		}
922		/* Get the current time relative to the wheel */
923		wheel_cts = tcp_tv_to_hptstick(tv);
924		/* Map it onto the wheel */
925		wheel_tick = tick_to_wheel(wheel_cts);
926		/* Now what's the max we can place it at? */
927		maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
928		if (diag) {
929			diag->wheel_tick = wheel_tick;
930			diag->maxticks = maxticks;
931			diag->wheel_cts = wheel_cts;
932		}
933		if (maxticks == 0) {
934			/* The pacer is in a wheel wrap behind, yikes! */
935			if (slot > 1) {
936				/*
937				 * Reduce by 1 to prevent a forever loop in
938				 * case something else is wrong. Note this
939				 * probably does not hurt because the pacer
940				 * if its true is so far behind we will be
941				 * > 1second late calling anyway.
942				 */
943				slot--;
944			}
945			inp->inp_hptsslot = last_tick;
946			inp->inp_hpts_request = slot;
947		} else 	if (maxticks >= slot) {
948			/* It all fits on the wheel */
949			inp->inp_hpts_request = 0;
950			inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
951		} else {
952			/* It does not fit */
953			inp->inp_hpts_request = slot - maxticks;
954			inp->inp_hptsslot = last_tick;
955		}
956		if (diag) {
957			diag->slot_remaining = inp->inp_hpts_request;
958			diag->inp_hptsslot = inp->inp_hptsslot;
959		}
960#ifdef INVARIANTS
961		check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
962#endif
963		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
964		if ((hpts->p_hpts_active == 0) &&
965		    (inp->inp_hpts_request == 0) &&
966		    (hpts->p_on_min_sleep == 0)) {
967			/*
968			 * The hpts is sleeping and not on a minimum
969			 * sleep time, we need to figure out where
970			 * it will wake up at and if we need to reschedule
971			 * its time-out.
972			 */
973			uint32_t have_slept, yet_to_sleep;
974
975			/* Now do we need to restart the hpts's timer? */
976			have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
977			if (have_slept < hpts->p_hpts_sleep_time)
978				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
979			else {
980				/* We are over-due */
981				yet_to_sleep = 0;
982				need_wakeup = 1;
983			}
984			if (diag) {
985				diag->have_slept = have_slept;
986				diag->yet_to_sleep = yet_to_sleep;
987			}
988			if (yet_to_sleep &&
989			    (yet_to_sleep > slot)) {
990				/*
991				 * We need to reschedule the hpts's time-out.
992				 */
993				hpts->p_hpts_sleep_time = slot;
994				need_new_to = slot * HPTS_TICKS_PER_USEC;
995			}
996		}
997		/*
998		 * Now how far is the hpts sleeping to? if active is 1, its
999		 * up and ticking we do nothing, otherwise we may need to
1000		 * reschedule its callout if need_new_to is set from above.
1001		 */
1002		if (need_wakeup) {
1003			hpts->p_direct_wake = 1;
1004			tcp_wakehpts(hpts);
1005			if (diag) {
1006				diag->need_new_to = 0;
1007				diag->co_ret = 0xffff0000;
1008			}
1009		} else if (need_new_to) {
1010			int32_t co_ret;
1011			struct timeval tv;
1012			sbintime_t sb;
1013
1014			tv.tv_sec = 0;
1015			tv.tv_usec = 0;
1016			while (need_new_to > HPTS_USEC_IN_SEC) {
1017				tv.tv_sec++;
1018				need_new_to -= HPTS_USEC_IN_SEC;
1019			}
1020			tv.tv_usec = need_new_to;
1021			sb = tvtosbt(tv);
1022			if (tcp_hpts_callout_skip_swi == 0) {
1023				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
1024				    hpts_timeout_swi, hpts, hpts->p_cpu,
1025				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
1026			} else {
1027				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
1028				    hpts_timeout_dir, hpts,
1029				    hpts->p_cpu,
1030				    C_PREL(tcp_hpts_precision));
1031			}
1032			if (diag) {
1033				diag->need_new_to = need_new_to;
1034				diag->co_ret = co_ret;
1035			}
1036		}
1037	} else {
1038#ifdef INVARIANTS
1039		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
1040#endif
1041	}
1042}
1043
1044uint32_t
1045tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
1046{
1047	struct tcp_hpts_entry *hpts;
1048	uint32_t slot_on;
1049	struct timeval tv;
1050
1051	/*
1052	 * We now return the next-slot the hpts will be on, beyond its
1053	 * current run (if up) or where it was when it stopped if it is
1054	 * sleeping.
1055	 */
1056	INP_WLOCK_ASSERT(inp);
1057	hpts = tcp_hpts_lock(inp);
1058	microuptime(&tv);
1059	tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
1060	slot_on = hpts->p_nxt_slot;
1061	mtx_unlock(&hpts->p_mtx);
1062	return (slot_on);
1063}
1064
1065uint32_t
1066__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
1067	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
1068}
1069int
1070__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
1071{
1072	int32_t retval = 0;
1073
1074	HPTS_MTX_ASSERT(hpts);
1075	if (inp->inp_in_input == 0) {
1076		/* Ok we need to set it on the hpts in the current slot */
1077		hpts_sane_input_insert(hpts, inp, line);
1078		retval = 1;
1079		if (hpts->p_hpts_active == 0) {
1080			/*
1081			 * Activate the hpts if it is sleeping.
1082			 */
1083			retval = 2;
1084			hpts->p_direct_wake = 1;
1085			tcp_wakeinput(hpts);
1086		}
1087	} else if (hpts->p_hpts_active == 0) {
1088		retval = 4;
1089		hpts->p_direct_wake = 1;
1090		tcp_wakeinput(hpts);
1091	}
1092	return (retval);
1093}
1094
1095int32_t
1096__tcp_queue_to_input(struct inpcb *inp, int line)
1097{
1098	struct tcp_hpts_entry *hpts;
1099	int32_t ret;
1100
1101	hpts = tcp_input_lock(inp);
1102	ret = __tcp_queue_to_input_locked(inp, hpts, line);
1103	mtx_unlock(&hpts->p_mtx);
1104	return (ret);
1105}
1106
1107void
1108__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
1109{
1110	struct tcp_hpts_entry *hpts;
1111	struct tcpcb *tp;
1112
1113	tp = intotcpcb(inp);
1114	hpts = tcp_input_lock(tp->t_inpcb);
1115	if (inp->inp_in_input == 0) {
1116		/* Ok we need to set it on the hpts in the current slot */
1117		hpts_sane_input_insert(hpts, inp, line);
1118		if (hpts->p_hpts_active == 0) {
1119			/*
1120			 * Activate the hpts if it is sleeping.
1121			 */
1122			hpts->p_direct_wake = 1;
1123			tcp_wakeinput(hpts);
1124		}
1125	} else if (hpts->p_hpts_active == 0) {
1126		hpts->p_direct_wake = 1;
1127		tcp_wakeinput(hpts);
1128	}
1129	inp->inp_hpts_drop_reas = reason;
1130	mtx_unlock(&hpts->p_mtx);
1131}
1132
1133static uint16_t
1134hpts_random_cpu(struct inpcb *inp){
1135	/*
1136	 * No flow type set distribute the load randomly.
1137	 */
1138	uint16_t cpuid;
1139	uint32_t ran;
1140
1141	/*
1142	 * If one has been set use it i.e. we want both in and out on the
1143	 * same hpts.
1144	 */
1145	if (inp->inp_input_cpu_set) {
1146		return (inp->inp_input_cpu);
1147	} else if (inp->inp_hpts_cpu_set) {
1148		return (inp->inp_hpts_cpu);
1149	}
1150	/* Nothing set use a random number */
1151	ran = arc4random();
1152	cpuid = (ran & 0xffff) % mp_ncpus;
1153	return (cpuid);
1154}
1155
1156static uint16_t
1157hpts_cpuid(struct inpcb *inp)
1158{
1159	u_int cpuid;
1160#if !defined(RSS) && defined(NUMA)
1161	struct hpts_domain_info *di;
1162#endif
1163
1164	/*
1165	 * If one has been set use it i.e. we want both in and out on the
1166	 * same hpts.
1167	 */
1168	if (inp->inp_input_cpu_set) {
1169		return (inp->inp_input_cpu);
1170	} else if (inp->inp_hpts_cpu_set) {
1171		return (inp->inp_hpts_cpu);
1172	}
1173	/* If one is set the other must be the same */
1174#ifdef RSS
1175	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
1176	if (cpuid == NETISR_CPUID_NONE)
1177		return (hpts_random_cpu(inp));
1178	else
1179		return (cpuid);
1180#else
1181	/*
1182	 * We don't have a flowid -> cpuid mapping, so cheat and just map
1183	 * unknown cpuids to curcpu.  Not the best, but apparently better
1184	 * than defaulting to swi 0.
1185	 */
1186
1187	if (inp->inp_flowtype == M_HASHTYPE_NONE)
1188		return (hpts_random_cpu(inp));
1189	/*
1190	 * Hash to a thread based on the flowid.  If we are using numa,
1191	 * then restrict the hash to the numa domain where the inp lives.
1192	 */
1193#ifdef NUMA
1194	if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
1195		di = &hpts_domains[inp->inp_numa_domain];
1196		cpuid = di->cpu[inp->inp_flowid % di->count];
1197	} else
1198#endif
1199		cpuid = inp->inp_flowid % mp_ncpus;
1200
1201	return (cpuid);
1202#endif
1203}
1204
1205static void
1206tcp_drop_in_pkts(struct tcpcb *tp)
1207{
1208	struct mbuf *m, *n;
1209
1210	m = tp->t_in_pkt;
1211	if (m)
1212		n = m->m_nextpkt;
1213	else
1214		n = NULL;
1215	tp->t_in_pkt = NULL;
1216	while (m) {
1217		m_freem(m);
1218		m = n;
1219		if (m)
1220			n = m->m_nextpkt;
1221	}
1222}
1223
1224/*
1225 * Do NOT try to optimize the processing of inp's
1226 * by first pulling off all the inp's into a temporary
1227 * list (e.g. TAILQ_CONCAT). If you do that the subtle
1228 * interactions of switching CPU's will kill because of
1229 * problems in the linked list manipulation. Basically
1230 * you would switch cpu's with the hpts mutex locked
1231 * but then while you were processing one of the inp's
1232 * some other one that you switch will get a new
1233 * packet on the different CPU. It will insert it
1234 * on the new hpts's input list. Creating a temporary
1235 * link in the inp will not fix it either, since
1236 * the other hpts will be doing the same thing and
1237 * you will both end up using the temporary link.
1238 *
1239 * You will die in an ASSERT for tailq corruption if you
1240 * run INVARIANTS or you will die horribly without
1241 * INVARIANTS in some unknown way with a corrupt linked
1242 * list.
1243 */
1244static void
1245tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
1246{
1247	struct tcpcb *tp;
1248	struct inpcb *inp;
1249	uint16_t drop_reason;
1250	int16_t set_cpu;
1251	uint32_t did_prefetch = 0;
1252	int dropped;
1253
1254	HPTS_MTX_ASSERT(hpts);
1255	NET_EPOCH_ASSERT();
1256
1257	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
1258		HPTS_MTX_ASSERT(hpts);
1259		hpts_sane_input_remove(hpts, inp, 0);
1260		if (inp->inp_input_cpu_set == 0) {
1261			set_cpu = 1;
1262		} else {
1263			set_cpu = 0;
1264		}
1265		hpts->p_inp = inp;
1266		drop_reason = inp->inp_hpts_drop_reas;
1267		inp->inp_in_input = 0;
1268		mtx_unlock(&hpts->p_mtx);
1269		INP_WLOCK(inp);
1270#ifdef VIMAGE
1271		CURVNET_SET(inp->inp_vnet);
1272#endif
1273		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
1274		    (inp->inp_flags2 & INP_FREED)) {
1275out:
1276			hpts->p_inp = NULL;
1277			if (in_pcbrele_wlocked(inp) == 0) {
1278				INP_WUNLOCK(inp);
1279			}
1280#ifdef VIMAGE
1281			CURVNET_RESTORE();
1282#endif
1283			mtx_lock(&hpts->p_mtx);
1284			continue;
1285		}
1286		tp = intotcpcb(inp);
1287		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
1288			goto out;
1289		}
1290		if (drop_reason) {
1291			/* This tcb is being destroyed for drop_reason */
1292			tcp_drop_in_pkts(tp);
1293			tp = tcp_drop(tp, drop_reason);
1294			if (tp == NULL) {
1295				INP_WLOCK(inp);
1296			}
1297			if (in_pcbrele_wlocked(inp) == 0)
1298				INP_WUNLOCK(inp);
1299#ifdef VIMAGE
1300			CURVNET_RESTORE();
1301#endif
1302			mtx_lock(&hpts->p_mtx);
1303			continue;
1304		}
1305		if (set_cpu) {
1306			/*
1307			 * Setup so the next time we will move to the right
1308			 * CPU. This should be a rare event. It will
1309			 * sometimes happens when we are the client side
1310			 * (usually not the server). Somehow tcp_output()
1311			 * gets called before the tcp_do_segment() sets the
1312			 * intial state. This means the r_cpu and r_hpts_cpu
1313			 * is 0. We get on the hpts, and then tcp_input()
1314			 * gets called setting up the r_cpu to the correct
1315			 * value. The hpts goes off and sees the mis-match.
1316			 * We simply correct it here and the CPU will switch
1317			 * to the new hpts nextime the tcb gets added to the
1318			 * the hpts (not this time) :-)
1319			 */
1320			tcp_set_hpts(inp);
1321		}
1322		if (tp->t_fb_ptr != NULL) {
1323			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
1324			did_prefetch = 1;
1325		}
1326		if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
1327			if (inp->inp_in_input)
1328				tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
1329			dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
1330			if (dropped) {
1331				/* Re-acquire the wlock so we can release the reference */
1332				INP_WLOCK(inp);
1333			}
1334		} else if (tp->t_in_pkt) {
1335			/*
1336			 * We reach here only if we had a
1337			 * stack that supported INP_SUPPORTS_MBUFQ
1338			 * and then somehow switched to a stack that
1339			 * does not. The packets are basically stranded
1340			 * and would hang with the connection until
1341			 * cleanup without this code. Its not the
1342			 * best way but I know of no other way to
1343			 * handle it since the stack needs functions
1344			 * it does not have to handle queued packets.
1345			 */
1346			tcp_drop_in_pkts(tp);
1347		}
1348		if (in_pcbrele_wlocked(inp) == 0)
1349			INP_WUNLOCK(inp);
1350		INP_UNLOCK_ASSERT(inp);
1351#ifdef VIMAGE
1352		CURVNET_RESTORE();
1353#endif
1354		mtx_lock(&hpts->p_mtx);
1355		hpts->p_inp = NULL;
1356	}
1357}
1358
1359static void
1360tcp_hptsi(struct tcp_hpts_entry *hpts)
1361{
1362	struct tcpcb *tp;
1363	struct inpcb *inp = NULL, *ninp;
1364	struct timeval tv;
1365	int32_t ticks_to_run, i, error;
1366	int32_t paced_cnt = 0;
1367	int32_t loop_cnt = 0;
1368	int32_t did_prefetch = 0;
1369	int32_t prefetch_ninp = 0;
1370	int32_t prefetch_tp = 0;
1371	int32_t wrap_loop_cnt = 0;
1372	int16_t set_cpu;
1373
1374	HPTS_MTX_ASSERT(hpts);
1375	NET_EPOCH_ASSERT();
1376
1377	/* record previous info for any logging */
1378	hpts->saved_lasttick = hpts->p_lasttick;
1379	hpts->saved_curtick = hpts->p_curtick;
1380	hpts->saved_curslot = hpts->p_cur_slot;
1381	hpts->saved_prev_slot = hpts->p_prev_slot;
1382
1383	hpts->p_lasttick = hpts->p_curtick;
1384	hpts->p_curtick = tcp_gethptstick(&tv);
1385	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
1386	if ((hpts->p_on_queue_cnt == 0) ||
1387	    (hpts->p_lasttick == hpts->p_curtick)) {
1388		/*
1389		 * No time has yet passed,
1390		 * or nothing to do.
1391		 */
1392		hpts->p_prev_slot = hpts->p_cur_slot;
1393		hpts->p_lasttick = hpts->p_curtick;
1394		goto no_run;
1395	}
1396again:
1397	hpts->p_wheel_complete = 0;
1398	HPTS_MTX_ASSERT(hpts);
1399	ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
1400	if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
1401	    (hpts->p_on_queue_cnt != 0)) {
1402		/*
1403		 * Wheel wrap is occuring, basically we
1404		 * are behind and the distance between
1405		 * run's has spread so much it has exceeded
1406		 * the time on the wheel (1.024 seconds). This
1407		 * is ugly and should NOT be happening. We
1408		 * need to run the entire wheel. We last processed
1409		 * p_prev_slot, so that needs to be the last slot
1410		 * we run. The next slot after that should be our
1411		 * reserved first slot for new, and then starts
1412		 * the running postion. Now the problem is the
1413		 * reserved "not to yet" place does not exist
1414		 * and there may be inp's in there that need
1415		 * running. We can merge those into the
1416		 * first slot at the head.
1417		 */
1418		wrap_loop_cnt++;
1419		hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
1420		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
1421		/*
1422		 * Adjust p_cur_slot to be where we are starting from
1423		 * hopefully we will catch up (fat chance if something
1424		 * is broken this bad :( )
1425		 */
1426		hpts->p_cur_slot = hpts->p_prev_slot;
1427		/*
1428		 * The next slot has guys to run too, and that would
1429		 * be where we would normally start, lets move them into
1430		 * the next slot (p_prev_slot + 2) so that we will
1431		 * run them, the extra 10usecs of late (by being
1432		 * put behind) does not really matter in this situation.
1433		 */
1434#ifdef INVARIANTS
1435		/*
1436		 * To prevent a panic we need to update the inpslot to the
1437		 * new location. This is safe since it takes both the
1438		 * INP lock and the pacer mutex to change the inp_hptsslot.
1439		 */
1440		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
1441			inp->inp_hptsslot = hpts->p_runningtick;
1442		}
1443#endif
1444		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
1445			     &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
1446		ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
1447		counter_u64_add(wheel_wrap, 1);
1448	} else {
1449		/*
1450		 * Nxt slot is always one after p_runningtick though
1451		 * its not used usually unless we are doing wheel wrap.
1452		 */
1453		hpts->p_nxt_slot = hpts->p_prev_slot;
1454		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
1455	}
1456#ifdef INVARIANTS
1457	if (TAILQ_EMPTY(&hpts->p_input) &&
1458	    (hpts->p_on_inqueue_cnt != 0)) {
1459		panic("tp:%p in_hpts input empty but cnt:%d",
1460		      hpts, hpts->p_on_inqueue_cnt);
1461	}
1462#endif
1463	HPTS_MTX_ASSERT(hpts);
1464	if (hpts->p_on_queue_cnt == 0) {
1465		goto no_one;
1466	}
1467	HPTS_MTX_ASSERT(hpts);
1468	for (i = 0; i < ticks_to_run; i++) {
1469		/*
1470		 * Calculate our delay, if there are no extra ticks there
1471		 * was not any (i.e. if ticks_to_run == 1, no delay).
1472		 */
1473		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
1474		HPTS_MTX_ASSERT(hpts);
1475		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
1476			/* For debugging */
1477			hpts->p_inp = inp;
1478			paced_cnt++;
1479#ifdef INVARIANTS
1480			if (hpts->p_runningtick != inp->inp_hptsslot) {
1481				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
1482				      hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
1483			}
1484#endif
1485			/* Now pull it */
1486			if (inp->inp_hpts_cpu_set == 0) {
1487				set_cpu = 1;
1488			} else {
1489				set_cpu = 0;
1490			}
1491			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
1492			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
1493				/* We prefetch the next inp if possible */
1494				kern_prefetch(ninp, &prefetch_ninp);
1495				prefetch_ninp = 1;
1496			}
1497			if (inp->inp_hpts_request) {
1498				/*
1499				 * This guy is deferred out further in time
1500				 * then our wheel had available on it.
1501				 * Push him back on the wheel or run it
1502				 * depending.
1503				 */
1504				uint32_t maxticks, last_tick, remaining_slots;
1505
1506				remaining_slots = ticks_to_run - (i + 1);
1507				if (inp->inp_hpts_request > remaining_slots) {
1508					/*
1509					 * How far out can we go?
1510					 */
1511					maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
1512					if (maxticks >= inp->inp_hpts_request) {
1513						/* we can place it finally to be processed  */
1514						inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
1515						inp->inp_hpts_request = 0;
1516					} else {
1517						/* Work off some more time */
1518						inp->inp_hptsslot = last_tick;
1519						inp->inp_hpts_request-= maxticks;
1520					}
1521					hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
1522					hpts->p_inp = NULL;
1523					continue;
1524				}
1525				inp->inp_hpts_request = 0;
1526				/* Fall through we will so do it now */
1527			}
1528			/*
1529			 * We clear the hpts flag here after dealing with
1530			 * remaining slots. This way anyone looking with the
1531			 * TCB lock will see its on the hpts until just
1532			 * before we unlock.
1533			 */
1534			inp->inp_in_hpts = 0;
1535			mtx_unlock(&hpts->p_mtx);
1536			INP_WLOCK(inp);
1537			if (in_pcbrele_wlocked(inp)) {
1538				mtx_lock(&hpts->p_mtx);
1539				hpts->p_inp = NULL;
1540				continue;
1541			}
1542			if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
1543			    (inp->inp_flags2 & INP_FREED)) {
1544			out_now:
1545#ifdef INVARIANTS
1546				if (mtx_owned(&hpts->p_mtx)) {
1547					panic("Hpts:%p owns mtx prior-to lock line:%d",
1548					      hpts, __LINE__);
1549				}
1550#endif
1551				INP_WUNLOCK(inp);
1552				mtx_lock(&hpts->p_mtx);
1553				hpts->p_inp = NULL;
1554				continue;
1555			}
1556			tp = intotcpcb(inp);
1557			if ((tp == NULL) || (tp->t_inpcb == NULL)) {
1558				goto out_now;
1559			}
1560			if (set_cpu) {
1561				/*
1562				 * Setup so the next time we will move to
1563				 * the right CPU. This should be a rare
1564				 * event. It will sometimes happens when we
1565				 * are the client side (usually not the
1566				 * server). Somehow tcp_output() gets called
1567				 * before the tcp_do_segment() sets the
1568				 * intial state. This means the r_cpu and
1569				 * r_hpts_cpu is 0. We get on the hpts, and
1570				 * then tcp_input() gets called setting up
1571				 * the r_cpu to the correct value. The hpts
1572				 * goes off and sees the mis-match. We
1573				 * simply correct it here and the CPU will
1574				 * switch to the new hpts nextime the tcb
1575				 * gets added to the the hpts (not this one)
1576				 * :-)
1577				 */
1578				tcp_set_hpts(inp);
1579			}
1580#ifdef VIMAGE
1581			CURVNET_SET(inp->inp_vnet);
1582#endif
1583			/* Lets do any logging that we might want to */
1584			if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1585				tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
1586			}
1587			/*
1588			 * There is a hole here, we get the refcnt on the
1589			 * inp so it will still be preserved but to make
1590			 * sure we can get the INP we need to hold the p_mtx
1591			 * above while we pull out the tp/inp,  as long as
1592			 * fini gets the lock first we are assured of having
1593			 * a sane INP we can lock and test.
1594			 */
1595#ifdef INVARIANTS
1596			if (mtx_owned(&hpts->p_mtx)) {
1597				panic("Hpts:%p owns mtx before tcp-output:%d",
1598				      hpts, __LINE__);
1599			}
1600#endif
1601			if (tp->t_fb_ptr != NULL) {
1602				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
1603				did_prefetch = 1;
1604			}
1605			if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
1606				error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
1607				if (error) {
1608					/* The input killed the connection */
1609					goto skip_pacing;
1610				}
1611			}
1612			inp->inp_hpts_calls = 1;
1613			error = tp->t_fb->tfb_tcp_output(tp);
1614			inp->inp_hpts_calls = 0;
1615			if (ninp && ninp->inp_ppcb) {
1616				/*
1617				 * If we have a nxt inp, see if we can
1618				 * prefetch its ppcb. Note this may seem
1619				 * "risky" since we have no locks (other
1620				 * than the previous inp) and there no
1621				 * assurance that ninp was not pulled while
1622				 * we were processing inp and freed. If this
1623				 * occured it could mean that either:
1624				 *
1625				 * a) Its NULL (which is fine we won't go
1626				 * here) <or> b) Its valid (which is cool we
1627				 * will prefetch it) <or> c) The inp got
1628				 * freed back to the slab which was
1629				 * reallocated. Then the piece of memory was
1630				 * re-used and something else (not an
1631				 * address) is in inp_ppcb. If that occurs
1632				 * we don't crash, but take a TLB shootdown
1633				 * performance hit (same as if it was NULL
1634				 * and we tried to pre-fetch it).
1635				 *
1636				 * Considering that the likelyhood of <c> is
1637				 * quite rare we will take a risk on doing
1638				 * this. If performance drops after testing
1639				 * we can always take this out. NB: the
1640				 * kern_prefetch on amd64 actually has
1641				 * protection against a bad address now via
1642				 * the DMAP_() tests. This will prevent the
1643				 * TLB hit, and instead if <c> occurs just
1644				 * cause us to load cache with a useless
1645				 * address (to us).
1646				 */
1647				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
1648				prefetch_tp = 1;
1649			}
1650			INP_WUNLOCK(inp);
1651		skip_pacing:
1652#ifdef VIMAGE
1653			CURVNET_RESTORE();
1654#endif
1655			INP_UNLOCK_ASSERT(inp);
1656#ifdef INVARIANTS
1657			if (mtx_owned(&hpts->p_mtx)) {
1658				panic("Hpts:%p owns mtx prior-to lock line:%d",
1659				      hpts, __LINE__);
1660			}
1661#endif
1662			mtx_lock(&hpts->p_mtx);
1663			hpts->p_inp = NULL;
1664		}
1665		HPTS_MTX_ASSERT(hpts);
1666		hpts->p_inp = NULL;
1667		hpts->p_runningtick++;
1668		if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
1669			hpts->p_runningtick = 0;
1670		}
1671	}
1672no_one:
1673	HPTS_MTX_ASSERT(hpts);
1674	hpts->p_delayed_by = 0;
1675	/*
1676	 * Check to see if we took an excess amount of time and need to run
1677	 * more ticks (if we did not hit eno-bufs).
1678	 */
1679#ifdef INVARIANTS
1680	if (TAILQ_EMPTY(&hpts->p_input) &&
1681	    (hpts->p_on_inqueue_cnt != 0)) {
1682		panic("tp:%p in_hpts input empty but cnt:%d",
1683		      hpts, hpts->p_on_inqueue_cnt);
1684	}
1685#endif
1686	hpts->p_prev_slot = hpts->p_cur_slot;
1687	hpts->p_lasttick = hpts->p_curtick;
1688	if (loop_cnt > max_pacer_loops) {
1689		/*
1690		 * Something is serious slow we have
1691		 * looped through processing the wheel
1692		 * and by the time we cleared the
1693		 * needs to run max_pacer_loops time
1694		 * we still needed to run. That means
1695		 * the system is hopelessly behind and
1696		 * can never catch up :(
1697		 *
1698		 * We will just lie to this thread
1699		 * and let it thing p_curtick is
1700		 * correct. When it next awakens
1701		 * it will find itself further behind.
1702		 */
1703		counter_u64_add(hpts_hopelessly_behind, 1);
1704		goto no_run;
1705	}
1706	hpts->p_curtick = tcp_gethptstick(&tv);
1707	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
1708	if ((wrap_loop_cnt < 2) &&
1709	    (hpts->p_lasttick != hpts->p_curtick)) {
1710		counter_u64_add(hpts_loops, 1);
1711		loop_cnt++;
1712		goto again;
1713	}
1714no_run:
1715	/*
1716	 * Set flag to tell that we are done for
1717	 * any slot input that happens during
1718	 * input.
1719	 */
1720	hpts->p_wheel_complete = 1;
1721	/*
1722	 * Run any input that may be there not covered
1723	 * in running data.
1724	 */
1725	if (!TAILQ_EMPTY(&hpts->p_input)) {
1726		tcp_input_data(hpts, &tv);
1727		/*
1728		 * Now did we spend too long running
1729		 * input and need to run more ticks?
1730		 */
1731		KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
1732			("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
1733			 hpts->p_prev_slot, hpts->p_cur_slot));
1734		KASSERT(hpts->p_lasttick == hpts->p_curtick,
1735			("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
1736			 hpts->p_lasttick, hpts->p_curtick));
1737		hpts->p_curtick = tcp_gethptstick(&tv);
1738		if (hpts->p_lasttick != hpts->p_curtick) {
1739			counter_u64_add(hpts_loops, 1);
1740			hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
1741			goto again;
1742		}
1743	}
1744	{
1745		uint32_t t = 0, i, fnd = 0;
1746
1747		if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
1748			/*
1749			 * Find next slot that is occupied and use that to
1750			 * be the sleep time.
1751			 */
1752			for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
1753				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
1754					fnd = 1;
1755					break;
1756				}
1757				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
1758			}
1759			if (fnd) {
1760				hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
1761			} else {
1762#ifdef INVARIANTS
1763				panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
1764#endif
1765				counter_u64_add(back_tosleep, 1);
1766				hpts->p_on_queue_cnt = 0;
1767				goto non_found;
1768			}
1769		} else if (wrap_loop_cnt >= 2) {
1770			/* Special case handling */
1771			hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
1772		} else {
1773			/* No one on the wheel sleep for all but 400 slots or sleep max  */
1774		non_found:
1775			hpts->p_hpts_sleep_time = hpts_sleep_max;
1776		}
1777	}
1778}
1779
1780void
1781__tcp_set_hpts(struct inpcb *inp, int32_t line)
1782{
1783	struct tcp_hpts_entry *hpts;
1784
1785	INP_WLOCK_ASSERT(inp);
1786	hpts = tcp_hpts_lock(inp);
1787	if ((inp->inp_in_hpts == 0) &&
1788	    (inp->inp_hpts_cpu_set == 0)) {
1789		inp->inp_hpts_cpu = hpts_cpuid(inp);
1790		inp->inp_hpts_cpu_set = 1;
1791	}
1792	mtx_unlock(&hpts->p_mtx);
1793	hpts = tcp_input_lock(inp);
1794	if ((inp->inp_input_cpu_set == 0) &&
1795	    (inp->inp_in_input == 0)) {
1796		inp->inp_input_cpu = hpts_cpuid(inp);
1797		inp->inp_input_cpu_set = 1;
1798	}
1799	mtx_unlock(&hpts->p_mtx);
1800}
1801
1802uint16_t
1803tcp_hpts_delayedby(struct inpcb *inp){
1804	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
1805}
1806
1807static void
1808tcp_hpts_thread(void *ctx)
1809{
1810	struct tcp_hpts_entry *hpts;
1811	struct epoch_tracker et;
1812	struct timeval tv;
1813	sbintime_t sb;
1814
1815	hpts = (struct tcp_hpts_entry *)ctx;
1816	mtx_lock(&hpts->p_mtx);
1817	if (hpts->p_direct_wake) {
1818		/* Signaled by input */
1819		callout_stop(&hpts->co);
1820	} else {
1821		/* Timed out */
1822		if (callout_pending(&hpts->co) ||
1823		    !callout_active(&hpts->co)) {
1824			mtx_unlock(&hpts->p_mtx);
1825			return;
1826		}
1827		callout_deactivate(&hpts->co);
1828	}
1829	hpts->p_hpts_wake_scheduled = 0;
1830	hpts->p_hpts_active = 1;
1831	NET_EPOCH_ENTER(et);
1832	tcp_hptsi(hpts);
1833	NET_EPOCH_EXIT(et);
1834	HPTS_MTX_ASSERT(hpts);
1835	tv.tv_sec = 0;
1836	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
1837	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
1838		hpts->overidden_sleep = tv.tv_usec;
1839		tv.tv_usec = tcp_min_hptsi_time;
1840		hpts->p_on_min_sleep = 1;
1841	} else {
1842		/* Clear the min sleep flag */
1843		hpts->overidden_sleep = 0;
1844		hpts->p_on_min_sleep = 0;
1845	}
1846	hpts->p_hpts_active = 0;
1847	sb = tvtosbt(tv);
1848	if (tcp_hpts_callout_skip_swi == 0) {
1849		callout_reset_sbt_on(&hpts->co, sb, 0,
1850		    hpts_timeout_swi, hpts, hpts->p_cpu,
1851		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
1852	} else {
1853		callout_reset_sbt_on(&hpts->co, sb, 0,
1854		    hpts_timeout_dir, hpts,
1855		    hpts->p_cpu,
1856		    C_PREL(tcp_hpts_precision));
1857	}
1858	hpts->p_direct_wake = 0;
1859	mtx_unlock(&hpts->p_mtx);
1860}
1861
1862#undef	timersub
1863
1864static void
1865tcp_init_hptsi(void *st)
1866{
1867	int32_t i, j, error, bound = 0, created = 0;
1868	size_t sz, asz;
1869	struct timeval tv;
1870	sbintime_t sb;
1871	struct tcp_hpts_entry *hpts;
1872	struct pcpu *pc;
1873	cpuset_t cs;
1874	char unit[16];
1875	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
1876	int count, domain;
1877
1878	tcp_pace.rp_proc = NULL;
1879	tcp_pace.rp_num_hptss = ncpus;
1880	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
1881	hpts_loops = counter_u64_alloc(M_WAITOK);
1882	back_tosleep = counter_u64_alloc(M_WAITOK);
1883	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
1884	wheel_wrap = counter_u64_alloc(M_WAITOK);
1885	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
1886	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
1887	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
1888	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
1889		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
1890		    M_TCPHPTS, M_WAITOK | M_ZERO);
1891		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
1892		    M_TCPHPTS, M_WAITOK);
1893		hpts = tcp_pace.rp_ent[i];
1894		/*
1895		 * Init all the hpts structures that are not specifically
1896		 * zero'd by the allocations. Also lets attach them to the
1897		 * appropriate sysctl block as well.
1898		 */
1899		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
1900		    "hpts", MTX_DEF | MTX_DUPOK);
1901		TAILQ_INIT(&hpts->p_input);
1902		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
1903			TAILQ_INIT(&hpts->p_hptss[j]);
1904		}
1905		sysctl_ctx_init(&hpts->hpts_ctx);
1906		sprintf(unit, "%d", i);
1907		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
1908		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
1909		    OID_AUTO,
1910		    unit,
1911		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1912		    "");
1913		SYSCTL_ADD_INT(&hpts->hpts_ctx,
1914		    SYSCTL_CHILDREN(hpts->hpts_root),
1915		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
1916		    &hpts->p_on_inqueue_cnt, 0,
1917		    "Count TCB's awaiting input processing");
1918		SYSCTL_ADD_INT(&hpts->hpts_ctx,
1919		    SYSCTL_CHILDREN(hpts->hpts_root),
1920		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
1921		    &hpts->p_on_queue_cnt, 0,
1922		    "Count TCB's awaiting output processing");
1923		SYSCTL_ADD_U16(&hpts->hpts_ctx,
1924		    SYSCTL_CHILDREN(hpts->hpts_root),
1925		    OID_AUTO, "active", CTLFLAG_RD,
1926		    &hpts->p_hpts_active, 0,
1927		    "Is the hpts active");
1928		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1929		    SYSCTL_CHILDREN(hpts->hpts_root),
1930		    OID_AUTO, "curslot", CTLFLAG_RD,
1931		    &hpts->p_cur_slot, 0,
1932		    "What the current running pacers goal");
1933		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1934		    SYSCTL_CHILDREN(hpts->hpts_root),
1935		    OID_AUTO, "runtick", CTLFLAG_RD,
1936		    &hpts->p_runningtick, 0,
1937		    "What the running pacers current slot is");
1938		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1939		    SYSCTL_CHILDREN(hpts->hpts_root),
1940		    OID_AUTO, "curtick", CTLFLAG_RD,
1941		    &hpts->p_curtick, 0,
1942		    "What the running pacers last tick mapped to the wheel was");
1943		hpts->p_hpts_sleep_time = hpts_sleep_max;
1944		hpts->p_num = i;
1945		hpts->p_curtick = tcp_gethptstick(&tv);
1946		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
1947		hpts->p_cpu = 0xffff;
1948		hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
1949		callout_init(&hpts->co, 1);
1950	}
1951
1952	/* Don't try to bind to NUMA domains if we don't have any */
1953	if (vm_ndomains == 1 && tcp_bind_threads == 2)
1954		tcp_bind_threads = 0;
1955
1956	/*
1957	 * Now lets start ithreads to handle the hptss.
1958	 */
1959	CPU_FOREACH(i) {
1960		hpts = tcp_pace.rp_ent[i];
1961		hpts->p_cpu = i;
1962		error = swi_add(&hpts->ie, "hpts",
1963		    tcp_hpts_thread, (void *)hpts,
1964		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
1965		if (error) {
1966			panic("Can't add hpts:%p i:%d err:%d",
1967			    hpts, i, error);
1968		}
1969		created++;
1970		if (tcp_bind_threads == 1) {
1971			if (intr_event_bind(hpts->ie, i) == 0)
1972				bound++;
1973		} else if (tcp_bind_threads == 2) {
1974			pc = pcpu_find(i);
1975			domain = pc->pc_domain;
1976			CPU_COPY(&cpuset_domain[domain], &cs);
1977			if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
1978			    == 0) {
1979				bound++;
1980				count = hpts_domains[domain].count;
1981				hpts_domains[domain].cpu[count] = i;
1982				hpts_domains[domain].count++;
1983			}
1984		}
1985		tv.tv_sec = 0;
1986		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
1987		sb = tvtosbt(tv);
1988		if (tcp_hpts_callout_skip_swi == 0) {
1989			callout_reset_sbt_on(&hpts->co, sb, 0,
1990			    hpts_timeout_swi, hpts, hpts->p_cpu,
1991			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
1992		} else {
1993			callout_reset_sbt_on(&hpts->co, sb, 0,
1994			    hpts_timeout_dir, hpts,
1995			    hpts->p_cpu,
1996			    C_PREL(tcp_hpts_precision));
1997		}
1998	}
1999	/*
2000	 * If we somehow have an empty domain, fall back to choosing
2001	 * among all htps threads.
2002	 */
2003	for (i = 0; i < vm_ndomains; i++) {
2004		if (hpts_domains[i].count == 0) {
2005			tcp_bind_threads = 0;
2006			break;
2007		}
2008	}
2009
2010	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
2011	    created, bound,
2012	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
2013}
2014
2015SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
2016MODULE_VERSION(tcphpts, 1);
2017