t4_tom.c revision 346849
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/tom/t4_tom.c 346849 2019-04-28 18:36:54Z np $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/param.h>
35#include <sys/types.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/limits.h>
41#include <sys/module.h>
42#include <sys/protosw.h>
43#include <sys/domain.h>
44#include <sys/refcount.h>
45#include <sys/rmlock.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/taskqueue.h>
49#include <net/if.h>
50#include <net/if_var.h>
51#include <net/if_types.h>
52#include <net/if_vlan_var.h>
53#include <netinet/in.h>
54#include <netinet/in_pcb.h>
55#include <netinet/in_var.h>
56#include <netinet/ip.h>
57#include <netinet/ip6.h>
58#include <netinet6/scope6_var.h>
59#define TCPSTATES
60#include <netinet/tcp_fsm.h>
61#include <netinet/tcp_timer.h>
62#include <netinet/tcp_var.h>
63#include <netinet/toecore.h>
64
65#ifdef TCP_OFFLOAD
66#include "common/common.h"
67#include "common/t4_msg.h"
68#include "common/t4_regs.h"
69#include "common/t4_regs_values.h"
70#include "common/t4_tcb.h"
71#include "tom/t4_tom_l2t.h"
72#include "tom/t4_tom.h"
73#include "tom/t4_tls.h"
74
75static struct protosw toe_protosw;
76static struct pr_usrreqs toe_usrreqs;
77
78static struct protosw toe6_protosw;
79static struct pr_usrreqs toe6_usrreqs;
80
81/* Module ops */
82static int t4_tom_mod_load(void);
83static int t4_tom_mod_unload(void);
84static int t4_tom_modevent(module_t, int, void *);
85
86/* ULD ops and helpers */
87static int t4_tom_activate(struct adapter *);
88static int t4_tom_deactivate(struct adapter *);
89
90static struct uld_info tom_uld_info = {
91	.uld_id = ULD_TOM,
92	.activate = t4_tom_activate,
93	.deactivate = t4_tom_deactivate,
94};
95
96static void queue_tid_release(struct adapter *, int);
97static void release_offload_resources(struct toepcb *);
98static int alloc_tid_tabs(struct tid_info *);
99static void free_tid_tabs(struct tid_info *);
100static int add_lip(struct adapter *, struct in6_addr *);
101static int delete_lip(struct adapter *, struct in6_addr *);
102static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
103static void init_clip_table(struct adapter *, struct tom_data *);
104static void update_clip(struct adapter *, void *);
105static void t4_clip_task(void *, int);
106static void update_clip_table(struct adapter *, struct tom_data *);
107static void destroy_clip_table(struct adapter *, struct tom_data *);
108static void free_tom_data(struct adapter *, struct tom_data *);
109static void reclaim_wr_resources(void *, int);
110
111static int in6_ifaddr_gen;
112static eventhandler_tag ifaddr_evhandler;
113static struct timeout_task clip_task;
114
115struct toepcb *
116alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
117{
118	struct port_info *pi = vi->pi;
119	struct adapter *sc = pi->adapter;
120	struct toepcb *toep;
121	int tx_credits, txsd_total, len;
122
123	/*
124	 * The firmware counts tx work request credits in units of 16 bytes
125	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
126	 * about tx credits if it wants to abort a connection.
127	 */
128	tx_credits = sc->params.ofldq_wr_cred;
129	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
130
131	/*
132	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
133	 * immediate payload, and firmware counts tx work request credits in
134	 * units of 16 byte.  Calculate the maximum work requests possible.
135	 */
136	txsd_total = tx_credits /
137	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
138
139	KASSERT(txqid >= vi->first_ofld_txq &&
140	    txqid < vi->first_ofld_txq + vi->nofldtxq,
141	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
142		vi->first_ofld_txq, vi->nofldtxq));
143
144	KASSERT(rxqid >= vi->first_ofld_rxq &&
145	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
146	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
147		vi->first_ofld_rxq, vi->nofldrxq));
148
149	len = offsetof(struct toepcb, txsd) +
150	    txsd_total * sizeof(struct ofld_tx_sdesc);
151
152	toep = malloc(len, M_CXGBE, M_ZERO | flags);
153	if (toep == NULL)
154		return (NULL);
155
156	refcount_init(&toep->refcount, 1);
157	toep->td = sc->tom_softc;
158	toep->vi = vi;
159	toep->tc_idx = -1;
160	toep->tx_total = tx_credits;
161	toep->tx_credits = tx_credits;
162	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
163	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
164	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
165	mbufq_init(&toep->ulp_pduq, INT_MAX);
166	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
167	toep->txsd_total = txsd_total;
168	toep->txsd_avail = txsd_total;
169	toep->txsd_pidx = 0;
170	toep->txsd_cidx = 0;
171	aiotx_init_toep(toep);
172
173	return (toep);
174}
175
176struct toepcb *
177hold_toepcb(struct toepcb *toep)
178{
179
180	refcount_acquire(&toep->refcount);
181	return (toep);
182}
183
184void
185free_toepcb(struct toepcb *toep)
186{
187
188	if (refcount_release(&toep->refcount) == 0)
189		return;
190
191	KASSERT(!(toep->flags & TPF_ATTACHED),
192	    ("%s: attached to an inpcb", __func__));
193	KASSERT(!(toep->flags & TPF_CPL_PENDING),
194	    ("%s: CPL pending", __func__));
195
196	if (toep->ulp_mode == ULP_MODE_TCPDDP)
197		ddp_uninit_toep(toep);
198	tls_uninit_toep(toep);
199	free(toep, M_CXGBE);
200}
201
202/*
203 * Set up the socket for TCP offload.
204 */
205void
206offload_socket(struct socket *so, struct toepcb *toep)
207{
208	struct tom_data *td = toep->td;
209	struct inpcb *inp = sotoinpcb(so);
210	struct tcpcb *tp = intotcpcb(inp);
211	struct sockbuf *sb;
212
213	INP_WLOCK_ASSERT(inp);
214
215	/* Update socket */
216	sb = &so->so_snd;
217	SOCKBUF_LOCK(sb);
218	sb->sb_flags |= SB_NOCOALESCE;
219	SOCKBUF_UNLOCK(sb);
220	sb = &so->so_rcv;
221	SOCKBUF_LOCK(sb);
222	sb->sb_flags |= SB_NOCOALESCE;
223	if (inp->inp_vflag & INP_IPV6)
224		so->so_proto = &toe6_protosw;
225	else
226		so->so_proto = &toe_protosw;
227	SOCKBUF_UNLOCK(sb);
228
229	/* Update TCP PCB */
230	tp->tod = &td->tod;
231	tp->t_toe = toep;
232	tp->t_flags |= TF_TOE;
233
234	/* Install an extra hold on inp */
235	toep->inp = inp;
236	toep->flags |= TPF_ATTACHED;
237	in_pcbref(inp);
238
239	/* Add the TOE PCB to the active list */
240	mtx_lock(&td->toep_list_lock);
241	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
242	mtx_unlock(&td->toep_list_lock);
243}
244
245/* This is _not_ the normal way to "unoffload" a socket. */
246void
247undo_offload_socket(struct socket *so)
248{
249	struct inpcb *inp = sotoinpcb(so);
250	struct tcpcb *tp = intotcpcb(inp);
251	struct toepcb *toep = tp->t_toe;
252	struct tom_data *td = toep->td;
253	struct sockbuf *sb;
254
255	INP_WLOCK_ASSERT(inp);
256
257	sb = &so->so_snd;
258	SOCKBUF_LOCK(sb);
259	sb->sb_flags &= ~SB_NOCOALESCE;
260	SOCKBUF_UNLOCK(sb);
261	sb = &so->so_rcv;
262	SOCKBUF_LOCK(sb);
263	sb->sb_flags &= ~SB_NOCOALESCE;
264	SOCKBUF_UNLOCK(sb);
265
266	tp->tod = NULL;
267	tp->t_toe = NULL;
268	tp->t_flags &= ~TF_TOE;
269
270	toep->inp = NULL;
271	toep->flags &= ~TPF_ATTACHED;
272	if (in_pcbrele_wlocked(inp))
273		panic("%s: inp freed.", __func__);
274
275	mtx_lock(&td->toep_list_lock);
276	TAILQ_REMOVE(&td->toep_list, toep, link);
277	mtx_unlock(&td->toep_list_lock);
278}
279
280static void
281release_offload_resources(struct toepcb *toep)
282{
283	struct tom_data *td = toep->td;
284	struct adapter *sc = td_adapter(td);
285	int tid = toep->tid;
286
287	KASSERT(!(toep->flags & TPF_CPL_PENDING),
288	    ("%s: %p has CPL pending.", __func__, toep));
289	KASSERT(!(toep->flags & TPF_ATTACHED),
290	    ("%s: %p is still attached.", __func__, toep));
291
292	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
293	    __func__, toep, tid, toep->l2te, toep->ce);
294
295	/*
296	 * These queues should have been emptied at approximately the same time
297	 * that a normal connection's socket's so_snd would have been purged or
298	 * drained.  Do _not_ clean up here.
299	 */
300	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
301	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
302#ifdef INVARIANTS
303	if (toep->ulp_mode == ULP_MODE_TCPDDP)
304		ddp_assert_empty(toep);
305#endif
306
307	if (toep->l2te)
308		t4_l2t_release(toep->l2te);
309
310	if (tid >= 0) {
311		remove_tid(sc, tid, toep->ce ? 2 : 1);
312		release_tid(sc, tid, toep->ctrlq);
313	}
314
315	if (toep->ce)
316		release_lip(td, toep->ce);
317
318#ifdef RATELIMIT
319	if (toep->tc_idx != -1)
320		t4_release_cl_rl_kbps(sc, toep->vi->pi->port_id, toep->tc_idx);
321#endif
322	mtx_lock(&td->toep_list_lock);
323	TAILQ_REMOVE(&td->toep_list, toep, link);
324	mtx_unlock(&td->toep_list_lock);
325
326	free_toepcb(toep);
327}
328
329/*
330 * The kernel is done with the TCP PCB and this is our opportunity to unhook the
331 * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
332 * pending CPL) then it is time to release all resources tied to the toepcb.
333 *
334 * Also gets called when an offloaded active open fails and the TOM wants the
335 * kernel to take the TCP PCB back.
336 */
337static void
338t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
339{
340#if defined(KTR) || defined(INVARIANTS)
341	struct inpcb *inp = tp->t_inpcb;
342#endif
343	struct toepcb *toep = tp->t_toe;
344
345	INP_WLOCK_ASSERT(inp);
346
347	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
348	KASSERT(toep->flags & TPF_ATTACHED,
349	    ("%s: not attached", __func__));
350
351#ifdef KTR
352	if (tp->t_state == TCPS_SYN_SENT) {
353		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
354		    __func__, toep->tid, toep, toep->flags, inp,
355		    inp->inp_flags);
356	} else {
357		CTR6(KTR_CXGBE,
358		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
359		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
360		    inp->inp_flags);
361	}
362#endif
363
364	tp->t_toe = NULL;
365	tp->t_flags &= ~TF_TOE;
366	toep->flags &= ~TPF_ATTACHED;
367
368	if (!(toep->flags & TPF_CPL_PENDING))
369		release_offload_resources(toep);
370}
371
372/*
373 * setsockopt handler.
374 */
375static void
376t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
377{
378	struct adapter *sc = tod->tod_softc;
379	struct toepcb *toep = tp->t_toe;
380
381	if (dir == SOPT_GET)
382		return;
383
384	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
385
386	switch (name) {
387	case TCP_NODELAY:
388		if (tp->t_state != TCPS_ESTABLISHED)
389			break;
390		t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
391		    V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1),
392		    0, 0);
393		break;
394	default:
395		break;
396	}
397}
398
399static inline int
400get_tcb_bit(u_char *tcb, int bit)
401{
402	int ix, shift;
403
404	ix = 127 - (bit >> 3);
405	shift = bit & 0x7;
406
407	return ((tcb[ix] >> shift) & 1);
408}
409
410static inline uint64_t
411get_tcb_bits(u_char *tcb, int hi, int lo)
412{
413	uint64_t rc = 0;
414
415	while (hi >= lo) {
416		rc = (rc << 1) | get_tcb_bit(tcb, hi);
417		--hi;
418	}
419
420	return (rc);
421}
422
423/*
424 * Called by the kernel to allow the TOE driver to "refine" values filled up in
425 * the tcp_info for an offloaded connection.
426 */
427static void
428t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti)
429{
430	int i, j, k, rc;
431	struct adapter *sc = tod->tod_softc;
432	struct toepcb *toep = tp->t_toe;
433	uint32_t addr, v;
434	uint32_t buf[TCB_SIZE / sizeof(uint32_t)];
435	u_char *tcb, tmp;
436
437	INP_WLOCK_ASSERT(tp->t_inpcb);
438	MPASS(ti != NULL);
439
440	addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + toep->tid * TCB_SIZE;
441	rc = read_via_memwin(sc, 2, addr, &buf[0], TCB_SIZE);
442	if (rc != 0)
443		return;
444
445	tcb = (u_char *)&buf[0];
446	for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
447		for (k = 0; k < 16; k++) {
448			tmp = tcb[i + k];
449			tcb[i + k] = tcb[j + k];
450			tcb[j + k] = tmp;
451		}
452	}
453
454	ti->tcpi_state = get_tcb_bits(tcb, 115, 112);
455
456	v = get_tcb_bits(tcb, 271, 256);
457	ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
458
459	v = get_tcb_bits(tcb, 287, 272);
460	ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
461
462	ti->tcpi_snd_ssthresh = get_tcb_bits(tcb, 487, 460);
463	ti->tcpi_snd_cwnd = get_tcb_bits(tcb, 459, 432);
464	ti->tcpi_rcv_nxt = get_tcb_bits(tcb, 553, 522);
465
466	ti->tcpi_snd_nxt = get_tcb_bits(tcb, 319, 288) -
467	    get_tcb_bits(tcb, 375, 348);
468
469	/* Receive window being advertised by us. */
470	ti->tcpi_rcv_space = get_tcb_bits(tcb, 581, 554);
471
472	/* Send window ceiling. */
473	v = get_tcb_bits(tcb, 159, 144) << get_tcb_bits(tcb, 131, 128);
474	ti->tcpi_snd_wnd = min(v, ti->tcpi_snd_cwnd);
475}
476
477/*
478 * The TOE driver will not receive any more CPLs for the tid associated with the
479 * toepcb; release the hold on the inpcb.
480 */
481void
482final_cpl_received(struct toepcb *toep)
483{
484	struct inpcb *inp = toep->inp;
485
486	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
487	INP_WLOCK_ASSERT(inp);
488	KASSERT(toep->flags & TPF_CPL_PENDING,
489	    ("%s: CPL not pending already?", __func__));
490
491	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
492	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
493
494	if (toep->ulp_mode == ULP_MODE_TCPDDP)
495		release_ddp_resources(toep);
496	toep->inp = NULL;
497	toep->flags &= ~TPF_CPL_PENDING;
498	mbufq_drain(&toep->ulp_pdu_reclaimq);
499
500	if (!(toep->flags & TPF_ATTACHED))
501		release_offload_resources(toep);
502
503	if (!in_pcbrele_wlocked(inp))
504		INP_WUNLOCK(inp);
505}
506
507void
508insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
509{
510	struct tid_info *t = &sc->tids;
511
512	t->tid_tab[tid] = ctx;
513	atomic_add_int(&t->tids_in_use, ntids);
514}
515
516void *
517lookup_tid(struct adapter *sc, int tid)
518{
519	struct tid_info *t = &sc->tids;
520
521	return (t->tid_tab[tid]);
522}
523
524void
525update_tid(struct adapter *sc, int tid, void *ctx)
526{
527	struct tid_info *t = &sc->tids;
528
529	t->tid_tab[tid] = ctx;
530}
531
532void
533remove_tid(struct adapter *sc, int tid, int ntids)
534{
535	struct tid_info *t = &sc->tids;
536
537	t->tid_tab[tid] = NULL;
538	atomic_subtract_int(&t->tids_in_use, ntids);
539}
540
541void
542release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
543{
544	struct wrqe *wr;
545	struct cpl_tid_release *req;
546
547	wr = alloc_wrqe(sizeof(*req), ctrlq);
548	if (wr == NULL) {
549		queue_tid_release(sc, tid);	/* defer */
550		return;
551	}
552	req = wrtod(wr);
553
554	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
555
556	t4_wrq_tx(sc, wr);
557}
558
559static void
560queue_tid_release(struct adapter *sc, int tid)
561{
562
563	CXGBE_UNIMPLEMENTED("deferred tid release");
564}
565
566/*
567 * What mtu_idx to use, given a 4-tuple.  Note that both s->mss and tcp_mssopt
568 * have the MSS that we should advertise in our SYN.  Advertised MSS doesn't
569 * account for any TCP options so the effective MSS (only payload, no headers or
570 * options) could be different.  We fill up tp->t_maxseg with the effective MSS
571 * at the end of the 3-way handshake.
572 */
573int
574find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
575    struct offload_settings *s)
576{
577	unsigned short *mtus = &sc->params.mtus[0];
578	int i, mss, mtu;
579
580	MPASS(inc != NULL);
581
582	mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
583	if (inc->inc_flags & INC_ISIPV6)
584		mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
585	else
586		mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
587
588	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
589		continue;
590
591	return (i);
592}
593
594/*
595 * Determine the receive window size for a socket.
596 */
597u_long
598select_rcv_wnd(struct socket *so)
599{
600	unsigned long wnd;
601
602	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
603
604	wnd = sbspace(&so->so_rcv);
605	if (wnd < MIN_RCV_WND)
606		wnd = MIN_RCV_WND;
607
608	return min(wnd, MAX_RCV_WND);
609}
610
611int
612select_rcv_wscale(void)
613{
614	int wscale = 0;
615	unsigned long space = sb_max;
616
617	if (space > MAX_RCV_WND)
618		space = MAX_RCV_WND;
619
620	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
621		wscale++;
622
623	return (wscale);
624}
625
626/*
627 * socket so could be a listening socket too.
628 */
629uint64_t
630calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
631    int mtu_idx, int rscale, int rx_credits, int ulp_mode,
632    struct offload_settings *s)
633{
634	int keepalive;
635	uint64_t opt0;
636
637	MPASS(so != NULL);
638	MPASS(vi != NULL);
639	KASSERT(rx_credits <= M_RCV_BUFSIZ,
640	    ("%s: rcv_bufsiz too high", __func__));
641
642	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
643	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits) |
644	    V_L2T_IDX(e->idx) | V_SMAC_SEL(vi->smt_idx) |
645	    V_TX_CHAN(vi->pi->tx_chan);
646
647	keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE;
648	opt0 |= V_KEEP_ALIVE(keepalive != 0);
649
650	if (s->nagle < 0) {
651		struct inpcb *inp = sotoinpcb(so);
652		struct tcpcb *tp = intotcpcb(inp);
653
654		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
655	} else
656		opt0 |= V_NAGLE(s->nagle != 0);
657
658	return htobe64(opt0);
659}
660
661uint64_t
662select_ntuple(struct vi_info *vi, struct l2t_entry *e)
663{
664	struct adapter *sc = vi->pi->adapter;
665	struct tp_params *tp = &sc->params.tp;
666	uint16_t viid = vi->viid;
667	uint64_t ntuple = 0;
668
669	/*
670	 * Initialize each of the fields which we care about which are present
671	 * in the Compressed Filter Tuple.
672	 */
673	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
674		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
675
676	if (tp->port_shift >= 0)
677		ntuple |= (uint64_t)e->lport << tp->port_shift;
678
679	if (tp->protocol_shift >= 0)
680		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
681
682	if (tp->vnic_shift >= 0) {
683		uint32_t vf = G_FW_VIID_VIN(viid);
684		uint32_t pf = G_FW_VIID_PFN(viid);
685		uint32_t vld = G_FW_VIID_VIVLD(viid);
686
687		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
688		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
689	}
690
691	if (is_t4(sc))
692		return (htobe32((uint32_t)ntuple));
693	else
694		return (htobe64(V_FILTER_TUPLE(ntuple)));
695}
696
697static int
698is_tls_sock(struct socket *so, struct adapter *sc)
699{
700	struct inpcb *inp = sotoinpcb(so);
701	int i, rc;
702
703	/* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */
704	rc = 0;
705	ADAPTER_LOCK(sc);
706	for (i = 0; i < sc->tt.num_tls_rx_ports; i++) {
707		if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) ||
708		    inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) {
709			rc = 1;
710			break;
711		}
712	}
713	ADAPTER_UNLOCK(sc);
714	return (rc);
715}
716
717int
718select_ulp_mode(struct socket *so, struct adapter *sc,
719    struct offload_settings *s)
720{
721
722	if (can_tls_offload(sc) &&
723	    (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc))))
724		return (ULP_MODE_TLS);
725	else if (s->ddp > 0 ||
726	    (s->ddp < 0 && sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0))
727		return (ULP_MODE_TCPDDP);
728	else
729		return (ULP_MODE_NONE);
730}
731
732void
733set_ulp_mode(struct toepcb *toep, int ulp_mode)
734{
735
736	CTR4(KTR_CXGBE, "%s: toep %p (tid %d) ulp_mode %d",
737	    __func__, toep, toep->tid, ulp_mode);
738	toep->ulp_mode = ulp_mode;
739	tls_init_toep(toep);
740	if (toep->ulp_mode == ULP_MODE_TCPDDP)
741		ddp_init_toep(toep);
742}
743
744int
745negative_advice(int status)
746{
747
748	return (status == CPL_ERR_RTX_NEG_ADVICE ||
749	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
750	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
751}
752
753static int
754alloc_tid_tab(struct tid_info *t, int flags)
755{
756
757	MPASS(t->ntids > 0);
758	MPASS(t->tid_tab == NULL);
759
760	t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
761	    M_ZERO | flags);
762	if (t->tid_tab == NULL)
763		return (ENOMEM);
764	atomic_store_rel_int(&t->tids_in_use, 0);
765
766	return (0);
767}
768
769static void
770free_tid_tab(struct tid_info *t)
771{
772
773	KASSERT(t->tids_in_use == 0,
774	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
775
776	free(t->tid_tab, M_CXGBE);
777	t->tid_tab = NULL;
778}
779
780static int
781alloc_stid_tab(struct tid_info *t, int flags)
782{
783
784	MPASS(t->nstids > 0);
785	MPASS(t->stid_tab == NULL);
786
787	t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
788	    M_ZERO | flags);
789	if (t->stid_tab == NULL)
790		return (ENOMEM);
791	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
792	t->stids_in_use = 0;
793	TAILQ_INIT(&t->stids);
794	t->nstids_free_head = t->nstids;
795
796	return (0);
797}
798
799static void
800free_stid_tab(struct tid_info *t)
801{
802
803	KASSERT(t->stids_in_use == 0,
804	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
805
806	if (mtx_initialized(&t->stid_lock))
807		mtx_destroy(&t->stid_lock);
808	free(t->stid_tab, M_CXGBE);
809	t->stid_tab = NULL;
810}
811
812static void
813free_tid_tabs(struct tid_info *t)
814{
815
816	free_tid_tab(t);
817	free_atid_tab(t);
818	free_stid_tab(t);
819}
820
821static int
822alloc_tid_tabs(struct tid_info *t)
823{
824	int rc;
825
826	rc = alloc_tid_tab(t, M_NOWAIT);
827	if (rc != 0)
828		goto failed;
829
830	rc = alloc_atid_tab(t, M_NOWAIT);
831	if (rc != 0)
832		goto failed;
833
834	rc = alloc_stid_tab(t, M_NOWAIT);
835	if (rc != 0)
836		goto failed;
837
838	return (0);
839failed:
840	free_tid_tabs(t);
841	return (rc);
842}
843
844static int
845add_lip(struct adapter *sc, struct in6_addr *lip)
846{
847        struct fw_clip_cmd c;
848
849	ASSERT_SYNCHRONIZED_OP(sc);
850	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
851
852        memset(&c, 0, sizeof(c));
853	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
854	    F_FW_CMD_WRITE);
855        c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
856        c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
857        c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
858
859	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
860}
861
862static int
863delete_lip(struct adapter *sc, struct in6_addr *lip)
864{
865	struct fw_clip_cmd c;
866
867	ASSERT_SYNCHRONIZED_OP(sc);
868	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
869
870	memset(&c, 0, sizeof(c));
871	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
872	    F_FW_CMD_READ);
873        c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
874        c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
875        c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
876
877	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
878}
879
880static struct clip_entry *
881search_lip(struct tom_data *td, struct in6_addr *lip)
882{
883	struct clip_entry *ce;
884
885	mtx_assert(&td->clip_table_lock, MA_OWNED);
886
887	TAILQ_FOREACH(ce, &td->clip_table, link) {
888		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
889			return (ce);
890	}
891
892	return (NULL);
893}
894
895struct clip_entry *
896hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce)
897{
898
899	mtx_lock(&td->clip_table_lock);
900	if (ce == NULL)
901		ce = search_lip(td, lip);
902	if (ce != NULL)
903		ce->refcount++;
904	mtx_unlock(&td->clip_table_lock);
905
906	return (ce);
907}
908
909void
910release_lip(struct tom_data *td, struct clip_entry *ce)
911{
912
913	mtx_lock(&td->clip_table_lock);
914	KASSERT(search_lip(td, &ce->lip) == ce,
915	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
916	KASSERT(ce->refcount > 0,
917	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
918	--ce->refcount;
919	mtx_unlock(&td->clip_table_lock);
920}
921
922static void
923init_clip_table(struct adapter *sc, struct tom_data *td)
924{
925
926	ASSERT_SYNCHRONIZED_OP(sc);
927
928	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
929	TAILQ_INIT(&td->clip_table);
930	td->clip_gen = -1;
931
932	update_clip_table(sc, td);
933}
934
935static void
936update_clip(struct adapter *sc, void *arg __unused)
937{
938
939	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
940		return;
941
942	if (uld_active(sc, ULD_TOM))
943		update_clip_table(sc, sc->tom_softc);
944
945	end_synchronized_op(sc, LOCK_HELD);
946}
947
948static void
949t4_clip_task(void *arg, int count)
950{
951
952	t4_iterate(update_clip, NULL);
953}
954
955static void
956update_clip_table(struct adapter *sc, struct tom_data *td)
957{
958	struct rm_priotracker in6_ifa_tracker;
959	struct in6_ifaddr *ia;
960	struct in6_addr *lip, tlip;
961	struct clip_head stale;
962	struct clip_entry *ce, *ce_temp;
963	struct vi_info *vi;
964	int rc, gen, i, j;
965	uintptr_t last_vnet;
966
967	ASSERT_SYNCHRONIZED_OP(sc);
968
969	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
970	mtx_lock(&td->clip_table_lock);
971
972	gen = atomic_load_acq_int(&in6_ifaddr_gen);
973	if (gen == td->clip_gen)
974		goto done;
975
976	TAILQ_INIT(&stale);
977	TAILQ_CONCAT(&stale, &td->clip_table, link);
978
979	/*
980	 * last_vnet optimizes the common cases where all if_vnet = NULL (no
981	 * VIMAGE) or all if_vnet = vnet0.
982	 */
983	last_vnet = (uintptr_t)(-1);
984	for_each_port(sc, i)
985	for_each_vi(sc->port[i], j, vi) {
986		if (last_vnet == (uintptr_t)vi->ifp->if_vnet)
987			continue;
988
989		/* XXX: races with if_vmove */
990		CURVNET_SET(vi->ifp->if_vnet);
991		TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
992			lip = &ia->ia_addr.sin6_addr;
993
994			KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
995			    ("%s: mcast address in in6_ifaddr list", __func__));
996
997			if (IN6_IS_ADDR_LOOPBACK(lip))
998				continue;
999			if (IN6_IS_SCOPE_EMBED(lip)) {
1000				/* Remove the embedded scope */
1001				tlip = *lip;
1002				lip = &tlip;
1003				in6_clearscope(lip);
1004			}
1005			/*
1006			 * XXX: how to weed out the link local address for the
1007			 * loopback interface?  It's fe80::1 usually (always?).
1008			 */
1009
1010			/*
1011			 * If it's in the main list then we already know it's
1012			 * not stale.
1013			 */
1014			TAILQ_FOREACH(ce, &td->clip_table, link) {
1015				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
1016					goto next;
1017			}
1018
1019			/*
1020			 * If it's in the stale list we should move it to the
1021			 * main list.
1022			 */
1023			TAILQ_FOREACH(ce, &stale, link) {
1024				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
1025					TAILQ_REMOVE(&stale, ce, link);
1026					TAILQ_INSERT_TAIL(&td->clip_table, ce,
1027					    link);
1028					goto next;
1029				}
1030			}
1031
1032			/* A new IP6 address; add it to the CLIP table */
1033			ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
1034			memcpy(&ce->lip, lip, sizeof(ce->lip));
1035			ce->refcount = 0;
1036			rc = add_lip(sc, lip);
1037			if (rc == 0)
1038				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
1039			else {
1040				char ip[INET6_ADDRSTRLEN];
1041
1042				inet_ntop(AF_INET6, &ce->lip, &ip[0],
1043				    sizeof(ip));
1044				log(LOG_ERR, "%s: could not add %s (%d)\n",
1045				    __func__, ip, rc);
1046				free(ce, M_CXGBE);
1047			}
1048next:
1049			continue;
1050		}
1051		CURVNET_RESTORE();
1052		last_vnet = (uintptr_t)vi->ifp->if_vnet;
1053	}
1054
1055	/*
1056	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
1057	 * no longer referenced by the driver.
1058	 */
1059	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
1060		if (ce->refcount == 0) {
1061			rc = delete_lip(sc, &ce->lip);
1062			if (rc == 0) {
1063				TAILQ_REMOVE(&stale, ce, link);
1064				free(ce, M_CXGBE);
1065			} else {
1066				char ip[INET6_ADDRSTRLEN];
1067
1068				inet_ntop(AF_INET6, &ce->lip, &ip[0],
1069				    sizeof(ip));
1070				log(LOG_ERR, "%s: could not delete %s (%d)\n",
1071				    __func__, ip, rc);
1072			}
1073		}
1074	}
1075	/* The ones that are still referenced need to stay in the CLIP table */
1076	TAILQ_CONCAT(&td->clip_table, &stale, link);
1077
1078	td->clip_gen = gen;
1079done:
1080	mtx_unlock(&td->clip_table_lock);
1081	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
1082}
1083
1084static void
1085destroy_clip_table(struct adapter *sc, struct tom_data *td)
1086{
1087	struct clip_entry *ce, *ce_temp;
1088
1089	if (mtx_initialized(&td->clip_table_lock)) {
1090		mtx_lock(&td->clip_table_lock);
1091		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
1092			KASSERT(ce->refcount == 0,
1093			    ("%s: CLIP entry %p still in use (%d)", __func__,
1094			    ce, ce->refcount));
1095			TAILQ_REMOVE(&td->clip_table, ce, link);
1096			delete_lip(sc, &ce->lip);
1097			free(ce, M_CXGBE);
1098		}
1099		mtx_unlock(&td->clip_table_lock);
1100		mtx_destroy(&td->clip_table_lock);
1101	}
1102}
1103
1104static void
1105free_tom_data(struct adapter *sc, struct tom_data *td)
1106{
1107
1108	ASSERT_SYNCHRONIZED_OP(sc);
1109
1110	KASSERT(TAILQ_EMPTY(&td->toep_list),
1111	    ("%s: TOE PCB list is not empty.", __func__));
1112	KASSERT(td->lctx_count == 0,
1113	    ("%s: lctx hash table is not empty.", __func__));
1114
1115	t4_free_ppod_region(&td->pr);
1116	destroy_clip_table(sc, td);
1117
1118	if (td->listen_mask != 0)
1119		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
1120
1121	if (mtx_initialized(&td->unsent_wr_lock))
1122		mtx_destroy(&td->unsent_wr_lock);
1123	if (mtx_initialized(&td->lctx_hash_lock))
1124		mtx_destroy(&td->lctx_hash_lock);
1125	if (mtx_initialized(&td->toep_list_lock))
1126		mtx_destroy(&td->toep_list_lock);
1127
1128	free_tid_tabs(&sc->tids);
1129	free(td, M_CXGBE);
1130}
1131
1132static char *
1133prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
1134    int *buflen)
1135{
1136	char *pkt;
1137	struct tcphdr *th;
1138	int ipv6, len;
1139	const int maxlen =
1140	    max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
1141	    max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1142	    sizeof(struct tcphdr);
1143
1144	MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
1145
1146	pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
1147	if (pkt == NULL)
1148		return (NULL);
1149
1150	ipv6 = inp->inp_vflag & INP_IPV6;
1151	len = 0;
1152
1153	if (vtag == 0xffff) {
1154		struct ether_header *eh = (void *)pkt;
1155
1156		if (ipv6)
1157			eh->ether_type = htons(ETHERTYPE_IPV6);
1158		else
1159			eh->ether_type = htons(ETHERTYPE_IP);
1160
1161		len += sizeof(*eh);
1162	} else {
1163		struct ether_vlan_header *evh = (void *)pkt;
1164
1165		evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
1166		evh->evl_tag = htons(vtag);
1167		if (ipv6)
1168			evh->evl_proto = htons(ETHERTYPE_IPV6);
1169		else
1170			evh->evl_proto = htons(ETHERTYPE_IP);
1171
1172		len += sizeof(*evh);
1173	}
1174
1175	if (ipv6) {
1176		struct ip6_hdr *ip6 = (void *)&pkt[len];
1177
1178		ip6->ip6_vfc = IPV6_VERSION;
1179		ip6->ip6_plen = htons(sizeof(struct tcphdr));
1180		ip6->ip6_nxt = IPPROTO_TCP;
1181		if (open_type == OPEN_TYPE_ACTIVE) {
1182			ip6->ip6_src = inp->in6p_laddr;
1183			ip6->ip6_dst = inp->in6p_faddr;
1184		} else if (open_type == OPEN_TYPE_LISTEN) {
1185			ip6->ip6_src = inp->in6p_laddr;
1186			ip6->ip6_dst = ip6->ip6_src;
1187		}
1188
1189		len += sizeof(*ip6);
1190	} else {
1191		struct ip *ip = (void *)&pkt[len];
1192
1193		ip->ip_v = IPVERSION;
1194		ip->ip_hl = sizeof(*ip) >> 2;
1195		ip->ip_tos = inp->inp_ip_tos;
1196		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
1197		ip->ip_ttl = inp->inp_ip_ttl;
1198		ip->ip_p = IPPROTO_TCP;
1199		if (open_type == OPEN_TYPE_ACTIVE) {
1200			ip->ip_src = inp->inp_laddr;
1201			ip->ip_dst = inp->inp_faddr;
1202		} else if (open_type == OPEN_TYPE_LISTEN) {
1203			ip->ip_src = inp->inp_laddr;
1204			ip->ip_dst = ip->ip_src;
1205		}
1206
1207		len += sizeof(*ip);
1208	}
1209
1210	th = (void *)&pkt[len];
1211	if (open_type == OPEN_TYPE_ACTIVE) {
1212		th->th_sport = inp->inp_lport;	/* network byte order already */
1213		th->th_dport = inp->inp_fport;	/* ditto */
1214	} else if (open_type == OPEN_TYPE_LISTEN) {
1215		th->th_sport = inp->inp_lport;	/* network byte order already */
1216		th->th_dport = th->th_sport;
1217	}
1218	len += sizeof(th);
1219
1220	*pktlen = *buflen = len;
1221	return (pkt);
1222}
1223
1224const struct offload_settings *
1225lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
1226    uint16_t vtag, struct inpcb *inp)
1227{
1228	const struct t4_offload_policy *op;
1229	char *pkt;
1230	struct offload_rule *r;
1231	int i, matched, pktlen, buflen;
1232	static const struct offload_settings allow_offloading_settings = {
1233		.offload = 1,
1234		.rx_coalesce = -1,
1235		.cong_algo = -1,
1236		.sched_class = -1,
1237		.tstamp = -1,
1238		.sack = -1,
1239		.nagle = -1,
1240		.ecn = -1,
1241		.ddp = -1,
1242		.tls = -1,
1243		.txq = -1,
1244		.rxq = -1,
1245		.mss = -1,
1246	};
1247	static const struct offload_settings disallow_offloading_settings = {
1248		.offload = 0,
1249		/* rest is irrelevant when offload is off. */
1250	};
1251
1252	rw_assert(&sc->policy_lock, RA_LOCKED);
1253
1254	/*
1255	 * If there's no Connection Offloading Policy attached to the device
1256	 * then we need to return a default static policy.  If
1257	 * "cop_managed_offloading" is true, then we need to disallow
1258	 * offloading until a COP is attached to the device.  Otherwise we
1259	 * allow offloading ...
1260	 */
1261	op = sc->policy;
1262	if (op == NULL) {
1263		if (sc->tt.cop_managed_offloading)
1264			return (&disallow_offloading_settings);
1265		else
1266			return (&allow_offloading_settings);
1267	}
1268
1269	switch (open_type) {
1270	case OPEN_TYPE_ACTIVE:
1271	case OPEN_TYPE_LISTEN:
1272		pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen);
1273		break;
1274	case OPEN_TYPE_PASSIVE:
1275		MPASS(m != NULL);
1276		pkt = mtod(m, char *);
1277		MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
1278		pkt += sizeof(struct cpl_pass_accept_req);
1279		pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
1280		buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
1281		break;
1282	default:
1283		MPASS(0);
1284		return (&disallow_offloading_settings);
1285	}
1286
1287	if (pkt == NULL || pktlen == 0 || buflen == 0)
1288		return (&disallow_offloading_settings);
1289
1290	r = &op->rule[0];
1291	for (i = 0; i < op->nrules; i++, r++) {
1292		if (r->open_type != open_type &&
1293		    r->open_type != OPEN_TYPE_DONTCARE) {
1294			continue;
1295		}
1296		matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
1297		if (matched)
1298			break;
1299	}
1300
1301	if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
1302		free(pkt, M_CXGBE);
1303
1304	return (matched ? &r->settings : &disallow_offloading_settings);
1305}
1306
1307static void
1308reclaim_wr_resources(void *arg, int count)
1309{
1310	struct tom_data *td = arg;
1311	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
1312	struct cpl_act_open_req *cpl;
1313	u_int opcode, atid;
1314	struct wrqe *wr;
1315	struct adapter *sc;
1316
1317	mtx_lock(&td->unsent_wr_lock);
1318	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
1319	mtx_unlock(&td->unsent_wr_lock);
1320
1321	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
1322		STAILQ_REMOVE_HEAD(&twr_list, link);
1323
1324		cpl = wrtod(wr);
1325		opcode = GET_OPCODE(cpl);
1326
1327		switch (opcode) {
1328		case CPL_ACT_OPEN_REQ:
1329		case CPL_ACT_OPEN_REQ6:
1330			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
1331			sc = td_adapter(td);
1332
1333			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
1334			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
1335			free(wr, M_CXGBE);
1336			break;
1337		default:
1338			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
1339			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
1340			/* WR not freed here; go look at it with a debugger.  */
1341		}
1342	}
1343}
1344
1345/*
1346 * Ground control to Major TOM
1347 * Commencing countdown, engines on
1348 */
1349static int
1350t4_tom_activate(struct adapter *sc)
1351{
1352	struct tom_data *td;
1353	struct toedev *tod;
1354	struct vi_info *vi;
1355	struct sge_ofld_rxq *ofld_rxq;
1356	int i, j, rc, v;
1357
1358	ASSERT_SYNCHRONIZED_OP(sc);
1359
1360	/* per-adapter softc for TOM */
1361	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
1362	if (td == NULL)
1363		return (ENOMEM);
1364
1365	/* List of TOE PCBs and associated lock */
1366	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
1367	TAILQ_INIT(&td->toep_list);
1368
1369	/* Listen context */
1370	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
1371	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
1372	    &td->listen_mask, HASH_NOWAIT);
1373
1374	/* List of WRs for which L2 resolution failed */
1375	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
1376	STAILQ_INIT(&td->unsent_wr_list);
1377	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
1378
1379	/* TID tables */
1380	rc = alloc_tid_tabs(&sc->tids);
1381	if (rc != 0)
1382		goto done;
1383
1384	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
1385	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
1386	if (rc != 0)
1387		goto done;
1388	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
1389	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
1390
1391	/* CLIP table for IPv6 offload */
1392	init_clip_table(sc, td);
1393
1394	/* toedev ops */
1395	tod = &td->tod;
1396	init_toedev(tod);
1397	tod->tod_softc = sc;
1398	tod->tod_connect = t4_connect;
1399	tod->tod_listen_start = t4_listen_start;
1400	tod->tod_listen_stop = t4_listen_stop;
1401	tod->tod_rcvd = t4_rcvd;
1402	tod->tod_output = t4_tod_output;
1403	tod->tod_send_rst = t4_send_rst;
1404	tod->tod_send_fin = t4_send_fin;
1405	tod->tod_pcb_detach = t4_pcb_detach;
1406	tod->tod_l2_update = t4_l2_update;
1407	tod->tod_syncache_added = t4_syncache_added;
1408	tod->tod_syncache_removed = t4_syncache_removed;
1409	tod->tod_syncache_respond = t4_syncache_respond;
1410	tod->tod_offload_socket = t4_offload_socket;
1411	tod->tod_ctloutput = t4_ctloutput;
1412#if 0
1413	tod->tod_tcp_info = t4_tcp_info;
1414#else
1415	(void)&t4_tcp_info;
1416#endif
1417
1418	for_each_port(sc, i) {
1419		for_each_vi(sc->port[i], v, vi) {
1420			TOEDEV(vi->ifp) = &td->tod;
1421			for_each_ofld_rxq(vi, j, ofld_rxq) {
1422				ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl;
1423				ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2;
1424			}
1425		}
1426	}
1427
1428	sc->tom_softc = td;
1429	register_toedev(sc->tom_softc);
1430
1431done:
1432	if (rc != 0)
1433		free_tom_data(sc, td);
1434	return (rc);
1435}
1436
1437static int
1438t4_tom_deactivate(struct adapter *sc)
1439{
1440	int rc = 0;
1441	struct tom_data *td = sc->tom_softc;
1442
1443	ASSERT_SYNCHRONIZED_OP(sc);
1444
1445	if (td == NULL)
1446		return (0);	/* XXX. KASSERT? */
1447
1448	if (sc->offload_map != 0)
1449		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
1450
1451	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
1452		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
1453
1454	mtx_lock(&td->toep_list_lock);
1455	if (!TAILQ_EMPTY(&td->toep_list))
1456		rc = EBUSY;
1457	mtx_unlock(&td->toep_list_lock);
1458
1459	mtx_lock(&td->lctx_hash_lock);
1460	if (td->lctx_count > 0)
1461		rc = EBUSY;
1462	mtx_unlock(&td->lctx_hash_lock);
1463
1464	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
1465	mtx_lock(&td->unsent_wr_lock);
1466	if (!STAILQ_EMPTY(&td->unsent_wr_list))
1467		rc = EBUSY;
1468	mtx_unlock(&td->unsent_wr_lock);
1469
1470	if (rc == 0) {
1471		unregister_toedev(sc->tom_softc);
1472		free_tom_data(sc, td);
1473		sc->tom_softc = NULL;
1474	}
1475
1476	return (rc);
1477}
1478
1479static void
1480t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
1481{
1482
1483	atomic_add_rel_int(&in6_ifaddr_gen, 1);
1484	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
1485}
1486
1487static int
1488t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
1489{
1490	struct tcpcb *tp = so_sototcpcb(so);
1491	struct toepcb *toep = tp->t_toe;
1492	int error;
1493
1494	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1495		error = t4_aio_queue_ddp(so, job);
1496		if (error != EOPNOTSUPP)
1497			return (error);
1498	}
1499
1500	return (t4_aio_queue_aiotx(so, job));
1501}
1502
1503static int
1504t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
1505{
1506
1507	if (sopt->sopt_level != IPPROTO_TCP)
1508		return (tcp_ctloutput(so, sopt));
1509
1510	switch (sopt->sopt_name) {
1511	case TCP_TLSOM_SET_TLS_CONTEXT:
1512	case TCP_TLSOM_GET_TLS_TOM:
1513	case TCP_TLSOM_CLR_TLS_TOM:
1514	case TCP_TLSOM_CLR_QUIES:
1515		return (t4_ctloutput_tls(so, sopt));
1516	default:
1517		return (tcp_ctloutput(so, sopt));
1518	}
1519}
1520
1521static int
1522t4_tom_mod_load(void)
1523{
1524	struct protosw *tcp_protosw, *tcp6_protosw;
1525
1526	/* CPL handlers */
1527	t4_init_connect_cpl_handlers();
1528	t4_init_listen_cpl_handlers();
1529	t4_init_cpl_io_handlers();
1530
1531	t4_ddp_mod_load();
1532	t4_tls_mod_load();
1533
1534	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
1535	if (tcp_protosw == NULL)
1536		return (ENOPROTOOPT);
1537	bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
1538	bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
1539	toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
1540	toe_protosw.pr_ctloutput = t4_ctloutput_tom;
1541	toe_protosw.pr_usrreqs = &toe_usrreqs;
1542
1543	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
1544	if (tcp6_protosw == NULL)
1545		return (ENOPROTOOPT);
1546	bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
1547	bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
1548	toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
1549	toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
1550	toe6_protosw.pr_usrreqs = &toe6_usrreqs;
1551
1552	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
1553	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
1554	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
1555
1556	return (t4_register_uld(&tom_uld_info));
1557}
1558
1559static void
1560tom_uninit(struct adapter *sc, void *arg __unused)
1561{
1562	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
1563		return;
1564
1565	/* Try to free resources (works only if no port has IFCAP_TOE) */
1566	if (uld_active(sc, ULD_TOM))
1567		t4_deactivate_uld(sc, ULD_TOM);
1568
1569	end_synchronized_op(sc, 0);
1570}
1571
1572static int
1573t4_tom_mod_unload(void)
1574{
1575	t4_iterate(tom_uninit, NULL);
1576
1577	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
1578		return (EBUSY);
1579
1580	if (ifaddr_evhandler) {
1581		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
1582		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
1583	}
1584
1585	t4_tls_mod_unload();
1586	t4_ddp_mod_unload();
1587
1588	t4_uninit_connect_cpl_handlers();
1589	t4_uninit_listen_cpl_handlers();
1590	t4_uninit_cpl_io_handlers();
1591
1592	return (0);
1593}
1594#endif	/* TCP_OFFLOAD */
1595
1596static int
1597t4_tom_modevent(module_t mod, int cmd, void *arg)
1598{
1599	int rc = 0;
1600
1601#ifdef TCP_OFFLOAD
1602	switch (cmd) {
1603	case MOD_LOAD:
1604		rc = t4_tom_mod_load();
1605		break;
1606
1607	case MOD_UNLOAD:
1608		rc = t4_tom_mod_unload();
1609		break;
1610
1611	default:
1612		rc = EINVAL;
1613	}
1614#else
1615	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
1616	rc = EOPNOTSUPP;
1617#endif
1618	return (rc);
1619}
1620
1621static moduledata_t t4_tom_moddata= {
1622	"t4_tom",
1623	t4_tom_modevent,
1624	0
1625};
1626
1627MODULE_VERSION(t4_tom, 1);
1628MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
1629MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
1630DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
1631