Deleted Added
full compact
t4_listen.c (245937) t4_listen.c (248925)
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/dev/cxgbe/tom/t4_listen.c 245937 2013-01-26 03:23:28Z np $");
29__FBSDID("$FreeBSD: head/sys/dev/cxgbe/tom/t4_listen.c 248925 2013-03-30 02:26:20Z np $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_types.h>
49#include <net/if_vlan_var.h>
50#include <net/route.h>
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/ip.h>
54#include <netinet/ip6.h>
55#include <netinet6/scope6_var.h>
56#include <netinet/tcp_timer.h>
57#include <netinet/tcp_var.h>
58#define TCPSTATES
59#include <netinet/tcp_fsm.h>
60#include <netinet/toecore.h>
61
62#include "common/common.h"
63#include "common/t4_msg.h"
64#include "common/t4_regs.h"
65#include "tom/t4_tom_l2t.h"
66#include "tom/t4_tom.h"
67
68/* stid services */
69static int alloc_stid(struct adapter *, struct listen_ctx *, int);
70static struct listen_ctx *lookup_stid(struct adapter *, int);
71static void free_stid(struct adapter *, struct listen_ctx *);
72
73/* lctx services */
74static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
75 struct port_info *);
76static int free_lctx(struct adapter *, struct listen_ctx *);
77static void hold_lctx(struct listen_ctx *);
78static void listen_hash_add(struct adapter *, struct listen_ctx *);
79static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
80static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
81static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
82
83static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
84static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
85static void send_reset_synqe(struct toedev *, struct synq_entry *);
86
87static int
88alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
89{
90 struct tid_info *t = &sc->tids;
91 u_int stid, n, f, mask;
92 struct stid_region *sr = &lctx->stid_region;
93
94 /*
95 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
96 * the TCAM. The start of the stid region is properly aligned (the chip
97 * requires each region to be 128-cell aligned).
98 */
99 n = isipv6 ? 2 : 1;
100 mask = n - 1;
101 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
102 ("%s: stid region (%u, %u) not properly aligned. n = %u",
103 __func__, t->stid_base, t->nstids, n));
104
105 mtx_lock(&t->stid_lock);
106 if (n > t->nstids - t->stids_in_use) {
107 mtx_unlock(&t->stid_lock);
108 return (-1);
109 }
110
111 if (t->nstids_free_head >= n) {
112 /*
113 * This allocation will definitely succeed because the region
114 * starts at a good alignment and we just checked we have enough
115 * stids free.
116 */
117 f = t->nstids_free_head & mask;
118 t->nstids_free_head -= n + f;
119 stid = t->nstids_free_head;
120 TAILQ_INSERT_HEAD(&t->stids, sr, link);
121 } else {
122 struct stid_region *s;
123
124 stid = t->nstids_free_head;
125 TAILQ_FOREACH(s, &t->stids, link) {
126 stid += s->used + s->free;
127 f = stid & mask;
128 if (n <= s->free - f) {
129 stid -= n + f;
130 s->free -= n + f;
131 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
132 goto allocated;
133 }
134 }
135
136 if (__predict_false(stid != t->nstids)) {
137 panic("%s: stids TAILQ (%p) corrupt."
138 " At %d instead of %d at the end of the queue.",
139 __func__, &t->stids, stid, t->nstids);
140 }
141
142 mtx_unlock(&t->stid_lock);
143 return (-1);
144 }
145
146allocated:
147 sr->used = n;
148 sr->free = f;
149 t->stids_in_use += n;
150 t->stid_tab[stid] = lctx;
151 mtx_unlock(&t->stid_lock);
152
153 KASSERT(((stid + t->stid_base) & mask) == 0,
154 ("%s: EDOOFUS.", __func__));
155 return (stid + t->stid_base);
156}
157
158static struct listen_ctx *
159lookup_stid(struct adapter *sc, int stid)
160{
161 struct tid_info *t = &sc->tids;
162
163 return (t->stid_tab[stid - t->stid_base]);
164}
165
166static void
167free_stid(struct adapter *sc, struct listen_ctx *lctx)
168{
169 struct tid_info *t = &sc->tids;
170 struct stid_region *sr = &lctx->stid_region;
171 struct stid_region *s;
172
173 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
174
175 mtx_lock(&t->stid_lock);
176 s = TAILQ_PREV(sr, stid_head, link);
177 if (s != NULL)
178 s->free += sr->used + sr->free;
179 else
180 t->nstids_free_head += sr->used + sr->free;
181 KASSERT(t->stids_in_use >= sr->used,
182 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
183 t->stids_in_use, sr->used));
184 t->stids_in_use -= sr->used;
185 TAILQ_REMOVE(&t->stids, sr, link);
186 mtx_unlock(&t->stid_lock);
187}
188
189static struct listen_ctx *
190alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
191{
192 struct listen_ctx *lctx;
193
194 INP_WLOCK_ASSERT(inp);
195
196 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
197 if (lctx == NULL)
198 return (NULL);
199
200 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
201 if (lctx->stid < 0) {
202 free(lctx, M_CXGBE);
203 return (NULL);
204 }
205
206 lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
207 lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
208 refcount_init(&lctx->refcount, 1);
209 TAILQ_INIT(&lctx->synq);
210
211 lctx->inp = inp;
212 in_pcbref(inp);
213
214 return (lctx);
215}
216
217/* Don't call this directly, use release_lctx instead */
218static int
219free_lctx(struct adapter *sc, struct listen_ctx *lctx)
220{
221 struct inpcb *inp = lctx->inp;
222
223 INP_WLOCK_ASSERT(inp);
224 KASSERT(lctx->refcount == 0,
225 ("%s: refcount %d", __func__, lctx->refcount));
226 KASSERT(TAILQ_EMPTY(&lctx->synq),
227 ("%s: synq not empty.", __func__));
228 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
229
230 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
231 __func__, lctx->stid, lctx, lctx->inp);
232
233 free_stid(sc, lctx);
234 free(lctx, M_CXGBE);
235
236 return (in_pcbrele_wlocked(inp));
237}
238
239static void
240hold_lctx(struct listen_ctx *lctx)
241{
242
243 refcount_acquire(&lctx->refcount);
244}
245
246static inline uint32_t
247listen_hashfn(void *key, u_long mask)
248{
249
250 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
251}
252
253/*
254 * Add a listen_ctx entry to the listen hash table.
255 */
256static void
257listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
258{
259 struct tom_data *td = sc->tom_softc;
260 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
261
262 mtx_lock(&td->lctx_hash_lock);
263 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
264 td->lctx_count++;
265 mtx_unlock(&td->lctx_hash_lock);
266}
267
268/*
269 * Look for the listening socket's context entry in the hash and return it.
270 */
271static struct listen_ctx *
272listen_hash_find(struct adapter *sc, struct inpcb *inp)
273{
274 struct tom_data *td = sc->tom_softc;
275 int bucket = listen_hashfn(inp, td->listen_mask);
276 struct listen_ctx *lctx;
277
278 mtx_lock(&td->lctx_hash_lock);
279 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
280 if (lctx->inp == inp)
281 break;
282 }
283 mtx_unlock(&td->lctx_hash_lock);
284
285 return (lctx);
286}
287
288/*
289 * Removes the listen_ctx structure for inp from the hash and returns it.
290 */
291static struct listen_ctx *
292listen_hash_del(struct adapter *sc, struct inpcb *inp)
293{
294 struct tom_data *td = sc->tom_softc;
295 int bucket = listen_hashfn(inp, td->listen_mask);
296 struct listen_ctx *lctx, *l;
297
298 mtx_lock(&td->lctx_hash_lock);
299 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
300 if (lctx->inp == inp) {
301 LIST_REMOVE(lctx, link);
302 td->lctx_count--;
303 break;
304 }
305 }
306 mtx_unlock(&td->lctx_hash_lock);
307
308 return (lctx);
309}
310
311/*
312 * Releases a hold on the lctx. Must be called with the listening socket's inp
313 * locked. The inp may be freed by this function and it returns NULL to
314 * indicate this.
315 */
316static struct inpcb *
317release_lctx(struct adapter *sc, struct listen_ctx *lctx)
318{
319 struct inpcb *inp = lctx->inp;
320 int inp_freed = 0;
321
322 INP_WLOCK_ASSERT(inp);
323 if (refcount_release(&lctx->refcount))
324 inp_freed = free_lctx(sc, lctx);
325
326 return (inp_freed ? NULL : inp);
327}
328
329static void
330send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
331{
332 struct adapter *sc = tod->tod_softc;
333 struct mbuf *m = synqe->syn;
334 struct ifnet *ifp = m->m_pkthdr.rcvif;
335 struct port_info *pi = ifp->if_softc;
336 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
337 struct wrqe *wr;
338 struct fw_flowc_wr *flowc;
339 struct cpl_abort_req *req;
340 int txqid, rxqid, flowclen;
341 struct sge_wrq *ofld_txq;
342 struct sge_ofld_rxq *ofld_rxq;
343 const int nparams = 6;
344 unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
345
346 INP_WLOCK_ASSERT(synqe->lctx->inp);
347
348 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
349 __func__, synqe, synqe->flags, synqe->tid,
350 synqe->flags & TPF_ABORT_SHUTDOWN ?
351 " (abort already in progress)" : "");
352 if (synqe->flags & TPF_ABORT_SHUTDOWN)
353 return; /* abort already in progress */
354 synqe->flags |= TPF_ABORT_SHUTDOWN;
355
356 get_qids_from_mbuf(m, &txqid, &rxqid);
357 ofld_txq = &sc->sge.ofld_txq[txqid];
358 ofld_rxq = &sc->sge.ofld_rxq[rxqid];
359
360 /* The wrqe will have two WRs - a flowc followed by an abort_req */
361 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
362
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#ifdef TCP_OFFLOAD
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/protosw.h>
41#include <sys/refcount.h>
42#include <sys/domain.h>
43#include <sys/fnv_hash.h>
44#include <sys/socket.h>
45#include <sys/socketvar.h>
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_types.h>
49#include <net/if_vlan_var.h>
50#include <net/route.h>
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/ip.h>
54#include <netinet/ip6.h>
55#include <netinet6/scope6_var.h>
56#include <netinet/tcp_timer.h>
57#include <netinet/tcp_var.h>
58#define TCPSTATES
59#include <netinet/tcp_fsm.h>
60#include <netinet/toecore.h>
61
62#include "common/common.h"
63#include "common/t4_msg.h"
64#include "common/t4_regs.h"
65#include "tom/t4_tom_l2t.h"
66#include "tom/t4_tom.h"
67
68/* stid services */
69static int alloc_stid(struct adapter *, struct listen_ctx *, int);
70static struct listen_ctx *lookup_stid(struct adapter *, int);
71static void free_stid(struct adapter *, struct listen_ctx *);
72
73/* lctx services */
74static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
75 struct port_info *);
76static int free_lctx(struct adapter *, struct listen_ctx *);
77static void hold_lctx(struct listen_ctx *);
78static void listen_hash_add(struct adapter *, struct listen_ctx *);
79static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
80static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
81static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
82
83static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
84static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
85static void send_reset_synqe(struct toedev *, struct synq_entry *);
86
87static int
88alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
89{
90 struct tid_info *t = &sc->tids;
91 u_int stid, n, f, mask;
92 struct stid_region *sr = &lctx->stid_region;
93
94 /*
95 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
96 * the TCAM. The start of the stid region is properly aligned (the chip
97 * requires each region to be 128-cell aligned).
98 */
99 n = isipv6 ? 2 : 1;
100 mask = n - 1;
101 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
102 ("%s: stid region (%u, %u) not properly aligned. n = %u",
103 __func__, t->stid_base, t->nstids, n));
104
105 mtx_lock(&t->stid_lock);
106 if (n > t->nstids - t->stids_in_use) {
107 mtx_unlock(&t->stid_lock);
108 return (-1);
109 }
110
111 if (t->nstids_free_head >= n) {
112 /*
113 * This allocation will definitely succeed because the region
114 * starts at a good alignment and we just checked we have enough
115 * stids free.
116 */
117 f = t->nstids_free_head & mask;
118 t->nstids_free_head -= n + f;
119 stid = t->nstids_free_head;
120 TAILQ_INSERT_HEAD(&t->stids, sr, link);
121 } else {
122 struct stid_region *s;
123
124 stid = t->nstids_free_head;
125 TAILQ_FOREACH(s, &t->stids, link) {
126 stid += s->used + s->free;
127 f = stid & mask;
128 if (n <= s->free - f) {
129 stid -= n + f;
130 s->free -= n + f;
131 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
132 goto allocated;
133 }
134 }
135
136 if (__predict_false(stid != t->nstids)) {
137 panic("%s: stids TAILQ (%p) corrupt."
138 " At %d instead of %d at the end of the queue.",
139 __func__, &t->stids, stid, t->nstids);
140 }
141
142 mtx_unlock(&t->stid_lock);
143 return (-1);
144 }
145
146allocated:
147 sr->used = n;
148 sr->free = f;
149 t->stids_in_use += n;
150 t->stid_tab[stid] = lctx;
151 mtx_unlock(&t->stid_lock);
152
153 KASSERT(((stid + t->stid_base) & mask) == 0,
154 ("%s: EDOOFUS.", __func__));
155 return (stid + t->stid_base);
156}
157
158static struct listen_ctx *
159lookup_stid(struct adapter *sc, int stid)
160{
161 struct tid_info *t = &sc->tids;
162
163 return (t->stid_tab[stid - t->stid_base]);
164}
165
166static void
167free_stid(struct adapter *sc, struct listen_ctx *lctx)
168{
169 struct tid_info *t = &sc->tids;
170 struct stid_region *sr = &lctx->stid_region;
171 struct stid_region *s;
172
173 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
174
175 mtx_lock(&t->stid_lock);
176 s = TAILQ_PREV(sr, stid_head, link);
177 if (s != NULL)
178 s->free += sr->used + sr->free;
179 else
180 t->nstids_free_head += sr->used + sr->free;
181 KASSERT(t->stids_in_use >= sr->used,
182 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
183 t->stids_in_use, sr->used));
184 t->stids_in_use -= sr->used;
185 TAILQ_REMOVE(&t->stids, sr, link);
186 mtx_unlock(&t->stid_lock);
187}
188
189static struct listen_ctx *
190alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
191{
192 struct listen_ctx *lctx;
193
194 INP_WLOCK_ASSERT(inp);
195
196 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
197 if (lctx == NULL)
198 return (NULL);
199
200 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
201 if (lctx->stid < 0) {
202 free(lctx, M_CXGBE);
203 return (NULL);
204 }
205
206 lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
207 lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
208 refcount_init(&lctx->refcount, 1);
209 TAILQ_INIT(&lctx->synq);
210
211 lctx->inp = inp;
212 in_pcbref(inp);
213
214 return (lctx);
215}
216
217/* Don't call this directly, use release_lctx instead */
218static int
219free_lctx(struct adapter *sc, struct listen_ctx *lctx)
220{
221 struct inpcb *inp = lctx->inp;
222
223 INP_WLOCK_ASSERT(inp);
224 KASSERT(lctx->refcount == 0,
225 ("%s: refcount %d", __func__, lctx->refcount));
226 KASSERT(TAILQ_EMPTY(&lctx->synq),
227 ("%s: synq not empty.", __func__));
228 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
229
230 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
231 __func__, lctx->stid, lctx, lctx->inp);
232
233 free_stid(sc, lctx);
234 free(lctx, M_CXGBE);
235
236 return (in_pcbrele_wlocked(inp));
237}
238
239static void
240hold_lctx(struct listen_ctx *lctx)
241{
242
243 refcount_acquire(&lctx->refcount);
244}
245
246static inline uint32_t
247listen_hashfn(void *key, u_long mask)
248{
249
250 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
251}
252
253/*
254 * Add a listen_ctx entry to the listen hash table.
255 */
256static void
257listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
258{
259 struct tom_data *td = sc->tom_softc;
260 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
261
262 mtx_lock(&td->lctx_hash_lock);
263 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
264 td->lctx_count++;
265 mtx_unlock(&td->lctx_hash_lock);
266}
267
268/*
269 * Look for the listening socket's context entry in the hash and return it.
270 */
271static struct listen_ctx *
272listen_hash_find(struct adapter *sc, struct inpcb *inp)
273{
274 struct tom_data *td = sc->tom_softc;
275 int bucket = listen_hashfn(inp, td->listen_mask);
276 struct listen_ctx *lctx;
277
278 mtx_lock(&td->lctx_hash_lock);
279 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
280 if (lctx->inp == inp)
281 break;
282 }
283 mtx_unlock(&td->lctx_hash_lock);
284
285 return (lctx);
286}
287
288/*
289 * Removes the listen_ctx structure for inp from the hash and returns it.
290 */
291static struct listen_ctx *
292listen_hash_del(struct adapter *sc, struct inpcb *inp)
293{
294 struct tom_data *td = sc->tom_softc;
295 int bucket = listen_hashfn(inp, td->listen_mask);
296 struct listen_ctx *lctx, *l;
297
298 mtx_lock(&td->lctx_hash_lock);
299 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
300 if (lctx->inp == inp) {
301 LIST_REMOVE(lctx, link);
302 td->lctx_count--;
303 break;
304 }
305 }
306 mtx_unlock(&td->lctx_hash_lock);
307
308 return (lctx);
309}
310
311/*
312 * Releases a hold on the lctx. Must be called with the listening socket's inp
313 * locked. The inp may be freed by this function and it returns NULL to
314 * indicate this.
315 */
316static struct inpcb *
317release_lctx(struct adapter *sc, struct listen_ctx *lctx)
318{
319 struct inpcb *inp = lctx->inp;
320 int inp_freed = 0;
321
322 INP_WLOCK_ASSERT(inp);
323 if (refcount_release(&lctx->refcount))
324 inp_freed = free_lctx(sc, lctx);
325
326 return (inp_freed ? NULL : inp);
327}
328
329static void
330send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
331{
332 struct adapter *sc = tod->tod_softc;
333 struct mbuf *m = synqe->syn;
334 struct ifnet *ifp = m->m_pkthdr.rcvif;
335 struct port_info *pi = ifp->if_softc;
336 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
337 struct wrqe *wr;
338 struct fw_flowc_wr *flowc;
339 struct cpl_abort_req *req;
340 int txqid, rxqid, flowclen;
341 struct sge_wrq *ofld_txq;
342 struct sge_ofld_rxq *ofld_rxq;
343 const int nparams = 6;
344 unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
345
346 INP_WLOCK_ASSERT(synqe->lctx->inp);
347
348 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
349 __func__, synqe, synqe->flags, synqe->tid,
350 synqe->flags & TPF_ABORT_SHUTDOWN ?
351 " (abort already in progress)" : "");
352 if (synqe->flags & TPF_ABORT_SHUTDOWN)
353 return; /* abort already in progress */
354 synqe->flags |= TPF_ABORT_SHUTDOWN;
355
356 get_qids_from_mbuf(m, &txqid, &rxqid);
357 ofld_txq = &sc->sge.ofld_txq[txqid];
358 ofld_rxq = &sc->sge.ofld_rxq[rxqid];
359
360 /* The wrqe will have two WRs - a flowc followed by an abort_req */
361 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
362
363 wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
363 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
364 if (wr == NULL) {
365 /* XXX */
366 panic("%s: allocation failure.", __func__);
367 }
368 flowc = wrtod(wr);
364 if (wr == NULL) {
365 /* XXX */
366 panic("%s: allocation failure.", __func__);
367 }
368 flowc = wrtod(wr);
369 req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE));
369 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
370
371 /* First the flowc ... */
372 memset(flowc, 0, wr->wr_len);
373 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
374 V_FW_FLOWC_WR_NPARAMS(nparams));
375 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
376 V_FW_WR_FLOWID(synqe->tid));
377 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
378 flowc->mnemval[0].val = htobe32(pfvf);
379 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
380 flowc->mnemval[1].val = htobe32(pi->tx_chan);
381 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
382 flowc->mnemval[2].val = htobe32(pi->tx_chan);
383 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
384 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
385 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
386 flowc->mnemval[4].val = htobe32(512);
387 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
388 flowc->mnemval[5].val = htobe32(512);
389 synqe->flags |= TPF_FLOWC_WR_SENT;
390
391 /* ... then ABORT request */
392 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
393 req->rsvd0 = 0; /* don't have a snd_nxt */
394 req->rsvd1 = 1; /* no data sent yet */
395 req->cmd = CPL_ABORT_SEND_RST;
396
397 t4_l2t_send(sc, wr, e);
398}
399
400static int
401create_server(struct adapter *sc, struct listen_ctx *lctx)
402{
403 struct wrqe *wr;
404 struct cpl_pass_open_req *req;
405 struct inpcb *inp = lctx->inp;
406
407 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
408 if (wr == NULL) {
409 log(LOG_ERR, "%s: allocation failure", __func__);
410 return (ENOMEM);
411 }
412 req = wrtod(wr);
413
414 INIT_TP_WR(req, 0);
415 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
416 req->local_port = inp->inp_lport;
417 req->peer_port = 0;
418 req->local_ip = inp->inp_laddr.s_addr;
419 req->peer_ip = 0;
420 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
421 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
422 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
423
424 t4_wrq_tx(sc, wr);
425 return (0);
426}
427
428static int
429create_server6(struct adapter *sc, struct listen_ctx *lctx)
430{
431 struct wrqe *wr;
432 struct cpl_pass_open_req6 *req;
433 struct inpcb *inp = lctx->inp;
434
435 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
436 if (wr == NULL) {
437 log(LOG_ERR, "%s: allocation failure", __func__);
438 return (ENOMEM);
439 }
440 req = wrtod(wr);
441
442 INIT_TP_WR(req, 0);
443 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
444 req->local_port = inp->inp_lport;
445 req->peer_port = 0;
446 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
447 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
448 req->peer_ip_hi = 0;
449 req->peer_ip_lo = 0;
450 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
451 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
452 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
453
454 t4_wrq_tx(sc, wr);
455 return (0);
456}
457
458static int
459destroy_server(struct adapter *sc, struct listen_ctx *lctx)
460{
461 struct wrqe *wr;
462 struct cpl_close_listsvr_req *req;
463
464 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
465 if (wr == NULL) {
466 /* XXX */
467 panic("%s: allocation failure.", __func__);
468 }
469 req = wrtod(wr);
470
471 INIT_TP_WR(req, 0);
472 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
473 lctx->stid));
474 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
475 req->rsvd = htobe16(0);
476
477 t4_wrq_tx(sc, wr);
478 return (0);
479}
480
481/*
482 * Start a listening server by sending a passive open request to HW.
483 *
484 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
485 * sc->offload_map, if_capenable are all race prone.
486 */
487int
488t4_listen_start(struct toedev *tod, struct tcpcb *tp)
489{
490 struct adapter *sc = tod->tod_softc;
491 struct port_info *pi;
492 struct inpcb *inp = tp->t_inpcb;
493 struct listen_ctx *lctx;
494 int i, rc;
495
496 INP_WLOCK_ASSERT(inp);
497
498#if 0
499 ADAPTER_LOCK(sc);
500 if (IS_BUSY(sc)) {
501 log(LOG_ERR, "%s: listen request ignored, %s is busy",
502 __func__, device_get_nameunit(sc->dev));
503 goto done;
504 }
505
506 KASSERT(sc->flags & TOM_INIT_DONE,
507 ("%s: TOM not initialized", __func__));
508#endif
509
510 if ((sc->open_device_map & sc->offload_map) == 0)
511 goto done; /* no port that's UP with IFCAP_TOE enabled */
512
513 /*
514 * Find a running port with IFCAP_TOE (4 or 6). We'll use the first
515 * such port's queues to send the passive open and receive the reply to
516 * it.
517 *
518 * XXX: need a way to mark a port in use by offload. if_cxgbe should
519 * then reject any attempt to bring down such a port (and maybe reject
520 * attempts to disable IFCAP_TOE on that port too?).
521 */
522 for_each_port(sc, i) {
523 if (isset(&sc->open_device_map, i) &&
524 sc->port[i]->ifp->if_capenable & IFCAP_TOE)
525 break;
526 }
527 KASSERT(i < sc->params.nports,
528 ("%s: no running port with TOE capability enabled.", __func__));
529 pi = sc->port[i];
530
531 if (listen_hash_find(sc, inp) != NULL)
532 goto done; /* already setup */
533
534 lctx = alloc_lctx(sc, inp, pi);
535 if (lctx == NULL) {
536 log(LOG_ERR,
537 "%s: listen request ignored, %s couldn't allocate lctx\n",
538 __func__, device_get_nameunit(sc->dev));
539 goto done;
540 }
541 listen_hash_add(sc, lctx);
542
543 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
544 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
545 inp->inp_vflag);
546
547 if (inp->inp_vflag & INP_IPV6)
548 rc = create_server6(sc, lctx);
549 else
550 rc = create_server(sc, lctx);
551 if (rc != 0) {
552 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
553 __func__, device_get_nameunit(sc->dev), rc);
554 (void) listen_hash_del(sc, inp);
555 inp = release_lctx(sc, lctx);
556 /* can't be freed, host stack has a reference */
557 KASSERT(inp != NULL, ("%s: inp freed", __func__));
558 goto done;
559 }
560 lctx->flags |= LCTX_RPL_PENDING;
561done:
562#if 0
563 ADAPTER_UNLOCK(sc);
564#endif
565 return (0);
566}
567
568int
569t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
570{
571 struct listen_ctx *lctx;
572 struct adapter *sc = tod->tod_softc;
573 struct inpcb *inp = tp->t_inpcb;
574 struct synq_entry *synqe;
575
576 INP_WLOCK_ASSERT(inp);
577
578 lctx = listen_hash_del(sc, inp);
579 if (lctx == NULL)
580 return (ENOENT); /* no hardware listener for this inp */
581
582 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
583 lctx, lctx->flags);
584
585 /*
586 * If the reply to the PASS_OPEN is still pending we'll wait for it to
587 * arrive and clean up when it does.
588 */
589 if (lctx->flags & LCTX_RPL_PENDING) {
590 KASSERT(TAILQ_EMPTY(&lctx->synq),
591 ("%s: synq not empty.", __func__));
592 return (EINPROGRESS);
593 }
594
595 /*
596 * The host stack will abort all the connections on the listening
597 * socket's so_comp. It doesn't know about the connections on the synq
598 * so we need to take care of those.
599 */
600 TAILQ_FOREACH(synqe, &lctx->synq, link) {
601 if (synqe->flags & TPF_SYNQE_HAS_L2TE)
602 send_reset_synqe(tod, synqe);
603 }
604
605 destroy_server(sc, lctx);
606 return (0);
607}
608
609static inline void
610hold_synqe(struct synq_entry *synqe)
611{
612
613 refcount_acquire(&synqe->refcnt);
614}
615
616static inline void
617release_synqe(struct synq_entry *synqe)
618{
619
620 if (refcount_release(&synqe->refcnt)) {
621 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
622
623 m_freem(synqe->syn);
624 if (needfree)
625 free(synqe, M_CXGBE);
626 }
627}
628
629void
630t4_syncache_added(struct toedev *tod __unused, void *arg)
631{
632 struct synq_entry *synqe = arg;
633
634 hold_synqe(synqe);
635}
636
637void
638t4_syncache_removed(struct toedev *tod __unused, void *arg)
639{
640 struct synq_entry *synqe = arg;
641
642 release_synqe(synqe);
643}
644
645/* XXX */
646extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
647
648int
649t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
650{
651 struct adapter *sc = tod->tod_softc;
652 struct synq_entry *synqe = arg;
653 struct wrqe *wr;
654 struct l2t_entry *e;
655 struct tcpopt to;
656 struct ip *ip = mtod(m, struct ip *);
657 struct tcphdr *th;
658
659 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
660 if (wr == NULL) {
661 m_freem(m);
662 return (EALREADY);
663 }
664
665 if (ip->ip_v == IPVERSION)
666 th = (void *)(ip + 1);
667 else
668 th = (void *)((struct ip6_hdr *)ip + 1);
669 bzero(&to, sizeof(to));
670 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
671 TO_SYN);
672
673 /* save these for later */
674 synqe->iss = be32toh(th->th_seq);
675 synqe->ts = to.to_tsval;
676
677 e = &sc->l2t->l2tab[synqe->l2e_idx];
678 t4_l2t_send(sc, wr, e);
679
680 m_freem(m); /* don't need this any more */
681 return (0);
682}
683
684static int
685do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
686 struct mbuf *m)
687{
688 struct adapter *sc = iq->adapter;
689 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
690 int stid = GET_TID(cpl);
691 unsigned int status = cpl->status;
692 struct listen_ctx *lctx = lookup_stid(sc, stid);
693 struct inpcb *inp = lctx->inp;
694#ifdef INVARIANTS
695 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
696#endif
697
698 KASSERT(opcode == CPL_PASS_OPEN_RPL,
699 ("%s: unexpected opcode 0x%x", __func__, opcode));
700 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
701 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
702
703 INP_WLOCK(inp);
704
705 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
706 __func__, stid, status, lctx->flags);
707
708 lctx->flags &= ~LCTX_RPL_PENDING;
709
710 if (status != CPL_ERR_NONE)
711 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
712
713#ifdef INVARIANTS
714 /*
715 * If the inp has been dropped (listening socket closed) then
716 * listen_stop must have run and taken the inp out of the hash.
717 */
718 if (inp->inp_flags & INP_DROPPED) {
719 KASSERT(listen_hash_del(sc, inp) == NULL,
720 ("%s: inp %p still in listen hash", __func__, inp));
721 }
722#endif
723
724 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
725 if (release_lctx(sc, lctx) != NULL)
726 INP_WUNLOCK(inp);
727 return (status);
728 }
729
730 /*
731 * Listening socket stopped listening earlier and now the chip tells us
732 * it has started the hardware listener. Stop it; the lctx will be
733 * released in do_close_server_rpl.
734 */
735 if (inp->inp_flags & INP_DROPPED) {
736 destroy_server(sc, lctx);
737 INP_WUNLOCK(inp);
738 return (status);
739 }
740
741 /*
742 * Failed to start hardware listener. Take inp out of the hash and
743 * release our reference on it. An error message has been logged
744 * already.
745 */
746 if (status != CPL_ERR_NONE) {
747 listen_hash_del(sc, inp);
748 if (release_lctx(sc, lctx) != NULL)
749 INP_WUNLOCK(inp);
750 return (status);
751 }
752
753 /* hardware listener open for business */
754
755 INP_WUNLOCK(inp);
756 return (status);
757}
758
759static int
760do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
761 struct mbuf *m)
762{
763 struct adapter *sc = iq->adapter;
764 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
765 int stid = GET_TID(cpl);
766 unsigned int status = cpl->status;
767 struct listen_ctx *lctx = lookup_stid(sc, stid);
768 struct inpcb *inp = lctx->inp;
769#ifdef INVARIANTS
770 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
771#endif
772
773 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
774 ("%s: unexpected opcode 0x%x", __func__, opcode));
775 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
776 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
777
778 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
779
780 if (status != CPL_ERR_NONE) {
781 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
782 __func__, status, stid);
783 return (status);
784 }
785
786 INP_WLOCK(inp);
787 inp = release_lctx(sc, lctx);
788 if (inp != NULL)
789 INP_WUNLOCK(inp);
790
791 return (status);
792}
793
794static void
795done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
796{
797 struct listen_ctx *lctx = synqe->lctx;
798 struct inpcb *inp = lctx->inp;
799 struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
800 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
801
802 INP_WLOCK_ASSERT(inp);
803
804 TAILQ_REMOVE(&lctx->synq, synqe, link);
805 inp = release_lctx(sc, lctx);
806 if (inp)
807 INP_WUNLOCK(inp);
808 remove_tid(sc, synqe->tid);
809 release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
810 t4_l2t_release(e);
811 release_synqe(synqe); /* removed from synq list */
812}
813
814int
815do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
816 struct mbuf *m)
817{
818 struct adapter *sc = iq->adapter;
819 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
820 unsigned int tid = GET_TID(cpl);
821 struct synq_entry *synqe = lookup_tid(sc, tid);
822 struct listen_ctx *lctx = synqe->lctx;
823 struct inpcb *inp = lctx->inp;
824 int txqid;
825 struct sge_wrq *ofld_txq;
826#ifdef INVARIANTS
827 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
828#endif
829
830 KASSERT(opcode == CPL_ABORT_REQ_RSS,
831 ("%s: unexpected opcode 0x%x", __func__, opcode));
832 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
833 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
834
835 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
836 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
837
838 if (negative_advice(cpl->status))
839 return (0); /* Ignore negative advice */
840
841 INP_WLOCK(inp);
842
843 get_qids_from_mbuf(synqe->syn, &txqid, NULL);
844 ofld_txq = &sc->sge.ofld_txq[txqid];
845
846 /*
847 * If we'd initiated an abort earlier the reply to it is responsible for
848 * cleaning up resources. Otherwise we tear everything down right here
849 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
850 */
851 if (synqe->flags & TPF_ABORT_SHUTDOWN) {
852 INP_WUNLOCK(inp);
853 goto done;
854 }
855
856 done_with_synqe(sc, synqe);
857 /* inp lock released by done_with_synqe */
858done:
859 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
860 return (0);
861}
862
863int
864do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
865 struct mbuf *m)
866{
867 struct adapter *sc = iq->adapter;
868 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
869 unsigned int tid = GET_TID(cpl);
870 struct synq_entry *synqe = lookup_tid(sc, tid);
871 struct listen_ctx *lctx = synqe->lctx;
872 struct inpcb *inp = lctx->inp;
873#ifdef INVARIANTS
874 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
875#endif
876
877 KASSERT(opcode == CPL_ABORT_RPL_RSS,
878 ("%s: unexpected opcode 0x%x", __func__, opcode));
879 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
880 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
881
882 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
883 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
884
885 INP_WLOCK(inp);
886 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
887 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
888 __func__, synqe, synqe->flags));
889
890 done_with_synqe(sc, synqe);
891 /* inp lock released by done_with_synqe */
892
893 return (0);
894}
895
896void
897t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
898{
899 struct adapter *sc = tod->tod_softc;
900 struct synq_entry *synqe = arg;
901#ifdef INVARIANTS
902 struct inpcb *inp = sotoinpcb(so);
903#endif
904 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
905 struct toepcb *toep = *(struct toepcb **)(cpl + 1);
906
907 INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
908 INP_WLOCK_ASSERT(inp);
909 KASSERT(synqe->flags & TPF_SYNQE,
910 ("%s: %p not a synq_entry?", __func__, arg));
911
912 offload_socket(so, toep);
913 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
914 toep->flags |= TPF_CPL_PENDING;
915 update_tid(sc, synqe->tid, toep);
916 synqe->flags |= TPF_SYNQE_EXPANDED;
917}
918
919static inline void
920save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
921{
922 uint32_t txqid, rxqid;
923
924 txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
925 rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
926
927 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
928}
929
930static inline void
931get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
932{
933
934 if (txqid)
935 *txqid = m->m_pkthdr.flowid >> 16;
936 if (rxqid)
937 *rxqid = m->m_pkthdr.flowid & 0xffff;
938}
939
940/*
941 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
942 * store some state temporarily.
943 */
944static struct synq_entry *
945mbuf_to_synqe(struct mbuf *m)
946{
370
371 /* First the flowc ... */
372 memset(flowc, 0, wr->wr_len);
373 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
374 V_FW_FLOWC_WR_NPARAMS(nparams));
375 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
376 V_FW_WR_FLOWID(synqe->tid));
377 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
378 flowc->mnemval[0].val = htobe32(pfvf);
379 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
380 flowc->mnemval[1].val = htobe32(pi->tx_chan);
381 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
382 flowc->mnemval[2].val = htobe32(pi->tx_chan);
383 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
384 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
385 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
386 flowc->mnemval[4].val = htobe32(512);
387 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
388 flowc->mnemval[5].val = htobe32(512);
389 synqe->flags |= TPF_FLOWC_WR_SENT;
390
391 /* ... then ABORT request */
392 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
393 req->rsvd0 = 0; /* don't have a snd_nxt */
394 req->rsvd1 = 1; /* no data sent yet */
395 req->cmd = CPL_ABORT_SEND_RST;
396
397 t4_l2t_send(sc, wr, e);
398}
399
400static int
401create_server(struct adapter *sc, struct listen_ctx *lctx)
402{
403 struct wrqe *wr;
404 struct cpl_pass_open_req *req;
405 struct inpcb *inp = lctx->inp;
406
407 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
408 if (wr == NULL) {
409 log(LOG_ERR, "%s: allocation failure", __func__);
410 return (ENOMEM);
411 }
412 req = wrtod(wr);
413
414 INIT_TP_WR(req, 0);
415 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
416 req->local_port = inp->inp_lport;
417 req->peer_port = 0;
418 req->local_ip = inp->inp_laddr.s_addr;
419 req->peer_ip = 0;
420 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
421 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
422 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
423
424 t4_wrq_tx(sc, wr);
425 return (0);
426}
427
428static int
429create_server6(struct adapter *sc, struct listen_ctx *lctx)
430{
431 struct wrqe *wr;
432 struct cpl_pass_open_req6 *req;
433 struct inpcb *inp = lctx->inp;
434
435 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
436 if (wr == NULL) {
437 log(LOG_ERR, "%s: allocation failure", __func__);
438 return (ENOMEM);
439 }
440 req = wrtod(wr);
441
442 INIT_TP_WR(req, 0);
443 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
444 req->local_port = inp->inp_lport;
445 req->peer_port = 0;
446 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
447 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
448 req->peer_ip_hi = 0;
449 req->peer_ip_lo = 0;
450 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
451 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
452 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
453
454 t4_wrq_tx(sc, wr);
455 return (0);
456}
457
458static int
459destroy_server(struct adapter *sc, struct listen_ctx *lctx)
460{
461 struct wrqe *wr;
462 struct cpl_close_listsvr_req *req;
463
464 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
465 if (wr == NULL) {
466 /* XXX */
467 panic("%s: allocation failure.", __func__);
468 }
469 req = wrtod(wr);
470
471 INIT_TP_WR(req, 0);
472 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
473 lctx->stid));
474 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
475 req->rsvd = htobe16(0);
476
477 t4_wrq_tx(sc, wr);
478 return (0);
479}
480
481/*
482 * Start a listening server by sending a passive open request to HW.
483 *
484 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
485 * sc->offload_map, if_capenable are all race prone.
486 */
487int
488t4_listen_start(struct toedev *tod, struct tcpcb *tp)
489{
490 struct adapter *sc = tod->tod_softc;
491 struct port_info *pi;
492 struct inpcb *inp = tp->t_inpcb;
493 struct listen_ctx *lctx;
494 int i, rc;
495
496 INP_WLOCK_ASSERT(inp);
497
498#if 0
499 ADAPTER_LOCK(sc);
500 if (IS_BUSY(sc)) {
501 log(LOG_ERR, "%s: listen request ignored, %s is busy",
502 __func__, device_get_nameunit(sc->dev));
503 goto done;
504 }
505
506 KASSERT(sc->flags & TOM_INIT_DONE,
507 ("%s: TOM not initialized", __func__));
508#endif
509
510 if ((sc->open_device_map & sc->offload_map) == 0)
511 goto done; /* no port that's UP with IFCAP_TOE enabled */
512
513 /*
514 * Find a running port with IFCAP_TOE (4 or 6). We'll use the first
515 * such port's queues to send the passive open and receive the reply to
516 * it.
517 *
518 * XXX: need a way to mark a port in use by offload. if_cxgbe should
519 * then reject any attempt to bring down such a port (and maybe reject
520 * attempts to disable IFCAP_TOE on that port too?).
521 */
522 for_each_port(sc, i) {
523 if (isset(&sc->open_device_map, i) &&
524 sc->port[i]->ifp->if_capenable & IFCAP_TOE)
525 break;
526 }
527 KASSERT(i < sc->params.nports,
528 ("%s: no running port with TOE capability enabled.", __func__));
529 pi = sc->port[i];
530
531 if (listen_hash_find(sc, inp) != NULL)
532 goto done; /* already setup */
533
534 lctx = alloc_lctx(sc, inp, pi);
535 if (lctx == NULL) {
536 log(LOG_ERR,
537 "%s: listen request ignored, %s couldn't allocate lctx\n",
538 __func__, device_get_nameunit(sc->dev));
539 goto done;
540 }
541 listen_hash_add(sc, lctx);
542
543 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
544 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
545 inp->inp_vflag);
546
547 if (inp->inp_vflag & INP_IPV6)
548 rc = create_server6(sc, lctx);
549 else
550 rc = create_server(sc, lctx);
551 if (rc != 0) {
552 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
553 __func__, device_get_nameunit(sc->dev), rc);
554 (void) listen_hash_del(sc, inp);
555 inp = release_lctx(sc, lctx);
556 /* can't be freed, host stack has a reference */
557 KASSERT(inp != NULL, ("%s: inp freed", __func__));
558 goto done;
559 }
560 lctx->flags |= LCTX_RPL_PENDING;
561done:
562#if 0
563 ADAPTER_UNLOCK(sc);
564#endif
565 return (0);
566}
567
568int
569t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
570{
571 struct listen_ctx *lctx;
572 struct adapter *sc = tod->tod_softc;
573 struct inpcb *inp = tp->t_inpcb;
574 struct synq_entry *synqe;
575
576 INP_WLOCK_ASSERT(inp);
577
578 lctx = listen_hash_del(sc, inp);
579 if (lctx == NULL)
580 return (ENOENT); /* no hardware listener for this inp */
581
582 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
583 lctx, lctx->flags);
584
585 /*
586 * If the reply to the PASS_OPEN is still pending we'll wait for it to
587 * arrive and clean up when it does.
588 */
589 if (lctx->flags & LCTX_RPL_PENDING) {
590 KASSERT(TAILQ_EMPTY(&lctx->synq),
591 ("%s: synq not empty.", __func__));
592 return (EINPROGRESS);
593 }
594
595 /*
596 * The host stack will abort all the connections on the listening
597 * socket's so_comp. It doesn't know about the connections on the synq
598 * so we need to take care of those.
599 */
600 TAILQ_FOREACH(synqe, &lctx->synq, link) {
601 if (synqe->flags & TPF_SYNQE_HAS_L2TE)
602 send_reset_synqe(tod, synqe);
603 }
604
605 destroy_server(sc, lctx);
606 return (0);
607}
608
609static inline void
610hold_synqe(struct synq_entry *synqe)
611{
612
613 refcount_acquire(&synqe->refcnt);
614}
615
616static inline void
617release_synqe(struct synq_entry *synqe)
618{
619
620 if (refcount_release(&synqe->refcnt)) {
621 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
622
623 m_freem(synqe->syn);
624 if (needfree)
625 free(synqe, M_CXGBE);
626 }
627}
628
629void
630t4_syncache_added(struct toedev *tod __unused, void *arg)
631{
632 struct synq_entry *synqe = arg;
633
634 hold_synqe(synqe);
635}
636
637void
638t4_syncache_removed(struct toedev *tod __unused, void *arg)
639{
640 struct synq_entry *synqe = arg;
641
642 release_synqe(synqe);
643}
644
645/* XXX */
646extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
647
648int
649t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
650{
651 struct adapter *sc = tod->tod_softc;
652 struct synq_entry *synqe = arg;
653 struct wrqe *wr;
654 struct l2t_entry *e;
655 struct tcpopt to;
656 struct ip *ip = mtod(m, struct ip *);
657 struct tcphdr *th;
658
659 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
660 if (wr == NULL) {
661 m_freem(m);
662 return (EALREADY);
663 }
664
665 if (ip->ip_v == IPVERSION)
666 th = (void *)(ip + 1);
667 else
668 th = (void *)((struct ip6_hdr *)ip + 1);
669 bzero(&to, sizeof(to));
670 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
671 TO_SYN);
672
673 /* save these for later */
674 synqe->iss = be32toh(th->th_seq);
675 synqe->ts = to.to_tsval;
676
677 e = &sc->l2t->l2tab[synqe->l2e_idx];
678 t4_l2t_send(sc, wr, e);
679
680 m_freem(m); /* don't need this any more */
681 return (0);
682}
683
684static int
685do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
686 struct mbuf *m)
687{
688 struct adapter *sc = iq->adapter;
689 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
690 int stid = GET_TID(cpl);
691 unsigned int status = cpl->status;
692 struct listen_ctx *lctx = lookup_stid(sc, stid);
693 struct inpcb *inp = lctx->inp;
694#ifdef INVARIANTS
695 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
696#endif
697
698 KASSERT(opcode == CPL_PASS_OPEN_RPL,
699 ("%s: unexpected opcode 0x%x", __func__, opcode));
700 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
701 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
702
703 INP_WLOCK(inp);
704
705 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
706 __func__, stid, status, lctx->flags);
707
708 lctx->flags &= ~LCTX_RPL_PENDING;
709
710 if (status != CPL_ERR_NONE)
711 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
712
713#ifdef INVARIANTS
714 /*
715 * If the inp has been dropped (listening socket closed) then
716 * listen_stop must have run and taken the inp out of the hash.
717 */
718 if (inp->inp_flags & INP_DROPPED) {
719 KASSERT(listen_hash_del(sc, inp) == NULL,
720 ("%s: inp %p still in listen hash", __func__, inp));
721 }
722#endif
723
724 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
725 if (release_lctx(sc, lctx) != NULL)
726 INP_WUNLOCK(inp);
727 return (status);
728 }
729
730 /*
731 * Listening socket stopped listening earlier and now the chip tells us
732 * it has started the hardware listener. Stop it; the lctx will be
733 * released in do_close_server_rpl.
734 */
735 if (inp->inp_flags & INP_DROPPED) {
736 destroy_server(sc, lctx);
737 INP_WUNLOCK(inp);
738 return (status);
739 }
740
741 /*
742 * Failed to start hardware listener. Take inp out of the hash and
743 * release our reference on it. An error message has been logged
744 * already.
745 */
746 if (status != CPL_ERR_NONE) {
747 listen_hash_del(sc, inp);
748 if (release_lctx(sc, lctx) != NULL)
749 INP_WUNLOCK(inp);
750 return (status);
751 }
752
753 /* hardware listener open for business */
754
755 INP_WUNLOCK(inp);
756 return (status);
757}
758
759static int
760do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
761 struct mbuf *m)
762{
763 struct adapter *sc = iq->adapter;
764 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
765 int stid = GET_TID(cpl);
766 unsigned int status = cpl->status;
767 struct listen_ctx *lctx = lookup_stid(sc, stid);
768 struct inpcb *inp = lctx->inp;
769#ifdef INVARIANTS
770 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
771#endif
772
773 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
774 ("%s: unexpected opcode 0x%x", __func__, opcode));
775 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
776 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
777
778 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
779
780 if (status != CPL_ERR_NONE) {
781 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
782 __func__, status, stid);
783 return (status);
784 }
785
786 INP_WLOCK(inp);
787 inp = release_lctx(sc, lctx);
788 if (inp != NULL)
789 INP_WUNLOCK(inp);
790
791 return (status);
792}
793
794static void
795done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
796{
797 struct listen_ctx *lctx = synqe->lctx;
798 struct inpcb *inp = lctx->inp;
799 struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
800 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
801
802 INP_WLOCK_ASSERT(inp);
803
804 TAILQ_REMOVE(&lctx->synq, synqe, link);
805 inp = release_lctx(sc, lctx);
806 if (inp)
807 INP_WUNLOCK(inp);
808 remove_tid(sc, synqe->tid);
809 release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
810 t4_l2t_release(e);
811 release_synqe(synqe); /* removed from synq list */
812}
813
814int
815do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
816 struct mbuf *m)
817{
818 struct adapter *sc = iq->adapter;
819 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
820 unsigned int tid = GET_TID(cpl);
821 struct synq_entry *synqe = lookup_tid(sc, tid);
822 struct listen_ctx *lctx = synqe->lctx;
823 struct inpcb *inp = lctx->inp;
824 int txqid;
825 struct sge_wrq *ofld_txq;
826#ifdef INVARIANTS
827 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
828#endif
829
830 KASSERT(opcode == CPL_ABORT_REQ_RSS,
831 ("%s: unexpected opcode 0x%x", __func__, opcode));
832 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
833 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
834
835 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
836 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
837
838 if (negative_advice(cpl->status))
839 return (0); /* Ignore negative advice */
840
841 INP_WLOCK(inp);
842
843 get_qids_from_mbuf(synqe->syn, &txqid, NULL);
844 ofld_txq = &sc->sge.ofld_txq[txqid];
845
846 /*
847 * If we'd initiated an abort earlier the reply to it is responsible for
848 * cleaning up resources. Otherwise we tear everything down right here
849 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
850 */
851 if (synqe->flags & TPF_ABORT_SHUTDOWN) {
852 INP_WUNLOCK(inp);
853 goto done;
854 }
855
856 done_with_synqe(sc, synqe);
857 /* inp lock released by done_with_synqe */
858done:
859 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
860 return (0);
861}
862
863int
864do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
865 struct mbuf *m)
866{
867 struct adapter *sc = iq->adapter;
868 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
869 unsigned int tid = GET_TID(cpl);
870 struct synq_entry *synqe = lookup_tid(sc, tid);
871 struct listen_ctx *lctx = synqe->lctx;
872 struct inpcb *inp = lctx->inp;
873#ifdef INVARIANTS
874 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
875#endif
876
877 KASSERT(opcode == CPL_ABORT_RPL_RSS,
878 ("%s: unexpected opcode 0x%x", __func__, opcode));
879 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
880 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
881
882 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
883 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
884
885 INP_WLOCK(inp);
886 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
887 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
888 __func__, synqe, synqe->flags));
889
890 done_with_synqe(sc, synqe);
891 /* inp lock released by done_with_synqe */
892
893 return (0);
894}
895
896void
897t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
898{
899 struct adapter *sc = tod->tod_softc;
900 struct synq_entry *synqe = arg;
901#ifdef INVARIANTS
902 struct inpcb *inp = sotoinpcb(so);
903#endif
904 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
905 struct toepcb *toep = *(struct toepcb **)(cpl + 1);
906
907 INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
908 INP_WLOCK_ASSERT(inp);
909 KASSERT(synqe->flags & TPF_SYNQE,
910 ("%s: %p not a synq_entry?", __func__, arg));
911
912 offload_socket(so, toep);
913 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
914 toep->flags |= TPF_CPL_PENDING;
915 update_tid(sc, synqe->tid, toep);
916 synqe->flags |= TPF_SYNQE_EXPANDED;
917}
918
919static inline void
920save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
921{
922 uint32_t txqid, rxqid;
923
924 txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
925 rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
926
927 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
928}
929
930static inline void
931get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
932{
933
934 if (txqid)
935 *txqid = m->m_pkthdr.flowid >> 16;
936 if (rxqid)
937 *rxqid = m->m_pkthdr.flowid & 0xffff;
938}
939
940/*
941 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
942 * store some state temporarily.
943 */
944static struct synq_entry *
945mbuf_to_synqe(struct mbuf *m)
946{
947 int len = roundup(sizeof (struct synq_entry), 8);
947 int len = roundup2(sizeof (struct synq_entry), 8);
948 int tspace = M_TRAILINGSPACE(m);
949 struct synq_entry *synqe = NULL;
950
951 if (tspace < len) {
952 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
953 if (synqe == NULL)
954 return (NULL);
955 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
956 } else {
957 synqe = (void *)(m->m_data + m->m_len + tspace - len);
958 synqe->flags = TPF_SYNQE;
959 }
960
961 return (synqe);
962}
963
964static void
965t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
966{
967 bzero(to, sizeof(*to));
968
969 if (t4opt->mss) {
970 to->to_flags |= TOF_MSS;
971 to->to_mss = be16toh(t4opt->mss);
972 }
973
974 if (t4opt->wsf) {
975 to->to_flags |= TOF_SCALE;
976 to->to_wscale = t4opt->wsf;
977 }
978
979 if (t4opt->tstamp)
980 to->to_flags |= TOF_TS;
981
982 if (t4opt->sack)
983 to->to_flags |= TOF_SACKPERM;
984}
985
986/*
987 * Options2 for passive open.
988 */
989static uint32_t
990calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
991 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
992{
993 uint32_t opt2 = 0;
994 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
995
996 if (V_tcp_do_rfc1323) {
997 if (tcpopt->tstamp)
998 opt2 |= F_TSTAMPS_EN;
999 if (tcpopt->sack)
1000 opt2 |= F_SACK_EN;
1001 if (tcpopt->wsf > 0)
1002 opt2 |= F_WND_SCALE_EN;
1003 }
1004
1005 if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1006 opt2 |= F_CCTRL_ECN;
1007
1008 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
948 int tspace = M_TRAILINGSPACE(m);
949 struct synq_entry *synqe = NULL;
950
951 if (tspace < len) {
952 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
953 if (synqe == NULL)
954 return (NULL);
955 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
956 } else {
957 synqe = (void *)(m->m_data + m->m_len + tspace - len);
958 synqe->flags = TPF_SYNQE;
959 }
960
961 return (synqe);
962}
963
964static void
965t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
966{
967 bzero(to, sizeof(*to));
968
969 if (t4opt->mss) {
970 to->to_flags |= TOF_MSS;
971 to->to_mss = be16toh(t4opt->mss);
972 }
973
974 if (t4opt->wsf) {
975 to->to_flags |= TOF_SCALE;
976 to->to_wscale = t4opt->wsf;
977 }
978
979 if (t4opt->tstamp)
980 to->to_flags |= TOF_TS;
981
982 if (t4opt->sack)
983 to->to_flags |= TOF_SACKPERM;
984}
985
986/*
987 * Options2 for passive open.
988 */
989static uint32_t
990calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
991 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
992{
993 uint32_t opt2 = 0;
994 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
995
996 if (V_tcp_do_rfc1323) {
997 if (tcpopt->tstamp)
998 opt2 |= F_TSTAMPS_EN;
999 if (tcpopt->sack)
1000 opt2 |= F_SACK_EN;
1001 if (tcpopt->wsf > 0)
1002 opt2 |= F_WND_SCALE_EN;
1003 }
1004
1005 if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
1006 opt2 |= F_CCTRL_ECN;
1007
1008 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
1009 opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
1010 opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1009 opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1010 if (is_t4(sc))
1011 opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
1012 else
1013 opt2 |= F_T5_OPT_2_VALID | V_RX_COALESCE(M_RX_COALESCE);
1011
1012#ifdef USE_DDP_RX_FLOW_CONTROL
1013 if (ulp_mode == ULP_MODE_TCPDDP)
1014 opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1015#endif
1016
1017 return htobe32(opt2);
1018}
1019
1020/* XXX: duplication. */
1021static inline void
1022tcp_fields_to_host(struct tcphdr *th)
1023{
1024
1025 th->th_seq = ntohl(th->th_seq);
1026 th->th_ack = ntohl(th->th_ack);
1027 th->th_win = ntohs(th->th_win);
1028 th->th_urp = ntohs(th->th_urp);
1029}
1030
1031static void
1032pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
1033 struct tcphdr *th)
1034{
1035 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1036 const struct ether_header *eh;
1037 unsigned int hlen = be32toh(cpl->hdr_len);
1038 uintptr_t l3hdr;
1039 const struct tcphdr *tcp;
1040
1041 eh = (const void *)(cpl + 1);
1042 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1043 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1044
1045 if (inc) {
1046 bzero(inc, sizeof(*inc));
1047 inc->inc_fport = tcp->th_sport;
1048 inc->inc_lport = tcp->th_dport;
1049 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1050 const struct ip *ip = (const void *)l3hdr;
1051
1052 inc->inc_faddr = ip->ip_src;
1053 inc->inc_laddr = ip->ip_dst;
1054 } else {
1055 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1056
1057 inc->inc_flags |= INC_ISIPV6;
1058 inc->inc6_faddr = ip6->ip6_src;
1059 inc->inc6_laddr = ip6->ip6_dst;
1060 }
1061 }
1062
1063 if (th) {
1064 bcopy(tcp, th, sizeof(*th));
1065 tcp_fields_to_host(th); /* just like tcp_input */
1066 }
1067}
1068
1069static int
1070ifnet_has_ip6(struct ifnet *ifp, struct in6_addr *ip6)
1071{
1072 struct ifaddr *ifa;
1073 struct sockaddr_in6 *sin6;
1074 int found = 0;
1075 struct in6_addr in6 = *ip6;
1076
1077 /* Just as in ip6_input */
1078 if (in6_clearscope(&in6) || in6_clearscope(&in6))
1079 return (0);
1080 in6_setscope(&in6, ifp, NULL);
1081
1082 if_addr_rlock(ifp);
1083 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1084 sin6 = (void *)ifa->ifa_addr;
1085 if (sin6->sin6_family != AF_INET6)
1086 continue;
1087
1088 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &in6)) {
1089 found = 1;
1090 break;
1091 }
1092 }
1093 if_addr_runlock(ifp);
1094
1095 return (found);
1096}
1097
1098static struct l2t_entry *
1099get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1100 struct in_conninfo *inc)
1101{
1102 struct rtentry *rt;
1103 struct l2t_entry *e;
1104 struct sockaddr_in6 sin6;
1105 struct sockaddr *dst = (void *)&sin6;
1106
1107 if (inc->inc_flags & INC_ISIPV6) {
1108 dst->sa_len = sizeof(struct sockaddr_in6);
1109 dst->sa_family = AF_INET6;
1110 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1111
1112 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1113 /* no need for route lookup */
1114 e = t4_l2t_get(pi, ifp, dst);
1115 return (e);
1116 }
1117 } else {
1118 dst->sa_len = sizeof(struct sockaddr_in);
1119 dst->sa_family = AF_INET;
1120 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1121 }
1122
1123 rt = rtalloc1(dst, 0, 0);
1124 if (rt == NULL)
1125 return (NULL);
1126 else {
1127 struct sockaddr *nexthop;
1128
1129 RT_UNLOCK(rt);
1130 if (rt->rt_ifp != ifp)
1131 e = NULL;
1132 else {
1133 if (rt->rt_flags & RTF_GATEWAY)
1134 nexthop = rt->rt_gateway;
1135 else
1136 nexthop = dst;
1137 e = t4_l2t_get(pi, ifp, nexthop);
1138 }
1139 RTFREE(rt);
1140 }
1141
1142 return (e);
1143}
1144
1145static int
1146ifnet_has_ip(struct ifnet *ifp, struct in_addr in)
1147{
1148 struct ifaddr *ifa;
1149 struct sockaddr_in *sin;
1150 int found = 0;
1151
1152 if_addr_rlock(ifp);
1153 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1154 sin = (void *)ifa->ifa_addr;
1155 if (sin->sin_family != AF_INET)
1156 continue;
1157
1158 if (sin->sin_addr.s_addr == in.s_addr) {
1159 found = 1;
1160 break;
1161 }
1162 }
1163 if_addr_runlock(ifp);
1164
1165 return (found);
1166}
1167
1168#define REJECT_PASS_ACCEPT() do { \
1169 reject_reason = __LINE__; \
1170 goto reject; \
1171} while (0)
1172
1173/*
1174 * The context associated with a tid entry via insert_tid could be a synq_entry
1175 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
1176 */
1177CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1178
1179/*
1180 * Incoming SYN on a listening socket.
1181 *
1182 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1183 * etc.
1184 */
1185static int
1186do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1187 struct mbuf *m)
1188{
1189 struct adapter *sc = iq->adapter;
1190 struct toedev *tod;
1191 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1192 struct cpl_pass_accept_rpl *rpl;
1193 struct wrqe *wr;
1194 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1195 unsigned int tid = GET_TID(cpl);
1196 struct listen_ctx *lctx = lookup_stid(sc, stid);
1197 struct inpcb *inp;
1198 struct socket *so;
1199 struct in_conninfo inc;
1200 struct tcphdr th;
1201 struct tcpopt to;
1202 struct port_info *pi;
1203 struct ifnet *hw_ifp, *ifp;
1204 struct l2t_entry *e = NULL;
1205 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1206 struct synq_entry *synqe = NULL;
1207 int reject_reason;
1208 uint16_t vid;
1209#ifdef INVARIANTS
1210 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1211#endif
1212
1213 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1214 ("%s: unexpected opcode 0x%x", __func__, opcode));
1215 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1216
1217 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1218 lctx);
1219
1220 pass_accept_req_to_protohdrs(m, &inc, &th);
1221 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1222
1223 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1224 hw_ifp = pi->ifp; /* the cxgbeX ifnet */
1225 m->m_pkthdr.rcvif = hw_ifp;
1226 tod = TOEDEV(hw_ifp);
1227
1228 /*
1229 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1230 * involved. Don't offload if the SYN had a VLAN tag and the vid
1231 * doesn't match anything on this interface.
1232 *
1233 * XXX: lagg support, lagg + vlan support.
1234 */
1235 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1236 if (vid != 0xfff) {
1237 ifp = VLAN_DEVAT(hw_ifp, vid);
1238 if (ifp == NULL)
1239 REJECT_PASS_ACCEPT();
1240 } else
1241 ifp = hw_ifp;
1242
1243 /*
1244 * Don't offload if the peer requested a TCP option that's not known to
1245 * the silicon.
1246 */
1247 if (cpl->tcpopt.unknown)
1248 REJECT_PASS_ACCEPT();
1249
1250 if (inc.inc_flags & INC_ISIPV6) {
1251
1252 /* Don't offload if the ifcap isn't enabled */
1253 if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1254 REJECT_PASS_ACCEPT();
1255
1256 /*
1257 * SYN must be directed to an IP6 address on this ifnet. This
1258 * is more restrictive than in6_localip.
1259 */
1260 if (!ifnet_has_ip6(ifp, &inc.inc6_laddr))
1261 REJECT_PASS_ACCEPT();
1262 } else {
1263
1264 /* Don't offload if the ifcap isn't enabled */
1265 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1266 REJECT_PASS_ACCEPT();
1267
1268 /*
1269 * SYN must be directed to an IP address on this ifnet. This
1270 * is more restrictive than in_localip.
1271 */
1272 if (!ifnet_has_ip(ifp, inc.inc_laddr))
1273 REJECT_PASS_ACCEPT();
1274 }
1275
1276 e = get_l2te_for_nexthop(pi, ifp, &inc);
1277 if (e == NULL)
1278 REJECT_PASS_ACCEPT();
1279
1280 synqe = mbuf_to_synqe(m);
1281 if (synqe == NULL)
1282 REJECT_PASS_ACCEPT();
1283
1284 wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
1285 if (wr == NULL)
1286 REJECT_PASS_ACCEPT();
1287 rpl = wrtod(wr);
1288
1289 INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */
1290
1291 /* Don't offload if the 4-tuple is already in use */
1292 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1293 INP_INFO_WUNLOCK(&V_tcbinfo);
1294 free(wr, M_CXGBE);
1295 REJECT_PASS_ACCEPT();
1296 }
1297
1298 inp = lctx->inp; /* listening socket, not owned by TOE */
1299 INP_WLOCK(inp);
1300
1301 /* Don't offload if the listening socket has closed */
1302 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1303 /*
1304 * The listening socket has closed. The reply from the TOE to
1305 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1306 * resources tied to this listen context.
1307 */
1308 INP_WUNLOCK(inp);
1309 INP_INFO_WUNLOCK(&V_tcbinfo);
1310 free(wr, M_CXGBE);
1311 REJECT_PASS_ACCEPT();
1312 }
1313 so = inp->inp_socket;
1314
1315 mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1316 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1317 SOCKBUF_LOCK(&so->so_rcv);
1318 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1319 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1320 SOCKBUF_UNLOCK(&so->so_rcv);
1321
1322 save_qids_in_mbuf(m, pi);
1323 get_qids_from_mbuf(m, NULL, &rxqid);
1324
1325 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1326 if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1327 ulp_mode = ULP_MODE_TCPDDP;
1328 synqe->flags |= TPF_SYNQE_TCPDDP;
1329 } else
1330 ulp_mode = ULP_MODE_NONE;
1331 rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1332 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1333
1334 synqe->tid = tid;
1335 synqe->lctx = lctx;
1336 synqe->syn = m;
1337 m = NULL;
1338 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */
1339 synqe->l2e_idx = e->idx;
1340 synqe->rcv_bufsize = rx_credits;
1341 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1342
1343 insert_tid(sc, tid, synqe);
1344 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1345 hold_synqe(synqe); /* hold for the duration it's in the synq */
1346 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
1347
1348 /*
1349 * If all goes well t4_syncache_respond will get called during
1350 * syncache_add. Also note that syncache_add releases both pcbinfo and
1351 * pcb locks.
1352 */
1353 toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1354 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */
1355 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1356
1357 /*
1358 * If we replied during syncache_add (synqe->wr has been consumed),
1359 * good. Otherwise, set it to 0 so that further syncache_respond
1360 * attempts by the kernel will be ignored.
1361 */
1362 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1363
1364 /*
1365 * syncache may or may not have a hold on the synqe, which may
1366 * or may not be stashed in the original SYN mbuf passed to us.
1367 * Just copy it over instead of dealing with all possibilities.
1368 */
1369 m = m_dup(synqe->syn, M_NOWAIT);
1370 if (m)
1371 m->m_pkthdr.rcvif = hw_ifp;
1372
1373 remove_tid(sc, synqe->tid);
1374 free(wr, M_CXGBE);
1375
1376 /* Yank the synqe out of the lctx synq. */
1377 INP_WLOCK(inp);
1378 TAILQ_REMOVE(&lctx->synq, synqe, link);
1379 release_synqe(synqe); /* removed from synq list */
1380 inp = release_lctx(sc, lctx);
1381 if (inp)
1382 INP_WUNLOCK(inp);
1383
1384 release_synqe(synqe); /* extra hold */
1385 REJECT_PASS_ACCEPT();
1386 }
1387
1388 CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1389 __func__, stid, tid, lctx, synqe);
1390
1391 INP_WLOCK(inp);
1392 synqe->flags |= TPF_SYNQE_HAS_L2TE;
1393 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1394 /*
1395 * Listening socket closed but tod_listen_stop did not abort
1396 * this tid because there was no L2T entry for the tid at that
1397 * time. Abort it now. The reply to the abort will clean up.
1398 */
1399 CTR6(KTR_CXGBE,
1400 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1401 __func__, stid, tid, lctx, synqe, synqe->flags);
1402 if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1403 send_reset_synqe(tod, synqe);
1404 INP_WUNLOCK(inp);
1405
1406 release_synqe(synqe); /* extra hold */
1407 return (__LINE__);
1408 }
1409 INP_WUNLOCK(inp);
1410
1411 release_synqe(synqe); /* extra hold */
1412 return (0);
1413reject:
1414 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1415 reject_reason);
1416
1417 if (e)
1418 t4_l2t_release(e);
1419 release_tid(sc, tid, lctx->ctrlq);
1420
1421 if (__predict_true(m != NULL)) {
1422 m_adj(m, sizeof(*cpl));
1423 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1424 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1425 m->m_pkthdr.csum_data = 0xffff;
1426 hw_ifp->if_input(hw_ifp, m);
1427 }
1428
1429 return (reject_reason);
1430}
1431
1432static void
1433synqe_to_protohdrs(struct synq_entry *synqe,
1434 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1435 struct tcphdr *th, struct tcpopt *to)
1436{
1437 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1438
1439 /* start off with the original SYN */
1440 pass_accept_req_to_protohdrs(synqe->syn, inc, th);
1441
1442 /* modify parts to make it look like the ACK to our SYN|ACK */
1443 th->th_flags = TH_ACK;
1444 th->th_ack = synqe->iss + 1;
1445 th->th_seq = be32toh(cpl->rcv_isn);
1446 bzero(to, sizeof(*to));
1447 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1448 to->to_flags |= TOF_TS;
1449 to->to_tsecr = synqe->ts;
1450 }
1451}
1452
1453static int
1454do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1455 struct mbuf *m)
1456{
1457 struct adapter *sc = iq->adapter;
1458 struct port_info *pi;
1459 struct ifnet *ifp;
1460 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1461#if defined(KTR) || defined(INVARIANTS)
1462 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1463#endif
1464 unsigned int tid = GET_TID(cpl);
1465 struct synq_entry *synqe = lookup_tid(sc, tid);
1466 struct listen_ctx *lctx = synqe->lctx;
1467 struct inpcb *inp = lctx->inp;
1468 struct socket *so;
1469 struct tcphdr th;
1470 struct tcpopt to;
1471 struct in_conninfo inc;
1472 struct toepcb *toep;
1473 u_int txqid, rxqid;
1474#ifdef INVARIANTS
1475 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1476#endif
1477
1478 KASSERT(opcode == CPL_PASS_ESTABLISH,
1479 ("%s: unexpected opcode 0x%x", __func__, opcode));
1480 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1481 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1482 KASSERT(synqe->flags & TPF_SYNQE,
1483 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1484
1485 INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
1486 INP_WLOCK(inp);
1487
1488 CTR6(KTR_CXGBE,
1489 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1490 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1491
1492 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1493
1494 if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1495 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1496 ("%s: listen socket closed but tid %u not aborted.",
1497 __func__, tid));
1498 }
1499
1500 INP_WUNLOCK(inp);
1501 INP_INFO_WUNLOCK(&V_tcbinfo);
1502 return (0);
1503 }
1504
1505 ifp = synqe->syn->m_pkthdr.rcvif;
1506 pi = ifp->if_softc;
1507 KASSERT(pi->adapter == sc,
1508 ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
1509
1510 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1511 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1512 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid,
1513 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1514
1515 toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
1516 if (toep == NULL) {
1517reset:
1518 /*
1519 * The reply to this abort will perform final cleanup. There is
1520 * no need to check for HAS_L2TE here. We can be here only if
1521 * we responded to the PASS_ACCEPT_REQ, and our response had the
1522 * L2T idx.
1523 */
1524 send_reset_synqe(TOEDEV(ifp), synqe);
1525 INP_WUNLOCK(inp);
1526 INP_INFO_WUNLOCK(&V_tcbinfo);
1527 return (0);
1528 }
1529 toep->tid = tid;
1530 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1531 if (synqe->flags & TPF_SYNQE_TCPDDP)
1532 set_tcpddp_ulp_mode(toep);
1533 else
1534 toep->ulp_mode = ULP_MODE_NONE;
1535 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1536 toep->rx_credits = synqe->rcv_bufsize;
1537
1538 so = inp->inp_socket;
1539 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1540
1541 /* Come up with something that syncache_expand should be ok with. */
1542 synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
1543
1544 /*
1545 * No more need for anything in the mbuf that carried the
1546 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer
1547 * there. XXX: bad form but I don't want to increase the size of synqe.
1548 */
1549 m = synqe->syn;
1550 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1551 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1552 bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1553 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1554
1555 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1556 free_toepcb(toep);
1557 goto reset;
1558 }
1559
1560 /*
1561 * This is for the unlikely case where the syncache entry that we added
1562 * has been evicted from the syncache, but the syncache_expand above
1563 * works because of syncookies.
1564 *
1565 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1566 * anyone accept'ing a connection before we've installed our hooks, but
1567 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1568 */
1569 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1570 struct inpcb *new_inp = sotoinpcb(so);
1571
1572 INP_WLOCK(new_inp);
1573 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1574 t4_offload_socket(TOEDEV(ifp), synqe, so);
1575 INP_WUNLOCK(new_inp);
1576 }
1577
1578 /* Done with the synqe */
1579 TAILQ_REMOVE(&lctx->synq, synqe, link);
1580 inp = release_lctx(sc, lctx);
1581 if (inp != NULL)
1582 INP_WUNLOCK(inp);
1583 INP_INFO_WUNLOCK(&V_tcbinfo);
1584 release_synqe(synqe);
1585
1586 return (0);
1587}
1588
1589void
1590t4_init_listen_cpl_handlers(struct adapter *sc)
1591{
1592
1593 t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1594 t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1595 t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1596 t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
1597}
1598#endif
1014
1015#ifdef USE_DDP_RX_FLOW_CONTROL
1016 if (ulp_mode == ULP_MODE_TCPDDP)
1017 opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
1018#endif
1019
1020 return htobe32(opt2);
1021}
1022
1023/* XXX: duplication. */
1024static inline void
1025tcp_fields_to_host(struct tcphdr *th)
1026{
1027
1028 th->th_seq = ntohl(th->th_seq);
1029 th->th_ack = ntohl(th->th_ack);
1030 th->th_win = ntohs(th->th_win);
1031 th->th_urp = ntohs(th->th_urp);
1032}
1033
1034static void
1035pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
1036 struct tcphdr *th)
1037{
1038 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1039 const struct ether_header *eh;
1040 unsigned int hlen = be32toh(cpl->hdr_len);
1041 uintptr_t l3hdr;
1042 const struct tcphdr *tcp;
1043
1044 eh = (const void *)(cpl + 1);
1045 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1046 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1047
1048 if (inc) {
1049 bzero(inc, sizeof(*inc));
1050 inc->inc_fport = tcp->th_sport;
1051 inc->inc_lport = tcp->th_dport;
1052 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1053 const struct ip *ip = (const void *)l3hdr;
1054
1055 inc->inc_faddr = ip->ip_src;
1056 inc->inc_laddr = ip->ip_dst;
1057 } else {
1058 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1059
1060 inc->inc_flags |= INC_ISIPV6;
1061 inc->inc6_faddr = ip6->ip6_src;
1062 inc->inc6_laddr = ip6->ip6_dst;
1063 }
1064 }
1065
1066 if (th) {
1067 bcopy(tcp, th, sizeof(*th));
1068 tcp_fields_to_host(th); /* just like tcp_input */
1069 }
1070}
1071
1072static int
1073ifnet_has_ip6(struct ifnet *ifp, struct in6_addr *ip6)
1074{
1075 struct ifaddr *ifa;
1076 struct sockaddr_in6 *sin6;
1077 int found = 0;
1078 struct in6_addr in6 = *ip6;
1079
1080 /* Just as in ip6_input */
1081 if (in6_clearscope(&in6) || in6_clearscope(&in6))
1082 return (0);
1083 in6_setscope(&in6, ifp, NULL);
1084
1085 if_addr_rlock(ifp);
1086 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1087 sin6 = (void *)ifa->ifa_addr;
1088 if (sin6->sin6_family != AF_INET6)
1089 continue;
1090
1091 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &in6)) {
1092 found = 1;
1093 break;
1094 }
1095 }
1096 if_addr_runlock(ifp);
1097
1098 return (found);
1099}
1100
1101static struct l2t_entry *
1102get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1103 struct in_conninfo *inc)
1104{
1105 struct rtentry *rt;
1106 struct l2t_entry *e;
1107 struct sockaddr_in6 sin6;
1108 struct sockaddr *dst = (void *)&sin6;
1109
1110 if (inc->inc_flags & INC_ISIPV6) {
1111 dst->sa_len = sizeof(struct sockaddr_in6);
1112 dst->sa_family = AF_INET6;
1113 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
1114
1115 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1116 /* no need for route lookup */
1117 e = t4_l2t_get(pi, ifp, dst);
1118 return (e);
1119 }
1120 } else {
1121 dst->sa_len = sizeof(struct sockaddr_in);
1122 dst->sa_family = AF_INET;
1123 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
1124 }
1125
1126 rt = rtalloc1(dst, 0, 0);
1127 if (rt == NULL)
1128 return (NULL);
1129 else {
1130 struct sockaddr *nexthop;
1131
1132 RT_UNLOCK(rt);
1133 if (rt->rt_ifp != ifp)
1134 e = NULL;
1135 else {
1136 if (rt->rt_flags & RTF_GATEWAY)
1137 nexthop = rt->rt_gateway;
1138 else
1139 nexthop = dst;
1140 e = t4_l2t_get(pi, ifp, nexthop);
1141 }
1142 RTFREE(rt);
1143 }
1144
1145 return (e);
1146}
1147
1148static int
1149ifnet_has_ip(struct ifnet *ifp, struct in_addr in)
1150{
1151 struct ifaddr *ifa;
1152 struct sockaddr_in *sin;
1153 int found = 0;
1154
1155 if_addr_rlock(ifp);
1156 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1157 sin = (void *)ifa->ifa_addr;
1158 if (sin->sin_family != AF_INET)
1159 continue;
1160
1161 if (sin->sin_addr.s_addr == in.s_addr) {
1162 found = 1;
1163 break;
1164 }
1165 }
1166 if_addr_runlock(ifp);
1167
1168 return (found);
1169}
1170
1171#define REJECT_PASS_ACCEPT() do { \
1172 reject_reason = __LINE__; \
1173 goto reject; \
1174} while (0)
1175
1176/*
1177 * The context associated with a tid entry via insert_tid could be a synq_entry
1178 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
1179 */
1180CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1181
1182/*
1183 * Incoming SYN on a listening socket.
1184 *
1185 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1186 * etc.
1187 */
1188static int
1189do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1190 struct mbuf *m)
1191{
1192 struct adapter *sc = iq->adapter;
1193 struct toedev *tod;
1194 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1195 struct cpl_pass_accept_rpl *rpl;
1196 struct wrqe *wr;
1197 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1198 unsigned int tid = GET_TID(cpl);
1199 struct listen_ctx *lctx = lookup_stid(sc, stid);
1200 struct inpcb *inp;
1201 struct socket *so;
1202 struct in_conninfo inc;
1203 struct tcphdr th;
1204 struct tcpopt to;
1205 struct port_info *pi;
1206 struct ifnet *hw_ifp, *ifp;
1207 struct l2t_entry *e = NULL;
1208 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1209 struct synq_entry *synqe = NULL;
1210 int reject_reason;
1211 uint16_t vid;
1212#ifdef INVARIANTS
1213 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1214#endif
1215
1216 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1217 ("%s: unexpected opcode 0x%x", __func__, opcode));
1218 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1219
1220 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1221 lctx);
1222
1223 pass_accept_req_to_protohdrs(m, &inc, &th);
1224 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1225
1226 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1227 hw_ifp = pi->ifp; /* the cxgbeX ifnet */
1228 m->m_pkthdr.rcvif = hw_ifp;
1229 tod = TOEDEV(hw_ifp);
1230
1231 /*
1232 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1233 * involved. Don't offload if the SYN had a VLAN tag and the vid
1234 * doesn't match anything on this interface.
1235 *
1236 * XXX: lagg support, lagg + vlan support.
1237 */
1238 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1239 if (vid != 0xfff) {
1240 ifp = VLAN_DEVAT(hw_ifp, vid);
1241 if (ifp == NULL)
1242 REJECT_PASS_ACCEPT();
1243 } else
1244 ifp = hw_ifp;
1245
1246 /*
1247 * Don't offload if the peer requested a TCP option that's not known to
1248 * the silicon.
1249 */
1250 if (cpl->tcpopt.unknown)
1251 REJECT_PASS_ACCEPT();
1252
1253 if (inc.inc_flags & INC_ISIPV6) {
1254
1255 /* Don't offload if the ifcap isn't enabled */
1256 if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1257 REJECT_PASS_ACCEPT();
1258
1259 /*
1260 * SYN must be directed to an IP6 address on this ifnet. This
1261 * is more restrictive than in6_localip.
1262 */
1263 if (!ifnet_has_ip6(ifp, &inc.inc6_laddr))
1264 REJECT_PASS_ACCEPT();
1265 } else {
1266
1267 /* Don't offload if the ifcap isn't enabled */
1268 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1269 REJECT_PASS_ACCEPT();
1270
1271 /*
1272 * SYN must be directed to an IP address on this ifnet. This
1273 * is more restrictive than in_localip.
1274 */
1275 if (!ifnet_has_ip(ifp, inc.inc_laddr))
1276 REJECT_PASS_ACCEPT();
1277 }
1278
1279 e = get_l2te_for_nexthop(pi, ifp, &inc);
1280 if (e == NULL)
1281 REJECT_PASS_ACCEPT();
1282
1283 synqe = mbuf_to_synqe(m);
1284 if (synqe == NULL)
1285 REJECT_PASS_ACCEPT();
1286
1287 wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
1288 if (wr == NULL)
1289 REJECT_PASS_ACCEPT();
1290 rpl = wrtod(wr);
1291
1292 INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */
1293
1294 /* Don't offload if the 4-tuple is already in use */
1295 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1296 INP_INFO_WUNLOCK(&V_tcbinfo);
1297 free(wr, M_CXGBE);
1298 REJECT_PASS_ACCEPT();
1299 }
1300
1301 inp = lctx->inp; /* listening socket, not owned by TOE */
1302 INP_WLOCK(inp);
1303
1304 /* Don't offload if the listening socket has closed */
1305 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1306 /*
1307 * The listening socket has closed. The reply from the TOE to
1308 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1309 * resources tied to this listen context.
1310 */
1311 INP_WUNLOCK(inp);
1312 INP_INFO_WUNLOCK(&V_tcbinfo);
1313 free(wr, M_CXGBE);
1314 REJECT_PASS_ACCEPT();
1315 }
1316 so = inp->inp_socket;
1317
1318 mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1319 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1320 SOCKBUF_LOCK(&so->so_rcv);
1321 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1322 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1323 SOCKBUF_UNLOCK(&so->so_rcv);
1324
1325 save_qids_in_mbuf(m, pi);
1326 get_qids_from_mbuf(m, NULL, &rxqid);
1327
1328 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1329 if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) {
1330 ulp_mode = ULP_MODE_TCPDDP;
1331 synqe->flags |= TPF_SYNQE_TCPDDP;
1332 } else
1333 ulp_mode = ULP_MODE_NONE;
1334 rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, ulp_mode);
1335 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
1336
1337 synqe->tid = tid;
1338 synqe->lctx = lctx;
1339 synqe->syn = m;
1340 m = NULL;
1341 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */
1342 synqe->l2e_idx = e->idx;
1343 synqe->rcv_bufsize = rx_credits;
1344 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1345
1346 insert_tid(sc, tid, synqe);
1347 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1348 hold_synqe(synqe); /* hold for the duration it's in the synq */
1349 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
1350
1351 /*
1352 * If all goes well t4_syncache_respond will get called during
1353 * syncache_add. Also note that syncache_add releases both pcbinfo and
1354 * pcb locks.
1355 */
1356 toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1357 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */
1358 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1359
1360 /*
1361 * If we replied during syncache_add (synqe->wr has been consumed),
1362 * good. Otherwise, set it to 0 so that further syncache_respond
1363 * attempts by the kernel will be ignored.
1364 */
1365 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1366
1367 /*
1368 * syncache may or may not have a hold on the synqe, which may
1369 * or may not be stashed in the original SYN mbuf passed to us.
1370 * Just copy it over instead of dealing with all possibilities.
1371 */
1372 m = m_dup(synqe->syn, M_NOWAIT);
1373 if (m)
1374 m->m_pkthdr.rcvif = hw_ifp;
1375
1376 remove_tid(sc, synqe->tid);
1377 free(wr, M_CXGBE);
1378
1379 /* Yank the synqe out of the lctx synq. */
1380 INP_WLOCK(inp);
1381 TAILQ_REMOVE(&lctx->synq, synqe, link);
1382 release_synqe(synqe); /* removed from synq list */
1383 inp = release_lctx(sc, lctx);
1384 if (inp)
1385 INP_WUNLOCK(inp);
1386
1387 release_synqe(synqe); /* extra hold */
1388 REJECT_PASS_ACCEPT();
1389 }
1390
1391 CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1392 __func__, stid, tid, lctx, synqe);
1393
1394 INP_WLOCK(inp);
1395 synqe->flags |= TPF_SYNQE_HAS_L2TE;
1396 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1397 /*
1398 * Listening socket closed but tod_listen_stop did not abort
1399 * this tid because there was no L2T entry for the tid at that
1400 * time. Abort it now. The reply to the abort will clean up.
1401 */
1402 CTR6(KTR_CXGBE,
1403 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1404 __func__, stid, tid, lctx, synqe, synqe->flags);
1405 if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1406 send_reset_synqe(tod, synqe);
1407 INP_WUNLOCK(inp);
1408
1409 release_synqe(synqe); /* extra hold */
1410 return (__LINE__);
1411 }
1412 INP_WUNLOCK(inp);
1413
1414 release_synqe(synqe); /* extra hold */
1415 return (0);
1416reject:
1417 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1418 reject_reason);
1419
1420 if (e)
1421 t4_l2t_release(e);
1422 release_tid(sc, tid, lctx->ctrlq);
1423
1424 if (__predict_true(m != NULL)) {
1425 m_adj(m, sizeof(*cpl));
1426 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1427 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1428 m->m_pkthdr.csum_data = 0xffff;
1429 hw_ifp->if_input(hw_ifp, m);
1430 }
1431
1432 return (reject_reason);
1433}
1434
1435static void
1436synqe_to_protohdrs(struct synq_entry *synqe,
1437 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1438 struct tcphdr *th, struct tcpopt *to)
1439{
1440 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1441
1442 /* start off with the original SYN */
1443 pass_accept_req_to_protohdrs(synqe->syn, inc, th);
1444
1445 /* modify parts to make it look like the ACK to our SYN|ACK */
1446 th->th_flags = TH_ACK;
1447 th->th_ack = synqe->iss + 1;
1448 th->th_seq = be32toh(cpl->rcv_isn);
1449 bzero(to, sizeof(*to));
1450 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1451 to->to_flags |= TOF_TS;
1452 to->to_tsecr = synqe->ts;
1453 }
1454}
1455
1456static int
1457do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1458 struct mbuf *m)
1459{
1460 struct adapter *sc = iq->adapter;
1461 struct port_info *pi;
1462 struct ifnet *ifp;
1463 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1464#if defined(KTR) || defined(INVARIANTS)
1465 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1466#endif
1467 unsigned int tid = GET_TID(cpl);
1468 struct synq_entry *synqe = lookup_tid(sc, tid);
1469 struct listen_ctx *lctx = synqe->lctx;
1470 struct inpcb *inp = lctx->inp;
1471 struct socket *so;
1472 struct tcphdr th;
1473 struct tcpopt to;
1474 struct in_conninfo inc;
1475 struct toepcb *toep;
1476 u_int txqid, rxqid;
1477#ifdef INVARIANTS
1478 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1479#endif
1480
1481 KASSERT(opcode == CPL_PASS_ESTABLISH,
1482 ("%s: unexpected opcode 0x%x", __func__, opcode));
1483 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1484 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1485 KASSERT(synqe->flags & TPF_SYNQE,
1486 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1487
1488 INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
1489 INP_WLOCK(inp);
1490
1491 CTR6(KTR_CXGBE,
1492 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1493 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1494
1495 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1496
1497 if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1498 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1499 ("%s: listen socket closed but tid %u not aborted.",
1500 __func__, tid));
1501 }
1502
1503 INP_WUNLOCK(inp);
1504 INP_INFO_WUNLOCK(&V_tcbinfo);
1505 return (0);
1506 }
1507
1508 ifp = synqe->syn->m_pkthdr.rcvif;
1509 pi = ifp->if_softc;
1510 KASSERT(pi->adapter == sc,
1511 ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
1512
1513 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1514 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1515 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid,
1516 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1517
1518 toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
1519 if (toep == NULL) {
1520reset:
1521 /*
1522 * The reply to this abort will perform final cleanup. There is
1523 * no need to check for HAS_L2TE here. We can be here only if
1524 * we responded to the PASS_ACCEPT_REQ, and our response had the
1525 * L2T idx.
1526 */
1527 send_reset_synqe(TOEDEV(ifp), synqe);
1528 INP_WUNLOCK(inp);
1529 INP_INFO_WUNLOCK(&V_tcbinfo);
1530 return (0);
1531 }
1532 toep->tid = tid;
1533 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1534 if (synqe->flags & TPF_SYNQE_TCPDDP)
1535 set_tcpddp_ulp_mode(toep);
1536 else
1537 toep->ulp_mode = ULP_MODE_NONE;
1538 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1539 toep->rx_credits = synqe->rcv_bufsize;
1540
1541 so = inp->inp_socket;
1542 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1543
1544 /* Come up with something that syncache_expand should be ok with. */
1545 synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
1546
1547 /*
1548 * No more need for anything in the mbuf that carried the
1549 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer
1550 * there. XXX: bad form but I don't want to increase the size of synqe.
1551 */
1552 m = synqe->syn;
1553 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1554 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1555 bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1556 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1557
1558 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1559 free_toepcb(toep);
1560 goto reset;
1561 }
1562
1563 /*
1564 * This is for the unlikely case where the syncache entry that we added
1565 * has been evicted from the syncache, but the syncache_expand above
1566 * works because of syncookies.
1567 *
1568 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1569 * anyone accept'ing a connection before we've installed our hooks, but
1570 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1571 */
1572 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1573 struct inpcb *new_inp = sotoinpcb(so);
1574
1575 INP_WLOCK(new_inp);
1576 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1577 t4_offload_socket(TOEDEV(ifp), synqe, so);
1578 INP_WUNLOCK(new_inp);
1579 }
1580
1581 /* Done with the synqe */
1582 TAILQ_REMOVE(&lctx->synq, synqe, link);
1583 inp = release_lctx(sc, lctx);
1584 if (inp != NULL)
1585 INP_WUNLOCK(inp);
1586 INP_INFO_WUNLOCK(&V_tcbinfo);
1587 release_synqe(synqe);
1588
1589 return (0);
1590}
1591
1592void
1593t4_init_listen_cpl_handlers(struct adapter *sc)
1594{
1595
1596 t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1597 t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1598 t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1599 t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
1600}
1601#endif