iw_cxgb_cm.c revision 178786
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c 178786 2008-05-05 18:46:18Z kmacy $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/bus.h>
36#include <sys/module.h>
37#include <sys/pciio.h>
38#include <sys/conf.h>
39#include <machine/bus.h>
40#include <machine/resource.h>
41#include <sys/bus_dma.h>
42#include <sys/rman.h>
43#include <sys/ioccom.h>
44#include <sys/mbuf.h>
45#include <sys/rwlock.h>
46#include <sys/linker.h>
47#include <sys/firmware.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sockio.h>
51#include <sys/smp.h>
52#include <sys/sysctl.h>
53#include <sys/syslog.h>
54#include <sys/queue.h>
55#include <sys/taskqueue.h>
56#include <sys/proc.h>
57#include <sys/uio.h>
58
59#include <net/route.h>
60#include <netinet/in_systm.h>
61#include <netinet/in.h>
62#include <netinet/in_pcb.h>
63#include <netinet/ip.h>
64#include <netinet/ip_var.h>
65#include <netinet/tcp_var.h>
66#include <netinet/tcp.h>
67#include <netinet/tcpip.h>
68
69#include <contrib/rdma/ib_verbs.h>
70
71
72#ifdef CONFIG_DEFINED
73#include <cxgb_include.h>
74#include <ulp/tom/cxgb_tom.h>
75#include <ulp/tom/cxgb_t3_ddp.h>
76#include <ulp/tom/cxgb_defs.h>
77#include <ulp/tom/cxgb_toepcb.h>
78#include <ulp/iw_cxgb/iw_cxgb_wr.h>
79#include <ulp/iw_cxgb/iw_cxgb_hal.h>
80#include <ulp/iw_cxgb/iw_cxgb_provider.h>
81#include <ulp/iw_cxgb/iw_cxgb_cm.h>
82#include <ulp/iw_cxgb/iw_cxgb.h>
83#else
84#include <dev/cxgb/cxgb_include.h>
85#include <dev/cxgb/ulp/tom/cxgb_tom.h>
86#include <dev/ulp/tom/cxgb_t3_ddp.h>
87#include <dev/cxgb/ulp/tom/cxgb_defs.h>
88#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
89#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
90#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
91#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
92#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
93#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
94#endif
95
96#ifdef KTR
97static char *states[] = {
98	"idle",
99	"listen",
100	"connecting",
101	"mpa_wait_req",
102	"mpa_req_sent",
103	"mpa_req_rcvd",
104	"mpa_rep_sent",
105	"fpdu_mode",
106	"aborting",
107	"closing",
108	"moribund",
109	"dead",
110	NULL,
111};
112#endif
113
114SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters");
115
116static int ep_timeout_secs = 10;
117TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs);
118SYSCTL_UINT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0,
119    "CM Endpoint operation timeout in seconds (default=10)");
120
121static int mpa_rev = 1;
122TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev);
123SYSCTL_UINT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0,
124    "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)");
125
126static int markers_enabled = 0;
127TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled);
128SYSCTL_UINT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0,
129    "Enable MPA MARKERS (default(0)=disabled)");
130
131static int crc_enabled = 1;
132TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled);
133SYSCTL_UINT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0,
134    "Enable MPA CRC (default(1)=enabled)");
135
136static int rcv_win = 256 * 1024;
137TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win);
138SYSCTL_UINT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0,
139    "TCP receive window in bytes (default=256KB)");
140
141static int snd_win = 32 * 1024;
142TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win);
143SYSCTL_UINT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0,
144    "TCP send window in bytes (default=32KB)");
145
146static unsigned int nocong = 0;
147TUNABLE_INT("hw.iw_cxgb.nocong", &nocong);
148SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0,
149    "Turn off congestion control (default=0)");
150
151static unsigned int cong_flavor = 1;
152TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor);
153SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0,
154    "TCP Congestion control flavor (default=1)");
155
156static void ep_timeout(void *arg);
157static void connect_reply_upcall(struct iwch_ep *ep, int status);
158static void iwch_so_upcall(struct socket *so, void *arg, int waitflag);
159
160/*
161 * Cruft to offload socket upcalls onto thread.
162 */
163static struct mtx req_lock;
164static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list;
165static struct task iw_cxgb_task;
166static struct taskqueue *iw_cxgb_taskq;
167static void process_req(void *ctx, int pending);
168
169static void
170start_ep_timer(struct iwch_ep *ep)
171{
172	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
173	if (callout_pending(&ep->timer)) {
174		CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep);
175		callout_deactivate(&ep->timer);
176		callout_drain(&ep->timer);
177	} else {
178		/*
179		 * XXX this looks racy
180		 */
181		get_ep(&ep->com);
182		callout_init(&ep->timer, TRUE);
183	}
184	callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep);
185}
186
187static void
188stop_ep_timer(struct iwch_ep *ep)
189{
190	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
191	callout_drain(&ep->timer);
192	put_ep(&ep->com);
193}
194
195static int set_tcpinfo(struct iwch_ep *ep)
196{
197	struct tcp_info ti;
198	struct sockopt sopt;
199	int err;
200
201	sopt.sopt_dir = SOPT_GET;
202	sopt.sopt_level = IPPROTO_TCP;
203	sopt.sopt_name = TCP_INFO;
204	sopt.sopt_val = (caddr_t)&ti;
205	sopt.sopt_valsize = sizeof ti;
206	sopt.sopt_td = NULL;
207
208	err = sogetopt(ep->com.so, &sopt);
209	if (err) {
210		printf("%s can't get tcpinfo\n", __FUNCTION__);
211		return -err;
212	}
213	if (!(ti.tcpi_options & TCPI_OPT_TOE)) {
214		printf("%s connection NOT OFFLOADED!\n", __FUNCTION__);
215		return -EINVAL;
216	}
217
218	ep->snd_seq = ti.tcpi_snd_nxt;
219	ep->rcv_seq = ti.tcpi_rcv_nxt;
220	ep->emss = ti.__tcpi_snd_mss - sizeof(struct tcpiphdr);
221	ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */
222	if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS)
223		ep->emss -= 12;
224	if (ep->emss < 128)
225		ep->emss = 128;
226	return 0;
227}
228
229static enum iwch_ep_state
230state_read(struct iwch_ep_common *epc)
231{
232	enum iwch_ep_state state;
233
234	mtx_lock(&epc->lock);
235	state = epc->state;
236	mtx_unlock(&epc->lock);
237	return state;
238}
239
240static void
241__state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
242{
243	epc->state = new;
244}
245
246static void
247state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
248{
249
250	mtx_lock(&epc->lock);
251	CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]);
252	__state_set(epc, new);
253	mtx_unlock(&epc->lock);
254	return;
255}
256
257static void *
258alloc_ep(int size, int flags)
259{
260	struct iwch_ep_common *epc;
261
262	epc = malloc(size, M_DEVBUF, flags);
263	if (epc) {
264		memset(epc, 0, size);
265		refcount_init(&epc->refcount, 1);
266		mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK);
267		cv_init(&epc->waitq, "iwch_epc cv");
268	}
269	CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc);
270	return epc;
271}
272
273void __free_ep(struct iwch_ep_common *epc)
274{
275	CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]);
276	KASSERT(!epc->so, ("%s warning ep->so %p \n", __FUNCTION__, epc->so));
277	KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc));
278	free(epc, M_DEVBUF);
279}
280
281int
282iwch_quiesce_tid(struct iwch_ep *ep)
283{
284#ifdef notyet
285	struct cpl_set_tcb_field *req;
286	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
287
288	if (m == NULL)
289		return (-ENOMEM);
290	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
291	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
292	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
293	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
294	req->reply = 0;
295	req->cpu_idx = 0;
296	req->word = htons(W_TCB_RX_QUIESCE);
297	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
298	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
299
300	m_set_priority(m, CPL_PRIORITY_DATA);
301	cxgb_ofld_send(ep->com.tdev, m);
302#endif
303	return 0;
304}
305
306int
307iwch_resume_tid(struct iwch_ep *ep)
308{
309#ifdef notyet
310	struct cpl_set_tcb_field *req;
311	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
312
313	if (m == NULL)
314		return (-ENOMEM);
315	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
316	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
317	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
318	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
319	req->reply = 0;
320	req->cpu_idx = 0;
321	req->word = htons(W_TCB_RX_QUIESCE);
322	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
323	req->val = 0;
324
325	m_set_priority(m, CPL_PRIORITY_DATA);
326	cxgb_ofld_send(ep->com.tdev, m);
327#endif
328	return 0;
329}
330
331static struct rtentry *
332find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port,
333    __be16 peer_port, u8 tos)
334{
335        struct route iproute;
336        struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst;
337
338        bzero(&iproute, sizeof iproute);
339	dst->sin_family = AF_INET;
340	dst->sin_len = sizeof *dst;
341        dst->sin_addr.s_addr = peer_ip;
342
343        rtalloc(&iproute);
344	return iproute.ro_rt;
345}
346
347static void
348close_socket(struct iwch_ep_common *epc)
349{
350	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
351	SOCK_LOCK(epc->so);
352	epc->so->so_upcall = NULL;
353	epc->so->so_upcallarg = NULL;
354	epc->so->so_rcv.sb_flags &= ~SB_UPCALL;
355	SOCK_UNLOCK(epc->so);
356	soshutdown(epc->so, SHUT_WR|SHUT_RD);
357	epc->so = NULL;
358}
359
360static void
361shutdown_socket(struct iwch_ep_common *epc)
362{
363	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
364	soshutdown(epc->so, SHUT_WR);
365}
366
367static void
368abort_socket(struct iwch_ep *ep)
369{
370	struct sockopt sopt;
371	int err;
372	struct linger l;
373
374	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
375	l.l_onoff = 1;
376	l.l_linger = 0;
377
378	/* linger_time of 0 forces RST to be sent */
379	sopt.sopt_dir = SOPT_SET;
380	sopt.sopt_level = SOL_SOCKET;
381	sopt.sopt_name = SO_LINGER;
382	sopt.sopt_val = (caddr_t)&l;
383	sopt.sopt_valsize = sizeof l;
384	sopt.sopt_td = NULL;
385	err = sosetopt(ep->com.so, &sopt);
386	if (err)
387		printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err);
388}
389
390static void
391send_mpa_req(struct iwch_ep *ep)
392{
393	int mpalen;
394	struct mpa_message *mpa;
395	struct mbuf *m;
396	int err;
397
398	CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen);
399
400	mpalen = sizeof(*mpa) + ep->plen;
401	m = m_gethdr(mpalen, M_NOWAIT);
402	if (m == NULL) {
403		connect_reply_upcall(ep, -ENOMEM);
404		return;
405	}
406	mpa = mtod(m, struct mpa_message *);
407	m->m_len = mpalen;
408	m->m_pkthdr.len = mpalen;
409	memset(mpa, 0, sizeof(*mpa));
410	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
411	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
412		     (markers_enabled ? MPA_MARKERS : 0);
413	mpa->private_data_size = htons(ep->plen);
414	mpa->revision = mpa_rev;
415	if (ep->plen)
416		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
417
418	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
419	if (err) {
420		m_freem(m);
421		connect_reply_upcall(ep, -ENOMEM);
422		return;
423	}
424
425	start_ep_timer(ep);
426	state_set(&ep->com, MPA_REQ_SENT);
427	return;
428}
429
430static int
431send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
432{
433	int mpalen;
434	struct mpa_message *mpa;
435	struct mbuf *m;
436	int err;
437
438	CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen);
439
440	mpalen = sizeof(*mpa) + plen;
441
442	m = m_gethdr(mpalen, M_NOWAIT);
443	if (m == NULL) {
444		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
445		return (-ENOMEM);
446	}
447	mpa = mtod(m, struct mpa_message *);
448	m->m_len = mpalen;
449	m->m_pkthdr.len = mpalen;
450	memset(mpa, 0, sizeof(*mpa));
451	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
452	mpa->flags = MPA_REJECT;
453	mpa->revision = mpa_rev;
454	mpa->private_data_size = htons(plen);
455	if (plen)
456		memcpy(mpa->private_data, pdata, plen);
457	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
458	PANIC_IF(err);
459	return 0;
460}
461
462static int
463send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
464{
465	int mpalen;
466	struct mpa_message *mpa;
467	struct mbuf *m;
468
469	CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen);
470
471	mpalen = sizeof(*mpa) + plen;
472
473	m = m_gethdr(mpalen, M_NOWAIT);
474	if (m == NULL) {
475		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
476		return (-ENOMEM);
477	}
478	mpa = mtod(m, struct mpa_message *);
479	m->m_len = mpalen;
480	m->m_pkthdr.len = mpalen;
481	memset(mpa, 0, sizeof(*mpa));
482	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
483	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
484		     (markers_enabled ? MPA_MARKERS : 0);
485	mpa->revision = mpa_rev;
486	mpa->private_data_size = htons(plen);
487	if (plen)
488		memcpy(mpa->private_data, pdata, plen);
489
490	state_set(&ep->com, MPA_REP_SENT);
491	return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT,
492		ep->com.thread);
493}
494
495static void
496close_complete_upcall(struct iwch_ep *ep)
497{
498	struct iw_cm_event event;
499
500	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
501	memset(&event, 0, sizeof(event));
502	event.event = IW_CM_EVENT_CLOSE;
503	if (ep->com.cm_id) {
504		CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d",
505		     ep, ep->com.cm_id, ep->hwtid);
506		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
507		ep->com.cm_id->rem_ref(ep->com.cm_id);
508		ep->com.cm_id = NULL;
509		ep->com.qp = NULL;
510	}
511}
512
513static void
514abort_connection(struct iwch_ep *ep)
515{
516	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
517	state_set(&ep->com, ABORTING);
518	abort_socket(ep);
519	close_socket(&ep->com);
520	close_complete_upcall(ep);
521	state_set(&ep->com, DEAD);
522	put_ep(&ep->com);
523}
524
525static void
526peer_close_upcall(struct iwch_ep *ep)
527{
528	struct iw_cm_event event;
529
530	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
531	memset(&event, 0, sizeof(event));
532	event.event = IW_CM_EVENT_DISCONNECT;
533	if (ep->com.cm_id) {
534		CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d",
535		     ep, ep->com.cm_id, ep->hwtid);
536		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
537	}
538}
539
540static void
541peer_abort_upcall(struct iwch_ep *ep)
542{
543	struct iw_cm_event event;
544
545	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
546	memset(&event, 0, sizeof(event));
547	event.event = IW_CM_EVENT_CLOSE;
548	event.status = ECONNRESET;
549	if (ep->com.cm_id) {
550		CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep,
551		     ep->com.cm_id, ep->hwtid);
552		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
553		ep->com.cm_id->rem_ref(ep->com.cm_id);
554		ep->com.cm_id = NULL;
555		ep->com.qp = NULL;
556	}
557}
558
559static void
560connect_reply_upcall(struct iwch_ep *ep, int status)
561{
562	struct iw_cm_event event;
563
564	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status);
565	memset(&event, 0, sizeof(event));
566	event.event = IW_CM_EVENT_CONNECT_REPLY;
567	event.status = status;
568	event.local_addr = ep->com.local_addr;
569	event.remote_addr = ep->com.remote_addr;
570
571	if ((status == 0) || (status == ECONNREFUSED)) {
572		event.private_data_len = ep->plen;
573		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
574	}
575	if (ep->com.cm_id) {
576		CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep,
577		     ep->hwtid, status);
578		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
579	}
580	if (status < 0) {
581		ep->com.cm_id->rem_ref(ep->com.cm_id);
582		ep->com.cm_id = NULL;
583		ep->com.qp = NULL;
584	}
585}
586
587static void
588connect_request_upcall(struct iwch_ep *ep)
589{
590	struct iw_cm_event event;
591
592	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
593	memset(&event, 0, sizeof(event));
594	event.event = IW_CM_EVENT_CONNECT_REQUEST;
595	event.local_addr = ep->com.local_addr;
596	event.remote_addr = ep->com.remote_addr;
597	event.private_data_len = ep->plen;
598	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
599	event.provider_data = ep;
600	event.so = ep->com.so;
601	if (state_read(&ep->parent_ep->com) != DEAD)
602		ep->parent_ep->com.cm_id->event_handler(
603						ep->parent_ep->com.cm_id,
604						&event);
605	put_ep(&ep->parent_ep->com);
606	ep->parent_ep = NULL;
607}
608
609static void
610established_upcall(struct iwch_ep *ep)
611{
612	struct iw_cm_event event;
613
614	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
615	memset(&event, 0, sizeof(event));
616	event.event = IW_CM_EVENT_ESTABLISHED;
617	if (ep->com.cm_id) {
618		CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid);
619		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
620	}
621}
622
623static void
624process_mpa_reply(struct iwch_ep *ep)
625{
626	struct mpa_message *mpa;
627	u16 plen;
628	struct iwch_qp_attributes attrs;
629	enum iwch_qp_attr_mask mask;
630	int err;
631	struct mbuf *top, *m;
632	int flags = MSG_DONTWAIT;
633	struct uio uio;
634	int len;
635
636	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
637
638	/*
639	 * Stop mpa timer.  If it expired, then the state has
640	 * changed and we bail since ep_timeout already aborted
641	 * the connection.
642	 */
643	stop_ep_timer(ep);
644	if (state_read(&ep->com) != MPA_REQ_SENT)
645		return;
646
647	uio.uio_resid = len = 1000000;
648	uio.uio_td = ep->com.thread;
649	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
650	if (err) {
651		if (err == EWOULDBLOCK) {
652			start_ep_timer(ep);
653			return;
654		}
655		err = -err;
656		goto err;
657	}
658
659	if (ep->com.so->so_rcv.sb_mb) {
660		printf("%s data after soreceive called! so %p sb_mb %p top %p\n",
661			__FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top);
662	}
663
664	m = top;
665	do {
666		/*
667		 * If we get more than the supported amount of private data
668		 * then we must fail this connection.
669		 */
670		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
671			err = (-EINVAL);
672			goto err;
673		}
674
675		/*
676		 * copy the new data into our accumulation buffer.
677		 */
678		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
679		ep->mpa_pkt_len += m->m_len;
680		if (!m->m_next)
681			m = m->m_nextpkt;
682		else
683			m = m->m_next;
684	} while (m);
685
686	m_freem(top);
687
688	/*
689	 * if we don't even have the mpa message, then bail.
690	 */
691	if (ep->mpa_pkt_len < sizeof(*mpa))
692		return;
693	mpa = (struct mpa_message *)ep->mpa_pkt;
694
695	/* Validate MPA header. */
696	if (mpa->revision != mpa_rev) {
697		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
698		err = EPROTO;
699		goto err;
700	}
701	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
702		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
703		err = EPROTO;
704		goto err;
705	}
706
707	plen = ntohs(mpa->private_data_size);
708
709	/*
710	 * Fail if there's too much private data.
711	 */
712	if (plen > MPA_MAX_PRIVATE_DATA) {
713		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
714		err = EPROTO;
715		goto err;
716	}
717
718	/*
719	 * If plen does not account for pkt size
720	 */
721	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
722		CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len);
723		err = EPROTO;
724		goto err;
725	}
726
727	ep->plen = (u8) plen;
728
729	/*
730	 * If we don't have all the pdata yet, then bail.
731	 * We'll continue process when more data arrives.
732	 */
733	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
734		return;
735
736	if (mpa->flags & MPA_REJECT) {
737		err = ECONNREFUSED;
738		goto err;
739	}
740
741	/*
742	 * If we get here we have accumulated the entire mpa
743	 * start reply message including private data. And
744	 * the MPA header is valid.
745	 */
746	CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__);
747	state_set(&ep->com, FPDU_MODE);
748	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
749	ep->mpa_attr.recv_marker_enabled = markers_enabled;
750	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
751	ep->mpa_attr.version = mpa_rev;
752	if (set_tcpinfo(ep)) {
753		printf("%s set_tcpinfo error\n", __FUNCTION__);
754		goto err;
755	}
756	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
757	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
758	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
759	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
760
761	attrs.mpa_attr = ep->mpa_attr;
762	attrs.max_ird = ep->ird;
763	attrs.max_ord = ep->ord;
764	attrs.llp_stream_handle = ep;
765	attrs.next_state = IWCH_QP_STATE_RTS;
766
767	mask = IWCH_QP_ATTR_NEXT_STATE |
768	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
769	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
770
771	/* bind QP and TID with INIT_WR */
772	err = iwch_modify_qp(ep->com.qp->rhp,
773			     ep->com.qp, mask, &attrs, 1);
774	if (!err)
775		goto out;
776err:
777	abort_connection(ep);
778out:
779	connect_reply_upcall(ep, err);
780	return;
781}
782
783static void
784process_mpa_request(struct iwch_ep *ep)
785{
786	struct mpa_message *mpa;
787	u16 plen;
788	int flags = MSG_DONTWAIT;
789	struct mbuf *top, *m;
790	int err;
791	struct uio uio;
792	int len;
793
794	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
795
796	/*
797	 * Stop mpa timer.  If it expired, then the state has
798	 * changed and we bail since ep_timeout already aborted
799	 * the connection.
800	 */
801	stop_ep_timer(ep);
802	if (state_read(&ep->com) != MPA_REQ_WAIT)
803		return;
804
805	uio.uio_resid = len = 1000000;
806	uio.uio_td = ep->com.thread;
807	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
808	if (err) {
809		if (err == EWOULDBLOCK) {
810			start_ep_timer(ep);
811			return;
812		}
813		err = -err;
814		goto err;
815	}
816
817	m = top;
818	do {
819
820		/*
821		 * If we get more than the supported amount of private data
822		 * then we must fail this connection.
823		 */
824		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
825			CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__,
826				ep->mpa_pkt_len + m->m_len);
827			goto err;
828		}
829
830
831		/*
832		 * Copy the new data into our accumulation buffer.
833		 */
834		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
835		ep->mpa_pkt_len += m->m_len;
836
837		if (!m->m_next)
838			m = m->m_nextpkt;
839		else
840			m = m->m_next;
841	} while (m);
842
843	m_freem(top);
844
845	/*
846	 * If we don't even have the mpa message, then bail.
847	 * We'll continue process when more data arrives.
848	 */
849	if (ep->mpa_pkt_len < sizeof(*mpa)) {
850		start_ep_timer(ep);
851		CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__,
852			ep->mpa_pkt_len);
853		return;
854	}
855	mpa = (struct mpa_message *) ep->mpa_pkt;
856
857	/*
858	 * Validate MPA Header.
859	 */
860	if (mpa->revision != mpa_rev) {
861		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
862		goto err;
863	}
864
865	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
866		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
867		goto err;
868	}
869
870	plen = ntohs(mpa->private_data_size);
871
872	/*
873	 * Fail if there's too much private data.
874	 */
875	if (plen > MPA_MAX_PRIVATE_DATA) {
876		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
877		goto err;
878	}
879
880	/*
881	 * If plen does not account for pkt size
882	 */
883	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
884		CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__,
885			ep->mpa_pkt_len);
886		goto err;
887	}
888	ep->plen = (u8) plen;
889
890	/*
891	 * If we don't have all the pdata yet, then bail.
892	 */
893	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) {
894		start_ep_timer(ep);
895		CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__,
896			ep->mpa_pkt_len);
897		return;
898	}
899
900	/*
901	 * If we get here we have accumulated the entire mpa
902	 * start reply message including private data.
903	 */
904	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
905	ep->mpa_attr.recv_marker_enabled = markers_enabled;
906	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
907	ep->mpa_attr.version = mpa_rev;
908	if (set_tcpinfo(ep)) {
909		printf("%s set_tcpinfo error\n", __FUNCTION__);
910		goto err;
911	}
912	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
913	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
914	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
915	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
916
917	state_set(&ep->com, MPA_REQ_RCVD);
918
919	/* drive upcall */
920	connect_request_upcall(ep);
921	return;
922err:
923	abort_connection(ep);
924	return;
925}
926
927static void
928process_peer_close(struct iwch_ep *ep)
929{
930	struct iwch_qp_attributes attrs;
931	int disconnect = 1;
932	int release = 0;
933
934	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
935
936	mtx_lock(&ep->com.lock);
937	switch (ep->com.state) {
938	case MPA_REQ_WAIT:
939		__state_set(&ep->com, CLOSING);
940		break;
941	case MPA_REQ_SENT:
942		__state_set(&ep->com, CLOSING);
943		connect_reply_upcall(ep, -ECONNRESET);
944		break;
945	case MPA_REQ_RCVD:
946
947		/*
948		 * We're gonna mark this puppy DEAD, but keep
949		 * the reference on it until the ULP accepts or
950		 * rejects the CR.
951		 */
952		__state_set(&ep->com, CLOSING);
953		get_ep(&ep->com);
954		break;
955	case MPA_REP_SENT:
956		__state_set(&ep->com, CLOSING);
957		break;
958	case FPDU_MODE:
959		start_ep_timer(ep);
960		__state_set(&ep->com, CLOSING);
961		attrs.next_state = IWCH_QP_STATE_CLOSING;
962		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
963			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
964		peer_close_upcall(ep);
965		break;
966	case ABORTING:
967		disconnect = 0;
968		break;
969	case CLOSING:
970		__state_set(&ep->com, MORIBUND);
971		disconnect = 0;
972		break;
973	case MORIBUND:
974		stop_ep_timer(ep);
975		if (ep->com.cm_id && ep->com.qp) {
976			attrs.next_state = IWCH_QP_STATE_IDLE;
977			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
978				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
979		}
980		close_socket(&ep->com);
981		close_complete_upcall(ep);
982		__state_set(&ep->com, DEAD);
983		release = 1;
984		disconnect = 0;
985		break;
986	case DEAD:
987		disconnect = 0;
988		break;
989	default:
990		PANIC_IF(1);
991	}
992	mtx_unlock(&ep->com.lock);
993	if (disconnect)
994		iwch_ep_disconnect(ep, 0, M_NOWAIT);
995	if (release)
996		put_ep(&ep->com);
997	return;
998}
999
1000static void
1001process_conn_error(struct iwch_ep *ep)
1002{
1003	struct iwch_qp_attributes attrs;
1004	int ret;
1005	int state;
1006
1007	state = state_read(&ep->com);
1008	CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]);
1009	switch (state) {
1010	case MPA_REQ_WAIT:
1011		stop_ep_timer(ep);
1012		break;
1013	case MPA_REQ_SENT:
1014		stop_ep_timer(ep);
1015		connect_reply_upcall(ep, -ECONNRESET);
1016		break;
1017	case MPA_REP_SENT:
1018		ep->com.rpl_err = ECONNRESET;
1019		CTR1(KTR_IW_CXGB, "waking up ep %p", ep);
1020		break;
1021	case MPA_REQ_RCVD:
1022
1023		/*
1024		 * We're gonna mark this puppy DEAD, but keep
1025		 * the reference on it until the ULP accepts or
1026		 * rejects the CR.
1027		 */
1028		get_ep(&ep->com);
1029		break;
1030	case MORIBUND:
1031	case CLOSING:
1032		stop_ep_timer(ep);
1033		/*FALLTHROUGH*/
1034	case FPDU_MODE:
1035		if (ep->com.cm_id && ep->com.qp) {
1036			attrs.next_state = IWCH_QP_STATE_ERROR;
1037			ret = iwch_modify_qp(ep->com.qp->rhp,
1038				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1039				     &attrs, 1);
1040			if (ret)
1041				log(LOG_ERR,
1042				       "%s - qp <- error failed!\n",
1043				       __FUNCTION__);
1044		}
1045		peer_abort_upcall(ep);
1046		break;
1047	case ABORTING:
1048		break;
1049	case DEAD:
1050		CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__,
1051			ep->com.so->so_error);
1052		return;
1053	default:
1054		PANIC_IF(1);
1055		break;
1056	}
1057
1058	if (state != ABORTING) {
1059		close_socket(&ep->com);
1060		state_set(&ep->com, DEAD);
1061		put_ep(&ep->com);
1062	}
1063	return;
1064}
1065
1066static void
1067process_close_complete(struct iwch_ep *ep)
1068{
1069	struct iwch_qp_attributes attrs;
1070	int release = 0;
1071
1072	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1073	PANIC_IF(!ep);
1074
1075	/* The cm_id may be null if we failed to connect */
1076	mtx_lock(&ep->com.lock);
1077	switch (ep->com.state) {
1078	case CLOSING:
1079		__state_set(&ep->com, MORIBUND);
1080		break;
1081	case MORIBUND:
1082		stop_ep_timer(ep);
1083		if ((ep->com.cm_id) && (ep->com.qp)) {
1084			attrs.next_state = IWCH_QP_STATE_IDLE;
1085			iwch_modify_qp(ep->com.qp->rhp,
1086					     ep->com.qp,
1087					     IWCH_QP_ATTR_NEXT_STATE,
1088					     &attrs, 1);
1089		}
1090		close_socket(&ep->com);
1091		close_complete_upcall(ep);
1092		__state_set(&ep->com, DEAD);
1093		release = 1;
1094		break;
1095	case ABORTING:
1096		break;
1097	case DEAD:
1098	default:
1099		PANIC_IF(1);
1100		break;
1101	}
1102	mtx_unlock(&ep->com.lock);
1103	if (release)
1104		put_ep(&ep->com);
1105	return;
1106}
1107
1108/*
1109 * T3A does 3 things when a TERM is received:
1110 * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
1111 * 2) generate an async event on the QP with the TERMINATE opcode
1112 * 3) post a TERMINATE opcde cqe into the associated CQ.
1113 *
1114 * For (1), we save the message in the qp for later consumer consumption.
1115 * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
1116 * For (3), we toss the CQE in cxio_poll_cq().
1117 *
1118 * terminate() handles case (1)...
1119 */
1120static int
1121terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx)
1122{
1123	struct toepcb *toep = (struct toepcb *)ctx;
1124	struct socket *so = toeptoso(toep);
1125	struct iwch_ep *ep = so->so_upcallarg;
1126
1127	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1128	m_adj(m, sizeof(struct cpl_rdma_terminate));
1129	CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len);
1130	m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer);
1131	ep->com.qp->attr.terminate_msg_len = m->m_len;
1132	ep->com.qp->attr.is_terminate_local = 0;
1133	return CPL_RET_BUF_DONE;
1134}
1135
1136static int
1137ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx)
1138{
1139	struct toepcb *toep = (struct toepcb *)ctx;
1140	struct socket *so = toeptoso(toep);
1141	struct cpl_rdma_ec_status *rep = cplhdr(m);
1142	struct iwch_ep *ep;
1143	struct iwch_qp_attributes attrs;
1144	int release = 0;
1145
1146	ep = so->so_upcallarg;
1147	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status);
1148	if (!so || !ep) {
1149		panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1);
1150	}
1151	mtx_lock(&ep->com.lock);
1152	switch (ep->com.state) {
1153	case CLOSING:
1154		if (!rep->status)
1155			__state_set(&ep->com, MORIBUND);
1156		else
1157			__state_set(&ep->com, ABORTING);
1158		break;
1159	case MORIBUND:
1160		stop_ep_timer(ep);
1161		if (!rep->status) {
1162			if ((ep->com.cm_id) && (ep->com.qp)) {
1163				attrs.next_state = IWCH_QP_STATE_IDLE;
1164				iwch_modify_qp(ep->com.qp->rhp,
1165					     ep->com.qp,
1166					     IWCH_QP_ATTR_NEXT_STATE,
1167					     &attrs, 1);
1168			}
1169			close_socket(&ep->com);
1170			close_complete_upcall(ep);
1171			__state_set(&ep->com, DEAD);
1172			release = 1;
1173		}
1174		break;
1175	case DEAD:
1176		break;
1177	default:
1178		panic("unknown state: %d\n", ep->com.state);
1179	}
1180	mtx_unlock(&ep->com.lock);
1181	if (rep->status) {
1182		log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n",
1183		       __FUNCTION__, ep->hwtid);
1184		attrs.next_state = IWCH_QP_STATE_ERROR;
1185		iwch_modify_qp(ep->com.qp->rhp,
1186			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1187			       &attrs, 1);
1188	}
1189	if (release)
1190		put_ep(&ep->com);
1191	return CPL_RET_BUF_DONE;
1192}
1193
1194static void
1195ep_timeout(void *arg)
1196{
1197	struct iwch_ep *ep = (struct iwch_ep *)arg;
1198	struct iwch_qp_attributes attrs;
1199	int err = 0;
1200
1201	mtx_lock(&ep->com.lock);
1202	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1203	switch (ep->com.state) {
1204	case MPA_REQ_SENT:
1205		connect_reply_upcall(ep, -ETIMEDOUT);
1206		break;
1207	case MPA_REQ_WAIT:
1208		break;
1209	case CLOSING:
1210	case MORIBUND:
1211		if (ep->com.cm_id && ep->com.qp)
1212			err = 1;
1213		break;
1214	default:
1215		panic("unknown state: %d\n", ep->com.state);
1216	}
1217	__state_set(&ep->com, ABORTING);
1218	mtx_unlock(&ep->com.lock);
1219	if (err){
1220		attrs.next_state = IWCH_QP_STATE_ERROR;
1221		iwch_modify_qp(ep->com.qp->rhp,
1222			     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1223			     &attrs, 1);
1224	}
1225	abort_connection(ep);
1226	put_ep(&ep->com);
1227}
1228
1229int
1230iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
1231{
1232	int err;
1233	struct iwch_ep *ep = to_ep(cm_id);
1234	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1235
1236	if (state_read(&ep->com) == DEAD) {
1237		put_ep(&ep->com);
1238		return (-ECONNRESET);
1239	}
1240	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
1241	if (mpa_rev == 0) {
1242		abort_connection(ep);
1243	} else {
1244		err = send_mpa_reject(ep, pdata, pdata_len);
1245		err = soshutdown(ep->com.so, 3);
1246	}
1247	return 0;
1248}
1249
1250int
1251iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
1252{
1253	int err;
1254	struct iwch_qp_attributes attrs;
1255	enum iwch_qp_attr_mask mask;
1256	struct iwch_ep *ep = to_ep(cm_id);
1257	struct iwch_dev *h = to_iwch_dev(cm_id->device);
1258	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
1259
1260	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1261	if (state_read(&ep->com) == DEAD)
1262		return (-ECONNRESET);
1263
1264	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
1265	PANIC_IF(!qp);
1266
1267	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
1268	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
1269		abort_connection(ep);
1270		return (-EINVAL);
1271	}
1272
1273	cm_id->add_ref(cm_id);
1274	ep->com.cm_id = cm_id;
1275	ep->com.qp = qp;
1276
1277	ep->com.rpl_err = 0;
1278	ep->com.rpl_done = 0;
1279	ep->ird = conn_param->ird;
1280	ep->ord = conn_param->ord;
1281	CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord);
1282	get_ep(&ep->com);
1283
1284	/* bind QP to EP and move to RTS */
1285	attrs.mpa_attr = ep->mpa_attr;
1286	attrs.max_ird = ep->ord;
1287	attrs.max_ord = ep->ord;
1288	attrs.llp_stream_handle = ep;
1289	attrs.next_state = IWCH_QP_STATE_RTS;
1290
1291	/* bind QP and TID with INIT_WR */
1292	mask = IWCH_QP_ATTR_NEXT_STATE |
1293			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
1294			     IWCH_QP_ATTR_MPA_ATTR |
1295			     IWCH_QP_ATTR_MAX_IRD |
1296			     IWCH_QP_ATTR_MAX_ORD;
1297
1298	err = iwch_modify_qp(ep->com.qp->rhp,
1299			     ep->com.qp, mask, &attrs, 1);
1300
1301	if (err)
1302		goto err;
1303
1304	err = send_mpa_reply(ep, conn_param->private_data,
1305 			     conn_param->private_data_len);
1306	if (err)
1307		goto err;
1308	state_set(&ep->com, FPDU_MODE);
1309	established_upcall(ep);
1310	put_ep(&ep->com);
1311	return 0;
1312err:
1313	ep->com.cm_id = NULL;
1314	ep->com.qp = NULL;
1315	cm_id->rem_ref(cm_id);
1316	put_ep(&ep->com);
1317	return err;
1318}
1319
1320static int init_sock(struct iwch_ep_common *epc)
1321{
1322	int err;
1323	struct sockopt sopt;
1324	int on=1;
1325
1326	epc->so->so_upcall = iwch_so_upcall;
1327	epc->so->so_upcallarg = epc;
1328	epc->so->so_rcv.sb_flags |= SB_UPCALL;
1329	epc->so->so_state |= SS_NBIO;
1330	sopt.sopt_dir = SOPT_SET;
1331	sopt.sopt_level = SOL_SOCKET;
1332	sopt.sopt_name = SO_NO_DDP;
1333	sopt.sopt_val = (caddr_t)&on;
1334	sopt.sopt_valsize = sizeof on;
1335	sopt.sopt_td = NULL;
1336	err = sosetopt(epc->so, &sopt);
1337	if (err)
1338		printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err);
1339	sopt.sopt_dir = SOPT_SET;
1340	sopt.sopt_level = IPPROTO_TCP;
1341	sopt.sopt_name = TCP_NODELAY;
1342	sopt.sopt_val = (caddr_t)&on;
1343	sopt.sopt_valsize = sizeof on;
1344	sopt.sopt_td = NULL;
1345	err = sosetopt(epc->so, &sopt);
1346	if (err)
1347		printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err);
1348
1349	return 0;
1350}
1351
1352static int
1353is_loopback_dst(struct iw_cm_id *cm_id)
1354{
1355	uint16_t port = cm_id->remote_addr.sin_port;
1356	struct ifaddr *ifa;
1357
1358	cm_id->remote_addr.sin_port = 0;
1359	ifa = ifa_ifwithaddr((struct sockaddr *)&cm_id->remote_addr);
1360	cm_id->remote_addr.sin_port = port;
1361	return (ifa != NULL);
1362}
1363
1364int
1365iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
1366{
1367	int err = 0;
1368	struct iwch_dev *h = to_iwch_dev(cm_id->device);
1369	struct iwch_ep *ep;
1370	struct rtentry *rt;
1371	struct toedev *tdev;
1372
1373	if (is_loopback_dst(cm_id)) {
1374		err = -ENOSYS;
1375		goto out;
1376	}
1377
1378	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
1379	if (!ep) {
1380		printf("%s - cannot alloc ep.\n", __FUNCTION__);
1381		err = (-ENOMEM);
1382		goto out;
1383	}
1384	callout_init(&ep->timer, TRUE);
1385	ep->plen = conn_param->private_data_len;
1386	if (ep->plen)
1387		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
1388		       conn_param->private_data, ep->plen);
1389	ep->ird = conn_param->ird;
1390	ep->ord = conn_param->ord;
1391
1392	cm_id->add_ref(cm_id);
1393	ep->com.cm_id = cm_id;
1394	ep->com.qp = get_qhp(h, conn_param->qpn);
1395	ep->com.thread = curthread;
1396	PANIC_IF(!ep->com.qp);
1397	CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn,
1398	     ep->com.qp, cm_id);
1399
1400	ep->com.so = cm_id->so;
1401	err = init_sock(&ep->com);
1402	if (err)
1403		goto fail2;
1404
1405	/* find a route */
1406	rt = find_route(cm_id->local_addr.sin_addr.s_addr,
1407			cm_id->remote_addr.sin_addr.s_addr,
1408			cm_id->local_addr.sin_port,
1409			cm_id->remote_addr.sin_port, IPTOS_LOWDELAY);
1410	if (!rt) {
1411		printf("%s - cannot find route.\n", __FUNCTION__);
1412		err = EHOSTUNREACH;
1413		goto fail2;
1414	}
1415
1416	if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) {
1417		printf("%s - interface not TOE capable.\n", __FUNCTION__);
1418		goto fail3;
1419	}
1420	tdev = TOEDEV(rt->rt_ifp);
1421	if (tdev == NULL) {
1422		printf("%s - No toedev for interface.\n", __FUNCTION__);
1423		goto fail3;
1424	}
1425	if (!tdev->tod_can_offload(tdev, ep->com.so)) {
1426		printf("%s - interface cannot offload!.\n", __FUNCTION__);
1427		goto fail3;
1428	}
1429	RTFREE(rt);
1430
1431	state_set(&ep->com, CONNECTING);
1432	ep->com.local_addr = cm_id->local_addr;
1433	ep->com.remote_addr = cm_id->remote_addr;
1434	err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr,
1435		ep->com.thread);
1436	if (!err)
1437		goto out;
1438fail3:
1439	RTFREE(ep->dst);
1440fail2:
1441	put_ep(&ep->com);
1442out:
1443	return err;
1444}
1445
1446int
1447iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
1448{
1449	int err = 0;
1450	struct iwch_listen_ep *ep;
1451
1452	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
1453	if (!ep) {
1454		printf("%s - cannot alloc ep.\n", __FUNCTION__);
1455		err = ENOMEM;
1456		goto out;
1457	}
1458	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1459	cm_id->add_ref(cm_id);
1460	ep->com.cm_id = cm_id;
1461	ep->backlog = backlog;
1462	ep->com.local_addr = cm_id->local_addr;
1463	ep->com.thread = curthread;
1464	state_set(&ep->com, LISTEN);
1465
1466	ep->com.so = cm_id->so;
1467	err = init_sock(&ep->com);
1468	if (err)
1469		goto fail;
1470
1471	err = solisten(ep->com.so, ep->backlog, ep->com.thread);
1472	if (!err) {
1473		cm_id->provider_data = ep;
1474		goto out;
1475	}
1476	close_socket(&ep->com);
1477fail:
1478	cm_id->rem_ref(cm_id);
1479	put_ep(&ep->com);
1480out:
1481	return err;
1482}
1483
1484int
1485iwch_destroy_listen(struct iw_cm_id *cm_id)
1486{
1487	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
1488
1489	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1490
1491	state_set(&ep->com, DEAD);
1492	close_socket(&ep->com);
1493	cm_id->rem_ref(cm_id);
1494	put_ep(&ep->com);
1495	return 0;
1496}
1497
1498int
1499iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags)
1500{
1501	int close = 0;
1502
1503	mtx_lock(&ep->com.lock);
1504
1505	PANIC_IF(!ep);
1506	PANIC_IF(!ep->com.so);
1507
1508	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep,
1509	     ep->com.so, states[ep->com.state], abrupt);
1510
1511	if (ep->com.state == DEAD) {
1512		CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep);
1513		goto out;
1514	}
1515
1516	if (abrupt) {
1517		if (ep->com.state != ABORTING) {
1518			ep->com.state = ABORTING;
1519			close = 1;
1520		}
1521		goto out;
1522	}
1523
1524	switch (ep->com.state) {
1525	case MPA_REQ_WAIT:
1526	case MPA_REQ_SENT:
1527	case MPA_REQ_RCVD:
1528	case MPA_REP_SENT:
1529	case FPDU_MODE:
1530		start_ep_timer(ep);
1531		ep->com.state = CLOSING;
1532		close = 1;
1533		break;
1534	case CLOSING:
1535		ep->com.state = MORIBUND;
1536		close = 1;
1537		break;
1538	case MORIBUND:
1539	case ABORTING:
1540		break;
1541	default:
1542		panic("unknown state: %d\n", ep->com.state);
1543		break;
1544	}
1545out:
1546	mtx_unlock(&ep->com.lock);
1547	if (close) {
1548		if (abrupt)
1549			abort_connection(ep);
1550		else
1551			shutdown_socket(&ep->com);
1552	}
1553	return 0;
1554}
1555
1556static void
1557process_data(struct iwch_ep *ep)
1558{
1559	struct sockaddr_in *local, *remote;
1560
1561	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1562
1563	switch (state_read(&ep->com)) {
1564	case MPA_REQ_SENT:
1565		process_mpa_reply(ep);
1566		break;
1567	case MPA_REQ_WAIT:
1568
1569		/*
1570		 * XXX
1571		 * Set local and remote addrs here because when we
1572		 * dequeue the newly accepted socket, they aren't set
1573		 * yet in the pcb!
1574		 */
1575		in_getsockaddr(ep->com.so, (struct sockaddr **)&local);
1576		in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote);
1577		CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__,
1578			inet_ntoa(local->sin_addr),
1579			inet_ntoa(remote->sin_addr));
1580		ep->com.local_addr = *local;
1581		ep->com.remote_addr = *remote;
1582		free(local, M_SONAME);
1583		free(remote, M_SONAME);
1584		process_mpa_request(ep);
1585		break;
1586	default:
1587		if (ep->com.so->so_rcv.sb_cc)
1588			printf("%s Unexpected streaming data."
1589			       " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n",
1590			       __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state,
1591			       ep->com.so->so_rcv.sb_cc, ep->com.so->so_rcv.sb_mb);
1592		break;
1593	}
1594	return;
1595}
1596
1597static void
1598process_connected(struct iwch_ep *ep)
1599{
1600	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1601	if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) {
1602		send_mpa_req(ep);
1603	} else {
1604		connect_reply_upcall(ep, -ep->com.so->so_error);
1605		close_socket(&ep->com);
1606		state_set(&ep->com, DEAD);
1607		put_ep(&ep->com);
1608	}
1609}
1610
1611static struct socket *
1612dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct iwch_ep *child_ep)
1613{
1614	struct socket *so;
1615
1616	ACCEPT_LOCK();
1617	so = TAILQ_FIRST(&head->so_comp);
1618	if (!so) {
1619		ACCEPT_UNLOCK();
1620		return NULL;
1621	}
1622	TAILQ_REMOVE(&head->so_comp, so, so_list);
1623	head->so_qlen--;
1624	SOCK_LOCK(so);
1625	so->so_qstate &= ~SQ_COMP;
1626	so->so_head = NULL;
1627	soref(so);
1628	so->so_rcv.sb_flags |= SB_UPCALL;
1629	so->so_state |= SS_NBIO;
1630	so->so_upcall = iwch_so_upcall;
1631	so->so_upcallarg = child_ep;
1632	PANIC_IF(!(so->so_state & SS_ISCONNECTED));
1633	PANIC_IF(so->so_error);
1634	SOCK_UNLOCK(so);
1635	ACCEPT_UNLOCK();
1636	soaccept(so, (struct sockaddr **)remote);
1637	return so;
1638}
1639
1640static void
1641process_newconn(struct iwch_ep *parent_ep)
1642{
1643	struct socket *child_so;
1644	struct iwch_ep *child_ep;
1645	struct sockaddr_in *remote;
1646
1647	CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so);
1648	child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT);
1649	if (!child_ep) {
1650		log(LOG_ERR, "%s - failed to allocate ep entry!\n",
1651		       __FUNCTION__);
1652		return;
1653	}
1654	child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep);
1655	if (!child_so) {
1656		log(LOG_ERR, "%s - failed to dequeue child socket!\n",
1657		       __FUNCTION__);
1658		__free_ep(&child_ep->com);
1659		return;
1660	}
1661	CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__,
1662		inet_ntoa(remote->sin_addr), ntohs(remote->sin_port));
1663	child_ep->com.so = child_so;
1664	child_ep->com.cm_id = NULL;
1665	child_ep->com.thread = parent_ep->com.thread;
1666	child_ep->parent_ep = parent_ep;
1667	free(remote, M_SONAME);
1668	get_ep(&parent_ep->com);
1669	child_ep->parent_ep = parent_ep;
1670	callout_init(&child_ep->timer, TRUE);
1671	state_set(&child_ep->com, MPA_REQ_WAIT);
1672	start_ep_timer(child_ep);
1673
1674	/* maybe the request has already been queued up on the socket... */
1675	process_mpa_request(child_ep);
1676}
1677
1678static void
1679iwch_so_upcall(struct socket *so, void *arg, int waitflag)
1680{
1681	struct iwch_ep *ep = arg;
1682
1683	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
1684	mtx_lock(&req_lock);
1685	if (ep && ep->com.so && !ep->com.entry.tqe_prev) {
1686		get_ep(&ep->com);
1687		TAILQ_INSERT_TAIL(&req_list, &ep->com, entry);
1688		taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task);
1689	}
1690	mtx_unlock(&req_lock);
1691}
1692
1693static void
1694process_socket_event(struct iwch_ep *ep)
1695{
1696	int state = state_read(&ep->com);
1697	struct socket *so = ep->com.so;
1698
1699	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
1700	if (state == CONNECTING) {
1701		process_connected(ep);
1702		return;
1703	}
1704
1705	if (state == LISTEN) {
1706		process_newconn(ep);
1707		return;
1708	}
1709
1710	/* connection error */
1711	if (so->so_error) {
1712		process_conn_error(ep);
1713		return;
1714	}
1715
1716	/* peer close */
1717	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) {
1718		process_peer_close(ep);
1719		return;
1720	}
1721
1722	/* close complete */
1723	if (so->so_state & (SS_ISDISCONNECTED)) {
1724		process_close_complete(ep);
1725		return;
1726	}
1727
1728	/* rx data */
1729	process_data(ep);
1730	return;
1731}
1732
1733static void
1734process_req(void *ctx, int pending)
1735{
1736	struct iwch_ep_common *epc;
1737
1738	CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__);
1739	mtx_lock(&req_lock);
1740	while (!TAILQ_EMPTY(&req_list)) {
1741		epc = TAILQ_FIRST(&req_list);
1742		TAILQ_REMOVE(&req_list, epc, entry);
1743		epc->entry.tqe_prev = NULL;
1744		mtx_unlock(&req_lock);
1745		if (epc->so)
1746			process_socket_event((struct iwch_ep *)epc);
1747		put_ep(epc);
1748		mtx_lock(&req_lock);
1749	}
1750	mtx_unlock(&req_lock);
1751}
1752
1753int
1754iwch_cm_init(void)
1755{
1756	TAILQ_INIT(&req_list);
1757	mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF);
1758	iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT,
1759		taskqueue_thread_enqueue, &iw_cxgb_taskq);
1760        if (iw_cxgb_taskq == NULL) {
1761                printf("failed to allocate iw_cxgb taskqueue\n");
1762                return (ENOMEM);
1763        }
1764        taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq");
1765        TASK_INIT(&iw_cxgb_task, 0, process_req, NULL);
1766	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate);
1767	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status);
1768	return 0;
1769}
1770
1771void
1772iwch_cm_term(void)
1773{
1774	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL);
1775	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL);
1776	taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task);
1777	taskqueue_free(iw_cxgb_taskq);
1778}
1779
1780