iw_cxgb_cm.c revision 193272
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c 193272 2009-06-01 21:17:03Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/bus.h>
36#include <sys/module.h>
37#include <sys/pciio.h>
38#include <sys/conf.h>
39#include <machine/bus.h>
40#include <machine/resource.h>
41#include <sys/bus_dma.h>
42#include <sys/rman.h>
43#include <sys/ioccom.h>
44#include <sys/mbuf.h>
45#include <sys/rwlock.h>
46#include <sys/linker.h>
47#include <sys/firmware.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sockio.h>
51#include <sys/smp.h>
52#include <sys/sysctl.h>
53#include <sys/syslog.h>
54#include <sys/queue.h>
55#include <sys/taskqueue.h>
56#include <sys/proc.h>
57#include <sys/uio.h>
58
59#include <net/route.h>
60#include <netinet/in_systm.h>
61#include <netinet/in.h>
62#include <netinet/in_pcb.h>
63#include <netinet/ip.h>
64#include <netinet/ip_var.h>
65#include <netinet/tcp_var.h>
66#include <netinet/tcp.h>
67#include <netinet/tcpip.h>
68
69#include <contrib/rdma/ib_verbs.h>
70
71#include <cxgb_include.h>
72#include <ulp/tom/cxgb_tom.h>
73#include <ulp/tom/cxgb_t3_ddp.h>
74#include <ulp/tom/cxgb_defs.h>
75#include <ulp/tom/cxgb_toepcb.h>
76#include <ulp/iw_cxgb/iw_cxgb_wr.h>
77#include <ulp/iw_cxgb/iw_cxgb_hal.h>
78#include <ulp/iw_cxgb/iw_cxgb_provider.h>
79#include <ulp/iw_cxgb/iw_cxgb_cm.h>
80#include <ulp/iw_cxgb/iw_cxgb.h>
81
82#ifdef KTR
83static char *states[] = {
84	"idle",
85	"listen",
86	"connecting",
87	"mpa_wait_req",
88	"mpa_req_sent",
89	"mpa_req_rcvd",
90	"mpa_rep_sent",
91	"fpdu_mode",
92	"aborting",
93	"closing",
94	"moribund",
95	"dead",
96	NULL,
97};
98#endif
99
100SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters");
101
102static int ep_timeout_secs = 10;
103TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs);
104SYSCTL_UINT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0,
105    "CM Endpoint operation timeout in seconds (default=10)");
106
107static int mpa_rev = 1;
108TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev);
109SYSCTL_UINT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0,
110    "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)");
111
112static int markers_enabled = 0;
113TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled);
114SYSCTL_UINT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0,
115    "Enable MPA MARKERS (default(0)=disabled)");
116
117static int crc_enabled = 1;
118TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled);
119SYSCTL_UINT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0,
120    "Enable MPA CRC (default(1)=enabled)");
121
122static int rcv_win = 256 * 1024;
123TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win);
124SYSCTL_UINT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0,
125    "TCP receive window in bytes (default=256KB)");
126
127static int snd_win = 32 * 1024;
128TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win);
129SYSCTL_UINT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0,
130    "TCP send window in bytes (default=32KB)");
131
132static unsigned int nocong = 0;
133TUNABLE_INT("hw.iw_cxgb.nocong", &nocong);
134SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0,
135    "Turn off congestion control (default=0)");
136
137static unsigned int cong_flavor = 1;
138TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor);
139SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0,
140    "TCP Congestion control flavor (default=1)");
141
142static void ep_timeout(void *arg);
143static void connect_reply_upcall(struct iwch_ep *ep, int status);
144static int iwch_so_upcall(struct socket *so, void *arg, int waitflag);
145
146/*
147 * Cruft to offload socket upcalls onto thread.
148 */
149static struct mtx req_lock;
150static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list;
151static struct task iw_cxgb_task;
152static struct taskqueue *iw_cxgb_taskq;
153static void process_req(void *ctx, int pending);
154
155static void
156start_ep_timer(struct iwch_ep *ep)
157{
158	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
159	if (callout_pending(&ep->timer)) {
160		CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep);
161		callout_deactivate(&ep->timer);
162		callout_drain(&ep->timer);
163	} else {
164		/*
165		 * XXX this looks racy
166		 */
167		get_ep(&ep->com);
168		callout_init(&ep->timer, TRUE);
169	}
170	callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep);
171}
172
173static void
174stop_ep_timer(struct iwch_ep *ep)
175{
176	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
177	callout_drain(&ep->timer);
178	put_ep(&ep->com);
179}
180
181static int set_tcpinfo(struct iwch_ep *ep)
182{
183	struct tcp_info ti;
184	struct sockopt sopt;
185	int err;
186
187	sopt.sopt_dir = SOPT_GET;
188	sopt.sopt_level = IPPROTO_TCP;
189	sopt.sopt_name = TCP_INFO;
190	sopt.sopt_val = (caddr_t)&ti;
191	sopt.sopt_valsize = sizeof ti;
192	sopt.sopt_td = NULL;
193
194	err = sogetopt(ep->com.so, &sopt);
195	if (err) {
196		printf("%s can't get tcpinfo\n", __FUNCTION__);
197		return -err;
198	}
199	if (!(ti.tcpi_options & TCPI_OPT_TOE)) {
200		printf("%s connection NOT OFFLOADED!\n", __FUNCTION__);
201		return -EINVAL;
202	}
203
204	ep->snd_seq = ti.tcpi_snd_nxt;
205	ep->rcv_seq = ti.tcpi_rcv_nxt;
206	ep->emss = ti.__tcpi_snd_mss - sizeof(struct tcpiphdr);
207	ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */
208	if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS)
209		ep->emss -= 12;
210	if (ep->emss < 128)
211		ep->emss = 128;
212	return 0;
213}
214
215static enum iwch_ep_state
216state_read(struct iwch_ep_common *epc)
217{
218	enum iwch_ep_state state;
219
220	mtx_lock(&epc->lock);
221	state = epc->state;
222	mtx_unlock(&epc->lock);
223	return state;
224}
225
226static void
227__state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
228{
229	epc->state = new;
230}
231
232static void
233state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
234{
235
236	mtx_lock(&epc->lock);
237	CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]);
238	__state_set(epc, new);
239	mtx_unlock(&epc->lock);
240	return;
241}
242
243static void *
244alloc_ep(int size, int flags)
245{
246	struct iwch_ep_common *epc;
247
248	epc = malloc(size, M_DEVBUF, flags);
249	if (epc) {
250		memset(epc, 0, size);
251		refcount_init(&epc->refcount, 1);
252		mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK);
253		cv_init(&epc->waitq, "iwch_epc cv");
254	}
255	CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc);
256	return epc;
257}
258
259void __free_ep(struct iwch_ep_common *epc)
260{
261	CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]);
262	KASSERT(!epc->so, ("%s warning ep->so %p \n", __FUNCTION__, epc->so));
263	KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc));
264	free(epc, M_DEVBUF);
265}
266
267int
268iwch_quiesce_tid(struct iwch_ep *ep)
269{
270#ifdef notyet
271	struct cpl_set_tcb_field *req;
272	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
273
274	if (m == NULL)
275		return (-ENOMEM);
276	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
277	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
278	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
279	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
280	req->reply = 0;
281	req->cpu_idx = 0;
282	req->word = htons(W_TCB_RX_QUIESCE);
283	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
284	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
285
286	m_set_priority(m, CPL_PRIORITY_DATA);
287	cxgb_ofld_send(ep->com.tdev, m);
288#endif
289	return 0;
290}
291
292int
293iwch_resume_tid(struct iwch_ep *ep)
294{
295#ifdef notyet
296	struct cpl_set_tcb_field *req;
297	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
298
299	if (m == NULL)
300		return (-ENOMEM);
301	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
302	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
303	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
304	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
305	req->reply = 0;
306	req->cpu_idx = 0;
307	req->word = htons(W_TCB_RX_QUIESCE);
308	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
309	req->val = 0;
310
311	m_set_priority(m, CPL_PRIORITY_DATA);
312	cxgb_ofld_send(ep->com.tdev, m);
313#endif
314	return 0;
315}
316
317static struct rtentry *
318find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port,
319    __be16 peer_port, u8 tos)
320{
321        struct route iproute;
322        struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst;
323
324        bzero(&iproute, sizeof iproute);
325	dst->sin_family = AF_INET;
326	dst->sin_len = sizeof *dst;
327        dst->sin_addr.s_addr = peer_ip;
328
329        rtalloc(&iproute);
330	return iproute.ro_rt;
331}
332
333static void
334close_socket(struct iwch_ep_common *epc)
335{
336	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
337	SOCK_LOCK(epc->so);
338	soupcall_clear(epc->so, SO_RCV);
339	SOCK_UNLOCK(epc->so);
340	soshutdown(epc->so, SHUT_WR|SHUT_RD);
341	epc->so = NULL;
342}
343
344static void
345shutdown_socket(struct iwch_ep_common *epc)
346{
347	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
348	soshutdown(epc->so, SHUT_WR);
349}
350
351static void
352abort_socket(struct iwch_ep *ep)
353{
354	struct sockopt sopt;
355	int err;
356	struct linger l;
357
358	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
359	l.l_onoff = 1;
360	l.l_linger = 0;
361
362	/* linger_time of 0 forces RST to be sent */
363	sopt.sopt_dir = SOPT_SET;
364	sopt.sopt_level = SOL_SOCKET;
365	sopt.sopt_name = SO_LINGER;
366	sopt.sopt_val = (caddr_t)&l;
367	sopt.sopt_valsize = sizeof l;
368	sopt.sopt_td = NULL;
369	err = sosetopt(ep->com.so, &sopt);
370	if (err)
371		printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err);
372}
373
374static void
375send_mpa_req(struct iwch_ep *ep)
376{
377	int mpalen;
378	struct mpa_message *mpa;
379	struct mbuf *m;
380	int err;
381
382	CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen);
383
384	mpalen = sizeof(*mpa) + ep->plen;
385	m = m_gethdr(mpalen, M_NOWAIT);
386	if (m == NULL) {
387		connect_reply_upcall(ep, -ENOMEM);
388		return;
389	}
390	mpa = mtod(m, struct mpa_message *);
391	m->m_len = mpalen;
392	m->m_pkthdr.len = mpalen;
393	memset(mpa, 0, sizeof(*mpa));
394	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
395	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
396		     (markers_enabled ? MPA_MARKERS : 0);
397	mpa->private_data_size = htons(ep->plen);
398	mpa->revision = mpa_rev;
399	if (ep->plen)
400		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
401
402	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
403	if (err) {
404		m_freem(m);
405		connect_reply_upcall(ep, -ENOMEM);
406		return;
407	}
408
409	start_ep_timer(ep);
410	state_set(&ep->com, MPA_REQ_SENT);
411	return;
412}
413
414static int
415send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
416{
417	int mpalen;
418	struct mpa_message *mpa;
419	struct mbuf *m;
420	int err;
421
422	CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen);
423
424	mpalen = sizeof(*mpa) + plen;
425
426	m = m_gethdr(mpalen, M_NOWAIT);
427	if (m == NULL) {
428		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
429		return (-ENOMEM);
430	}
431	mpa = mtod(m, struct mpa_message *);
432	m->m_len = mpalen;
433	m->m_pkthdr.len = mpalen;
434	memset(mpa, 0, sizeof(*mpa));
435	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
436	mpa->flags = MPA_REJECT;
437	mpa->revision = mpa_rev;
438	mpa->private_data_size = htons(plen);
439	if (plen)
440		memcpy(mpa->private_data, pdata, plen);
441	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
442	PANIC_IF(err);
443	return 0;
444}
445
446static int
447send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
448{
449	int mpalen;
450	struct mpa_message *mpa;
451	struct mbuf *m;
452
453	CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen);
454
455	mpalen = sizeof(*mpa) + plen;
456
457	m = m_gethdr(mpalen, M_NOWAIT);
458	if (m == NULL) {
459		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
460		return (-ENOMEM);
461	}
462	mpa = mtod(m, struct mpa_message *);
463	m->m_len = mpalen;
464	m->m_pkthdr.len = mpalen;
465	memset(mpa, 0, sizeof(*mpa));
466	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
467	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
468		     (markers_enabled ? MPA_MARKERS : 0);
469	mpa->revision = mpa_rev;
470	mpa->private_data_size = htons(plen);
471	if (plen)
472		memcpy(mpa->private_data, pdata, plen);
473
474	state_set(&ep->com, MPA_REP_SENT);
475	return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT,
476		ep->com.thread);
477}
478
479static void
480close_complete_upcall(struct iwch_ep *ep)
481{
482	struct iw_cm_event event;
483
484	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
485	memset(&event, 0, sizeof(event));
486	event.event = IW_CM_EVENT_CLOSE;
487	if (ep->com.cm_id) {
488		CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d",
489		     ep, ep->com.cm_id, ep->hwtid);
490		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
491		ep->com.cm_id->rem_ref(ep->com.cm_id);
492		ep->com.cm_id = NULL;
493		ep->com.qp = NULL;
494	}
495}
496
497static void
498abort_connection(struct iwch_ep *ep)
499{
500	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
501	state_set(&ep->com, ABORTING);
502	abort_socket(ep);
503	close_socket(&ep->com);
504	close_complete_upcall(ep);
505	state_set(&ep->com, DEAD);
506	put_ep(&ep->com);
507}
508
509static void
510peer_close_upcall(struct iwch_ep *ep)
511{
512	struct iw_cm_event event;
513
514	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
515	memset(&event, 0, sizeof(event));
516	event.event = IW_CM_EVENT_DISCONNECT;
517	if (ep->com.cm_id) {
518		CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d",
519		     ep, ep->com.cm_id, ep->hwtid);
520		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
521	}
522}
523
524static void
525peer_abort_upcall(struct iwch_ep *ep)
526{
527	struct iw_cm_event event;
528
529	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
530	memset(&event, 0, sizeof(event));
531	event.event = IW_CM_EVENT_CLOSE;
532	event.status = ECONNRESET;
533	if (ep->com.cm_id) {
534		CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep,
535		     ep->com.cm_id, ep->hwtid);
536		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
537		ep->com.cm_id->rem_ref(ep->com.cm_id);
538		ep->com.cm_id = NULL;
539		ep->com.qp = NULL;
540	}
541}
542
543static void
544connect_reply_upcall(struct iwch_ep *ep, int status)
545{
546	struct iw_cm_event event;
547
548	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status);
549	memset(&event, 0, sizeof(event));
550	event.event = IW_CM_EVENT_CONNECT_REPLY;
551	event.status = status;
552	event.local_addr = ep->com.local_addr;
553	event.remote_addr = ep->com.remote_addr;
554
555	if ((status == 0) || (status == ECONNREFUSED)) {
556		event.private_data_len = ep->plen;
557		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
558	}
559	if (ep->com.cm_id) {
560		CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep,
561		     ep->hwtid, status);
562		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
563	}
564	if (status < 0) {
565		ep->com.cm_id->rem_ref(ep->com.cm_id);
566		ep->com.cm_id = NULL;
567		ep->com.qp = NULL;
568	}
569}
570
571static void
572connect_request_upcall(struct iwch_ep *ep)
573{
574	struct iw_cm_event event;
575
576	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
577	memset(&event, 0, sizeof(event));
578	event.event = IW_CM_EVENT_CONNECT_REQUEST;
579	event.local_addr = ep->com.local_addr;
580	event.remote_addr = ep->com.remote_addr;
581	event.private_data_len = ep->plen;
582	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
583	event.provider_data = ep;
584	event.so = ep->com.so;
585	if (state_read(&ep->parent_ep->com) != DEAD)
586		ep->parent_ep->com.cm_id->event_handler(
587						ep->parent_ep->com.cm_id,
588						&event);
589	put_ep(&ep->parent_ep->com);
590	ep->parent_ep = NULL;
591}
592
593static void
594established_upcall(struct iwch_ep *ep)
595{
596	struct iw_cm_event event;
597
598	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
599	memset(&event, 0, sizeof(event));
600	event.event = IW_CM_EVENT_ESTABLISHED;
601	if (ep->com.cm_id) {
602		CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid);
603		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
604	}
605}
606
607static void
608process_mpa_reply(struct iwch_ep *ep)
609{
610	struct mpa_message *mpa;
611	u16 plen;
612	struct iwch_qp_attributes attrs;
613	enum iwch_qp_attr_mask mask;
614	int err;
615	struct mbuf *top, *m;
616	int flags = MSG_DONTWAIT;
617	struct uio uio;
618	int len;
619
620	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
621
622	/*
623	 * Stop mpa timer.  If it expired, then the state has
624	 * changed and we bail since ep_timeout already aborted
625	 * the connection.
626	 */
627	stop_ep_timer(ep);
628	if (state_read(&ep->com) != MPA_REQ_SENT)
629		return;
630
631	uio.uio_resid = len = 1000000;
632	uio.uio_td = ep->com.thread;
633	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
634	if (err) {
635		if (err == EWOULDBLOCK) {
636			start_ep_timer(ep);
637			return;
638		}
639		err = -err;
640		goto err;
641	}
642
643	if (ep->com.so->so_rcv.sb_mb) {
644		printf("%s data after soreceive called! so %p sb_mb %p top %p\n",
645			__FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top);
646	}
647
648	m = top;
649	do {
650		/*
651		 * If we get more than the supported amount of private data
652		 * then we must fail this connection.
653		 */
654		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
655			err = (-EINVAL);
656			goto err;
657		}
658
659		/*
660		 * copy the new data into our accumulation buffer.
661		 */
662		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
663		ep->mpa_pkt_len += m->m_len;
664		if (!m->m_next)
665			m = m->m_nextpkt;
666		else
667			m = m->m_next;
668	} while (m);
669
670	m_freem(top);
671
672	/*
673	 * if we don't even have the mpa message, then bail.
674	 */
675	if (ep->mpa_pkt_len < sizeof(*mpa))
676		return;
677	mpa = (struct mpa_message *)ep->mpa_pkt;
678
679	/* Validate MPA header. */
680	if (mpa->revision != mpa_rev) {
681		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
682		err = EPROTO;
683		goto err;
684	}
685	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
686		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
687		err = EPROTO;
688		goto err;
689	}
690
691	plen = ntohs(mpa->private_data_size);
692
693	/*
694	 * Fail if there's too much private data.
695	 */
696	if (plen > MPA_MAX_PRIVATE_DATA) {
697		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
698		err = EPROTO;
699		goto err;
700	}
701
702	/*
703	 * If plen does not account for pkt size
704	 */
705	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
706		CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len);
707		err = EPROTO;
708		goto err;
709	}
710
711	ep->plen = (u8) plen;
712
713	/*
714	 * If we don't have all the pdata yet, then bail.
715	 * We'll continue process when more data arrives.
716	 */
717	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
718		return;
719
720	if (mpa->flags & MPA_REJECT) {
721		err = ECONNREFUSED;
722		goto err;
723	}
724
725	/*
726	 * If we get here we have accumulated the entire mpa
727	 * start reply message including private data. And
728	 * the MPA header is valid.
729	 */
730	CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__);
731	state_set(&ep->com, FPDU_MODE);
732	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
733	ep->mpa_attr.recv_marker_enabled = markers_enabled;
734	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
735	ep->mpa_attr.version = mpa_rev;
736	if (set_tcpinfo(ep)) {
737		printf("%s set_tcpinfo error\n", __FUNCTION__);
738		goto err;
739	}
740	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
741	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
742	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
743	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
744
745	attrs.mpa_attr = ep->mpa_attr;
746	attrs.max_ird = ep->ird;
747	attrs.max_ord = ep->ord;
748	attrs.llp_stream_handle = ep;
749	attrs.next_state = IWCH_QP_STATE_RTS;
750
751	mask = IWCH_QP_ATTR_NEXT_STATE |
752	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
753	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
754
755	/* bind QP and TID with INIT_WR */
756	err = iwch_modify_qp(ep->com.qp->rhp,
757			     ep->com.qp, mask, &attrs, 1);
758	if (!err)
759		goto out;
760err:
761	abort_connection(ep);
762out:
763	connect_reply_upcall(ep, err);
764	return;
765}
766
767static void
768process_mpa_request(struct iwch_ep *ep)
769{
770	struct mpa_message *mpa;
771	u16 plen;
772	int flags = MSG_DONTWAIT;
773	struct mbuf *top, *m;
774	int err;
775	struct uio uio;
776	int len;
777
778	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
779
780	/*
781	 * Stop mpa timer.  If it expired, then the state has
782	 * changed and we bail since ep_timeout already aborted
783	 * the connection.
784	 */
785	stop_ep_timer(ep);
786	if (state_read(&ep->com) != MPA_REQ_WAIT)
787		return;
788
789	uio.uio_resid = len = 1000000;
790	uio.uio_td = ep->com.thread;
791	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
792	if (err) {
793		if (err == EWOULDBLOCK) {
794			start_ep_timer(ep);
795			return;
796		}
797		err = -err;
798		goto err;
799	}
800
801	m = top;
802	do {
803
804		/*
805		 * If we get more than the supported amount of private data
806		 * then we must fail this connection.
807		 */
808		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
809			CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__,
810				ep->mpa_pkt_len + m->m_len);
811			goto err;
812		}
813
814
815		/*
816		 * Copy the new data into our accumulation buffer.
817		 */
818		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
819		ep->mpa_pkt_len += m->m_len;
820
821		if (!m->m_next)
822			m = m->m_nextpkt;
823		else
824			m = m->m_next;
825	} while (m);
826
827	m_freem(top);
828
829	/*
830	 * If we don't even have the mpa message, then bail.
831	 * We'll continue process when more data arrives.
832	 */
833	if (ep->mpa_pkt_len < sizeof(*mpa)) {
834		start_ep_timer(ep);
835		CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__,
836			ep->mpa_pkt_len);
837		return;
838	}
839	mpa = (struct mpa_message *) ep->mpa_pkt;
840
841	/*
842	 * Validate MPA Header.
843	 */
844	if (mpa->revision != mpa_rev) {
845		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
846		goto err;
847	}
848
849	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
850		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
851		goto err;
852	}
853
854	plen = ntohs(mpa->private_data_size);
855
856	/*
857	 * Fail if there's too much private data.
858	 */
859	if (plen > MPA_MAX_PRIVATE_DATA) {
860		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
861		goto err;
862	}
863
864	/*
865	 * If plen does not account for pkt size
866	 */
867	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
868		CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__,
869			ep->mpa_pkt_len);
870		goto err;
871	}
872	ep->plen = (u8) plen;
873
874	/*
875	 * If we don't have all the pdata yet, then bail.
876	 */
877	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) {
878		start_ep_timer(ep);
879		CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__,
880			ep->mpa_pkt_len);
881		return;
882	}
883
884	/*
885	 * If we get here we have accumulated the entire mpa
886	 * start reply message including private data.
887	 */
888	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
889	ep->mpa_attr.recv_marker_enabled = markers_enabled;
890	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
891	ep->mpa_attr.version = mpa_rev;
892	if (set_tcpinfo(ep)) {
893		printf("%s set_tcpinfo error\n", __FUNCTION__);
894		goto err;
895	}
896	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
897	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
898	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
899	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
900
901	state_set(&ep->com, MPA_REQ_RCVD);
902
903	/* drive upcall */
904	connect_request_upcall(ep);
905	return;
906err:
907	abort_connection(ep);
908	return;
909}
910
911static void
912process_peer_close(struct iwch_ep *ep)
913{
914	struct iwch_qp_attributes attrs;
915	int disconnect = 1;
916	int release = 0;
917
918	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
919
920	mtx_lock(&ep->com.lock);
921	switch (ep->com.state) {
922	case MPA_REQ_WAIT:
923		__state_set(&ep->com, CLOSING);
924		break;
925	case MPA_REQ_SENT:
926		__state_set(&ep->com, CLOSING);
927		connect_reply_upcall(ep, -ECONNRESET);
928		break;
929	case MPA_REQ_RCVD:
930
931		/*
932		 * We're gonna mark this puppy DEAD, but keep
933		 * the reference on it until the ULP accepts or
934		 * rejects the CR.
935		 */
936		__state_set(&ep->com, CLOSING);
937		get_ep(&ep->com);
938		break;
939	case MPA_REP_SENT:
940		__state_set(&ep->com, CLOSING);
941		break;
942	case FPDU_MODE:
943		start_ep_timer(ep);
944		__state_set(&ep->com, CLOSING);
945		attrs.next_state = IWCH_QP_STATE_CLOSING;
946		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
947			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
948		peer_close_upcall(ep);
949		break;
950	case ABORTING:
951		disconnect = 0;
952		break;
953	case CLOSING:
954		__state_set(&ep->com, MORIBUND);
955		disconnect = 0;
956		break;
957	case MORIBUND:
958		stop_ep_timer(ep);
959		if (ep->com.cm_id && ep->com.qp) {
960			attrs.next_state = IWCH_QP_STATE_IDLE;
961			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
962				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
963		}
964		close_socket(&ep->com);
965		close_complete_upcall(ep);
966		__state_set(&ep->com, DEAD);
967		release = 1;
968		disconnect = 0;
969		break;
970	case DEAD:
971		disconnect = 0;
972		break;
973	default:
974		PANIC_IF(1);
975	}
976	mtx_unlock(&ep->com.lock);
977	if (disconnect)
978		iwch_ep_disconnect(ep, 0, M_NOWAIT);
979	if (release)
980		put_ep(&ep->com);
981	return;
982}
983
984static void
985process_conn_error(struct iwch_ep *ep)
986{
987	struct iwch_qp_attributes attrs;
988	int ret;
989	int state;
990
991	state = state_read(&ep->com);
992	CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]);
993	switch (state) {
994	case MPA_REQ_WAIT:
995		stop_ep_timer(ep);
996		break;
997	case MPA_REQ_SENT:
998		stop_ep_timer(ep);
999		connect_reply_upcall(ep, -ECONNRESET);
1000		break;
1001	case MPA_REP_SENT:
1002		ep->com.rpl_err = ECONNRESET;
1003		CTR1(KTR_IW_CXGB, "waking up ep %p", ep);
1004		break;
1005	case MPA_REQ_RCVD:
1006
1007		/*
1008		 * We're gonna mark this puppy DEAD, but keep
1009		 * the reference on it until the ULP accepts or
1010		 * rejects the CR.
1011		 */
1012		get_ep(&ep->com);
1013		break;
1014	case MORIBUND:
1015	case CLOSING:
1016		stop_ep_timer(ep);
1017		/*FALLTHROUGH*/
1018	case FPDU_MODE:
1019		if (ep->com.cm_id && ep->com.qp) {
1020			attrs.next_state = IWCH_QP_STATE_ERROR;
1021			ret = iwch_modify_qp(ep->com.qp->rhp,
1022				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1023				     &attrs, 1);
1024			if (ret)
1025				log(LOG_ERR,
1026				       "%s - qp <- error failed!\n",
1027				       __FUNCTION__);
1028		}
1029		peer_abort_upcall(ep);
1030		break;
1031	case ABORTING:
1032		break;
1033	case DEAD:
1034		CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__,
1035			ep->com.so->so_error);
1036		return;
1037	default:
1038		PANIC_IF(1);
1039		break;
1040	}
1041
1042	if (state != ABORTING) {
1043		close_socket(&ep->com);
1044		state_set(&ep->com, DEAD);
1045		put_ep(&ep->com);
1046	}
1047	return;
1048}
1049
1050static void
1051process_close_complete(struct iwch_ep *ep)
1052{
1053	struct iwch_qp_attributes attrs;
1054	int release = 0;
1055
1056	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1057	PANIC_IF(!ep);
1058
1059	/* The cm_id may be null if we failed to connect */
1060	mtx_lock(&ep->com.lock);
1061	switch (ep->com.state) {
1062	case CLOSING:
1063		__state_set(&ep->com, MORIBUND);
1064		break;
1065	case MORIBUND:
1066		stop_ep_timer(ep);
1067		if ((ep->com.cm_id) && (ep->com.qp)) {
1068			attrs.next_state = IWCH_QP_STATE_IDLE;
1069			iwch_modify_qp(ep->com.qp->rhp,
1070					     ep->com.qp,
1071					     IWCH_QP_ATTR_NEXT_STATE,
1072					     &attrs, 1);
1073		}
1074		close_socket(&ep->com);
1075		close_complete_upcall(ep);
1076		__state_set(&ep->com, DEAD);
1077		release = 1;
1078		break;
1079	case ABORTING:
1080		break;
1081	case DEAD:
1082	default:
1083		PANIC_IF(1);
1084		break;
1085	}
1086	mtx_unlock(&ep->com.lock);
1087	if (release)
1088		put_ep(&ep->com);
1089	return;
1090}
1091
1092/*
1093 * T3A does 3 things when a TERM is received:
1094 * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
1095 * 2) generate an async event on the QP with the TERMINATE opcode
1096 * 3) post a TERMINATE opcde cqe into the associated CQ.
1097 *
1098 * For (1), we save the message in the qp for later consumer consumption.
1099 * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
1100 * For (3), we toss the CQE in cxio_poll_cq().
1101 *
1102 * terminate() handles case (1)...
1103 */
1104static int
1105terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx)
1106{
1107	struct toepcb *toep = (struct toepcb *)ctx;
1108	struct socket *so = toeptoso(toep);
1109	struct iwch_ep *ep = so->so_rcv.sb_upcallarg;
1110
1111	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1112	m_adj(m, sizeof(struct cpl_rdma_terminate));
1113	CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len);
1114	m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer);
1115	ep->com.qp->attr.terminate_msg_len = m->m_len;
1116	ep->com.qp->attr.is_terminate_local = 0;
1117	return CPL_RET_BUF_DONE;
1118}
1119
1120static int
1121ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx)
1122{
1123	struct toepcb *toep = (struct toepcb *)ctx;
1124	struct socket *so = toeptoso(toep);
1125	struct cpl_rdma_ec_status *rep = cplhdr(m);
1126	struct iwch_ep *ep;
1127	struct iwch_qp_attributes attrs;
1128	int release = 0;
1129
1130	ep = so->so_rcv.sb_upcallarg;
1131	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status);
1132	if (!so || !ep) {
1133		panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1);
1134	}
1135	mtx_lock(&ep->com.lock);
1136	switch (ep->com.state) {
1137	case CLOSING:
1138		if (!rep->status)
1139			__state_set(&ep->com, MORIBUND);
1140		else
1141			__state_set(&ep->com, ABORTING);
1142		break;
1143	case MORIBUND:
1144		stop_ep_timer(ep);
1145		if (!rep->status) {
1146			if ((ep->com.cm_id) && (ep->com.qp)) {
1147				attrs.next_state = IWCH_QP_STATE_IDLE;
1148				iwch_modify_qp(ep->com.qp->rhp,
1149					     ep->com.qp,
1150					     IWCH_QP_ATTR_NEXT_STATE,
1151					     &attrs, 1);
1152			}
1153			close_socket(&ep->com);
1154			close_complete_upcall(ep);
1155			__state_set(&ep->com, DEAD);
1156			release = 1;
1157		}
1158		break;
1159	case DEAD:
1160		break;
1161	default:
1162		panic("unknown state: %d\n", ep->com.state);
1163	}
1164	mtx_unlock(&ep->com.lock);
1165	if (rep->status) {
1166		log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n",
1167		       __FUNCTION__, ep->hwtid);
1168		attrs.next_state = IWCH_QP_STATE_ERROR;
1169		iwch_modify_qp(ep->com.qp->rhp,
1170			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1171			       &attrs, 1);
1172	}
1173	if (release)
1174		put_ep(&ep->com);
1175	return CPL_RET_BUF_DONE;
1176}
1177
1178static void
1179ep_timeout(void *arg)
1180{
1181	struct iwch_ep *ep = (struct iwch_ep *)arg;
1182	struct iwch_qp_attributes attrs;
1183	int err = 0;
1184
1185	mtx_lock(&ep->com.lock);
1186	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1187	switch (ep->com.state) {
1188	case MPA_REQ_SENT:
1189		connect_reply_upcall(ep, -ETIMEDOUT);
1190		break;
1191	case MPA_REQ_WAIT:
1192		break;
1193	case CLOSING:
1194	case MORIBUND:
1195		if (ep->com.cm_id && ep->com.qp)
1196			err = 1;
1197		break;
1198	default:
1199		panic("unknown state: %d\n", ep->com.state);
1200	}
1201	__state_set(&ep->com, ABORTING);
1202	mtx_unlock(&ep->com.lock);
1203	if (err){
1204		attrs.next_state = IWCH_QP_STATE_ERROR;
1205		iwch_modify_qp(ep->com.qp->rhp,
1206			     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
1207			     &attrs, 1);
1208	}
1209	abort_connection(ep);
1210	put_ep(&ep->com);
1211}
1212
1213int
1214iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
1215{
1216	int err;
1217	struct iwch_ep *ep = to_ep(cm_id);
1218	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1219
1220	if (state_read(&ep->com) == DEAD) {
1221		put_ep(&ep->com);
1222		return (-ECONNRESET);
1223	}
1224	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
1225	if (mpa_rev == 0) {
1226		abort_connection(ep);
1227	} else {
1228		err = send_mpa_reject(ep, pdata, pdata_len);
1229		err = soshutdown(ep->com.so, 3);
1230	}
1231	return 0;
1232}
1233
1234int
1235iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
1236{
1237	int err;
1238	struct iwch_qp_attributes attrs;
1239	enum iwch_qp_attr_mask mask;
1240	struct iwch_ep *ep = to_ep(cm_id);
1241	struct iwch_dev *h = to_iwch_dev(cm_id->device);
1242	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
1243
1244	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1245	if (state_read(&ep->com) == DEAD)
1246		return (-ECONNRESET);
1247
1248	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
1249	PANIC_IF(!qp);
1250
1251	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
1252	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
1253		abort_connection(ep);
1254		return (-EINVAL);
1255	}
1256
1257	cm_id->add_ref(cm_id);
1258	ep->com.cm_id = cm_id;
1259	ep->com.qp = qp;
1260
1261	ep->com.rpl_err = 0;
1262	ep->com.rpl_done = 0;
1263	ep->ird = conn_param->ird;
1264	ep->ord = conn_param->ord;
1265	CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord);
1266	get_ep(&ep->com);
1267
1268	/* bind QP to EP and move to RTS */
1269	attrs.mpa_attr = ep->mpa_attr;
1270	attrs.max_ird = ep->ord;
1271	attrs.max_ord = ep->ord;
1272	attrs.llp_stream_handle = ep;
1273	attrs.next_state = IWCH_QP_STATE_RTS;
1274
1275	/* bind QP and TID with INIT_WR */
1276	mask = IWCH_QP_ATTR_NEXT_STATE |
1277			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
1278			     IWCH_QP_ATTR_MPA_ATTR |
1279			     IWCH_QP_ATTR_MAX_IRD |
1280			     IWCH_QP_ATTR_MAX_ORD;
1281
1282	err = iwch_modify_qp(ep->com.qp->rhp,
1283			     ep->com.qp, mask, &attrs, 1);
1284
1285	if (err)
1286		goto err;
1287
1288	err = send_mpa_reply(ep, conn_param->private_data,
1289 			     conn_param->private_data_len);
1290	if (err)
1291		goto err;
1292	state_set(&ep->com, FPDU_MODE);
1293	established_upcall(ep);
1294	put_ep(&ep->com);
1295	return 0;
1296err:
1297	ep->com.cm_id = NULL;
1298	ep->com.qp = NULL;
1299	cm_id->rem_ref(cm_id);
1300	put_ep(&ep->com);
1301	return err;
1302}
1303
1304static int init_sock(struct iwch_ep_common *epc)
1305{
1306	int err;
1307	struct sockopt sopt;
1308	int on=1;
1309
1310	SOCK_LOCK(epc->so);
1311	soupcall_set(epc->so, SO_RCV, iwch_so_upcall, epc);
1312	epc->so->so_state |= SS_NBIO;
1313	SOCK_UNLOCK(epc->so);
1314	sopt.sopt_dir = SOPT_SET;
1315	sopt.sopt_level = SOL_SOCKET;
1316	sopt.sopt_name = SO_NO_DDP;
1317	sopt.sopt_val = (caddr_t)&on;
1318	sopt.sopt_valsize = sizeof on;
1319	sopt.sopt_td = NULL;
1320	err = sosetopt(epc->so, &sopt);
1321	if (err)
1322		printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err);
1323	sopt.sopt_dir = SOPT_SET;
1324	sopt.sopt_level = IPPROTO_TCP;
1325	sopt.sopt_name = TCP_NODELAY;
1326	sopt.sopt_val = (caddr_t)&on;
1327	sopt.sopt_valsize = sizeof on;
1328	sopt.sopt_td = NULL;
1329	err = sosetopt(epc->so, &sopt);
1330	if (err)
1331		printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err);
1332
1333	return 0;
1334}
1335
1336static int
1337is_loopback_dst(struct iw_cm_id *cm_id)
1338{
1339	uint16_t port = cm_id->remote_addr.sin_port;
1340	struct ifaddr *ifa;
1341
1342	cm_id->remote_addr.sin_port = 0;
1343	ifa = ifa_ifwithaddr((struct sockaddr *)&cm_id->remote_addr);
1344	cm_id->remote_addr.sin_port = port;
1345	return (ifa != NULL);
1346}
1347
1348int
1349iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
1350{
1351	int err = 0;
1352	struct iwch_dev *h = to_iwch_dev(cm_id->device);
1353	struct iwch_ep *ep;
1354	struct rtentry *rt;
1355	struct toedev *tdev;
1356
1357	if (is_loopback_dst(cm_id)) {
1358		err = -ENOSYS;
1359		goto out;
1360	}
1361
1362	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
1363	if (!ep) {
1364		printf("%s - cannot alloc ep.\n", __FUNCTION__);
1365		err = (-ENOMEM);
1366		goto out;
1367	}
1368	callout_init(&ep->timer, TRUE);
1369	ep->plen = conn_param->private_data_len;
1370	if (ep->plen)
1371		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
1372		       conn_param->private_data, ep->plen);
1373	ep->ird = conn_param->ird;
1374	ep->ord = conn_param->ord;
1375
1376	cm_id->add_ref(cm_id);
1377	ep->com.cm_id = cm_id;
1378	ep->com.qp = get_qhp(h, conn_param->qpn);
1379	ep->com.thread = curthread;
1380	PANIC_IF(!ep->com.qp);
1381	CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn,
1382	     ep->com.qp, cm_id);
1383
1384	ep->com.so = cm_id->so;
1385	err = init_sock(&ep->com);
1386	if (err)
1387		goto fail2;
1388
1389	/* find a route */
1390	rt = find_route(cm_id->local_addr.sin_addr.s_addr,
1391			cm_id->remote_addr.sin_addr.s_addr,
1392			cm_id->local_addr.sin_port,
1393			cm_id->remote_addr.sin_port, IPTOS_LOWDELAY);
1394	if (!rt) {
1395		printf("%s - cannot find route.\n", __FUNCTION__);
1396		err = EHOSTUNREACH;
1397		goto fail2;
1398	}
1399
1400	if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) {
1401		printf("%s - interface not TOE capable.\n", __FUNCTION__);
1402		goto fail3;
1403	}
1404	tdev = TOEDEV(rt->rt_ifp);
1405	if (tdev == NULL) {
1406		printf("%s - No toedev for interface.\n", __FUNCTION__);
1407		goto fail3;
1408	}
1409	if (!tdev->tod_can_offload(tdev, ep->com.so)) {
1410		printf("%s - interface cannot offload!.\n", __FUNCTION__);
1411		goto fail3;
1412	}
1413	RTFREE(rt);
1414
1415	state_set(&ep->com, CONNECTING);
1416	ep->com.local_addr = cm_id->local_addr;
1417	ep->com.remote_addr = cm_id->remote_addr;
1418	err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr,
1419		ep->com.thread);
1420	if (!err)
1421		goto out;
1422fail3:
1423	RTFREE(ep->dst);
1424fail2:
1425	put_ep(&ep->com);
1426out:
1427	return err;
1428}
1429
1430int
1431iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
1432{
1433	int err = 0;
1434	struct iwch_listen_ep *ep;
1435
1436	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
1437	if (!ep) {
1438		printf("%s - cannot alloc ep.\n", __FUNCTION__);
1439		err = ENOMEM;
1440		goto out;
1441	}
1442	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1443	cm_id->add_ref(cm_id);
1444	ep->com.cm_id = cm_id;
1445	ep->backlog = backlog;
1446	ep->com.local_addr = cm_id->local_addr;
1447	ep->com.thread = curthread;
1448	state_set(&ep->com, LISTEN);
1449
1450	ep->com.so = cm_id->so;
1451	err = init_sock(&ep->com);
1452	if (err)
1453		goto fail;
1454
1455	err = solisten(ep->com.so, ep->backlog, ep->com.thread);
1456	if (!err) {
1457		cm_id->provider_data = ep;
1458		goto out;
1459	}
1460	close_socket(&ep->com);
1461fail:
1462	cm_id->rem_ref(cm_id);
1463	put_ep(&ep->com);
1464out:
1465	return err;
1466}
1467
1468int
1469iwch_destroy_listen(struct iw_cm_id *cm_id)
1470{
1471	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
1472
1473	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
1474
1475	state_set(&ep->com, DEAD);
1476	close_socket(&ep->com);
1477	cm_id->rem_ref(cm_id);
1478	put_ep(&ep->com);
1479	return 0;
1480}
1481
1482int
1483iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags)
1484{
1485	int close = 0;
1486
1487	mtx_lock(&ep->com.lock);
1488
1489	PANIC_IF(!ep);
1490	PANIC_IF(!ep->com.so);
1491
1492	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep,
1493	     ep->com.so, states[ep->com.state], abrupt);
1494
1495	if (ep->com.state == DEAD) {
1496		CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep);
1497		goto out;
1498	}
1499
1500	if (abrupt) {
1501		if (ep->com.state != ABORTING) {
1502			ep->com.state = ABORTING;
1503			close = 1;
1504		}
1505		goto out;
1506	}
1507
1508	switch (ep->com.state) {
1509	case MPA_REQ_WAIT:
1510	case MPA_REQ_SENT:
1511	case MPA_REQ_RCVD:
1512	case MPA_REP_SENT:
1513	case FPDU_MODE:
1514		start_ep_timer(ep);
1515		ep->com.state = CLOSING;
1516		close = 1;
1517		break;
1518	case CLOSING:
1519		ep->com.state = MORIBUND;
1520		close = 1;
1521		break;
1522	case MORIBUND:
1523	case ABORTING:
1524		break;
1525	default:
1526		panic("unknown state: %d\n", ep->com.state);
1527		break;
1528	}
1529out:
1530	mtx_unlock(&ep->com.lock);
1531	if (close) {
1532		if (abrupt)
1533			abort_connection(ep);
1534		else
1535			shutdown_socket(&ep->com);
1536	}
1537	return 0;
1538}
1539
1540static void
1541process_data(struct iwch_ep *ep)
1542{
1543	struct sockaddr_in *local, *remote;
1544
1545	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1546
1547	switch (state_read(&ep->com)) {
1548	case MPA_REQ_SENT:
1549		process_mpa_reply(ep);
1550		break;
1551	case MPA_REQ_WAIT:
1552
1553		/*
1554		 * XXX
1555		 * Set local and remote addrs here because when we
1556		 * dequeue the newly accepted socket, they aren't set
1557		 * yet in the pcb!
1558		 */
1559		in_getsockaddr(ep->com.so, (struct sockaddr **)&local);
1560		in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote);
1561		CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__,
1562			inet_ntoa(local->sin_addr),
1563			inet_ntoa(remote->sin_addr));
1564		ep->com.local_addr = *local;
1565		ep->com.remote_addr = *remote;
1566		free(local, M_SONAME);
1567		free(remote, M_SONAME);
1568		process_mpa_request(ep);
1569		break;
1570	default:
1571		if (ep->com.so->so_rcv.sb_cc)
1572			printf("%s Unexpected streaming data."
1573			       " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n",
1574			       __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state,
1575			       ep->com.so->so_rcv.sb_cc, ep->com.so->so_rcv.sb_mb);
1576		break;
1577	}
1578	return;
1579}
1580
1581static void
1582process_connected(struct iwch_ep *ep)
1583{
1584	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
1585	if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) {
1586		send_mpa_req(ep);
1587	} else {
1588		connect_reply_upcall(ep, -ep->com.so->so_error);
1589		close_socket(&ep->com);
1590		state_set(&ep->com, DEAD);
1591		put_ep(&ep->com);
1592	}
1593}
1594
1595static struct socket *
1596dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct iwch_ep *child_ep)
1597{
1598	struct socket *so;
1599
1600	ACCEPT_LOCK();
1601	so = TAILQ_FIRST(&head->so_comp);
1602	if (!so) {
1603		ACCEPT_UNLOCK();
1604		return NULL;
1605	}
1606	TAILQ_REMOVE(&head->so_comp, so, so_list);
1607	head->so_qlen--;
1608	SOCK_LOCK(so);
1609	so->so_qstate &= ~SQ_COMP;
1610	so->so_head = NULL;
1611	soref(so);
1612	soupcall_set(so, SO_RCV, iwch_so_upcall, child_ep);
1613	so->so_state |= SS_NBIO;
1614	PANIC_IF(!(so->so_state & SS_ISCONNECTED));
1615	PANIC_IF(so->so_error);
1616	SOCK_UNLOCK(so);
1617	ACCEPT_UNLOCK();
1618	soaccept(so, (struct sockaddr **)remote);
1619	return so;
1620}
1621
1622static void
1623process_newconn(struct iwch_ep *parent_ep)
1624{
1625	struct socket *child_so;
1626	struct iwch_ep *child_ep;
1627	struct sockaddr_in *remote;
1628
1629	CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so);
1630	child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT);
1631	if (!child_ep) {
1632		log(LOG_ERR, "%s - failed to allocate ep entry!\n",
1633		       __FUNCTION__);
1634		return;
1635	}
1636	child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep);
1637	if (!child_so) {
1638		log(LOG_ERR, "%s - failed to dequeue child socket!\n",
1639		       __FUNCTION__);
1640		__free_ep(&child_ep->com);
1641		return;
1642	}
1643	CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__,
1644		inet_ntoa(remote->sin_addr), ntohs(remote->sin_port));
1645	child_ep->com.so = child_so;
1646	child_ep->com.cm_id = NULL;
1647	child_ep->com.thread = parent_ep->com.thread;
1648	child_ep->parent_ep = parent_ep;
1649	free(remote, M_SONAME);
1650	get_ep(&parent_ep->com);
1651	child_ep->parent_ep = parent_ep;
1652	callout_init(&child_ep->timer, TRUE);
1653	state_set(&child_ep->com, MPA_REQ_WAIT);
1654	start_ep_timer(child_ep);
1655
1656	/* maybe the request has already been queued up on the socket... */
1657	process_mpa_request(child_ep);
1658}
1659
1660static int
1661iwch_so_upcall(struct socket *so, void *arg, int waitflag)
1662{
1663	struct iwch_ep *ep = arg;
1664
1665	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
1666	mtx_lock(&req_lock);
1667	if (ep && ep->com.so && !ep->com.entry.tqe_prev) {
1668		get_ep(&ep->com);
1669		TAILQ_INSERT_TAIL(&req_list, &ep->com, entry);
1670		taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task);
1671	}
1672	mtx_unlock(&req_lock);
1673	return (SU_OK);
1674}
1675
1676static void
1677process_socket_event(struct iwch_ep *ep)
1678{
1679	int state = state_read(&ep->com);
1680	struct socket *so = ep->com.so;
1681
1682	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
1683	if (state == CONNECTING) {
1684		process_connected(ep);
1685		return;
1686	}
1687
1688	if (state == LISTEN) {
1689		process_newconn(ep);
1690		return;
1691	}
1692
1693	/* connection error */
1694	if (so->so_error) {
1695		process_conn_error(ep);
1696		return;
1697	}
1698
1699	/* peer close */
1700	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) {
1701		process_peer_close(ep);
1702		return;
1703	}
1704
1705	/* close complete */
1706	if (so->so_state & (SS_ISDISCONNECTED)) {
1707		process_close_complete(ep);
1708		return;
1709	}
1710
1711	/* rx data */
1712	process_data(ep);
1713	return;
1714}
1715
1716static void
1717process_req(void *ctx, int pending)
1718{
1719	struct iwch_ep_common *epc;
1720
1721	CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__);
1722	mtx_lock(&req_lock);
1723	while (!TAILQ_EMPTY(&req_list)) {
1724		epc = TAILQ_FIRST(&req_list);
1725		TAILQ_REMOVE(&req_list, epc, entry);
1726		epc->entry.tqe_prev = NULL;
1727		mtx_unlock(&req_lock);
1728		if (epc->so)
1729			process_socket_event((struct iwch_ep *)epc);
1730		put_ep(epc);
1731		mtx_lock(&req_lock);
1732	}
1733	mtx_unlock(&req_lock);
1734}
1735
1736int
1737iwch_cm_init(void)
1738{
1739	TAILQ_INIT(&req_list);
1740	mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF);
1741	iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT,
1742		taskqueue_thread_enqueue, &iw_cxgb_taskq);
1743        if (iw_cxgb_taskq == NULL) {
1744                printf("failed to allocate iw_cxgb taskqueue\n");
1745                return (ENOMEM);
1746        }
1747        taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq");
1748        TASK_INIT(&iw_cxgb_task, 0, process_req, NULL);
1749	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate);
1750	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status);
1751	return 0;
1752}
1753
1754void
1755iwch_cm_term(void)
1756{
1757	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL);
1758	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL);
1759	taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task);
1760	taskqueue_free(iw_cxgb_taskq);
1761}
1762
1763