ipoib_ib.c revision 341891
1/*
2 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5 * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses.  You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 *     Redistribution and use in source and binary forms, with or
14 *     without modification, are permitted provided that the following
15 *     conditions are met:
16 *
17 *      - Redistributions of source code must retain the above
18 *        copyright notice, this list of conditions and the following
19 *        disclaimer.
20 *
21 *      - Redistributions in binary form must reproduce the above
22 *        copyright notice, this list of conditions and the following
23 *        disclaimer in the documentation and/or other materials
24 *        provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD$");
38
39#include "ipoib.h"
40
41#include <rdma/ib_cache.h>
42
43#include <security/mac/mac_framework.h>
44
45#include <linux/delay.h>
46#include <linux/dma-mapping.h>
47
48#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
49static int data_debug_level;
50
51module_param(data_debug_level, int, 0644);
52MODULE_PARM_DESC(data_debug_level,
53		 "Enable data path debug tracing if > 0");
54#endif
55
56static DEFINE_MUTEX(pkey_mutex);
57
58struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv,
59				 struct ib_pd *pd, struct ib_ah_attr *attr)
60{
61	struct ipoib_ah *ah;
62
63	ah = kmalloc(sizeof *ah, GFP_KERNEL);
64	if (!ah)
65		return NULL;
66
67	ah->priv      = priv;
68	ah->last_send = 0;
69	kref_init(&ah->ref);
70
71	ah->ah = ib_create_ah(pd, attr);
72	if (IS_ERR(ah->ah)) {
73		kfree(ah);
74		ah = NULL;
75	} else
76		ipoib_dbg(priv, "Created ah %p\n", ah->ah);
77
78	return ah;
79}
80
81void ipoib_free_ah(struct kref *kref)
82{
83	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
84	struct ipoib_dev_priv *priv = ah->priv;
85
86	unsigned long flags;
87
88	spin_lock_irqsave(&priv->lock, flags);
89	list_add_tail(&ah->list, &priv->dead_ahs);
90	spin_unlock_irqrestore(&priv->lock, flags);
91}
92
93void
94ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req)
95{
96	struct mbuf *m;
97	int i;
98
99	for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++)
100		ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len,
101		    DMA_FROM_DEVICE);
102}
103
104void
105ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length)
106{
107
108	m_adj(mb, -(mb->m_pkthdr.len - length));
109}
110
111struct mbuf *
112ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req,
113    int size)
114{
115	struct mbuf *mb, *m;
116	int i, j;
117
118	rx_req->mb = NULL;
119	mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR);
120	if (mb == NULL)
121		return (NULL);
122	for (i = 0, m = mb; m != NULL; m = m->m_next, i++) {
123		m->m_len = M_SIZE(m);
124		mb->m_pkthdr.len += m->m_len;
125		rx_req->mapping[i] = ib_dma_map_single(priv->ca,
126		    mtod(m, void *), m->m_len, DMA_FROM_DEVICE);
127		if (unlikely(ib_dma_mapping_error(priv->ca,
128		    rx_req->mapping[i])))
129			goto error;
130
131	}
132	rx_req->mb = mb;
133	return (mb);
134error:
135	for (j = 0, m = mb; j < i; m = m->m_next, j++)
136		ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len,
137		    DMA_FROM_DEVICE);
138	m_freem(mb);
139	return (NULL);
140
141}
142
143static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id)
144{
145	struct ipoib_rx_buf *rx_req;
146	struct ib_recv_wr *bad_wr;
147	struct mbuf *m;
148	int ret;
149	int i;
150
151	rx_req = &priv->rx_ring[id];
152	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
153		priv->rx_sge[i].addr = rx_req->mapping[i];
154		priv->rx_sge[i].length = m->m_len;
155	}
156	priv->rx_wr.num_sge = i;
157	priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
158
159	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
160	if (unlikely(ret)) {
161		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
162		ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]);
163		m_freem(priv->rx_ring[id].mb);
164		priv->rx_ring[id].mb = NULL;
165	}
166
167	return ret;
168}
169
170static struct mbuf *
171ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id)
172{
173
174	return ipoib_alloc_map_mb(priv, &priv->rx_ring[id],
175	    priv->max_ib_mtu + IB_GRH_BYTES);
176}
177
178static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv)
179{
180	int i;
181
182	for (i = 0; i < ipoib_recvq_size; ++i) {
183		if (!ipoib_alloc_rx_mb(priv, i)) {
184			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
185			return -ENOMEM;
186		}
187		if (ipoib_ib_post_receive(priv, i)) {
188			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
189			return -EIO;
190		}
191	}
192
193	return 0;
194}
195
196static void
197ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
198{
199	struct ipoib_rx_buf saverx;
200	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
201	struct ifnet *dev = priv->dev;
202	struct ipoib_header *eh;
203	struct mbuf *mb;
204
205	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
206		       wr_id, wc->status);
207
208	if (unlikely(wr_id >= ipoib_recvq_size)) {
209		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
210			   wr_id, ipoib_recvq_size);
211		return;
212	}
213
214	mb  = priv->rx_ring[wr_id].mb;
215
216	if (unlikely(wc->status != IB_WC_SUCCESS)) {
217		if (wc->status != IB_WC_WR_FLUSH_ERR) {
218			ipoib_warn(priv, "failed recv event "
219				   "(status=%d, wrid=%d vend_err %x)\n",
220				   wc->status, wr_id, wc->vendor_err);
221			goto repost;
222		}
223		if (mb) {
224			ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]);
225			m_freem(mb);
226			priv->rx_ring[wr_id].mb = NULL;
227		}
228		return;
229	}
230
231	/*
232	 * Drop packets that this interface sent, ie multicast packets
233	 * that the HCA has replicated.
234	 */
235	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
236		goto repost;
237
238	memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx));
239	/*
240	 * If we can't allocate a new RX buffer, dump
241	 * this packet and reuse the old buffer.
242	 */
243	if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) {
244		memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx));
245		if_inc_counter(dev, IFCOUNTER_IQDROPS, 1);
246		goto repost;
247	}
248
249	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
250		       wc->byte_len, wc->slid);
251
252	ipoib_dma_unmap_rx(priv, &saverx);
253	ipoib_dma_mb(priv, mb, wc->byte_len);
254
255	if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
256	if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
257	mb->m_pkthdr.rcvif = dev;
258	m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN);
259	eh = mtod(mb, struct ipoib_header *);
260	bzero(eh->hwaddr, 4);	/* Zero the queue pair, only dgid is in grh */
261
262	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
263		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
264
265	dev->if_input(dev, mb);
266
267repost:
268	if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
269		ipoib_warn(priv, "ipoib_ib_post_receive failed "
270			   "for buf %d\n", wr_id);
271}
272
273int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max)
274{
275	struct mbuf *mb = tx_req->mb;
276	u64 *mapping = tx_req->mapping;
277	struct mbuf *m, *p;
278	int error;
279	int i;
280
281	for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
282		if (m->m_len != 0)
283			continue;
284		if (p == NULL)
285			panic("ipoib_dma_map_tx: First mbuf empty\n");
286		p->m_next = m_free(m);
287		m = p;
288		i--;
289	}
290	i--;
291	if (i >= max) {
292		tx_req->mb = mb = m_defrag(mb, M_NOWAIT);
293		if (mb == NULL)
294			return -EIO;
295		for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
296		if (i >= max)
297			return -EIO;
298	}
299	error = 0;
300	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
301		mapping[i] = ib_dma_map_single(ca, mtod(m, void *),
302					       m->m_len, DMA_TO_DEVICE);
303		if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) {
304			error = -EIO;
305			break;
306		}
307	}
308	if (error) {
309		int end;
310
311		end = i;
312		for (m = mb, i = 0; i < end; m = m->m_next, i++)
313			ib_dma_unmap_single(ca, mapping[i], m->m_len,
314					    DMA_TO_DEVICE);
315	}
316	return error;
317}
318
319void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
320{
321	struct mbuf *mb = tx_req->mb;
322	u64 *mapping = tx_req->mapping;
323	struct mbuf *m;
324	int i;
325
326	for (m = mb, i = 0; m != NULL; m = m->m_next, i++)
327		ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE);
328}
329
330static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
331{
332	struct ifnet *dev = priv->dev;
333	unsigned int wr_id = wc->wr_id;
334	struct ipoib_tx_buf *tx_req;
335
336	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
337		       wr_id, wc->status);
338
339	if (unlikely(wr_id >= ipoib_sendq_size)) {
340		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
341			   wr_id, ipoib_sendq_size);
342		return;
343	}
344
345	tx_req = &priv->tx_ring[wr_id];
346
347	ipoib_dma_unmap_tx(priv->ca, tx_req);
348
349	if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
350
351	m_freem(tx_req->mb);
352
353	++priv->tx_tail;
354	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
355	    (dev->if_drv_flags & IFF_DRV_OACTIVE) &&
356	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
357		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
358
359	if (wc->status != IB_WC_SUCCESS &&
360	    wc->status != IB_WC_WR_FLUSH_ERR)
361		ipoib_warn(priv, "failed send event "
362			   "(status=%d, wrid=%d vend_err %x)\n",
363			   wc->status, wr_id, wc->vendor_err);
364}
365
366int
367ipoib_poll_tx(struct ipoib_dev_priv *priv)
368{
369	int n, i;
370
371	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
372	for (i = 0; i < n; ++i) {
373		struct ib_wc *wc = priv->send_wc + i;
374		if (wc->wr_id & IPOIB_OP_CM)
375			ipoib_cm_handle_tx_wc(priv, wc);
376		else
377			ipoib_ib_handle_tx_wc(priv, wc);
378	}
379
380	return n == MAX_SEND_CQE;
381}
382
383static void
384ipoib_poll(struct ipoib_dev_priv *priv)
385{
386	int n, i;
387
388poll_more:
389	spin_lock(&priv->drain_lock);
390	for (;;) {
391		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
392		for (i = 0; i < n; i++) {
393			struct ib_wc *wc = priv->ibwc + i;
394
395			if ((wc->wr_id & IPOIB_OP_RECV) == 0)
396				panic("ipoib_poll: Bad wr_id 0x%jX\n",
397				    (intmax_t)wc->wr_id);
398			if (wc->wr_id & IPOIB_OP_CM)
399				ipoib_cm_handle_rx_wc(priv, wc);
400			else
401				ipoib_ib_handle_rx_wc(priv, wc);
402		}
403
404		if (n != IPOIB_NUM_WC)
405			break;
406	}
407	spin_unlock(&priv->drain_lock);
408
409	if (ib_req_notify_cq(priv->recv_cq,
410	    IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS))
411		goto poll_more;
412}
413
414void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
415{
416	struct ipoib_dev_priv *priv = dev_ptr;
417
418	ipoib_poll(priv);
419}
420
421static void drain_tx_cq(struct ipoib_dev_priv *priv)
422{
423	struct ifnet *dev = priv->dev;
424
425	spin_lock(&priv->lock);
426	while (ipoib_poll_tx(priv))
427		; /* nothing */
428
429	if (dev->if_drv_flags & IFF_DRV_OACTIVE)
430		mod_timer(&priv->poll_timer, jiffies + 1);
431
432	spin_unlock(&priv->lock);
433}
434
435void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
436{
437	struct ipoib_dev_priv *priv = dev_ptr;
438
439	mod_timer(&priv->poll_timer, jiffies);
440}
441
442static inline int
443post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
444    struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head,
445    int hlen)
446{
447	struct ib_send_wr *bad_wr;
448	struct mbuf *mb = tx_req->mb;
449	u64 *mapping = tx_req->mapping;
450	struct mbuf *m;
451	int i;
452
453	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
454		priv->tx_sge[i].addr         = mapping[i];
455		priv->tx_sge[i].length       = m->m_len;
456	}
457	priv->tx_wr.wr.num_sge	= i;
458	priv->tx_wr.wr.wr_id	= wr_id;
459	priv->tx_wr.remote_qpn	= qpn;
460	priv->tx_wr.ah		= address;
461
462	if (head) {
463		priv->tx_wr.mss		= 0; /* XXX mb_shinfo(mb)->gso_size; */
464		priv->tx_wr.header	= head;
465		priv->tx_wr.hlen	= hlen;
466		priv->tx_wr.wr.opcode	= IB_WR_LSO;
467	} else
468		priv->tx_wr.wr.opcode	= IB_WR_SEND;
469
470	return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
471}
472
473void
474ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
475    struct ipoib_ah *address, u32 qpn)
476{
477	struct ifnet *dev = priv->dev;
478	struct ipoib_tx_buf *tx_req;
479	int hlen;
480	void *phead;
481
482	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
483		while (ipoib_poll_tx(priv))
484			; /* nothing */
485
486	m_adj(mb, sizeof (struct ipoib_pseudoheader));
487	if (0 /* XXX segment offload mb_is_gso(mb) */) {
488		/* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
489		phead = mtod(mb, void *);
490		if (mb->m_len < hlen) {
491			ipoib_warn(priv, "linear data too small\n");
492			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
493			m_freem(mb);
494			return;
495		}
496		m_adj(mb, hlen);
497	} else {
498		if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
499			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
500				   mb->m_pkthdr.len, priv->mcast_mtu);
501			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
502			ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu);
503			return;
504		}
505		phead = NULL;
506		hlen  = 0;
507	}
508
509	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
510		       mb->m_pkthdr.len, address, qpn);
511
512	/*
513	 * We put the mb into the tx_ring _before_ we call post_send()
514	 * because it's entirely possible that the completion handler will
515	 * run before we execute anything after the post_send().  That
516	 * means we have to make sure everything is properly recorded and
517	 * our state is consistent before we call post_send().
518	 */
519	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
520	tx_req->mb = mb;
521	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) {
522		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
523		if (tx_req->mb)
524			m_freem(tx_req->mb);
525		return;
526	}
527
528	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
529		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
530	else
531		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
532
533	if (++priv->tx_outstanding == ipoib_sendq_size) {
534		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
535		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
536			ipoib_warn(priv, "request notify on send CQ failed\n");
537		dev->if_drv_flags |= IFF_DRV_OACTIVE;
538	}
539
540	if (unlikely(post_send(priv,
541	    priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn,
542	    tx_req, phead, hlen))) {
543		ipoib_warn(priv, "post_send failed\n");
544		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
545		--priv->tx_outstanding;
546		ipoib_dma_unmap_tx(priv->ca, tx_req);
547		m_freem(mb);
548		if (dev->if_drv_flags & IFF_DRV_OACTIVE)
549			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
550	} else {
551		address->last_send = priv->tx_head;
552		++priv->tx_head;
553	}
554}
555
556static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
557{
558	struct ipoib_ah *ah, *tah;
559	LIST_HEAD(remove_list);
560	unsigned long flags;
561
562	spin_lock_irqsave(&priv->lock, flags);
563
564	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
565		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
566			list_del(&ah->list);
567			ib_destroy_ah(ah->ah);
568			kfree(ah);
569		}
570
571	spin_unlock_irqrestore(&priv->lock, flags);
572}
573
574void ipoib_reap_ah(struct work_struct *work)
575{
576	struct ipoib_dev_priv *priv =
577		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
578
579	__ipoib_reap_ah(priv);
580
581	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
582		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
583				   HZ);
584}
585
586static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv)
587{
588	unsigned long begin;
589
590	begin = jiffies;
591
592	while (!list_empty(&priv->dead_ahs)) {
593		__ipoib_reap_ah(priv);
594
595		if (time_after(jiffies, begin + HZ)) {
596			ipoib_warn(priv, "timing out; will leak address handles\n");
597			break;
598		}
599
600		msleep(1);
601	}
602}
603
604static void ipoib_ib_tx_timer_func(unsigned long ctx)
605{
606	drain_tx_cq((struct ipoib_dev_priv *)ctx);
607}
608
609int ipoib_ib_dev_open(struct ipoib_dev_priv *priv)
610{
611	int ret;
612
613	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
614		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
615		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
616		return -1;
617	}
618	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
619
620	ret = ipoib_init_qp(priv);
621	if (ret) {
622		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
623		return -1;
624	}
625
626	ret = ipoib_ib_post_receives(priv);
627	if (ret) {
628		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
629		ipoib_ib_dev_stop(priv, 1);
630		return -1;
631	}
632
633	ret = ipoib_cm_dev_open(priv);
634	if (ret) {
635		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
636		ipoib_ib_dev_stop(priv, 1);
637		return -1;
638	}
639
640	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
641	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
642
643	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
644
645	return 0;
646}
647
648static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv)
649{
650	u16 pkey_index = 0;
651
652	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
653		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
654	else
655		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
656}
657
658int ipoib_ib_dev_up(struct ipoib_dev_priv *priv)
659{
660
661	ipoib_pkey_dev_check_presence(priv);
662
663	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
664		ipoib_dbg(priv, "PKEY is not assigned.\n");
665		return 0;
666	}
667
668	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
669
670	return ipoib_mcast_start_thread(priv);
671}
672
673int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush)
674{
675
676	ipoib_dbg(priv, "downing ib_dev\n");
677
678	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
679	if_link_state_change(priv->dev, LINK_STATE_DOWN);
680
681	/* Shutdown the P_Key thread if still active */
682	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
683		mutex_lock(&pkey_mutex);
684		set_bit(IPOIB_PKEY_STOP, &priv->flags);
685		cancel_delayed_work(&priv->pkey_poll_task);
686		mutex_unlock(&pkey_mutex);
687		if (flush)
688			flush_workqueue(ipoib_workqueue);
689	}
690
691	ipoib_mcast_stop_thread(priv, flush);
692	ipoib_mcast_dev_flush(priv);
693
694	ipoib_flush_paths(priv);
695
696	return 0;
697}
698
699static int recvs_pending(struct ipoib_dev_priv *priv)
700{
701	int pending = 0;
702	int i;
703
704	for (i = 0; i < ipoib_recvq_size; ++i)
705		if (priv->rx_ring[i].mb)
706			++pending;
707
708	return pending;
709}
710
711static void check_qp_movement_and_print(struct ipoib_dev_priv *priv,
712					struct ib_qp *qp,
713					enum ib_qp_state new_state)
714{
715	struct ib_qp_attr qp_attr;
716	struct ib_qp_init_attr query_init_attr;
717	int ret;
718
719	ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr);
720	if (ret) {
721		ipoib_warn(priv, "%s: Failed to query QP (%d)\n", __func__, ret);
722		return;
723	}
724
725	/* print according to the new-state and the previous state */
726	if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) {
727		ipoib_dbg(priv, "Failed to modify QP %d->%d, acceptable\n",
728			  qp_attr.qp_state, new_state);
729	} else {
730		ipoib_warn(priv, "Failed to modify QP %d->%d\n",
731			   qp_attr.qp_state, new_state);
732	}
733}
734
735void ipoib_drain_cq(struct ipoib_dev_priv *priv)
736{
737	int i, n;
738
739	spin_lock(&priv->drain_lock);
740	do {
741		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
742		for (i = 0; i < n; ++i) {
743			/*
744			 * Convert any successful completions to flush
745			 * errors to avoid passing packets up the
746			 * stack after bringing the device down.
747			 */
748			if (priv->ibwc[i].status == IB_WC_SUCCESS)
749				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
750
751			if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
752				panic("ipoib_drain_cq:  Bad wrid 0x%jX\n",
753				    (intmax_t)priv->ibwc[i].wr_id);
754			if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
755				ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
756			else
757				ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
758		}
759	} while (n == IPOIB_NUM_WC);
760	spin_unlock(&priv->drain_lock);
761
762	spin_lock(&priv->lock);
763	while (ipoib_poll_tx(priv))
764		; /* nothing */
765
766	spin_unlock(&priv->lock);
767}
768
769int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush)
770{
771	struct ib_qp_attr qp_attr;
772	unsigned long begin;
773	struct ipoib_tx_buf *tx_req;
774	int i;
775
776	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
777
778	ipoib_cm_dev_stop(priv);
779
780	/*
781	 * Move our QP to the error state and then reinitialize in
782	 * when all work requests have completed or have been flushed.
783	 */
784	qp_attr.qp_state = IB_QPS_ERR;
785	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
786		check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
787
788	/* Wait for all sends and receives to complete */
789	begin = jiffies;
790
791	while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) {
792		if (time_after(jiffies, begin + 5 * HZ)) {
793			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
794				   priv->tx_head - priv->tx_tail, recvs_pending(priv));
795
796			/*
797			 * assume the HW is wedged and just free up
798			 * all our pending work requests.
799			 */
800			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
801				tx_req = &priv->tx_ring[priv->tx_tail &
802							(ipoib_sendq_size - 1)];
803				ipoib_dma_unmap_tx(priv->ca, tx_req);
804				m_freem(tx_req->mb);
805				++priv->tx_tail;
806				--priv->tx_outstanding;
807			}
808
809			for (i = 0; i < ipoib_recvq_size; ++i) {
810				struct ipoib_rx_buf *rx_req;
811
812				rx_req = &priv->rx_ring[i];
813				if (!rx_req->mb)
814					continue;
815				ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]);
816				m_freem(rx_req->mb);
817				rx_req->mb = NULL;
818			}
819
820			goto timeout;
821		}
822
823		ipoib_drain_cq(priv);
824
825		msleep(1);
826	}
827
828	ipoib_dbg(priv, "All sends and receives done.\n");
829
830timeout:
831	del_timer_sync(&priv->poll_timer);
832	qp_attr.qp_state = IB_QPS_RESET;
833	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
834		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
835
836	/* Wait for all AHs to be reaped */
837	set_bit(IPOIB_STOP_REAPER, &priv->flags);
838	cancel_delayed_work(&priv->ah_reap_task);
839	if (flush)
840		flush_workqueue(ipoib_workqueue);
841
842	ipoib_ah_dev_cleanup(priv);
843
844	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
845
846	return 0;
847}
848
849int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
850{
851	struct ifnet *dev = priv->dev;
852
853	priv->ca = ca;
854	priv->port = port;
855	priv->qp = NULL;
856
857	if (ipoib_transport_dev_init(priv, ca)) {
858		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
859		return -ENODEV;
860	}
861
862	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
863		    (unsigned long) priv);
864
865	if (dev->if_flags & IFF_UP) {
866		if (ipoib_ib_dev_open(priv)) {
867			ipoib_transport_dev_cleanup(priv);
868			return -ENODEV;
869		}
870	}
871
872	return 0;
873}
874
875static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
876				enum ipoib_flush_level level)
877{
878	struct ipoib_dev_priv *cpriv;
879	u16 new_index;
880
881	mutex_lock(&priv->vlan_mutex);
882
883	/*
884	 * Flush any child interfaces too -- they might be up even if
885	 * the parent is down.
886	 */
887	list_for_each_entry(cpriv, &priv->child_intfs, list)
888		__ipoib_ib_dev_flush(cpriv, level);
889
890	mutex_unlock(&priv->vlan_mutex);
891
892	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
893		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
894		return;
895	}
896
897	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
898		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
899		return;
900	}
901
902	if (level == IPOIB_FLUSH_HEAVY) {
903		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
904			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
905			ipoib_ib_dev_down(priv, 0);
906			ipoib_ib_dev_stop(priv, 0);
907			if (ipoib_pkey_dev_delay_open(priv))
908				return;
909		}
910
911		/* restart QP only if P_Key index is changed */
912		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
913		    new_index == priv->pkey_index) {
914			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
915			return;
916		}
917		priv->pkey_index = new_index;
918	}
919
920	if (level == IPOIB_FLUSH_LIGHT) {
921		ipoib_mark_paths_invalid(priv);
922		ipoib_mcast_dev_flush(priv);
923	}
924
925	if (level >= IPOIB_FLUSH_NORMAL)
926		ipoib_ib_dev_down(priv, 0);
927
928	if (level == IPOIB_FLUSH_HEAVY) {
929		ipoib_ib_dev_stop(priv, 0);
930		ipoib_ib_dev_open(priv);
931	}
932
933	/*
934	 * The device could have been brought down between the start and when
935	 * we get here, don't bring it back up if it's not configured up
936	 */
937	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
938		if (level >= IPOIB_FLUSH_NORMAL)
939			ipoib_ib_dev_up(priv);
940		ipoib_mcast_restart_task(&priv->restart_task);
941	}
942}
943
944void ipoib_ib_dev_flush_light(struct work_struct *work)
945{
946	struct ipoib_dev_priv *priv =
947		container_of(work, struct ipoib_dev_priv, flush_light);
948
949	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
950}
951
952void ipoib_ib_dev_flush_normal(struct work_struct *work)
953{
954	struct ipoib_dev_priv *priv =
955		container_of(work, struct ipoib_dev_priv, flush_normal);
956
957	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
958}
959
960void ipoib_ib_dev_flush_heavy(struct work_struct *work)
961{
962	struct ipoib_dev_priv *priv =
963		container_of(work, struct ipoib_dev_priv, flush_heavy);
964
965	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
966}
967
968void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv)
969{
970
971	ipoib_dbg(priv, "cleaning up ib_dev\n");
972
973	ipoib_mcast_stop_thread(priv, 1);
974	ipoib_mcast_dev_flush(priv);
975
976	ipoib_ah_dev_cleanup(priv);
977	ipoib_transport_dev_cleanup(priv);
978}
979
980/*
981 * Delayed P_Key Assigment Interim Support
982 *
983 * The following is initial implementation of delayed P_Key assigment
984 * mechanism. It is using the same approach implemented for the multicast
985 * group join. The single goal of this implementation is to quickly address
986 * Bug #2507. This implementation will probably be removed when the P_Key
987 * change async notification is available.
988 */
989
990void ipoib_pkey_poll(struct work_struct *work)
991{
992	struct ipoib_dev_priv *priv =
993		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
994
995	ipoib_pkey_dev_check_presence(priv);
996
997	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
998		ipoib_open(priv);
999	else {
1000		mutex_lock(&pkey_mutex);
1001		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
1002			queue_delayed_work(ipoib_workqueue,
1003					   &priv->pkey_poll_task,
1004					   HZ);
1005		mutex_unlock(&pkey_mutex);
1006	}
1007}
1008
1009int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv)
1010{
1011
1012	/* Look for the interface pkey value in the IB Port P_Key table and */
1013	/* set the interface pkey assigment flag                            */
1014	ipoib_pkey_dev_check_presence(priv);
1015
1016	/* P_Key value not assigned yet - start polling */
1017	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
1018		mutex_lock(&pkey_mutex);
1019		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
1020		queue_delayed_work(ipoib_workqueue,
1021				   &priv->pkey_poll_task,
1022				   HZ);
1023		mutex_unlock(&pkey_mutex);
1024		return 1;
1025	}
1026
1027	return 0;
1028}
1029