1/* 2 * Copyright (c) 2006 Mellanox Technologies. All rights reserved 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 * $Id: ipoib_cm.c,v 1.1.1.1 2007/08/03 18:52:32 Exp $ 33 */ 34 35#include <rdma/ib_cm.h> 36#include <rdma/ib_cache.h> 37#include <net/dst.h> 38#include <net/icmp.h> 39#include <linux/icmpv6.h> 40#include <linux/delay.h> 41 42#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA 43static int data_debug_level; 44 45module_param_named(cm_data_debug_level, data_debug_level, int, 0644); 46MODULE_PARM_DESC(cm_data_debug_level, 47 "Enable data path debug tracing for connected mode if > 0"); 48#endif 49 50#include "ipoib.h" 51 52#define IPOIB_CM_IETF_ID 0x1000000000000000ULL 53 54#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) 55#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) 56#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) 57#define IPOIB_CM_RX_UPDATE_MASK (0x3) 58 59static struct ib_qp_attr ipoib_cm_err_attr = { 60 .qp_state = IB_QPS_ERR 61}; 62 63#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff 64 65static struct ib_send_wr ipoib_cm_rx_drain_wr = { 66 .wr_id = IPOIB_CM_RX_DRAIN_WRID, 67 .opcode = IB_WR_SEND, 68}; 69 70static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, 71 struct ib_cm_event *event); 72 73static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, 74 u64 mapping[IPOIB_CM_RX_SG]) 75{ 76 int i; 77 78 ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); 79 80 for (i = 0; i < frags; ++i) 81 ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); 82} 83 84static int ipoib_cm_post_receive(struct net_device *dev, int id) 85{ 86 struct ipoib_dev_priv *priv = netdev_priv(dev); 87 struct ib_recv_wr *bad_wr; 88 int i, ret; 89 90 priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; 91 92 for (i = 0; i < IPOIB_CM_RX_SG; ++i) 93 priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; 94 95 ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); 96 if (unlikely(ret)) { 97 ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); 98 ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, 99 priv->cm.srq_ring[id].mapping); 100 dev_kfree_skb_any(priv->cm.srq_ring[id].skb); 101 priv->cm.srq_ring[id].skb = NULL; 102 } 103 104 return ret; 105} 106 107static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags, 108 u64 mapping[IPOIB_CM_RX_SG]) 109{ 110 struct ipoib_dev_priv *priv = netdev_priv(dev); 111 struct sk_buff *skb; 112 int i; 113 114 skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); 115 if (unlikely(!skb)) 116 return NULL; 117 118 /* 119 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the 120 * IP header to a multiple of 16. 121 */ 122 skb_reserve(skb, 12); 123 124 mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, 125 DMA_FROM_DEVICE); 126 if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { 127 dev_kfree_skb_any(skb); 128 return NULL; 129 } 130 131 for (i = 0; i < frags; i++) { 132 struct page *page = alloc_page(GFP_ATOMIC); 133 134 if (!page) 135 goto partial_error; 136 skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); 137 138 mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, 139 0, PAGE_SIZE, DMA_FROM_DEVICE); 140 if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) 141 goto partial_error; 142 } 143 144 priv->cm.srq_ring[id].skb = skb; 145 return skb; 146 147partial_error: 148 149 ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); 150 151 for (; i > 0; --i) 152 ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); 153 154 dev_kfree_skb_any(skb); 155 return NULL; 156} 157 158static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv) 159{ 160 struct ib_send_wr *bad_wr; 161 struct ipoib_cm_rx *p; 162 163 /* We only reserved 1 extra slot in CQ for drain WRs, so 164 * make sure we have at most 1 outstanding WR. */ 165 if (list_empty(&priv->cm.rx_flush_list) || 166 !list_empty(&priv->cm.rx_drain_list)) 167 return; 168 169 /* 170 * QPs on flush list are error state. This way, a "flush 171 * error" WC will be immediately generated for each WR we post. 172 */ 173 p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); 174 if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) 175 ipoib_warn(priv, "failed to post drain wr\n"); 176 177 list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); 178} 179 180static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) 181{ 182 struct ipoib_cm_rx *p = ctx; 183 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 184 unsigned long flags; 185 186 if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) 187 return; 188 189 spin_lock_irqsave(&priv->lock, flags); 190 list_move(&p->list, &priv->cm.rx_flush_list); 191 p->state = IPOIB_CM_RX_FLUSH; 192 ipoib_cm_start_rx_drain(priv); 193 spin_unlock_irqrestore(&priv->lock, flags); 194} 195 196static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, 197 struct ipoib_cm_rx *p) 198{ 199 struct ipoib_dev_priv *priv = netdev_priv(dev); 200 struct ib_qp_init_attr attr = { 201 .event_handler = ipoib_cm_rx_event_handler, 202 .send_cq = priv->cq, /* For drain WR */ 203 .recv_cq = priv->cq, 204 .srq = priv->cm.srq, 205 .cap.max_send_wr = 1, /* For drain WR */ 206 .cap.max_send_sge = 1, 207 .sq_sig_type = IB_SIGNAL_ALL_WR, 208 .qp_type = IB_QPT_RC, 209 .qp_context = p, 210 }; 211 return ib_create_qp(priv->pd, &attr); 212} 213 214static int ipoib_cm_modify_rx_qp(struct net_device *dev, 215 struct ib_cm_id *cm_id, struct ib_qp *qp, 216 unsigned psn) 217{ 218 struct ipoib_dev_priv *priv = netdev_priv(dev); 219 struct ib_qp_attr qp_attr; 220 int qp_attr_mask, ret; 221 222 qp_attr.qp_state = IB_QPS_INIT; 223 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); 224 if (ret) { 225 ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); 226 return ret; 227 } 228 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); 229 if (ret) { 230 ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); 231 return ret; 232 } 233 qp_attr.qp_state = IB_QPS_RTR; 234 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); 235 if (ret) { 236 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); 237 return ret; 238 } 239 qp_attr.rq_psn = psn; 240 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); 241 if (ret) { 242 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); 243 return ret; 244 } 245 246 qp_attr.qp_state = IB_QPS_RTS; 247 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); 248 if (ret) { 249 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); 250 return 0; 251 } 252 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); 253 if (ret) { 254 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); 255 return 0; 256 } 257 258 return 0; 259} 260 261static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, 262 struct ib_qp *qp, struct ib_cm_req_event_param *req, 263 unsigned psn) 264{ 265 struct ipoib_dev_priv *priv = netdev_priv(dev); 266 struct ipoib_cm_data data = {}; 267 struct ib_cm_rep_param rep = {}; 268 269 data.qpn = cpu_to_be32(priv->qp->qp_num); 270 data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); 271 272 rep.private_data = &data; 273 rep.private_data_len = sizeof data; 274 rep.flow_control = 0; 275 rep.rnr_retry_count = req->rnr_retry_count; 276 rep.target_ack_delay = 20; 277 rep.srq = 1; 278 rep.qp_num = qp->qp_num; 279 rep.starting_psn = psn; 280 return ib_send_cm_rep(cm_id, &rep); 281} 282 283static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) 284{ 285 struct net_device *dev = cm_id->context; 286 struct ipoib_dev_priv *priv = netdev_priv(dev); 287 struct ipoib_cm_rx *p; 288 unsigned psn; 289 int ret; 290 291 ipoib_dbg(priv, "REQ arrived\n"); 292 p = kzalloc(sizeof *p, GFP_KERNEL); 293 if (!p) 294 return -ENOMEM; 295 p->dev = dev; 296 p->id = cm_id; 297 cm_id->context = p; 298 p->state = IPOIB_CM_RX_LIVE; 299 p->jiffies = jiffies; 300 INIT_LIST_HEAD(&p->list); 301 302 p->qp = ipoib_cm_create_rx_qp(dev, p); 303 if (IS_ERR(p->qp)) { 304 ret = PTR_ERR(p->qp); 305 goto err_qp; 306 } 307 308 psn = random32() & 0xffffff; 309 ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); 310 if (ret) 311 goto err_modify; 312 313 spin_lock_irq(&priv->lock); 314 queue_delayed_work(ipoib_workqueue, 315 &priv->cm.stale_task, IPOIB_CM_RX_DELAY); 316 /* Add this entry to passive ids list head, but do not re-add it 317 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ 318 p->jiffies = jiffies; 319 if (p->state == IPOIB_CM_RX_LIVE) 320 list_move(&p->list, &priv->cm.passive_ids); 321 spin_unlock_irq(&priv->lock); 322 323 ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); 324 if (ret) { 325 ipoib_warn(priv, "failed to send REP: %d\n", ret); 326 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) 327 ipoib_warn(priv, "unable to move qp to error state\n"); 328 } 329 return 0; 330 331err_modify: 332 ib_destroy_qp(p->qp); 333err_qp: 334 kfree(p); 335 return ret; 336} 337 338static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, 339 struct ib_cm_event *event) 340{ 341 struct ipoib_cm_rx *p; 342 struct ipoib_dev_priv *priv; 343 344 switch (event->event) { 345 case IB_CM_REQ_RECEIVED: 346 return ipoib_cm_req_handler(cm_id, event); 347 case IB_CM_DREQ_RECEIVED: 348 p = cm_id->context; 349 ib_send_cm_drep(cm_id, NULL, 0); 350 /* Fall through */ 351 case IB_CM_REJ_RECEIVED: 352 p = cm_id->context; 353 priv = netdev_priv(p->dev); 354 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) 355 ipoib_warn(priv, "unable to move qp to error state\n"); 356 /* Fall through */ 357 default: 358 return 0; 359 } 360} 361/* Adjust length of skb with fragments to match received data */ 362static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, 363 unsigned int length, struct sk_buff *toskb) 364{ 365 int i, num_frags; 366 unsigned int size; 367 368 /* put header into skb */ 369 size = min(length, hdr_space); 370 skb->tail += size; 371 skb->len += size; 372 length -= size; 373 374 num_frags = skb_shinfo(skb)->nr_frags; 375 for (i = 0; i < num_frags; i++) { 376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 377 378 if (length == 0) { 379 /* don't need this page */ 380 skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); 381 --skb_shinfo(skb)->nr_frags; 382 } else { 383 size = min(length, (unsigned) PAGE_SIZE); 384 385 frag->size = size; 386 skb->data_len += size; 387 skb->truesize += size; 388 skb->len += size; 389 length -= size; 390 } 391 } 392} 393 394void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) 395{ 396 struct ipoib_dev_priv *priv = netdev_priv(dev); 397 unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; 398 struct sk_buff *skb, *newskb; 399 struct ipoib_cm_rx *p; 400 unsigned long flags; 401 u64 mapping[IPOIB_CM_RX_SG]; 402 int frags; 403 404 ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", 405 wr_id, wc->status); 406 407 if (unlikely(wr_id >= ipoib_recvq_size)) { 408 if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) { 409 spin_lock_irqsave(&priv->lock, flags); 410 list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); 411 ipoib_cm_start_rx_drain(priv); 412 queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); 413 spin_unlock_irqrestore(&priv->lock, flags); 414 } else 415 ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", 416 wr_id, ipoib_recvq_size); 417 return; 418 } 419 420 skb = priv->cm.srq_ring[wr_id].skb; 421 422 if (unlikely(wc->status != IB_WC_SUCCESS)) { 423 ipoib_dbg(priv, "cm recv error " 424 "(status=%d, wrid=%d vend_err %x)\n", 425 wc->status, wr_id, wc->vendor_err); 426 ++priv->stats.rx_dropped; 427 goto repost; 428 } 429 430 if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) { 431 p = wc->qp->qp_context; 432 if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { 433 spin_lock_irqsave(&priv->lock, flags); 434 p->jiffies = jiffies; 435 /* Move this entry to list head, but do not re-add it 436 * if it has been moved out of list. */ 437 if (p->state == IPOIB_CM_RX_LIVE) 438 list_move(&p->list, &priv->cm.passive_ids); 439 spin_unlock_irqrestore(&priv->lock, flags); 440 } 441 } 442 443 frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, 444 (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; 445 446 newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping); 447 if (unlikely(!newskb)) { 448 /* 449 * If we can't allocate a new RX buffer, dump 450 * this packet and reuse the old buffer. 451 */ 452 ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); 453 ++priv->stats.rx_dropped; 454 goto repost; 455 } 456 457 ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping); 458 memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); 459 460 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", 461 wc->byte_len, wc->slid); 462 463 skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); 464 465 skb->protocol = ((struct ipoib_header *) skb->data)->proto; 466 skb_reset_mac_header(skb); 467 skb_pull(skb, IPOIB_ENCAP_LEN); 468 469 dev->last_rx = jiffies; 470 ++priv->stats.rx_packets; 471 priv->stats.rx_bytes += skb->len; 472 473 skb->dev = dev; 474 skb->pkt_type = PACKET_HOST; 475 netif_receive_skb(skb); 476 477repost: 478 if (unlikely(ipoib_cm_post_receive(dev, wr_id))) 479 ipoib_warn(priv, "ipoib_cm_post_receive failed " 480 "for buf %d\n", wr_id); 481} 482 483static inline int post_send(struct ipoib_dev_priv *priv, 484 struct ipoib_cm_tx *tx, 485 unsigned int wr_id, 486 u64 addr, int len) 487{ 488 struct ib_send_wr *bad_wr; 489 490 priv->tx_sge.addr = addr; 491 priv->tx_sge.length = len; 492 493 priv->tx_wr.wr_id = wr_id; 494 495 return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); 496} 497 498void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) 499{ 500 struct ipoib_dev_priv *priv = netdev_priv(dev); 501 struct ipoib_tx_buf *tx_req; 502 u64 addr; 503 504 if (unlikely(skb->len > tx->mtu)) { 505 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 506 skb->len, tx->mtu); 507 ++priv->stats.tx_dropped; 508 ++priv->stats.tx_errors; 509 ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); 510 return; 511 } 512 513 ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", 514 tx->tx_head, skb->len, tx->qp->qp_num); 515 516 /* 517 * We put the skb into the tx_ring _before_ we call post_send() 518 * because it's entirely possible that the completion handler will 519 * run before we execute anything after the post_send(). That 520 * means we have to make sure everything is properly recorded and 521 * our state is consistent before we call post_send(). 522 */ 523 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; 524 tx_req->skb = skb; 525 addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); 526 if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 527 ++priv->stats.tx_errors; 528 dev_kfree_skb_any(skb); 529 return; 530 } 531 532 tx_req->mapping = addr; 533 534 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 535 addr, skb->len))) { 536 ipoib_warn(priv, "post_send failed\n"); 537 ++priv->stats.tx_errors; 538 ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 539 dev_kfree_skb_any(skb); 540 } else { 541 dev->trans_start = jiffies; 542 ++tx->tx_head; 543 544 if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { 545 ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", 546 tx->qp->qp_num); 547 netif_stop_queue(dev); 548 set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); 549 } 550 } 551} 552 553static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, 554 struct ib_wc *wc) 555{ 556 struct ipoib_dev_priv *priv = netdev_priv(dev); 557 unsigned int wr_id = wc->wr_id; 558 struct ipoib_tx_buf *tx_req; 559 unsigned long flags; 560 561 ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", 562 wr_id, wc->status); 563 564 if (unlikely(wr_id >= ipoib_sendq_size)) { 565 ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", 566 wr_id, ipoib_sendq_size); 567 return; 568 } 569 570 tx_req = &tx->tx_ring[wr_id]; 571 572 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 573 574 ++priv->stats.tx_packets; 575 priv->stats.tx_bytes += tx_req->skb->len; 576 577 dev_kfree_skb_any(tx_req->skb); 578 579 spin_lock_irqsave(&priv->tx_lock, flags); 580 ++tx->tx_tail; 581 if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && 582 tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { 583 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); 584 netif_wake_queue(dev); 585 } 586 587 if (wc->status != IB_WC_SUCCESS && 588 wc->status != IB_WC_WR_FLUSH_ERR) { 589 struct ipoib_neigh *neigh; 590 591 ipoib_dbg(priv, "failed cm send event " 592 "(status=%d, wrid=%d vend_err %x)\n", 593 wc->status, wr_id, wc->vendor_err); 594 595 spin_lock(&priv->lock); 596 neigh = tx->neigh; 597 598 if (neigh) { 599 neigh->cm = NULL; 600 list_del(&neigh->list); 601 if (neigh->ah) 602 ipoib_put_ah(neigh->ah); 603 ipoib_neigh_free(dev, neigh); 604 605 tx->neigh = NULL; 606 } 607 608 /* queue would be re-started anyway when TX is destroyed, 609 * but it makes sense to do it ASAP here. */ 610 if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) 611 netif_wake_queue(dev); 612 613 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 614 list_move(&tx->list, &priv->cm.reap_list); 615 queue_work(ipoib_workqueue, &priv->cm.reap_task); 616 } 617 618 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); 619 620 spin_unlock(&priv->lock); 621 } 622 623 spin_unlock_irqrestore(&priv->tx_lock, flags); 624} 625 626static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) 627{ 628 struct ipoib_cm_tx *tx = tx_ptr; 629 int n, i; 630 631 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 632 do { 633 n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc); 634 for (i = 0; i < n; ++i) 635 ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i); 636 } while (n == IPOIB_NUM_WC); 637} 638 639int ipoib_cm_dev_open(struct net_device *dev) 640{ 641 struct ipoib_dev_priv *priv = netdev_priv(dev); 642 int ret; 643 644 if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) 645 return 0; 646 647 priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); 648 if (IS_ERR(priv->cm.id)) { 649 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); 650 ret = PTR_ERR(priv->cm.id); 651 goto err_cm; 652 } 653 654 ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 655 0, NULL); 656 if (ret) { 657 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, 658 IPOIB_CM_IETF_ID | priv->qp->qp_num); 659 goto err_listen; 660 } 661 662 return 0; 663 664err_listen: 665 ib_destroy_cm_id(priv->cm.id); 666err_cm: 667 priv->cm.id = NULL; 668 return ret; 669} 670 671void ipoib_cm_dev_stop(struct net_device *dev) 672{ 673 struct ipoib_dev_priv *priv = netdev_priv(dev); 674 struct ipoib_cm_rx *p, *n; 675 unsigned long begin; 676 LIST_HEAD(list); 677 int ret; 678 679 if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) 680 return; 681 682 ib_destroy_cm_id(priv->cm.id); 683 priv->cm.id = NULL; 684 685 spin_lock_irq(&priv->lock); 686 while (!list_empty(&priv->cm.passive_ids)) { 687 p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); 688 list_move(&p->list, &priv->cm.rx_error_list); 689 p->state = IPOIB_CM_RX_ERROR; 690 spin_unlock_irq(&priv->lock); 691 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); 692 if (ret) 693 ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); 694 spin_lock_irq(&priv->lock); 695 } 696 697 /* Wait for all RX to be drained */ 698 begin = jiffies; 699 700 while (!list_empty(&priv->cm.rx_error_list) || 701 !list_empty(&priv->cm.rx_flush_list) || 702 !list_empty(&priv->cm.rx_drain_list)) { 703 if (time_after(jiffies, begin + 5 * HZ)) { 704 ipoib_warn(priv, "RX drain timing out\n"); 705 706 /* 707 * assume the HW is wedged and just free up everything. 708 */ 709 list_splice_init(&priv->cm.rx_flush_list, &list); 710 list_splice_init(&priv->cm.rx_error_list, &list); 711 list_splice_init(&priv->cm.rx_drain_list, &list); 712 break; 713 } 714 spin_unlock_irq(&priv->lock); 715 msleep(1); 716 ipoib_drain_cq(dev); 717 spin_lock_irq(&priv->lock); 718 } 719 720 list_splice_init(&priv->cm.rx_reap_list, &list); 721 722 spin_unlock_irq(&priv->lock); 723 724 list_for_each_entry_safe(p, n, &list, list) { 725 ib_destroy_cm_id(p->id); 726 ib_destroy_qp(p->qp); 727 kfree(p); 728 } 729 730 cancel_delayed_work(&priv->cm.stale_task); 731} 732 733static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) 734{ 735 struct ipoib_cm_tx *p = cm_id->context; 736 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 737 struct ipoib_cm_data *data = event->private_data; 738 struct sk_buff_head skqueue; 739 struct ib_qp_attr qp_attr; 740 int qp_attr_mask, ret; 741 struct sk_buff *skb; 742 743 p->mtu = be32_to_cpu(data->mtu); 744 745 if (p->mtu <= IPOIB_ENCAP_LEN) { 746 ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", 747 p->mtu, IPOIB_ENCAP_LEN); 748 return -EINVAL; 749 } 750 751 qp_attr.qp_state = IB_QPS_RTR; 752 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); 753 if (ret) { 754 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); 755 return ret; 756 } 757 758 qp_attr.rq_psn = 0; 759 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); 760 if (ret) { 761 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); 762 return ret; 763 } 764 765 qp_attr.qp_state = IB_QPS_RTS; 766 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); 767 if (ret) { 768 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); 769 return ret; 770 } 771 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); 772 if (ret) { 773 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); 774 return ret; 775 } 776 777 skb_queue_head_init(&skqueue); 778 779 spin_lock_irq(&priv->lock); 780 set_bit(IPOIB_FLAG_OPER_UP, &p->flags); 781 if (p->neigh) 782 while ((skb = __skb_dequeue(&p->neigh->queue))) 783 __skb_queue_tail(&skqueue, skb); 784 spin_unlock_irq(&priv->lock); 785 786 while ((skb = __skb_dequeue(&skqueue))) { 787 skb->dev = p->dev; 788 if (dev_queue_xmit(skb)) 789 ipoib_warn(priv, "dev_queue_xmit failed " 790 "to requeue packet\n"); 791 } 792 793 ret = ib_send_cm_rtu(cm_id, NULL, 0); 794 if (ret) { 795 ipoib_warn(priv, "failed to send RTU: %d\n", ret); 796 return ret; 797 } 798 return 0; 799} 800 801static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) 802{ 803 struct ipoib_dev_priv *priv = netdev_priv(dev); 804 struct ib_qp_init_attr attr = {}; 805 attr.recv_cq = priv->cq; 806 attr.srq = priv->cm.srq; 807 attr.cap.max_send_wr = ipoib_sendq_size; 808 attr.cap.max_send_sge = 1; 809 attr.sq_sig_type = IB_SIGNAL_ALL_WR; 810 attr.qp_type = IB_QPT_RC; 811 attr.send_cq = cq; 812 return ib_create_qp(priv->pd, &attr); 813} 814 815static int ipoib_cm_send_req(struct net_device *dev, 816 struct ib_cm_id *id, struct ib_qp *qp, 817 u32 qpn, 818 struct ib_sa_path_rec *pathrec) 819{ 820 struct ipoib_dev_priv *priv = netdev_priv(dev); 821 struct ipoib_cm_data data = {}; 822 struct ib_cm_req_param req = {}; 823 824 data.qpn = cpu_to_be32(priv->qp->qp_num); 825 data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); 826 827 req.primary_path = pathrec; 828 req.alternate_path = NULL; 829 req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); 830 req.qp_num = qp->qp_num; 831 req.qp_type = qp->qp_type; 832 req.private_data = &data; 833 req.private_data_len = sizeof data; 834 req.flow_control = 0; 835 836 req.starting_psn = 0; 837 838 /* 839 * Pick some arbitrary defaults here; we could make these 840 * module parameters if anyone cared about setting them. 841 */ 842 req.responder_resources = 4; 843 req.remote_cm_response_timeout = 20; 844 req.local_cm_response_timeout = 20; 845 req.retry_count = 0; /* RFC draft warns against retries */ 846 req.rnr_retry_count = 0; /* RFC draft warns against retries */ 847 req.max_cm_retries = 15; 848 req.srq = 1; 849 return ib_send_cm_req(id, &req); 850} 851 852static int ipoib_cm_modify_tx_init(struct net_device *dev, 853 struct ib_cm_id *cm_id, struct ib_qp *qp) 854{ 855 struct ipoib_dev_priv *priv = netdev_priv(dev); 856 struct ib_qp_attr qp_attr; 857 int qp_attr_mask, ret; 858 ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); 859 if (ret) { 860 ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret); 861 return ret; 862 } 863 864 qp_attr.qp_state = IB_QPS_INIT; 865 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; 866 qp_attr.port_num = priv->port; 867 qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; 868 869 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); 870 if (ret) { 871 ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); 872 return ret; 873 } 874 return 0; 875} 876 877static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, 878 struct ib_sa_path_rec *pathrec) 879{ 880 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 881 int ret; 882 883 p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, 884 GFP_KERNEL); 885 if (!p->tx_ring) { 886 ipoib_warn(priv, "failed to allocate tx ring\n"); 887 ret = -ENOMEM; 888 goto err_tx; 889 } 890 891 p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p, 892 ipoib_sendq_size + 1, 0); 893 if (IS_ERR(p->cq)) { 894 ret = PTR_ERR(p->cq); 895 ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret); 896 goto err_cq; 897 } 898 899 ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP); 900 if (ret) { 901 ipoib_warn(priv, "failed to request completion notification: %d\n", ret); 902 goto err_req_notify; 903 } 904 905 p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq); 906 if (IS_ERR(p->qp)) { 907 ret = PTR_ERR(p->qp); 908 ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); 909 goto err_qp; 910 } 911 912 p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); 913 if (IS_ERR(p->id)) { 914 ret = PTR_ERR(p->id); 915 ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); 916 goto err_id; 917 } 918 919 ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); 920 if (ret) { 921 ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); 922 goto err_modify; 923 } 924 925 ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); 926 if (ret) { 927 ipoib_warn(priv, "failed to send cm req: %d\n", ret); 928 goto err_send_cm; 929 } 930 931 ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n", 932 p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn); 933 934 return 0; 935 936err_send_cm: 937err_modify: 938 ib_destroy_cm_id(p->id); 939err_id: 940 p->id = NULL; 941 ib_destroy_qp(p->qp); 942err_req_notify: 943err_qp: 944 p->qp = NULL; 945 ib_destroy_cq(p->cq); 946err_cq: 947 p->cq = NULL; 948err_tx: 949 return ret; 950} 951 952static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) 953{ 954 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 955 struct ipoib_tx_buf *tx_req; 956 957 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", 958 p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); 959 960 if (p->id) 961 ib_destroy_cm_id(p->id); 962 963 if (p->qp) 964 ib_destroy_qp(p->qp); 965 966 if (p->cq) 967 ib_destroy_cq(p->cq); 968 969 if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags)) 970 netif_wake_queue(p->dev); 971 972 if (p->tx_ring) { 973 while ((int) p->tx_tail - (int) p->tx_head < 0) { 974 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 975 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 976 DMA_TO_DEVICE); 977 dev_kfree_skb_any(tx_req->skb); 978 ++p->tx_tail; 979 } 980 981 kfree(p->tx_ring); 982 } 983 984 kfree(p); 985} 986 987static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, 988 struct ib_cm_event *event) 989{ 990 struct ipoib_cm_tx *tx = cm_id->context; 991 struct ipoib_dev_priv *priv = netdev_priv(tx->dev); 992 struct net_device *dev = priv->dev; 993 struct ipoib_neigh *neigh; 994 int ret; 995 996 switch (event->event) { 997 case IB_CM_DREQ_RECEIVED: 998 ipoib_dbg(priv, "DREQ received.\n"); 999 ib_send_cm_drep(cm_id, NULL, 0); 1000 break; 1001 case IB_CM_REP_RECEIVED: 1002 ipoib_dbg(priv, "REP received.\n"); 1003 ret = ipoib_cm_rep_handler(cm_id, event); 1004 if (ret) 1005 ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, 1006 NULL, 0, NULL, 0); 1007 break; 1008 case IB_CM_REQ_ERROR: 1009 case IB_CM_REJ_RECEIVED: 1010 case IB_CM_TIMEWAIT_EXIT: 1011 ipoib_dbg(priv, "CM error %d.\n", event->event); 1012 spin_lock_irq(&priv->tx_lock); 1013 spin_lock(&priv->lock); 1014 neigh = tx->neigh; 1015 1016 if (neigh) { 1017 neigh->cm = NULL; 1018 list_del(&neigh->list); 1019 if (neigh->ah) 1020 ipoib_put_ah(neigh->ah); 1021 ipoib_neigh_free(dev, neigh); 1022 1023 tx->neigh = NULL; 1024 } 1025 1026 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 1027 list_move(&tx->list, &priv->cm.reap_list); 1028 queue_work(ipoib_workqueue, &priv->cm.reap_task); 1029 } 1030 1031 spin_unlock(&priv->lock); 1032 spin_unlock_irq(&priv->tx_lock); 1033 break; 1034 default: 1035 break; 1036 } 1037 1038 return 0; 1039} 1040 1041struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, 1042 struct ipoib_neigh *neigh) 1043{ 1044 struct ipoib_dev_priv *priv = netdev_priv(dev); 1045 struct ipoib_cm_tx *tx; 1046 1047 tx = kzalloc(sizeof *tx, GFP_ATOMIC); 1048 if (!tx) 1049 return NULL; 1050 1051 neigh->cm = tx; 1052 tx->neigh = neigh; 1053 tx->path = path; 1054 tx->dev = dev; 1055 list_add(&tx->list, &priv->cm.start_list); 1056 set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); 1057 queue_work(ipoib_workqueue, &priv->cm.start_task); 1058 return tx; 1059} 1060 1061void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) 1062{ 1063 struct ipoib_dev_priv *priv = netdev_priv(tx->dev); 1064 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 1065 list_move(&tx->list, &priv->cm.reap_list); 1066 queue_work(ipoib_workqueue, &priv->cm.reap_task); 1067 ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n", 1068 IPOIB_GID_ARG(tx->neigh->dgid)); 1069 tx->neigh = NULL; 1070 } 1071} 1072 1073static void ipoib_cm_tx_start(struct work_struct *work) 1074{ 1075 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 1076 cm.start_task); 1077 struct net_device *dev = priv->dev; 1078 struct ipoib_neigh *neigh; 1079 struct ipoib_cm_tx *p; 1080 unsigned long flags; 1081 int ret; 1082 1083 struct ib_sa_path_rec pathrec; 1084 u32 qpn; 1085 1086 spin_lock_irqsave(&priv->tx_lock, flags); 1087 spin_lock(&priv->lock); 1088 while (!list_empty(&priv->cm.start_list)) { 1089 p = list_entry(priv->cm.start_list.next, typeof(*p), list); 1090 list_del_init(&p->list); 1091 neigh = p->neigh; 1092 qpn = IPOIB_QPN(neigh->neighbour->ha); 1093 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); 1094 spin_unlock(&priv->lock); 1095 spin_unlock_irqrestore(&priv->tx_lock, flags); 1096 ret = ipoib_cm_tx_init(p, qpn, &pathrec); 1097 spin_lock_irqsave(&priv->tx_lock, flags); 1098 spin_lock(&priv->lock); 1099 if (ret) { 1100 neigh = p->neigh; 1101 if (neigh) { 1102 neigh->cm = NULL; 1103 list_del(&neigh->list); 1104 if (neigh->ah) 1105 ipoib_put_ah(neigh->ah); 1106 ipoib_neigh_free(dev, neigh); 1107 } 1108 list_del(&p->list); 1109 kfree(p); 1110 } 1111 } 1112 spin_unlock(&priv->lock); 1113 spin_unlock_irqrestore(&priv->tx_lock, flags); 1114} 1115 1116static void ipoib_cm_tx_reap(struct work_struct *work) 1117{ 1118 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 1119 cm.reap_task); 1120 struct ipoib_cm_tx *p; 1121 1122 spin_lock_irq(&priv->tx_lock); 1123 spin_lock(&priv->lock); 1124 while (!list_empty(&priv->cm.reap_list)) { 1125 p = list_entry(priv->cm.reap_list.next, typeof(*p), list); 1126 list_del(&p->list); 1127 spin_unlock(&priv->lock); 1128 spin_unlock_irq(&priv->tx_lock); 1129 ipoib_cm_tx_destroy(p); 1130 spin_lock_irq(&priv->tx_lock); 1131 spin_lock(&priv->lock); 1132 } 1133 spin_unlock(&priv->lock); 1134 spin_unlock_irq(&priv->tx_lock); 1135} 1136 1137static void ipoib_cm_skb_reap(struct work_struct *work) 1138{ 1139 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 1140 cm.skb_task); 1141 struct net_device *dev = priv->dev; 1142 struct sk_buff *skb; 1143 1144 unsigned mtu = priv->mcast_mtu; 1145 1146 spin_lock_irq(&priv->tx_lock); 1147 spin_lock(&priv->lock); 1148 while ((skb = skb_dequeue(&priv->cm.skb_queue))) { 1149 spin_unlock(&priv->lock); 1150 spin_unlock_irq(&priv->tx_lock); 1151 if (skb->protocol == htons(ETH_P_IP)) 1152 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 1153#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1154 else if (skb->protocol == htons(ETH_P_IPV6)) 1155 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); 1156#endif 1157 dev_kfree_skb_any(skb); 1158 spin_lock_irq(&priv->tx_lock); 1159 spin_lock(&priv->lock); 1160 } 1161 spin_unlock(&priv->lock); 1162 spin_unlock_irq(&priv->tx_lock); 1163} 1164 1165void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, 1166 unsigned int mtu) 1167{ 1168 struct ipoib_dev_priv *priv = netdev_priv(dev); 1169 int e = skb_queue_empty(&priv->cm.skb_queue); 1170 1171 if (skb->dst) 1172 skb->dst->ops->update_pmtu(skb->dst, mtu); 1173 1174 skb_queue_tail(&priv->cm.skb_queue, skb); 1175 if (e) 1176 queue_work(ipoib_workqueue, &priv->cm.skb_task); 1177} 1178 1179static void ipoib_cm_rx_reap(struct work_struct *work) 1180{ 1181 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 1182 cm.rx_reap_task); 1183 struct ipoib_cm_rx *p, *n; 1184 LIST_HEAD(list); 1185 1186 spin_lock_irq(&priv->lock); 1187 list_splice_init(&priv->cm.rx_reap_list, &list); 1188 spin_unlock_irq(&priv->lock); 1189 1190 list_for_each_entry_safe(p, n, &list, list) { 1191 ib_destroy_cm_id(p->id); 1192 ib_destroy_qp(p->qp); 1193 kfree(p); 1194 } 1195} 1196 1197static void ipoib_cm_stale_task(struct work_struct *work) 1198{ 1199 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 1200 cm.stale_task.work); 1201 struct ipoib_cm_rx *p; 1202 int ret; 1203 1204 spin_lock_irq(&priv->lock); 1205 while (!list_empty(&priv->cm.passive_ids)) { 1206 /* List is sorted by LRU, start from tail, 1207 * stop when we see a recently used entry */ 1208 p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); 1209 if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) 1210 break; 1211 list_move(&p->list, &priv->cm.rx_error_list); 1212 p->state = IPOIB_CM_RX_ERROR; 1213 spin_unlock_irq(&priv->lock); 1214 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); 1215 if (ret) 1216 ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); 1217 spin_lock_irq(&priv->lock); 1218 } 1219 1220 if (!list_empty(&priv->cm.passive_ids)) 1221 queue_delayed_work(ipoib_workqueue, 1222 &priv->cm.stale_task, IPOIB_CM_RX_DELAY); 1223 spin_unlock_irq(&priv->lock); 1224} 1225 1226 1227static ssize_t show_mode(struct device *d, struct device_attribute *attr, 1228 char *buf) 1229{ 1230 struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); 1231 1232 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) 1233 return sprintf(buf, "connected\n"); 1234 else 1235 return sprintf(buf, "datagram\n"); 1236} 1237 1238static ssize_t set_mode(struct device *d, struct device_attribute *attr, 1239 const char *buf, size_t count) 1240{ 1241 struct net_device *dev = to_net_dev(d); 1242 struct ipoib_dev_priv *priv = netdev_priv(dev); 1243 1244 /* flush paths if we switch modes so that connections are restarted */ 1245 if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { 1246 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 1247 ipoib_warn(priv, "enabling connected mode " 1248 "will cause multicast packet drops\n"); 1249 ipoib_flush_paths(dev); 1250 return count; 1251 } 1252 1253 if (!strcmp(buf, "datagram\n")) { 1254 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 1255 dev->mtu = min(priv->mcast_mtu, dev->mtu); 1256 ipoib_flush_paths(dev); 1257 return count; 1258 } 1259 1260 return -EINVAL; 1261} 1262 1263static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); 1264 1265int ipoib_cm_add_mode_attr(struct net_device *dev) 1266{ 1267 return device_create_file(&dev->dev, &dev_attr_mode); 1268} 1269 1270int ipoib_cm_dev_init(struct net_device *dev) 1271{ 1272 struct ipoib_dev_priv *priv = netdev_priv(dev); 1273 struct ib_srq_init_attr srq_init_attr = { 1274 .attr = { 1275 .max_wr = ipoib_recvq_size, 1276 .max_sge = IPOIB_CM_RX_SG 1277 } 1278 }; 1279 int ret, i; 1280 1281 INIT_LIST_HEAD(&priv->cm.passive_ids); 1282 INIT_LIST_HEAD(&priv->cm.reap_list); 1283 INIT_LIST_HEAD(&priv->cm.start_list); 1284 INIT_LIST_HEAD(&priv->cm.rx_error_list); 1285 INIT_LIST_HEAD(&priv->cm.rx_flush_list); 1286 INIT_LIST_HEAD(&priv->cm.rx_drain_list); 1287 INIT_LIST_HEAD(&priv->cm.rx_reap_list); 1288 INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); 1289 INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); 1290 INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); 1291 INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); 1292 INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); 1293 1294 skb_queue_head_init(&priv->cm.skb_queue); 1295 1296 priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); 1297 if (IS_ERR(priv->cm.srq)) { 1298 ret = PTR_ERR(priv->cm.srq); 1299 priv->cm.srq = NULL; 1300 return ret; 1301 } 1302 1303 priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, 1304 GFP_KERNEL); 1305 if (!priv->cm.srq_ring) { 1306 printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n", 1307 priv->ca->name, ipoib_recvq_size); 1308 ipoib_cm_dev_cleanup(dev); 1309 return -ENOMEM; 1310 } 1311 1312 for (i = 0; i < IPOIB_CM_RX_SG; ++i) 1313 priv->cm.rx_sge[i].lkey = priv->mr->lkey; 1314 1315 priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE; 1316 for (i = 1; i < IPOIB_CM_RX_SG; ++i) 1317 priv->cm.rx_sge[i].length = PAGE_SIZE; 1318 priv->cm.rx_wr.next = NULL; 1319 priv->cm.rx_wr.sg_list = priv->cm.rx_sge; 1320 priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG; 1321 1322 for (i = 0; i < ipoib_recvq_size; ++i) { 1323 if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1, 1324 priv->cm.srq_ring[i].mapping)) { 1325 ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); 1326 ipoib_cm_dev_cleanup(dev); 1327 return -ENOMEM; 1328 } 1329 if (ipoib_cm_post_receive(dev, i)) { 1330 ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); 1331 ipoib_cm_dev_cleanup(dev); 1332 return -EIO; 1333 } 1334 } 1335 1336 priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; 1337 return 0; 1338} 1339 1340void ipoib_cm_dev_cleanup(struct net_device *dev) 1341{ 1342 struct ipoib_dev_priv *priv = netdev_priv(dev); 1343 int i, ret; 1344 1345 if (!priv->cm.srq) 1346 return; 1347 1348 ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); 1349 1350 ret = ib_destroy_srq(priv->cm.srq); 1351 if (ret) 1352 ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); 1353 1354 priv->cm.srq = NULL; 1355 if (!priv->cm.srq_ring) 1356 return; 1357 for (i = 0; i < ipoib_recvq_size; ++i) 1358 if (priv->cm.srq_ring[i].skb) { 1359 ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, 1360 priv->cm.srq_ring[i].mapping); 1361 dev_kfree_skb_any(priv->cm.srq_ring[i].skb); 1362 priv->cm.srq_ring[i].skb = NULL; 1363 } 1364 kfree(priv->cm.srq_ring); 1365 priv->cm.srq_ring = NULL; 1366} 1367