rdsv3_impl.c revision 12198:4db936bda957
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24#include <sys/types.h> 25#include <sys/stream.h> 26#include <sys/dlpi.h> 27#include <sys/stropts.h> 28#include <sys/strsun.h> 29#include <sys/sysmacros.h> 30#include <sys/strlog.h> 31#include <sys/ddi.h> 32#include <sys/cmn_err.h> 33#include <sys/socket.h> 34#include <net/if.h> 35#include <net/if_types.h> 36#include <netinet/in.h> 37#include <sys/ethernet.h> 38#include <inet/arp.h> 39#include <inet/ip.h> 40#include <inet/ip6.h> 41#include <inet/ip_ire.h> 42#include <inet/ip_if.h> 43#include <inet/ip_ftable.h> 44 45#include <sys/sunddi.h> 46#include <sys/ksynch.h> 47 48#include <sys/rds.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sockio.h> 52#include <sys/sysmacros.h> 53#include <inet/common.h> 54#include <inet/ip.h> 55#include <net/if_types.h> 56 57#include <sys/ib/clients/rdsv3/rdsv3.h> 58#include <sys/ib/clients/rdsv3/rdma.h> 59#include <sys/ib/clients/rdsv3/ib.h> 60#include <sys/ib/clients/rdsv3/rdsv3_impl.h> 61#include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63#include <sys/dls.h> 64#include <sys/mac.h> 65#include <sys/mac_client.h> 66#include <sys/mac_provider.h> 67#include <sys/mac_client_priv.h> 68 69ddi_taskq_t *rdsv3_taskq = NULL; 70extern kmem_cache_t *rdsv3_alloc_cache; 71 72extern unsigned int ip_ocsum(ushort_t *address, int halfword_count, 73 unsigned int sum); 74 75/* 76 * Check if the IP interface named by `lifrp' is RDS-capable. 77 */ 78boolean_t 79rdsv3_capable_interface(struct lifreq *lifrp) 80{ 81 char ifname[LIFNAMSIZ]; 82 char drv[MAXLINKNAMELEN]; 83 uint_t ppa; 84 char *cp; 85 86 RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter"); 87 88 if (lifrp->lifr_type == IFT_IB) 89 return (B_TRUE); 90 91 /* 92 * Strip off the logical interface portion before getting 93 * intimate with the name. 94 */ 95 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 96 if ((cp = strchr(ifname, ':')) != NULL) 97 *cp = '\0'; 98 99 if (strcmp("lo0", ifname) == 0) { 100 /* 101 * loopback is considered RDS-capable 102 */ 103 return (B_TRUE); 104 } 105 106 return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && 107 rdsv3_if_lookup_by_name(drv)); 108} 109 110int 111rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs) 112{ 113 struct lifnum lifn; 114 struct lifconf lifc; 115 struct lifreq *lp, *rlp, lifr; 116 int rval = 0; 117 int numifs; 118 int bufsize, rbufsize; 119 void *buf, *rbuf; 120 int i, j, n, rc; 121 122 *ipaddrs = NULL; 123 *size = 0; 124 *nifs = 0; 125 126 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter"); 127 128retry_count: 129 /* snapshot the current number of interfaces */ 130 lifn.lifn_family = PF_UNSPEC; 131 lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; 132 lifn.lifn_count = 0; 133 rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval, 134 CRED()); 135 if (rval != 0) { 136 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", 137 "ksocket_ioctl returned: %d", rval); 138 return (rval); 139 } 140 141 numifs = lifn.lifn_count; 142 if (numifs <= 0) { 143 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found"); 144 return (0); 145 } 146 147 /* allocate extra room in case more interfaces appear */ 148 numifs += 10; 149 150 /* get the interface names and ip addresses */ 151 bufsize = numifs * sizeof (struct lifreq); 152 buf = kmem_alloc(bufsize, KM_SLEEP); 153 154 lifc.lifc_family = AF_UNSPEC; 155 lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; 156 lifc.lifc_len = bufsize; 157 lifc.lifc_buf = buf; 158 rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED()); 159 if (rc != 0) { 160 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed"); 161 kmem_free(buf, bufsize); 162 return (rc); 163 } 164 /* if our extra room is used up, try again */ 165 if (bufsize <= lifc.lifc_len) { 166 kmem_free(buf, bufsize); 167 buf = NULL; 168 goto retry_count; 169 } 170 /* calc actual number of ifconfs */ 171 n = lifc.lifc_len / sizeof (struct lifreq); 172 173 /* 174 * Count the RDS interfaces 175 */ 176 for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) { 177 178 /* 179 * Copy as the SIOCGLIFFLAGS ioctl is destructive 180 */ 181 bcopy(lp, &lifr, sizeof (struct lifreq)); 182 /* 183 * fetch the flags using the socket of the correct family 184 */ 185 switch (lifr.lifr_addr.ss_family) { 186 case AF_INET: 187 rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, 188 &rval, CRED()); 189 break; 190 default: 191 continue; 192 } 193 194 if (rc != 0) continue; 195 196 /* 197 * If we got the flags, skip uninteresting 198 * interfaces based on flags 199 */ 200 if ((lifr.lifr_flags & IFF_UP) != IFF_UP) 201 continue; 202 if (lifr.lifr_flags & 203 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 204 continue; 205 if (!rdsv3_capable_interface(&lifr)) 206 continue; 207 j++; 208 } 209 210 if (j <= 0) { 211 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces"); 212 kmem_free(buf, bufsize); 213 return (rval); 214 } 215 216 numifs = j; 217 218 /* This is the buffer we pass back */ 219 rbufsize = numifs * sizeof (struct lifreq); 220 rbuf = kmem_alloc(rbufsize, KM_SLEEP); 221 rlp = (struct lifreq *)rbuf; 222 223 /* 224 * Examine the array of interfaces and filter uninteresting ones 225 */ 226 for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) { 227 228 /* 229 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive 230 */ 231 bcopy(lp, &lifr, sizeof (struct lifreq)); 232 /* 233 * fetch the flags using the socket of the correct family 234 */ 235 switch (lifr.lifr_addr.ss_family) { 236 case AF_INET: 237 rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, 238 &rval, CRED()); 239 break; 240 default: 241 continue; 242 } 243 244 245 if (rc != 0) { 246 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", 247 "ksocket_ioctl failed" " for %s", lifr.lifr_name); 248 continue; 249 } 250 251 /* 252 * If we got the flags, skip uninteresting 253 * interfaces based on flags 254 */ 255 if ((lifr.lifr_flags & IFF_UP) != IFF_UP) 256 continue; 257 if (lifr.lifr_flags & 258 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 259 continue; 260 if (!rdsv3_capable_interface(&lifr)) 261 continue; 262 263 /* save the record */ 264 bcopy(lp, rlp, sizeof (struct lifreq)); 265 rlp++; 266 } 267 268 kmem_free(buf, bufsize); 269 270 *ipaddrs = rbuf; 271 *size = rbufsize; 272 *nifs = numifs; 273 274 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return"); 275 276 return (rval); 277} 278 279/* 280 * Check if the IP interface named by `ifrp' is RDS-capable. 281 */ 282boolean_t 283rdsv3_capable_interface_old(struct ifreq *ifrp) 284{ 285 char ifname[IFNAMSIZ]; 286 char drv[MAXLINKNAMELEN]; 287 uint_t ppa; 288 char *cp; 289 290 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter"); 291 292 /* 293 * Strip off the logical interface portion before getting 294 * intimate with the name. 295 */ 296 (void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ); 297 if ((cp = strchr(ifname, ':')) != NULL) 298 *cp = '\0'; 299 300 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname); 301 302 if ((strcmp("lo0", ifname) == 0) || 303 (strncmp("ibd", ifname, 3) == 0)) { 304 /* 305 * loopback and IB are considered RDS-capable 306 */ 307 return (B_TRUE); 308 } 309 310 return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && 311 rdsv3_if_lookup_by_name(drv)); 312} 313 314int 315rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs) 316{ 317 uint_t ifn; 318 struct ifconf ifc; 319 struct ifreq *lp, *rlp, ifr; 320 int rval = 0; 321 int numifs; 322 int bufsize, rbufsize; 323 void *buf, *rbuf; 324 int i, j, n, rc; 325 326 *ipaddrs = NULL; 327 *size = 0; 328 *nifs = 0; 329 330 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter"); 331 332retry_count: 333 rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval, 334 CRED()); 335 if (rval != 0) { 336 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 337 "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval); 338 return (rval); 339 } 340 341 numifs = ifn; 342 if (numifs <= 0) { 343 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found"); 344 return (0); 345 } 346 347 /* allocate extra room in case more interfaces appear */ 348 numifs += 10; 349 350 /* get the interface names and ip addresses */ 351 bufsize = numifs * sizeof (struct ifreq); 352 buf = kmem_alloc(bufsize, KM_SLEEP); 353 354 ifc.ifc_len = bufsize; 355 ifc.ifc_buf = buf; 356 rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED()); 357 if (rc != 0) { 358 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 359 "SIOCGLIFCONF failed: %d", rc); 360 kmem_free(buf, bufsize); 361 return (rc); 362 } 363 /* if our extra room is used up, try again */ 364 if (bufsize <= ifc.ifc_len) { 365 kmem_free(buf, bufsize); 366 buf = NULL; 367 goto retry_count; 368 } 369 /* calc actual number of ifconfs */ 370 n = ifc.ifc_len / sizeof (struct ifreq); 371 372 /* 373 * Count the RDS interfaces 374 */ 375 for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) { 376 377 /* 378 * Copy as the SIOCGIFFLAGS ioctl is destructive 379 */ 380 bcopy(lp, &ifr, sizeof (struct ifreq)); 381 /* 382 * fetch the flags using the socket of the correct family 383 */ 384 switch (ifr.ifr_addr.sa_family) { 385 case AF_INET: 386 rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, 387 &rval, CRED()); 388 break; 389 default: 390 continue; 391 } 392 393 if (rc != 0) continue; 394 395 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 396 "1. ifr_name: %s, flags: %d", ifr.ifr_name, 397 (ushort_t)ifr.ifr_flags); 398 399 /* 400 * If we got the flags, skip uninteresting 401 * interfaces based on flags 402 */ 403 if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) 404 continue; 405 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 406 "2. ifr_name: %s, flags: %d", ifr.ifr_name, 407 (ushort_t)ifr.ifr_flags); 408 if (((ushort_t)ifr.ifr_flags) & 409 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 410 continue; 411 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 412 "3. ifr_name: %s, flags: %d", ifr.ifr_name, 413 (ushort_t)ifr.ifr_flags); 414 if (!rdsv3_capable_interface_old(&ifr)) 415 continue; 416 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 417 "4. ifr_name: %s, flags: %d", ifr.ifr_name, 418 (ushort_t)ifr.ifr_flags); 419 j++; 420 } 421 422 if (j <= 0) { 423 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces"); 424 kmem_free(buf, bufsize); 425 return (rval); 426 } 427 428 numifs = j; 429 430 /* This is the buffer we pass back */ 431 rbufsize = numifs * sizeof (struct ifreq); 432 rbuf = kmem_alloc(rbufsize, KM_SLEEP); 433 rlp = (struct ifreq *)rbuf; 434 435 /* 436 * Examine the array of interfaces and filter uninteresting ones 437 */ 438 for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) { 439 440 /* 441 * Copy the address as the SIOCGIFFLAGS ioctl is destructive 442 */ 443 bcopy(lp, &ifr, sizeof (struct ifreq)); 444 /* 445 * fetch the flags using the socket of the correct family 446 */ 447 switch (ifr.ifr_addr.sa_family) { 448 case AF_INET: 449 rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, 450 &rval, CRED()); 451 break; 452 default: 453 continue; 454 } 455 456 457 if (rc != 0) { 458 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 459 "ksocket_ioctl failed: %d for %s", 460 rc, ifr.ifr_name); 461 continue; 462 } 463 464 /* 465 * If we got the flags, skip uninteresting 466 * interfaces based on flags 467 */ 468 if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) 469 continue; 470 if (((ushort_t)ifr.ifr_flags) & 471 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 472 continue; 473 if (!rdsv3_capable_interface_old(&ifr)) 474 continue; 475 476 /* save the record */ 477 bcopy(lp, rlp, sizeof (struct ifreq)); 478 rlp++; 479 } 480 481 kmem_free(buf, bufsize); 482 483 *ipaddrs = rbuf; 484 *size = rbufsize; 485 *nifs = numifs; 486 487 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return"); 488 489 return (rval); 490} 491 492boolean_t 493rdsv3_isloopback(ipaddr_t addr) 494{ 495 ip_stack_t *ipst; 496 497 ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; 498 ASSERT(ipst != NULL); 499 if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) { 500 netstack_rele(ipst->ips_netstack); 501 return (B_FALSE); 502 } 503 netstack_rele(ipst->ips_netstack); 504 return (B_TRUE); 505} 506 507/* 508 * Work Queue Implementation 509 */ 510 511#define RDSV3_WQ_THREAD_IDLE 0 512#define RDSV3_WQ_THREAD_RUNNING 1 513#define RDSV3_WQ_THREAD_FLUSHING 2 514#define RDSV3_WQ_THREAD_EXITING 3 515 516/* worker thread */ 517void 518rdsv3_worker_thread(void *arg) 519{ 520 rdsv3_workqueue_struct_t *wq = arg; 521 rdsv3_work_t *work; 522 523 RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq); 524 525 mutex_enter(&wq->wq_lock); 526 work = list_remove_head(&wq->wq_queue); 527 while (work) { 528 mutex_exit(&wq->wq_lock); 529 530 /* process work */ 531 work->func(work); 532 533 mutex_enter(&wq->wq_lock); 534 work = list_remove_head(&wq->wq_queue); 535 } 536 537 /* No more work, go home, until called again */ 538 if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) { 539 wq->wq_state = RDSV3_WQ_THREAD_IDLE; 540 } 541 mutex_exit(&wq->wq_lock); 542 543 RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq); 544} 545 546/* XXX */ 547void 548rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq) 549{ 550 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq); 551 552 mutex_enter(&wq->wq_lock); 553 switch (wq->wq_state) { 554 case RDSV3_WQ_THREAD_IDLE: 555 /* nothing to do */ 556 ASSERT(list_is_empty(&wq->wq_queue)); 557 break; 558 559 case RDSV3_WQ_THREAD_RUNNING: 560 wq->wq_state = RDSV3_WQ_THREAD_FLUSHING; 561 /* FALLTHRU */ 562 case RDSV3_WQ_THREAD_FLUSHING: 563 /* already flushing, wait until the flushing is complete */ 564 do { 565 mutex_exit(&wq->wq_lock); 566 delay(drv_usectohz(1000000)); 567 mutex_enter(&wq->wq_lock); 568 } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); 569 break; 570 case RDSV3_WQ_THREAD_EXITING: 571 mutex_exit(&wq->wq_lock); 572 rdsv3_worker_thread(wq); 573 return; 574 } 575 mutex_exit(&wq->wq_lock); 576 577 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq); 578} 579 580void 581rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp) 582{ 583 RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp); 584 585 mutex_enter(&wq->wq_lock); 586 587 if (list_link_active(&wp->work_item)) { 588 /* This is already in the queue, ignore this call */ 589 mutex_exit(&wq->wq_lock); 590 RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp); 591 return; 592 } 593 594 switch (wq->wq_state) { 595 case RDSV3_WQ_THREAD_RUNNING: 596 list_insert_tail(&wq->wq_queue, wp); 597 mutex_exit(&wq->wq_lock); 598 break; 599 600 case RDSV3_WQ_THREAD_FLUSHING: 601 do { 602 mutex_exit(&wq->wq_lock); 603 delay(drv_usectohz(1000000)); 604 mutex_enter(&wq->wq_lock); 605 } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); 606 607 if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) { 608 list_insert_tail(&wq->wq_queue, wp); 609 mutex_exit(&wq->wq_lock); 610 break; 611 } 612 /* FALLTHRU */ 613 614 case RDSV3_WQ_THREAD_IDLE: 615 list_insert_tail(&wq->wq_queue, wp); 616 wq->wq_state = RDSV3_WQ_THREAD_RUNNING; 617 mutex_exit(&wq->wq_lock); 618 619 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq, 620 DDI_SLEEP); 621 break; 622 623 case RDSV3_WQ_THREAD_EXITING: 624 mutex_exit(&wq->wq_lock); 625 break; 626 } 627 628 RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp); 629} 630 631/* timeout handler for delayed work queuing */ 632void 633rdsv3_work_timeout_handler(void *arg) 634{ 635 rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg; 636 637 RDSV3_DPRINTF4("rdsv3_work_timeout_handler", 638 "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work); 639 640 mutex_enter(&dwp->lock); 641 dwp->timeid = 0; 642 mutex_exit(&dwp->lock); 643 644 mutex_enter(&dwp->wq->wq_lock); 645 dwp->wq->wq_pending--; 646 if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) { 647 mutex_exit(&dwp->wq->wq_lock); 648 return; 649 } 650 mutex_exit(&dwp->wq->wq_lock); 651 652 rdsv3_queue_work(dwp->wq, &dwp->work); 653 654 RDSV3_DPRINTF4("rdsv3_work_timeout_handler", 655 "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work); 656} 657 658void 659rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq, 660 rdsv3_delayed_work_t *dwp, uint_t delay) 661{ 662 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 663 "Enter(wq: %p, wp: %p)", wq, dwp); 664 665 if (delay == 0) { 666 rdsv3_queue_work(wq, &dwp->work); 667 return; 668 } 669 670 mutex_enter(&wq->wq_lock); 671 if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) { 672 mutex_exit(&wq->wq_lock); 673 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 674 "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp); 675 return; 676 } 677 wq->wq_pending++; 678 mutex_exit(&wq->wq_lock); 679 680 mutex_enter(&dwp->lock); 681 if (dwp->timeid == 0) { 682 dwp->wq = wq; 683 dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp, 684 jiffies + (delay * rdsv3_one_sec_in_hz)); 685 mutex_exit(&dwp->lock); 686 } else { 687 mutex_exit(&dwp->lock); 688 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p", 689 dwp); 690 mutex_enter(&wq->wq_lock); 691 wq->wq_pending--; 692 mutex_exit(&wq->wq_lock); 693 } 694 695 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 696 "Return(wq: %p, wp: %p)", wq, dwp); 697} 698 699void 700rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp) 701{ 702 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 703 "Enter(wq: %p, dwp: %p)", dwp->wq, dwp); 704 705 mutex_enter(&dwp->lock); 706 if (dwp->timeid != 0) { 707 (void) untimeout(dwp->timeid); 708 dwp->timeid = 0; 709 } else { 710 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 711 "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp); 712 mutex_exit(&dwp->lock); 713 return; 714 } 715 mutex_exit(&dwp->lock); 716 717 mutex_enter(&dwp->wq->wq_lock); 718 dwp->wq->wq_pending--; 719 mutex_exit(&dwp->wq->wq_lock); 720 721 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 722 "Return(wq: %p, dwp: %p)", dwp->wq, dwp); 723} 724 725void 726rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq) 727{ 728 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter"); 729 730 ASSERT(wq); 731 732 mutex_enter(&wq->wq_lock); 733 wq->wq_state = RDSV3_WQ_THREAD_EXITING; 734 735 while (wq->wq_pending > 0) { 736 mutex_exit(&wq->wq_lock); 737 delay(drv_usectohz(1000000)); 738 mutex_enter(&wq->wq_lock); 739 }; 740 mutex_exit(&wq->wq_lock); 741 742 rdsv3_flush_workqueue(wq); 743 744 list_destroy(&wq->wq_queue); 745 mutex_destroy(&wq->wq_lock); 746 kmem_free(wq, sizeof (rdsv3_workqueue_struct_t)); 747 748 ASSERT(rdsv3_taskq); 749 ddi_taskq_destroy(rdsv3_taskq); 750 751 wq = NULL; 752 rdsv3_taskq = NULL; 753 754 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return"); 755} 756 757/* ARGSUSED */ 758void 759rdsv3_rdma_init_worker(struct rdsv3_work_s *work) 760{ 761 rdsv3_rdma_init(); 762} 763 764#define RDSV3_NUM_TASKQ_THREADS 4 765rdsv3_workqueue_struct_t * 766rdsv3_create_task_workqueue(char *name) 767{ 768 rdsv3_workqueue_struct_t *wq; 769 770 RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)", 771 rdsv3_dev_info); 772 773 rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name, 774 RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0); 775 if (rdsv3_taskq == NULL) { 776 RDSV3_DPRINTF1(__FILE__, 777 "ddi_taskq_create failed for rdsv3_taskq"); 778 return (NULL); 779 } 780 781 wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP); 782 if (wq == NULL) { 783 RDSV3_DPRINTF1(__FILE__, "kmem_zalloc failed for wq"); 784 ddi_taskq_destroy(rdsv3_taskq); 785 return (NULL); 786 } 787 788 list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s), 789 offsetof(struct rdsv3_work_s, work_item)); 790 mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL); 791 wq->wq_state = RDSV3_WQ_THREAD_IDLE; 792 wq->wq_pending = 0; 793 rdsv3_one_sec_in_hz = drv_usectohz(1000000); 794 795 RDSV3_DPRINTF2("create_singlethread_workqueue", "Return"); 796 797 return (wq); 798} 799 800/* 801 * Implementation for struct sock 802 */ 803 804void 805rdsv3_sock_exit_data(struct rsock *sk) 806{ 807 struct rdsv3_sock *rs = sk->sk_protinfo; 808 809 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); 810 811 ASSERT(rs != NULL); 812 ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD)); 813 814 rs->rs_sk = NULL; 815 816 list_destroy(&rs->rs_send_queue); 817 list_destroy(&rs->rs_notify_queue); 818 list_destroy(&rs->rs_recv_queue); 819 820 rw_destroy(&rs->rs_recv_lock); 821 mutex_destroy(&rs->rs_lock); 822 823 mutex_destroy(&rs->rs_rdma_lock); 824 avl_destroy(&rs->rs_rdma_keys); 825 826 rdsv3_exit_waitqueue(sk->sk_sleep); 827 kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t)); 828 mutex_destroy(&sk->sk_lock); 829 830 kmem_cache_free(rdsv3_alloc_cache, sk); 831 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); 832} 833 834/* XXX - figure out right values */ 835#define RDSV3_RECV_HIWATER (256 * 1024) 836#define RDSV3_RECV_LOWATER 128 837#define RDSV3_XMIT_HIWATER (256 * 1024) 838#define RDSV3_XMIT_LOWATER 1024 839 840struct rsock * 841rdsv3_sk_alloc() 842{ 843 struct rsock *sk; 844 845 sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP); 846 if (sk == NULL) { 847 RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed"); 848 return (NULL); 849 } 850 851 bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock)); 852 return (sk); 853} 854 855void 856rdsv3_sock_init_data(struct rsock *sk) 857{ 858 sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP); 859 rdsv3_init_waitqueue(sk->sk_sleep); 860 861 mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL); 862 sk->sk_refcount = 1; 863 sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1); 864 sk->sk_sndbuf = RDSV3_XMIT_HIWATER; 865 sk->sk_rcvbuf = RDSV3_RECV_HIWATER; 866} 867 868/* XXX - not complete */ 869void 870rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events) 871{ 872 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 873 874 if (events & POLLIN) { 875 rw_enter(&rs->rs_recv_lock, RW_READER); 876 while (list_is_empty(&rs->rs_recv_queue) && 877 list_is_empty(&rs->rs_notify_queue)) { 878 rw_exit(&rs->rs_recv_lock); 879 mutex_enter(&waitq->waitq_mutex); 880 (void) cv_wait_sig(&waitq->waitq_cv, 881 &waitq->waitq_mutex); 882 mutex_exit(&waitq->waitq_mutex); 883 rw_enter(&rs->rs_recv_lock, RW_READER); 884 } 885 rw_exit(&rs->rs_recv_lock); 886 } 887} 888 889/* 890 * Connection cache 891 */ 892/* ARGSUSED */ 893int 894rdsv3_conn_constructor(void *buf, void *arg, int kmflags) 895{ 896 struct rdsv3_connection *conn = buf; 897 898 bzero(conn, sizeof (struct rdsv3_connection)); 899 900 conn->c_next_tx_seq = 1; 901 mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL); 902 mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL); 903 list_create(&conn->c_send_queue, sizeof (struct rdsv3_message), 904 offsetof(struct rdsv3_message, m_conn_item)); 905 list_create(&conn->c_retrans, sizeof (struct rdsv3_message), 906 offsetof(struct rdsv3_message, m_conn_item)); 907 return (0); 908} 909 910/* ARGSUSED */ 911void 912rdsv3_conn_destructor(void *buf, void *arg) 913{ 914 struct rdsv3_connection *conn = buf; 915 916 ASSERT(list_is_empty(&conn->c_send_queue)); 917 ASSERT(list_is_empty(&conn->c_retrans)); 918 list_destroy(&conn->c_send_queue); 919 list_destroy(&conn->c_retrans); 920 mutex_destroy(&conn->c_send_lock); 921 mutex_destroy(&conn->c_lock); 922} 923 924int 925rdsv3_conn_compare(const void *conn1, const void *conn2) 926{ 927 uint32_be_t laddr1, faddr1, laddr2, faddr2; 928 929 laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr; 930 laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr; 931 932 if (laddr1 == laddr2) { 933 faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr; 934 faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr; 935 if (faddr1 == faddr2) 936 return (0); 937 if (faddr1 < faddr2) 938 return (-1); 939 return (1); 940 } 941 942 if (laddr1 < laddr2) 943 return (-1); 944 945 return (1); 946} 947 948/* loop.c */ 949extern kmutex_t loop_conns_lock; 950extern list_t loop_conns; 951 952struct rdsv3_loop_connection 953{ 954 struct list_node loop_node; 955 struct rdsv3_connection *conn; 956}; 957 958void 959rdsv3_loop_init(void) 960{ 961 list_create(&loop_conns, sizeof (struct rdsv3_loop_connection), 962 offsetof(struct rdsv3_loop_connection, loop_node)); 963 mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL); 964} 965 966/* rdma.c */ 967/* IB Rkey is used here for comparison */ 968int 969rdsv3_mr_compare(const void *mr1, const void *mr2) 970{ 971 uint32_t key1 = *(uint32_t *)mr1; 972 uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key; 973 974 if (key1 < key2) 975 return (-1); 976 if (key1 > key2) 977 return (1); 978 return (0); 979} 980 981/* transport.c */ 982extern list_t transports; 983extern krwlock_t trans_sem; 984 985void 986rdsv3_trans_exit(void) 987{ 988 struct rdsv3_transport *trans; 989 990 RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter"); 991 992 /* currently, only IB transport */ 993 rw_enter(&trans_sem, RW_READER); 994 if (!list_is_empty(&transports)) 995 trans = list_head(&transports); 996 else 997 trans = NULL; 998 rw_exit(&trans_sem); 999 1000 /* trans->exit() will remove the trans from the list */ 1001 if (trans) 1002 trans->exit(); 1003 1004 list_destroy(&transports); 1005 rw_destroy(&trans_sem); 1006 1007 RDSV3_DPRINTF2("rdsv3_trans_exit", "Return"); 1008} 1009 1010void 1011rdsv3_trans_init() 1012{ 1013 RDSV3_DPRINTF2("rdsv3_trans_init", "Enter"); 1014 1015 list_create(&transports, sizeof (struct rdsv3_transport), 1016 offsetof(struct rdsv3_transport, t_item)); 1017 rw_init(&trans_sem, NULL, RW_DRIVER, NULL); 1018 1019 RDSV3_DPRINTF2("rdsv3_trans_init", "Return"); 1020} 1021 1022int 1023rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size, 1024 void *payload) 1025{ 1026 struct cmsghdr *cp; 1027 char *bp; 1028 size_t cmlen; 1029 size_t cmspace; 1030 size_t bufsz; 1031 1032 RDSV3_DPRINTF4("rdsv3_put_cmsg", 1033 "Enter(msg: %p level: %d type: %d sz: %d)", 1034 msg, level, type, size); 1035 1036 if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) { 1037 return (0); 1038 } 1039 /* check for first cmsg or this is another cmsg to be appended */ 1040 if (msg->msg_control == NULL) 1041 msg->msg_controllen = 0; 1042 1043 cmlen = CMSG_LEN(size); 1044 cmspace = CMSG_SPACE(size); 1045 bufsz = msg->msg_controllen + cmspace; 1046 1047 /* extend the existing cmsg to append the next cmsg */ 1048 bp = kmem_alloc(bufsz, KM_SLEEP); 1049 if (msg->msg_control) { 1050 bcopy(msg->msg_control, bp, msg->msg_controllen); 1051 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 1052 } 1053 1054 /* assign payload the proper cmsg location */ 1055 cp = (struct cmsghdr *)(bp + msg->msg_controllen); 1056 cp->cmsg_len = cmlen; 1057 cp->cmsg_level = level; 1058 cp->cmsg_type = type; 1059 1060 bcopy(payload, CMSG_DATA(cp), cmlen - 1061 (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr))); 1062 1063 msg->msg_control = bp; 1064 msg->msg_controllen = bufsz; 1065 1066 RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len); 1067 1068 return (0); 1069} 1070 1071/* bind.c */ 1072extern kmutex_t rdsv3_bind_lock; 1073extern avl_tree_t rdsv3_bind_tree; 1074 1075/* ARGSUSED */ 1076int 1077rdsv3_verify_bind_address(ipaddr_t addr) 1078{ 1079 return (1); 1080} 1081 1082/* XXX - need to enhance to compare IP address and port */ 1083int 1084rdsv3_bind_node_compare(const void *a, const void *b) 1085{ 1086 uint16_be_t port = *(in_port_t *)a; 1087 struct rdsv3_sock *rs = (struct rdsv3_sock *)b; 1088 1089 RDSV3_DPRINTF5("rdsv3_bind_node_compare", "Enter (%x %x)", port, 1090 rs->rs_bound_port); 1091 1092 if (port > rs->rs_bound_port) 1093 return (+1); 1094 else if (port < rs->rs_bound_port) 1095 return (-1); 1096 1097 return (0); 1098} 1099 1100void 1101rdsv3_bind_tree_init() 1102{ 1103 RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter"); 1104 1105 mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL); 1106 avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare, 1107 sizeof (struct rdsv3_sock), 1108 offsetof(struct rdsv3_sock, rs_bound_node)); 1109 1110 RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return"); 1111} 1112 1113void 1114rdsv3_bind_tree_exit() 1115{ 1116 RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter"); 1117 1118 ASSERT(avl_is_empty(&rdsv3_bind_tree)); 1119 avl_destroy(&rdsv3_bind_tree); 1120 mutex_destroy(&rdsv3_bind_lock); 1121 1122 RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return"); 1123} 1124 1125/* checksum */ 1126uint16_t 1127rdsv3_ip_fast_csum(void *hdr, size_t length) 1128{ 1129 return (0xffff & 1130 (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0))); 1131} 1132 1133/* scatterlist implementation */ 1134/* ARGSUSED */ 1135caddr_t 1136rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat, 1137 uint_t offset) 1138{ 1139 return (0); 1140} 1141 1142uint_t 1143rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat, 1144 uint_t num) 1145{ 1146 struct rdsv3_scatterlist *s, *first; 1147 ibt_iov_t *iov; 1148 ibt_wr_ds_t *sgl; 1149 ibt_iov_attr_t iov_attr; 1150 ibt_send_wr_t swr; 1151 uint_t i; 1152 1153 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num); 1154 1155 s = first = &scat[0]; 1156 ASSERT(first->mihdl == NULL); 1157 1158 iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP); 1159 sgl = kmem_zalloc((num * 2) * sizeof (ibt_wr_ds_t), KM_SLEEP); 1160 1161 for (i = 0; i < num; i++, s++) { 1162 iov[i].iov_addr = s->vaddr; 1163 iov[i].iov_len = s->length; 1164 } 1165 1166 iov_attr.iov_as = NULL; 1167 iov_attr.iov = iov; 1168 iov_attr.iov_buf = NULL; 1169 iov_attr.iov_list_len = num; 1170 iov_attr.iov_wr_nds = num * 2; 1171 iov_attr.iov_lso_hdr_sz = 0; 1172 iov_attr.iov_flags = IBT_IOV_SLEEP; 1173 1174 swr.wr_sgl = sgl; 1175 1176 i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev), 1177 &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl); 1178 kmem_free(iov, num * sizeof (ibt_iov_t)); 1179 if (i != IBT_SUCCESS) { 1180 RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg", 1181 "ibt_map_mem_iov returned: %d", i); 1182 return (0); 1183 } 1184 1185 s = first; 1186 for (i = 0; i < num; i++, s++, sgl++) { 1187 s->sgl = sgl; 1188 } 1189 1190 return (num); 1191} 1192 1193void 1194rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat, 1195 uint_t num) 1196{ 1197 /* Zero length messages have no scatter gather entries */ 1198 if (num != 0) { 1199 ASSERT(scat->mihdl != NULL); 1200 ASSERT(scat->sgl != NULL); 1201 1202 (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl); 1203 1204 kmem_free(scat->sgl, (num * 2) * sizeof (ibt_wr_ds_t)); 1205 scat->sgl = NULL; 1206 scat->mihdl = NULL; 1207 } 1208} 1209 1210int 1211rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) 1212{ 1213 caddr_t addr; 1214 size_t size; 1215 ibt_mr_attr_t mr_attr; 1216 ibt_mr_desc_t mr_desc; 1217 ibt_mr_hdl_t mr_hdl; 1218 int ret; 1219 1220 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev); 1221 1222 ASSERT(ic->i_mr == NULL); 1223 1224 size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) * 1225 sizeof (struct rdsv3_header); 1226 1227 addr = kmem_zalloc(size, KM_NOSLEEP); 1228 if (addr == NULL) 1229 return (-1); 1230 1231 mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr; 1232 mr_attr.mr_len = size; 1233 mr_attr.mr_as = NULL; 1234 mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; 1235 ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd), 1236 &mr_attr, &mr_hdl, &mr_desc); 1237 if (ret != IBT_SUCCESS) { 1238 RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs", 1239 "ibt_register_mr returned: " "%d", ret); 1240 return (-1); 1241 } 1242 1243 ic->i_mr = 1244 (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr), 1245 KM_SLEEP); 1246 ic->i_mr->addr = addr; 1247 ic->i_mr->size = size; 1248 ic->i_mr->hdl = mr_hdl; 1249 ic->i_mr->lkey = mr_desc.md_lkey; 1250 1251 ic->i_send_hdrs = (struct rdsv3_header *)addr; 1252 ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr; 1253 1254 ic->i_recv_hdrs = (struct rdsv3_header *)(addr + 1255 (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); 1256 ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr + 1257 (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); 1258 ic->i_recv_tasklet_cpuid = -1; 1259 1260 ic->i_ack = (struct rdsv3_header *)(addr + 1261 ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * 1262 sizeof (struct rdsv3_header))); 1263 ic->i_ack_dma = (uint64_t)(uintptr_t)(addr + 1264 ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * 1265 sizeof (struct rdsv3_header))); 1266 1267 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev); 1268 1269 return (0); 1270} 1271 1272void 1273rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) 1274{ 1275 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev); 1276 ASSERT(ic->i_mr != NULL); 1277 1278 ic->i_send_hdrs = NULL; 1279 ic->i_send_hdrs_dma = NULL; 1280 1281 ic->i_recv_hdrs = NULL; 1282 ic->i_recv_hdrs_dma = NULL; 1283 1284 ic->i_ack = NULL; 1285 ic->i_ack_dma = NULL; 1286 1287 (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl); 1288 1289 kmem_free(ic->i_mr->addr, ic->i_mr->size); 1290 kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr)); 1291 1292 ic->i_mr = NULL; 1293 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev); 1294} 1295