icl_soft.c revision 263743
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: head/sys/dev/iscsi/icl.c 263743 2014-03-25 19:17:22Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capsicum.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW, 65 &debug, 1, "Enable debug messages"); 66static int partial_receive_len = 1 * 1024; /* XXX: More? */ 67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW, 69 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 70 "data segment"); 71 72static uma_zone_t icl_conn_zone; 73static uma_zone_t icl_pdu_zone; 74 75static volatile u_int icl_ncons; 76 77#define ICL_DEBUG(X, ...) \ 78 do { \ 79 if (debug > 1) \ 80 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 81 } while (0) 82 83#define ICL_WARN(X, ...) \ 84 do { \ 85 if (debug > 0) { \ 86 printf("WARNING: %s: " X "\n", \ 87 __func__, ## __VA_ARGS__); \ 88 } \ 89 } while (0) 90 91#define ICL_CONN_LOCK(X) mtx_lock(&X->ic_lock) 92#define ICL_CONN_UNLOCK(X) mtx_unlock(&X->ic_lock) 93#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(&X->ic_lock, MA_OWNED) 94 95static void 96icl_conn_fail(struct icl_conn *ic) 97{ 98 if (ic->ic_socket == NULL) 99 return; 100 101 /* 102 * XXX 103 */ 104 ic->ic_socket->so_error = EDOOFUS; 105 (ic->ic_error)(ic); 106} 107 108static struct mbuf * 109icl_conn_receive(struct icl_conn *ic, size_t len) 110{ 111 struct uio uio; 112 struct socket *so; 113 struct mbuf *m; 114 int error, flags; 115 116 so = ic->ic_socket; 117 118 memset(&uio, 0, sizeof(uio)); 119 uio.uio_resid = len; 120 121 flags = MSG_DONTWAIT; 122 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 123 if (error != 0) { 124 ICL_DEBUG("soreceive error %d", error); 125 return (NULL); 126 } 127 if (uio.uio_resid != 0) { 128 m_freem(m); 129 ICL_DEBUG("short read"); 130 return (NULL); 131 } 132 133 return (m); 134} 135 136static struct icl_pdu * 137icl_pdu_new(struct icl_conn *ic, int flags) 138{ 139 struct icl_pdu *ip; 140 141#ifdef DIAGNOSTIC 142 refcount_acquire(&ic->ic_outstanding_pdus); 143#endif 144 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 145 if (ip == NULL) { 146 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 147#ifdef DIAGNOSTIC 148 refcount_release(&ic->ic_outstanding_pdus); 149#endif 150 return (NULL); 151 } 152 153 ip->ip_conn = ic; 154 155 return (ip); 156} 157 158void 159icl_pdu_free(struct icl_pdu *ip) 160{ 161 struct icl_conn *ic; 162 163 ic = ip->ip_conn; 164 165 m_freem(ip->ip_bhs_mbuf); 166 m_freem(ip->ip_ahs_mbuf); 167 m_freem(ip->ip_data_mbuf); 168 uma_zfree(icl_pdu_zone, ip); 169#ifdef DIAGNOSTIC 170 refcount_release(&ic->ic_outstanding_pdus); 171#endif 172} 173 174/* 175 * Allocate icl_pdu with empty BHS to fill up by the caller. 176 */ 177struct icl_pdu * 178icl_pdu_new_bhs(struct icl_conn *ic, int flags) 179{ 180 struct icl_pdu *ip; 181 182 ip = icl_pdu_new(ic, flags); 183 if (ip == NULL) 184 return (NULL); 185 186 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 187 flags, MT_DATA, M_PKTHDR); 188 if (ip->ip_bhs_mbuf == NULL) { 189 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 190 icl_pdu_free(ip); 191 return (NULL); 192 } 193 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 194 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 195 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 196 197 return (ip); 198} 199 200static int 201icl_pdu_ahs_length(const struct icl_pdu *request) 202{ 203 204 return (request->ip_bhs->bhs_total_ahs_len * 4); 205} 206 207size_t 208icl_pdu_data_segment_length(const struct icl_pdu *request) 209{ 210 uint32_t len = 0; 211 212 len += request->ip_bhs->bhs_data_segment_len[0]; 213 len <<= 8; 214 len += request->ip_bhs->bhs_data_segment_len[1]; 215 len <<= 8; 216 len += request->ip_bhs->bhs_data_segment_len[2]; 217 218 return (len); 219} 220 221static void 222icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 223{ 224 225 response->ip_bhs->bhs_data_segment_len[2] = len; 226 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 227 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 228} 229 230static size_t 231icl_pdu_padding(const struct icl_pdu *ip) 232{ 233 234 if ((ip->ip_data_len % 4) != 0) 235 return (4 - (ip->ip_data_len % 4)); 236 237 return (0); 238} 239 240static size_t 241icl_pdu_size(const struct icl_pdu *response) 242{ 243 size_t len; 244 245 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 246 247 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 248 icl_pdu_padding(response); 249 if (response->ip_conn->ic_header_crc32c) 250 len += ISCSI_HEADER_DIGEST_SIZE; 251 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 252 len += ISCSI_DATA_DIGEST_SIZE; 253 254 return (len); 255} 256 257static int 258icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 259{ 260 struct mbuf *m; 261 262 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 263 if (m == NULL) { 264 ICL_DEBUG("failed to receive BHS"); 265 return (-1); 266 } 267 268 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 269 if (request->ip_bhs_mbuf == NULL) { 270 ICL_WARN("m_pullup failed"); 271 return (-1); 272 } 273 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 274 275 /* 276 * XXX: For architectures with strict alignment requirements 277 * we may need to allocate ip_bhs and copy the data into it. 278 * For some reason, though, not doing this doesn't seem 279 * to cause problems; tested on sparc64. 280 */ 281 282 *availablep -= sizeof(struct iscsi_bhs); 283 return (0); 284} 285 286static int 287icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 288{ 289 290 request->ip_ahs_len = icl_pdu_ahs_length(request); 291 if (request->ip_ahs_len == 0) 292 return (0); 293 294 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 295 request->ip_ahs_len); 296 if (request->ip_ahs_mbuf == NULL) { 297 ICL_DEBUG("failed to receive AHS"); 298 return (-1); 299 } 300 301 *availablep -= request->ip_ahs_len; 302 return (0); 303} 304 305static uint32_t 306icl_mbuf_to_crc32c(const struct mbuf *m0) 307{ 308 uint32_t digest = 0xffffffff; 309 const struct mbuf *m; 310 311 for (m = m0; m != NULL; m = m->m_next) 312 digest = calculate_crc32c(digest, 313 mtod(m, const void *), m->m_len); 314 315 digest = digest ^ 0xffffffff; 316 317 return (digest); 318} 319 320static int 321icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 322{ 323 struct mbuf *m; 324 uint32_t received_digest, valid_digest; 325 326 if (request->ip_conn->ic_header_crc32c == false) 327 return (0); 328 329 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 330 if (m == NULL) { 331 ICL_DEBUG("failed to receive header digest"); 332 return (-1); 333 } 334 335 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 336 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 337 m_freem(m); 338 339 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 340 341 /* 342 * XXX: Handle AHS. 343 */ 344 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 345 if (received_digest != valid_digest) { 346 ICL_WARN("header digest check failed; got 0x%x, " 347 "should be 0x%x", received_digest, valid_digest); 348 return (-1); 349 } 350 351 return (0); 352} 353 354/* 355 * Return the number of bytes that should be waiting in the receive socket 356 * before icl_pdu_receive_data_segment() gets called. 357 */ 358static size_t 359icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 360{ 361 size_t len; 362 363 len = icl_pdu_data_segment_length(request); 364 if (len == 0) 365 return (0); 366 367 /* 368 * Account for the parts of data segment already read from 369 * the socket buffer. 370 */ 371 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 372 len -= request->ip_data_len; 373 374 /* 375 * Don't always wait for the full data segment to be delivered 376 * to the socket; this might badly affect performance due to 377 * TCP window scaling. 378 */ 379 if (len > partial_receive_len) { 380#if 0 381 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 382 len, partial_receive_len)); 383#endif 384 len = partial_receive_len; 385 386 return (len); 387 } 388 389 /* 390 * Account for padding. Note that due to the way code is written, 391 * the icl_pdu_receive_data_segment() must always receive padding 392 * along with the last part of data segment, because it would be 393 * impossible to tell whether we've already received the full data 394 * segment including padding, or without it. 395 */ 396 if ((len % 4) != 0) 397 len += 4 - (len % 4); 398 399#if 0 400 ICL_DEBUG("need %zd bytes of data", len)); 401#endif 402 403 return (len); 404} 405 406static int 407icl_pdu_receive_data_segment(struct icl_pdu *request, 408 size_t *availablep, bool *more_neededp) 409{ 410 struct icl_conn *ic; 411 size_t len, padding = 0; 412 struct mbuf *m; 413 414 ic = request->ip_conn; 415 416 *more_neededp = false; 417 ic->ic_receive_len = 0; 418 419 len = icl_pdu_data_segment_length(request); 420 if (len == 0) 421 return (0); 422 423 if ((len % 4) != 0) 424 padding = 4 - (len % 4); 425 426 /* 427 * Account for already received parts of data segment. 428 */ 429 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 430 len -= request->ip_data_len; 431 432 if (len + padding > *availablep) { 433 /* 434 * Not enough data in the socket buffer. Receive as much 435 * as we can. Don't receive padding, since, obviously, it's 436 * not the end of data segment yet. 437 */ 438#if 0 439 ICL_DEBUG("limited from %zd to %zd", 440 len + padding, *availablep - padding)); 441#endif 442 len = *availablep - padding; 443 *more_neededp = true; 444 padding = 0; 445 } 446 447 /* 448 * Must not try to receive padding without at least one byte 449 * of actual data segment. 450 */ 451 if (len > 0) { 452 m = icl_conn_receive(request->ip_conn, len + padding); 453 if (m == NULL) { 454 ICL_DEBUG("failed to receive data segment"); 455 return (-1); 456 } 457 458 if (request->ip_data_mbuf == NULL) 459 request->ip_data_mbuf = m; 460 else 461 m_cat(request->ip_data_mbuf, m); 462 463 request->ip_data_len += len; 464 *availablep -= len + padding; 465 } else 466 ICL_DEBUG("len 0"); 467 468 if (*more_neededp) 469 ic->ic_receive_len = 470 icl_pdu_data_segment_receive_len(request); 471 472 return (0); 473} 474 475static int 476icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 477{ 478 struct mbuf *m; 479 uint32_t received_digest, valid_digest; 480 481 if (request->ip_conn->ic_data_crc32c == false) 482 return (0); 483 484 if (request->ip_data_len == 0) 485 return (0); 486 487 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 488 if (m == NULL) { 489 ICL_DEBUG("failed to receive data digest"); 490 return (-1); 491 } 492 493 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 494 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 495 m_freem(m); 496 497 *availablep -= ISCSI_DATA_DIGEST_SIZE; 498 499 /* 500 * Note that ip_data_mbuf also contains padding; since digest 501 * calculation is supposed to include that, we iterate over 502 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 503 */ 504 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 505 if (received_digest != valid_digest) { 506 ICL_WARN("data digest check failed; got 0x%x, " 507 "should be 0x%x", received_digest, valid_digest); 508 return (-1); 509 } 510 511 return (0); 512} 513 514/* 515 * Somewhat contrary to the name, this attempts to receive only one 516 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 517 */ 518static struct icl_pdu * 519icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 520{ 521 struct icl_pdu *request; 522 struct socket *so; 523 size_t len; 524 int error; 525 bool more_needed; 526 527 so = ic->ic_socket; 528 529 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 530 KASSERT(ic->ic_receive_pdu == NULL, 531 ("ic->ic_receive_pdu != NULL")); 532 request = icl_pdu_new(ic, M_NOWAIT); 533 if (request == NULL) { 534 ICL_DEBUG("failed to allocate PDU; " 535 "dropping connection"); 536 icl_conn_fail(ic); 537 return (NULL); 538 } 539 ic->ic_receive_pdu = request; 540 } else { 541 KASSERT(ic->ic_receive_pdu != NULL, 542 ("ic->ic_receive_pdu == NULL")); 543 request = ic->ic_receive_pdu; 544 } 545 546 if (*availablep < ic->ic_receive_len) { 547#if 0 548 ICL_DEBUG("not enough data; need %zd, " 549 "have %zd", ic->ic_receive_len, *availablep); 550#endif 551 return (NULL); 552 } 553 554 switch (ic->ic_receive_state) { 555 case ICL_CONN_STATE_BHS: 556 //ICL_DEBUG("receiving BHS"); 557 error = icl_pdu_receive_bhs(request, availablep); 558 if (error != 0) { 559 ICL_DEBUG("failed to receive BHS; " 560 "dropping connection"); 561 break; 562 } 563 564 /* 565 * We don't enforce any limit for AHS length; 566 * its length is stored in 8 bit field. 567 */ 568 569 len = icl_pdu_data_segment_length(request); 570 if (len > ic->ic_max_data_segment_length) { 571 ICL_WARN("received data segment " 572 "length %zd is larger than negotiated " 573 "MaxDataSegmentLength %zd; " 574 "dropping connection", 575 len, ic->ic_max_data_segment_length); 576 error = EINVAL; 577 break; 578 } 579 580 ic->ic_receive_state = ICL_CONN_STATE_AHS; 581 ic->ic_receive_len = icl_pdu_ahs_length(request); 582 break; 583 584 case ICL_CONN_STATE_AHS: 585 //ICL_DEBUG("receiving AHS"); 586 error = icl_pdu_receive_ahs(request, availablep); 587 if (error != 0) { 588 ICL_DEBUG("failed to receive AHS; " 589 "dropping connection"); 590 break; 591 } 592 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 593 if (ic->ic_header_crc32c == false) 594 ic->ic_receive_len = 0; 595 else 596 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 597 break; 598 599 case ICL_CONN_STATE_HEADER_DIGEST: 600 //ICL_DEBUG("receiving header digest"); 601 error = icl_pdu_check_header_digest(request, availablep); 602 if (error != 0) { 603 ICL_DEBUG("header digest failed; " 604 "dropping connection"); 605 break; 606 } 607 608 ic->ic_receive_state = ICL_CONN_STATE_DATA; 609 ic->ic_receive_len = 610 icl_pdu_data_segment_receive_len(request); 611 break; 612 613 case ICL_CONN_STATE_DATA: 614 //ICL_DEBUG("receiving data segment"); 615 error = icl_pdu_receive_data_segment(request, availablep, 616 &more_needed); 617 if (error != 0) { 618 ICL_DEBUG("failed to receive data segment;" 619 "dropping connection"); 620 break; 621 } 622 623 if (more_needed) 624 break; 625 626 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 627 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 628 ic->ic_receive_len = 0; 629 else 630 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 631 break; 632 633 case ICL_CONN_STATE_DATA_DIGEST: 634 //ICL_DEBUG("receiving data digest"); 635 error = icl_pdu_check_data_digest(request, availablep); 636 if (error != 0) { 637 ICL_DEBUG("data digest failed; " 638 "dropping connection"); 639 break; 640 } 641 642 /* 643 * We've received complete PDU; reset the receive state machine 644 * and return the PDU. 645 */ 646 ic->ic_receive_state = ICL_CONN_STATE_BHS; 647 ic->ic_receive_len = sizeof(struct iscsi_bhs); 648 ic->ic_receive_pdu = NULL; 649 return (request); 650 651 default: 652 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 653 } 654 655 if (error != 0) { 656 icl_pdu_free(request); 657 icl_conn_fail(ic); 658 } 659 660 return (NULL); 661} 662 663static void 664icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 665{ 666 struct icl_pdu *response; 667 struct socket *so; 668 669 so = ic->ic_socket; 670 671 /* 672 * This can never happen; we're careful to only mess with ic->ic_socket 673 * pointer when the send/receive threads are not running. 674 */ 675 KASSERT(so != NULL, ("NULL socket")); 676 677 for (;;) { 678 if (ic->ic_disconnecting) 679 return; 680 681 if (so->so_error != 0) { 682 ICL_DEBUG("connection error %d; " 683 "dropping connection", so->so_error); 684 icl_conn_fail(ic); 685 return; 686 } 687 688 /* 689 * Loop until we have a complete PDU or there is not enough 690 * data in the socket buffer. 691 */ 692 if (available < ic->ic_receive_len) { 693#if 0 694 ICL_DEBUG("not enough data; have %zd, " 695 "need %zd", available, 696 ic->ic_receive_len); 697#endif 698 return; 699 } 700 701 response = icl_conn_receive_pdu(ic, &available); 702 if (response == NULL) 703 continue; 704 705 if (response->ip_ahs_len > 0) { 706 ICL_WARN("received PDU with unsupported " 707 "AHS; opcode 0x%x; dropping connection", 708 response->ip_bhs->bhs_opcode); 709 icl_pdu_free(response); 710 icl_conn_fail(ic); 711 return; 712 } 713 714 (ic->ic_receive)(response); 715 } 716} 717 718static void 719icl_receive_thread(void *arg) 720{ 721 struct icl_conn *ic; 722 size_t available; 723 struct socket *so; 724 725 ic = arg; 726 so = ic->ic_socket; 727 728 ICL_CONN_LOCK(ic); 729 ic->ic_receive_running = true; 730 ICL_CONN_UNLOCK(ic); 731 732 for (;;) { 733 if (ic->ic_disconnecting) { 734 //ICL_DEBUG("terminating"); 735 break; 736 } 737 738 SOCKBUF_LOCK(&so->so_rcv); 739 available = so->so_rcv.sb_cc; 740 if (available < ic->ic_receive_len) { 741 so->so_rcv.sb_lowat = ic->ic_receive_len; 742 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 743 } 744 SOCKBUF_UNLOCK(&so->so_rcv); 745 746 icl_conn_receive_pdus(ic, available); 747 } 748 749 ICL_CONN_LOCK(ic); 750 ic->ic_receive_running = false; 751 ICL_CONN_UNLOCK(ic); 752 kthread_exit(); 753} 754 755static int 756icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 757{ 758 struct icl_conn *ic; 759 760 ic = arg; 761 cv_signal(&ic->ic_receive_cv); 762 return (SU_OK); 763} 764 765static int 766icl_pdu_send(struct icl_pdu *request) 767{ 768 size_t padding, pdu_len; 769 uint32_t digest, zero = 0; 770 int error, ok; 771 struct socket *so; 772 struct icl_conn *ic; 773 774 ic = request->ip_conn; 775 so = request->ip_conn->ic_socket; 776 777 ICL_CONN_LOCK_ASSERT(ic); 778 779 icl_pdu_set_data_segment_length(request, request->ip_data_len); 780 781 pdu_len = icl_pdu_size(request); 782 783 if (ic->ic_header_crc32c) { 784 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 785 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 786 (void *)&digest); 787 if (ok != 1) { 788 ICL_WARN("failed to append header digest"); 789 return (1); 790 } 791 } 792 793 if (request->ip_data_len != 0) { 794 padding = icl_pdu_padding(request); 795 if (padding > 0) { 796 ok = m_append(request->ip_data_mbuf, padding, 797 (void *)&zero); 798 if (ok != 1) { 799 ICL_WARN("failed to append padding"); 800 return (1); 801 } 802 } 803 804 if (ic->ic_data_crc32c) { 805 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 806 807 ok = m_append(request->ip_data_mbuf, sizeof(digest), 808 (void *)&digest); 809 if (ok != 1) { 810 ICL_WARN("failed to append header digest"); 811 return (1); 812 } 813 } 814 815 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 816 request->ip_data_mbuf = NULL; 817 } 818 819 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 820 821 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 822 NULL, MSG_DONTWAIT, curthread); 823 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 824 if (error != 0) { 825 ICL_DEBUG("sosend error %d", error); 826 return (error); 827 } 828 829 return (0); 830} 831 832static void 833icl_conn_send_pdus(struct icl_conn *ic) 834{ 835 struct icl_pdu *request; 836 struct socket *so; 837 size_t available, size; 838 int error; 839 840 ICL_CONN_LOCK_ASSERT(ic); 841 842 so = ic->ic_socket; 843 844 SOCKBUF_LOCK(&so->so_snd); 845 available = sbspace(&so->so_snd); 846 SOCKBUF_UNLOCK(&so->so_snd); 847 848 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 849 if (ic->ic_disconnecting) 850 return; 851 852 request = TAILQ_FIRST(&ic->ic_to_send); 853 size = icl_pdu_size(request); 854 if (available < size) { 855 /* 856 * Set the low watermark on the socket, 857 * to avoid waking up until there is enough 858 * space. 859 */ 860 SOCKBUF_LOCK(&so->so_snd); 861 so->so_snd.sb_lowat = size; 862 SOCKBUF_UNLOCK(&so->so_snd); 863#if 1 864 ICL_DEBUG("no space to send; " 865 "have %zd, need %zd", 866 available, size); 867#endif 868 return; 869 } 870 available -= size; 871 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next); 872 error = icl_pdu_send(request); 873 if (error != 0) { 874 ICL_DEBUG("failed to send PDU; " 875 "dropping connection"); 876 icl_conn_fail(ic); 877 return; 878 } 879 icl_pdu_free(request); 880 } 881} 882 883static void 884icl_send_thread(void *arg) 885{ 886 struct icl_conn *ic; 887 888 ic = arg; 889 890 ICL_CONN_LOCK(ic); 891 ic->ic_send_running = true; 892 893 for (;;) { 894 if (ic->ic_disconnecting) { 895 //ICL_DEBUG("terminating"); 896 break; 897 } 898 icl_conn_send_pdus(ic); 899 cv_wait(&ic->ic_send_cv, &ic->ic_lock); 900 } 901 902 ic->ic_send_running = false; 903 ICL_CONN_UNLOCK(ic); 904 kthread_exit(); 905} 906 907static int 908icl_soupcall_send(struct socket *so, void *arg, int waitflag) 909{ 910 struct icl_conn *ic; 911 912 ic = arg; 913 cv_signal(&ic->ic_send_cv); 914 return (SU_OK); 915} 916 917int 918icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags) 919{ 920 struct mbuf *mb, *newmb; 921 size_t copylen, off = 0; 922 923 KASSERT(len > 0, ("len == 0")); 924 925 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 926 if (newmb == NULL) { 927 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 928 return (ENOMEM); 929 } 930 931 for (mb = newmb; mb != NULL; mb = mb->m_next) { 932 copylen = min(M_TRAILINGSPACE(mb), len - off); 933 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 934 mb->m_len = copylen; 935 off += copylen; 936 } 937 KASSERT(off == len, ("%s: off != len", __func__)); 938 939 if (request->ip_data_mbuf == NULL) { 940 request->ip_data_mbuf = newmb; 941 request->ip_data_len = len; 942 } else { 943 m_cat(request->ip_data_mbuf, newmb); 944 request->ip_data_len += len; 945 } 946 947 return (0); 948} 949 950void 951icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 952{ 953 954 m_copydata(ip->ip_data_mbuf, off, len, addr); 955} 956 957void 958icl_pdu_queue(struct icl_pdu *ip) 959{ 960 struct icl_conn *ic; 961 962 ic = ip->ip_conn; 963 964 ICL_CONN_LOCK(ic); 965 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 966 ICL_DEBUG("icl_pdu_queue on closed connection"); 967 ICL_CONN_UNLOCK(ic); 968 icl_pdu_free(ip); 969 return; 970 } 971 TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 972 ICL_CONN_UNLOCK(ic); 973 cv_signal(&ic->ic_send_cv); 974} 975 976struct icl_conn * 977icl_conn_new(void) 978{ 979 struct icl_conn *ic; 980 981 refcount_acquire(&icl_ncons); 982 983 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 984 985 TAILQ_INIT(&ic->ic_to_send); 986 mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF); 987 cv_init(&ic->ic_send_cv, "icl_tx"); 988 cv_init(&ic->ic_receive_cv, "icl_rx"); 989#ifdef DIAGNOSTIC 990 refcount_init(&ic->ic_outstanding_pdus, 0); 991#endif 992 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 993 994 return (ic); 995} 996 997void 998icl_conn_free(struct icl_conn *ic) 999{ 1000 1001 mtx_destroy(&ic->ic_lock); 1002 cv_destroy(&ic->ic_send_cv); 1003 cv_destroy(&ic->ic_receive_cv); 1004 uma_zfree(icl_conn_zone, ic); 1005 refcount_release(&icl_ncons); 1006} 1007 1008static int 1009icl_conn_start(struct icl_conn *ic) 1010{ 1011 size_t bufsize; 1012 struct sockopt opt; 1013 int error, one = 1; 1014 1015 ICL_CONN_LOCK(ic); 1016 1017 /* 1018 * XXX: Ugly hack. 1019 */ 1020 if (ic->ic_socket == NULL) { 1021 ICL_CONN_UNLOCK(ic); 1022 return (EINVAL); 1023 } 1024 1025 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1026 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1027 ic->ic_disconnecting = false; 1028 1029 ICL_CONN_UNLOCK(ic); 1030 1031 /* 1032 * Use max available sockbuf size for sending. Do it manually 1033 * instead of sbreserve(9) to work around resource limits. 1034 * 1035 * XXX: This kind of sucks. On one hand, we don't currently support 1036 * sending a part of data segment; we always do it in one piece, 1037 * so we have to make sure it can fit in the socket buffer. 1038 * Once I've implemented partial send, we'll get rid of this 1039 * and use autoscaling. 1040 */ 1041 bufsize = (sizeof(struct iscsi_bhs) + 1042 ic->ic_max_data_segment_length) * 8; 1043 error = soreserve(ic->ic_socket, bufsize, bufsize); 1044 if (error != 0) { 1045 ICL_WARN("soreserve failed with error %d", error); 1046 icl_conn_close(ic); 1047 return (error); 1048 } 1049 1050 /* 1051 * Disable Nagle. 1052 */ 1053 bzero(&opt, sizeof(opt)); 1054 opt.sopt_dir = SOPT_SET; 1055 opt.sopt_level = IPPROTO_TCP; 1056 opt.sopt_name = TCP_NODELAY; 1057 opt.sopt_val = &one; 1058 opt.sopt_valsize = sizeof(one); 1059 error = sosetopt(ic->ic_socket, &opt); 1060 if (error != 0) { 1061 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1062 icl_conn_close(ic); 1063 return (error); 1064 } 1065 1066 /* 1067 * Start threads. 1068 */ 1069 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx"); 1070 if (error != 0) { 1071 ICL_WARN("kthread_add(9) failed with error %d", error); 1072 icl_conn_close(ic); 1073 return (error); 1074 } 1075 1076 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx"); 1077 if (error != 0) { 1078 ICL_WARN("kthread_add(9) failed with error %d", error); 1079 icl_conn_close(ic); 1080 return (error); 1081 } 1082 1083 /* 1084 * Register socket upcall, to get notified about incoming PDUs 1085 * and free space to send outgoing ones. 1086 */ 1087 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1088 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1089 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1090 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1091 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1092 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1093 1094 return (0); 1095} 1096 1097int 1098icl_conn_handoff(struct icl_conn *ic, int fd) 1099{ 1100 struct file *fp; 1101 struct socket *so; 1102 cap_rights_t rights; 1103 int error; 1104 1105 /* 1106 * Steal the socket from userland. 1107 */ 1108 error = fget(curthread, fd, 1109 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1110 if (error != 0) 1111 return (error); 1112 if (fp->f_type != DTYPE_SOCKET) { 1113 fdrop(fp, curthread); 1114 return (EINVAL); 1115 } 1116 so = fp->f_data; 1117 if (so->so_type != SOCK_STREAM) { 1118 fdrop(fp, curthread); 1119 return (EINVAL); 1120 } 1121 1122 ICL_CONN_LOCK(ic); 1123 1124 if (ic->ic_socket != NULL) { 1125 ICL_CONN_UNLOCK(ic); 1126 fdrop(fp, curthread); 1127 return (EBUSY); 1128 } 1129 1130 ic->ic_socket = fp->f_data; 1131 fp->f_ops = &badfileops; 1132 fp->f_data = NULL; 1133 fdrop(fp, curthread); 1134 ICL_CONN_UNLOCK(ic); 1135 1136 error = icl_conn_start(ic); 1137 1138 return (error); 1139} 1140 1141void 1142icl_conn_shutdown(struct icl_conn *ic) 1143{ 1144 1145 ICL_CONN_LOCK(ic); 1146 if (ic->ic_socket == NULL) { 1147 ICL_CONN_UNLOCK(ic); 1148 return; 1149 } 1150 ICL_CONN_UNLOCK(ic); 1151 1152 soshutdown(ic->ic_socket, SHUT_RDWR); 1153} 1154 1155void 1156icl_conn_close(struct icl_conn *ic) 1157{ 1158 struct icl_pdu *pdu; 1159 1160 ICL_CONN_LOCK(ic); 1161 if (ic->ic_socket == NULL) { 1162 ICL_CONN_UNLOCK(ic); 1163 return; 1164 } 1165 1166 ic->ic_disconnecting = true; 1167 1168 /* 1169 * Wake up the threads, so they can properly terminate. 1170 */ 1171 cv_signal(&ic->ic_receive_cv); 1172 cv_signal(&ic->ic_send_cv); 1173 while (ic->ic_receive_running || ic->ic_send_running) { 1174 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1175 ICL_CONN_UNLOCK(ic); 1176 cv_signal(&ic->ic_receive_cv); 1177 cv_signal(&ic->ic_send_cv); 1178 pause("icl_close", 1 * hz); 1179 ICL_CONN_LOCK(ic); 1180 } 1181 //ICL_DEBUG("send/receive threads terminated"); 1182 1183 soclose(ic->ic_socket); 1184 ic->ic_socket = NULL; 1185 1186 if (ic->ic_receive_pdu != NULL) { 1187 //ICL_DEBUG("freeing partially received PDU"); 1188 icl_pdu_free(ic->ic_receive_pdu); 1189 ic->ic_receive_pdu = NULL; 1190 } 1191 1192 /* 1193 * Remove any outstanding PDUs from the send queue. 1194 */ 1195 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 1196 pdu = TAILQ_FIRST(&ic->ic_to_send); 1197 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next); 1198 icl_pdu_free(pdu); 1199 } 1200 1201 KASSERT(TAILQ_EMPTY(&ic->ic_to_send), 1202 ("destroying session with non-empty send queue")); 1203 /* 1204 * XXX 1205 */ 1206#if 0 1207 KASSERT(ic->ic_outstanding_pdus == 0, 1208 ("destroying session with %d outstanding PDUs", 1209 ic->ic_outstanding_pdus)); 1210#endif 1211 ICL_CONN_UNLOCK(ic); 1212} 1213 1214bool 1215icl_conn_connected(struct icl_conn *ic) 1216{ 1217 1218 ICL_CONN_LOCK(ic); 1219 if (ic->ic_socket == NULL) { 1220 ICL_CONN_UNLOCK(ic); 1221 return (false); 1222 } 1223 if (ic->ic_socket->so_error != 0) { 1224 ICL_CONN_UNLOCK(ic); 1225 return (false); 1226 } 1227 ICL_CONN_UNLOCK(ic); 1228 return (true); 1229} 1230 1231#ifdef ICL_KERNEL_PROXY 1232int 1233icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1234{ 1235 int error; 1236 1237 if (so->so_type != SOCK_STREAM) 1238 return (EINVAL); 1239 1240 ICL_CONN_LOCK(ic); 1241 if (ic->ic_socket != NULL) { 1242 ICL_CONN_UNLOCK(ic); 1243 return (EBUSY); 1244 } 1245 ic->ic_socket = so; 1246 ICL_CONN_UNLOCK(ic); 1247 1248 error = icl_conn_start(ic); 1249 1250 return (error); 1251} 1252#endif /* ICL_KERNEL_PROXY */ 1253 1254static int 1255icl_unload(void) 1256{ 1257 1258 if (icl_ncons != 0) 1259 return (EBUSY); 1260 1261 uma_zdestroy(icl_conn_zone); 1262 uma_zdestroy(icl_pdu_zone); 1263 1264 return (0); 1265} 1266 1267static void 1268icl_load(void) 1269{ 1270 1271 icl_conn_zone = uma_zcreate("icl_conn", 1272 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1273 UMA_ALIGN_PTR, 0); 1274 icl_pdu_zone = uma_zcreate("icl_pdu", 1275 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1276 UMA_ALIGN_PTR, 0); 1277 1278 refcount_init(&icl_ncons, 0); 1279} 1280 1281static int 1282icl_modevent(module_t mod, int what, void *arg) 1283{ 1284 1285 switch (what) { 1286 case MOD_LOAD: 1287 icl_load(); 1288 return (0); 1289 case MOD_UNLOAD: 1290 return (icl_unload()); 1291 default: 1292 return (EINVAL); 1293 } 1294} 1295 1296moduledata_t icl_data = { 1297 "icl", 1298 icl_modevent, 1299 0 1300}; 1301 1302DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1303MODULE_VERSION(icl, 1); 1304