icl_soft.c revision 264023
1/*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: head/sys/dev/iscsi/icl.c 264023 2014-04-01 21:47:22Z trasz $ 30 */ 31 32/* 33 * iSCSI Common Layer. It's used by both the initiator and target to send 34 * and receive iSCSI PDUs. 35 */ 36 37#include <sys/param.h> 38#include <sys/capsicum.h> 39#include <sys/condvar.h> 40#include <sys/conf.h> 41#include <sys/file.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/mbuf.h> 46#include <sys/mutex.h> 47#include <sys/module.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/systm.h> 52#include <sys/sx.h> 53#include <sys/uio.h> 54#include <vm/uma.h> 55#include <netinet/in.h> 56#include <netinet/tcp.h> 57 58#include "icl.h" 59#include "iscsi_proto.h" 60 61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); 62static int debug = 1; 63TUNABLE_INT("kern.icl.debug", &debug); 64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW, 65 &debug, 1, "Enable debug messages"); 66static int partial_receive_len = 1 * 1024; /* XXX: More? */ 67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len); 68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW, 69 &partial_receive_len, 1 * 1024, "Minimum read size for partially received " 70 "data segment"); 71 72static uma_zone_t icl_conn_zone; 73static uma_zone_t icl_pdu_zone; 74 75static volatile u_int icl_ncons; 76 77#define ICL_DEBUG(X, ...) \ 78 do { \ 79 if (debug > 1) \ 80 printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ 81 } while (0) 82 83#define ICL_WARN(X, ...) \ 84 do { \ 85 if (debug > 0) { \ 86 printf("WARNING: %s: " X "\n", \ 87 __func__, ## __VA_ARGS__); \ 88 } \ 89 } while (0) 90 91#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 92#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 93#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 94#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 95 96static void 97icl_conn_fail(struct icl_conn *ic) 98{ 99 if (ic->ic_socket == NULL) 100 return; 101 102 /* 103 * XXX 104 */ 105 ic->ic_socket->so_error = EDOOFUS; 106 (ic->ic_error)(ic); 107} 108 109static struct mbuf * 110icl_conn_receive(struct icl_conn *ic, size_t len) 111{ 112 struct uio uio; 113 struct socket *so; 114 struct mbuf *m; 115 int error, flags; 116 117 so = ic->ic_socket; 118 119 memset(&uio, 0, sizeof(uio)); 120 uio.uio_resid = len; 121 122 flags = MSG_DONTWAIT; 123 error = soreceive(so, NULL, &uio, &m, NULL, &flags); 124 if (error != 0) { 125 ICL_DEBUG("soreceive error %d", error); 126 return (NULL); 127 } 128 if (uio.uio_resid != 0) { 129 m_freem(m); 130 ICL_DEBUG("short read"); 131 return (NULL); 132 } 133 134 return (m); 135} 136 137static struct icl_pdu * 138icl_pdu_new(struct icl_conn *ic, int flags) 139{ 140 struct icl_pdu *ip; 141 142#ifdef DIAGNOSTIC 143 refcount_acquire(&ic->ic_outstanding_pdus); 144#endif 145 ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO); 146 if (ip == NULL) { 147 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 148#ifdef DIAGNOSTIC 149 refcount_release(&ic->ic_outstanding_pdus); 150#endif 151 return (NULL); 152 } 153 154 ip->ip_conn = ic; 155 156 return (ip); 157} 158 159void 160icl_pdu_free(struct icl_pdu *ip) 161{ 162 struct icl_conn *ic; 163 164 ic = ip->ip_conn; 165 166 m_freem(ip->ip_bhs_mbuf); 167 m_freem(ip->ip_ahs_mbuf); 168 m_freem(ip->ip_data_mbuf); 169 uma_zfree(icl_pdu_zone, ip); 170#ifdef DIAGNOSTIC 171 refcount_release(&ic->ic_outstanding_pdus); 172#endif 173} 174 175/* 176 * Allocate icl_pdu with empty BHS to fill up by the caller. 177 */ 178struct icl_pdu * 179icl_pdu_new_bhs(struct icl_conn *ic, int flags) 180{ 181 struct icl_pdu *ip; 182 183 ip = icl_pdu_new(ic, flags); 184 if (ip == NULL) 185 return (NULL); 186 187 ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs), 188 flags, MT_DATA, M_PKTHDR); 189 if (ip->ip_bhs_mbuf == NULL) { 190 ICL_WARN("failed to allocate %zd bytes", sizeof(*ip)); 191 icl_pdu_free(ip); 192 return (NULL); 193 } 194 ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); 195 memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); 196 ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); 197 198 return (ip); 199} 200 201static int 202icl_pdu_ahs_length(const struct icl_pdu *request) 203{ 204 205 return (request->ip_bhs->bhs_total_ahs_len * 4); 206} 207 208size_t 209icl_pdu_data_segment_length(const struct icl_pdu *request) 210{ 211 uint32_t len = 0; 212 213 len += request->ip_bhs->bhs_data_segment_len[0]; 214 len <<= 8; 215 len += request->ip_bhs->bhs_data_segment_len[1]; 216 len <<= 8; 217 len += request->ip_bhs->bhs_data_segment_len[2]; 218 219 return (len); 220} 221 222static void 223icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) 224{ 225 226 response->ip_bhs->bhs_data_segment_len[2] = len; 227 response->ip_bhs->bhs_data_segment_len[1] = len >> 8; 228 response->ip_bhs->bhs_data_segment_len[0] = len >> 16; 229} 230 231static size_t 232icl_pdu_padding(const struct icl_pdu *ip) 233{ 234 235 if ((ip->ip_data_len % 4) != 0) 236 return (4 - (ip->ip_data_len % 4)); 237 238 return (0); 239} 240 241static size_t 242icl_pdu_size(const struct icl_pdu *response) 243{ 244 size_t len; 245 246 KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); 247 248 len = sizeof(struct iscsi_bhs) + response->ip_data_len + 249 icl_pdu_padding(response); 250 if (response->ip_conn->ic_header_crc32c) 251 len += ISCSI_HEADER_DIGEST_SIZE; 252 if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) 253 len += ISCSI_DATA_DIGEST_SIZE; 254 255 return (len); 256} 257 258static int 259icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) 260{ 261 struct mbuf *m; 262 263 m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs)); 264 if (m == NULL) { 265 ICL_DEBUG("failed to receive BHS"); 266 return (-1); 267 } 268 269 request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs)); 270 if (request->ip_bhs_mbuf == NULL) { 271 ICL_WARN("m_pullup failed"); 272 return (-1); 273 } 274 request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *); 275 276 /* 277 * XXX: For architectures with strict alignment requirements 278 * we may need to allocate ip_bhs and copy the data into it. 279 * For some reason, though, not doing this doesn't seem 280 * to cause problems; tested on sparc64. 281 */ 282 283 *availablep -= sizeof(struct iscsi_bhs); 284 return (0); 285} 286 287static int 288icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) 289{ 290 291 request->ip_ahs_len = icl_pdu_ahs_length(request); 292 if (request->ip_ahs_len == 0) 293 return (0); 294 295 request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, 296 request->ip_ahs_len); 297 if (request->ip_ahs_mbuf == NULL) { 298 ICL_DEBUG("failed to receive AHS"); 299 return (-1); 300 } 301 302 *availablep -= request->ip_ahs_len; 303 return (0); 304} 305 306static uint32_t 307icl_mbuf_to_crc32c(const struct mbuf *m0) 308{ 309 uint32_t digest = 0xffffffff; 310 const struct mbuf *m; 311 312 for (m = m0; m != NULL; m = m->m_next) 313 digest = calculate_crc32c(digest, 314 mtod(m, const void *), m->m_len); 315 316 digest = digest ^ 0xffffffff; 317 318 return (digest); 319} 320 321static int 322icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) 323{ 324 struct mbuf *m; 325 uint32_t received_digest, valid_digest; 326 327 if (request->ip_conn->ic_header_crc32c == false) 328 return (0); 329 330 m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE); 331 if (m == NULL) { 332 ICL_DEBUG("failed to receive header digest"); 333 return (-1); 334 } 335 336 CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); 337 m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest); 338 m_freem(m); 339 340 *availablep -= ISCSI_HEADER_DIGEST_SIZE; 341 342 /* 343 * XXX: Handle AHS. 344 */ 345 valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 346 if (received_digest != valid_digest) { 347 ICL_WARN("header digest check failed; got 0x%x, " 348 "should be 0x%x", received_digest, valid_digest); 349 return (-1); 350 } 351 352 return (0); 353} 354 355/* 356 * Return the number of bytes that should be waiting in the receive socket 357 * before icl_pdu_receive_data_segment() gets called. 358 */ 359static size_t 360icl_pdu_data_segment_receive_len(const struct icl_pdu *request) 361{ 362 size_t len; 363 364 len = icl_pdu_data_segment_length(request); 365 if (len == 0) 366 return (0); 367 368 /* 369 * Account for the parts of data segment already read from 370 * the socket buffer. 371 */ 372 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 373 len -= request->ip_data_len; 374 375 /* 376 * Don't always wait for the full data segment to be delivered 377 * to the socket; this might badly affect performance due to 378 * TCP window scaling. 379 */ 380 if (len > partial_receive_len) { 381#if 0 382 ICL_DEBUG("need %zd bytes of data, limiting to %zd", 383 len, partial_receive_len)); 384#endif 385 len = partial_receive_len; 386 387 return (len); 388 } 389 390 /* 391 * Account for padding. Note that due to the way code is written, 392 * the icl_pdu_receive_data_segment() must always receive padding 393 * along with the last part of data segment, because it would be 394 * impossible to tell whether we've already received the full data 395 * segment including padding, or without it. 396 */ 397 if ((len % 4) != 0) 398 len += 4 - (len % 4); 399 400#if 0 401 ICL_DEBUG("need %zd bytes of data", len)); 402#endif 403 404 return (len); 405} 406 407static int 408icl_pdu_receive_data_segment(struct icl_pdu *request, 409 size_t *availablep, bool *more_neededp) 410{ 411 struct icl_conn *ic; 412 size_t len, padding = 0; 413 struct mbuf *m; 414 415 ic = request->ip_conn; 416 417 *more_neededp = false; 418 ic->ic_receive_len = 0; 419 420 len = icl_pdu_data_segment_length(request); 421 if (len == 0) 422 return (0); 423 424 if ((len % 4) != 0) 425 padding = 4 - (len % 4); 426 427 /* 428 * Account for already received parts of data segment. 429 */ 430 KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); 431 len -= request->ip_data_len; 432 433 if (len + padding > *availablep) { 434 /* 435 * Not enough data in the socket buffer. Receive as much 436 * as we can. Don't receive padding, since, obviously, it's 437 * not the end of data segment yet. 438 */ 439#if 0 440 ICL_DEBUG("limited from %zd to %zd", 441 len + padding, *availablep - padding)); 442#endif 443 len = *availablep - padding; 444 *more_neededp = true; 445 padding = 0; 446 } 447 448 /* 449 * Must not try to receive padding without at least one byte 450 * of actual data segment. 451 */ 452 if (len > 0) { 453 m = icl_conn_receive(request->ip_conn, len + padding); 454 if (m == NULL) { 455 ICL_DEBUG("failed to receive data segment"); 456 return (-1); 457 } 458 459 if (request->ip_data_mbuf == NULL) 460 request->ip_data_mbuf = m; 461 else 462 m_cat(request->ip_data_mbuf, m); 463 464 request->ip_data_len += len; 465 *availablep -= len + padding; 466 } else 467 ICL_DEBUG("len 0"); 468 469 if (*more_neededp) 470 ic->ic_receive_len = 471 icl_pdu_data_segment_receive_len(request); 472 473 return (0); 474} 475 476static int 477icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) 478{ 479 struct mbuf *m; 480 uint32_t received_digest, valid_digest; 481 482 if (request->ip_conn->ic_data_crc32c == false) 483 return (0); 484 485 if (request->ip_data_len == 0) 486 return (0); 487 488 m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE); 489 if (m == NULL) { 490 ICL_DEBUG("failed to receive data digest"); 491 return (-1); 492 } 493 494 CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); 495 m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest); 496 m_freem(m); 497 498 *availablep -= ISCSI_DATA_DIGEST_SIZE; 499 500 /* 501 * Note that ip_data_mbuf also contains padding; since digest 502 * calculation is supposed to include that, we iterate over 503 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. 504 */ 505 valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 506 if (received_digest != valid_digest) { 507 ICL_WARN("data digest check failed; got 0x%x, " 508 "should be 0x%x", received_digest, valid_digest); 509 return (-1); 510 } 511 512 return (0); 513} 514 515/* 516 * Somewhat contrary to the name, this attempts to receive only one 517 * "part" of PDU at a time; call it repeatedly until it returns non-NULL. 518 */ 519static struct icl_pdu * 520icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) 521{ 522 struct icl_pdu *request; 523 struct socket *so; 524 size_t len; 525 int error; 526 bool more_needed; 527 528 so = ic->ic_socket; 529 530 if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { 531 KASSERT(ic->ic_receive_pdu == NULL, 532 ("ic->ic_receive_pdu != NULL")); 533 request = icl_pdu_new(ic, M_NOWAIT); 534 if (request == NULL) { 535 ICL_DEBUG("failed to allocate PDU; " 536 "dropping connection"); 537 icl_conn_fail(ic); 538 return (NULL); 539 } 540 ic->ic_receive_pdu = request; 541 } else { 542 KASSERT(ic->ic_receive_pdu != NULL, 543 ("ic->ic_receive_pdu == NULL")); 544 request = ic->ic_receive_pdu; 545 } 546 547 if (*availablep < ic->ic_receive_len) { 548#if 0 549 ICL_DEBUG("not enough data; need %zd, " 550 "have %zd", ic->ic_receive_len, *availablep); 551#endif 552 return (NULL); 553 } 554 555 switch (ic->ic_receive_state) { 556 case ICL_CONN_STATE_BHS: 557 //ICL_DEBUG("receiving BHS"); 558 error = icl_pdu_receive_bhs(request, availablep); 559 if (error != 0) { 560 ICL_DEBUG("failed to receive BHS; " 561 "dropping connection"); 562 break; 563 } 564 565 /* 566 * We don't enforce any limit for AHS length; 567 * its length is stored in 8 bit field. 568 */ 569 570 len = icl_pdu_data_segment_length(request); 571 if (len > ic->ic_max_data_segment_length) { 572 ICL_WARN("received data segment " 573 "length %zd is larger than negotiated " 574 "MaxDataSegmentLength %zd; " 575 "dropping connection", 576 len, ic->ic_max_data_segment_length); 577 error = EINVAL; 578 break; 579 } 580 581 ic->ic_receive_state = ICL_CONN_STATE_AHS; 582 ic->ic_receive_len = icl_pdu_ahs_length(request); 583 break; 584 585 case ICL_CONN_STATE_AHS: 586 //ICL_DEBUG("receiving AHS"); 587 error = icl_pdu_receive_ahs(request, availablep); 588 if (error != 0) { 589 ICL_DEBUG("failed to receive AHS; " 590 "dropping connection"); 591 break; 592 } 593 ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; 594 if (ic->ic_header_crc32c == false) 595 ic->ic_receive_len = 0; 596 else 597 ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE; 598 break; 599 600 case ICL_CONN_STATE_HEADER_DIGEST: 601 //ICL_DEBUG("receiving header digest"); 602 error = icl_pdu_check_header_digest(request, availablep); 603 if (error != 0) { 604 ICL_DEBUG("header digest failed; " 605 "dropping connection"); 606 break; 607 } 608 609 ic->ic_receive_state = ICL_CONN_STATE_DATA; 610 ic->ic_receive_len = 611 icl_pdu_data_segment_receive_len(request); 612 break; 613 614 case ICL_CONN_STATE_DATA: 615 //ICL_DEBUG("receiving data segment"); 616 error = icl_pdu_receive_data_segment(request, availablep, 617 &more_needed); 618 if (error != 0) { 619 ICL_DEBUG("failed to receive data segment;" 620 "dropping connection"); 621 break; 622 } 623 624 if (more_needed) 625 break; 626 627 ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST; 628 if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) 629 ic->ic_receive_len = 0; 630 else 631 ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE; 632 break; 633 634 case ICL_CONN_STATE_DATA_DIGEST: 635 //ICL_DEBUG("receiving data digest"); 636 error = icl_pdu_check_data_digest(request, availablep); 637 if (error != 0) { 638 ICL_DEBUG("data digest failed; " 639 "dropping connection"); 640 break; 641 } 642 643 /* 644 * We've received complete PDU; reset the receive state machine 645 * and return the PDU. 646 */ 647 ic->ic_receive_state = ICL_CONN_STATE_BHS; 648 ic->ic_receive_len = sizeof(struct iscsi_bhs); 649 ic->ic_receive_pdu = NULL; 650 return (request); 651 652 default: 653 panic("invalid ic_receive_state %d\n", ic->ic_receive_state); 654 } 655 656 if (error != 0) { 657 icl_pdu_free(request); 658 icl_conn_fail(ic); 659 } 660 661 return (NULL); 662} 663 664static void 665icl_conn_receive_pdus(struct icl_conn *ic, size_t available) 666{ 667 struct icl_pdu *response; 668 struct socket *so; 669 670 so = ic->ic_socket; 671 672 /* 673 * This can never happen; we're careful to only mess with ic->ic_socket 674 * pointer when the send/receive threads are not running. 675 */ 676 KASSERT(so != NULL, ("NULL socket")); 677 678 for (;;) { 679 if (ic->ic_disconnecting) 680 return; 681 682 if (so->so_error != 0) { 683 ICL_DEBUG("connection error %d; " 684 "dropping connection", so->so_error); 685 icl_conn_fail(ic); 686 return; 687 } 688 689 /* 690 * Loop until we have a complete PDU or there is not enough 691 * data in the socket buffer. 692 */ 693 if (available < ic->ic_receive_len) { 694#if 0 695 ICL_DEBUG("not enough data; have %zd, " 696 "need %zd", available, 697 ic->ic_receive_len); 698#endif 699 return; 700 } 701 702 response = icl_conn_receive_pdu(ic, &available); 703 if (response == NULL) 704 continue; 705 706 if (response->ip_ahs_len > 0) { 707 ICL_WARN("received PDU with unsupported " 708 "AHS; opcode 0x%x; dropping connection", 709 response->ip_bhs->bhs_opcode); 710 icl_pdu_free(response); 711 icl_conn_fail(ic); 712 return; 713 } 714 715 (ic->ic_receive)(response); 716 } 717} 718 719static void 720icl_receive_thread(void *arg) 721{ 722 struct icl_conn *ic; 723 size_t available; 724 struct socket *so; 725 726 ic = arg; 727 so = ic->ic_socket; 728 729 ICL_CONN_LOCK(ic); 730 ic->ic_receive_running = true; 731 ICL_CONN_UNLOCK(ic); 732 733 for (;;) { 734 if (ic->ic_disconnecting) { 735 //ICL_DEBUG("terminating"); 736 break; 737 } 738 739 SOCKBUF_LOCK(&so->so_rcv); 740 available = so->so_rcv.sb_cc; 741 if (available < ic->ic_receive_len) { 742 so->so_rcv.sb_lowat = ic->ic_receive_len; 743 cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); 744 } 745 SOCKBUF_UNLOCK(&so->so_rcv); 746 747 icl_conn_receive_pdus(ic, available); 748 } 749 750 ICL_CONN_LOCK(ic); 751 ic->ic_receive_running = false; 752 ICL_CONN_UNLOCK(ic); 753 kthread_exit(); 754} 755 756static int 757icl_soupcall_receive(struct socket *so, void *arg, int waitflag) 758{ 759 struct icl_conn *ic; 760 761 ic = arg; 762 cv_signal(&ic->ic_receive_cv); 763 return (SU_OK); 764} 765 766static int 767icl_pdu_send(struct icl_pdu *request) 768{ 769 size_t padding, pdu_len; 770 uint32_t digest, zero = 0; 771 int error, ok; 772 struct socket *so; 773 struct icl_conn *ic; 774 775 ic = request->ip_conn; 776 so = request->ip_conn->ic_socket; 777 778 ICL_CONN_LOCK_ASSERT(ic); 779 780 icl_pdu_set_data_segment_length(request, request->ip_data_len); 781 782 pdu_len = icl_pdu_size(request); 783 784 if (ic->ic_header_crc32c) { 785 digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); 786 ok = m_append(request->ip_bhs_mbuf, sizeof(digest), 787 (void *)&digest); 788 if (ok != 1) { 789 ICL_WARN("failed to append header digest"); 790 return (1); 791 } 792 } 793 794 if (request->ip_data_len != 0) { 795 padding = icl_pdu_padding(request); 796 if (padding > 0) { 797 ok = m_append(request->ip_data_mbuf, padding, 798 (void *)&zero); 799 if (ok != 1) { 800 ICL_WARN("failed to append padding"); 801 return (1); 802 } 803 } 804 805 if (ic->ic_data_crc32c) { 806 digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); 807 808 ok = m_append(request->ip_data_mbuf, sizeof(digest), 809 (void *)&digest); 810 if (ok != 1) { 811 ICL_WARN("failed to append header digest"); 812 return (1); 813 } 814 } 815 816 m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); 817 request->ip_data_mbuf = NULL; 818 } 819 820 request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; 821 822 error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, 823 NULL, MSG_DONTWAIT, curthread); 824 request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ 825 if (error != 0) { 826 ICL_DEBUG("sosend error %d", error); 827 return (error); 828 } 829 830 return (0); 831} 832 833static void 834icl_conn_send_pdus(struct icl_conn *ic) 835{ 836 struct icl_pdu *request; 837 struct socket *so; 838 size_t available, size; 839 int error; 840 841 ICL_CONN_LOCK_ASSERT(ic); 842 843 so = ic->ic_socket; 844 845 SOCKBUF_LOCK(&so->so_snd); 846 available = sbspace(&so->so_snd); 847 SOCKBUF_UNLOCK(&so->so_snd); 848 849 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 850 if (ic->ic_disconnecting) 851 return; 852 853 request = TAILQ_FIRST(&ic->ic_to_send); 854 size = icl_pdu_size(request); 855 if (available < size) { 856 /* 857 * Set the low watermark on the socket, 858 * to avoid waking up until there is enough 859 * space. 860 */ 861 SOCKBUF_LOCK(&so->so_snd); 862 so->so_snd.sb_lowat = size; 863 SOCKBUF_UNLOCK(&so->so_snd); 864#if 1 865 ICL_DEBUG("no space to send; " 866 "have %zd, need %zd", 867 available, size); 868#endif 869 return; 870 } 871 available -= size; 872 TAILQ_REMOVE(&ic->ic_to_send, request, ip_next); 873 error = icl_pdu_send(request); 874 if (error != 0) { 875 ICL_DEBUG("failed to send PDU; " 876 "dropping connection"); 877 icl_conn_fail(ic); 878 return; 879 } 880 icl_pdu_free(request); 881 } 882} 883 884static void 885icl_send_thread(void *arg) 886{ 887 struct icl_conn *ic; 888 889 ic = arg; 890 891 ICL_CONN_LOCK(ic); 892 ic->ic_send_running = true; 893 894 for (;;) { 895 if (ic->ic_disconnecting) { 896 //ICL_DEBUG("terminating"); 897 break; 898 } 899 icl_conn_send_pdus(ic); 900 cv_wait(&ic->ic_send_cv, ic->ic_lock); 901 } 902 903 ic->ic_send_running = false; 904 ICL_CONN_UNLOCK(ic); 905 kthread_exit(); 906} 907 908static int 909icl_soupcall_send(struct socket *so, void *arg, int waitflag) 910{ 911 struct icl_conn *ic; 912 913 ic = arg; 914 cv_signal(&ic->ic_send_cv); 915 return (SU_OK); 916} 917 918int 919icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags) 920{ 921 struct mbuf *mb, *newmb; 922 size_t copylen, off = 0; 923 924 KASSERT(len > 0, ("len == 0")); 925 926 newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR); 927 if (newmb == NULL) { 928 ICL_WARN("failed to allocate mbuf for %zd bytes", len); 929 return (ENOMEM); 930 } 931 932 for (mb = newmb; mb != NULL; mb = mb->m_next) { 933 copylen = min(M_TRAILINGSPACE(mb), len - off); 934 memcpy(mtod(mb, char *), (const char *)addr + off, copylen); 935 mb->m_len = copylen; 936 off += copylen; 937 } 938 KASSERT(off == len, ("%s: off != len", __func__)); 939 940 if (request->ip_data_mbuf == NULL) { 941 request->ip_data_mbuf = newmb; 942 request->ip_data_len = len; 943 } else { 944 m_cat(request->ip_data_mbuf, newmb); 945 request->ip_data_len += len; 946 } 947 948 return (0); 949} 950 951void 952icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len) 953{ 954 955 m_copydata(ip->ip_data_mbuf, off, len, addr); 956} 957 958void 959icl_pdu_queue(struct icl_pdu *ip) 960{ 961 struct icl_conn *ic; 962 963 ic = ip->ip_conn; 964 965 ICL_CONN_LOCK_ASSERT(ic); 966 967 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 968 ICL_DEBUG("icl_pdu_queue on closed connection"); 969 icl_pdu_free(ip); 970 return; 971 } 972 TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next); 973 cv_signal(&ic->ic_send_cv); 974} 975 976struct icl_conn * 977icl_conn_new(const char *name, struct mtx *lock) 978{ 979 struct icl_conn *ic; 980 981 refcount_acquire(&icl_ncons); 982 983 ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO); 984 985 TAILQ_INIT(&ic->ic_to_send); 986 ic->ic_lock = lock; 987 cv_init(&ic->ic_send_cv, "icl_tx"); 988 cv_init(&ic->ic_receive_cv, "icl_rx"); 989#ifdef DIAGNOSTIC 990 refcount_init(&ic->ic_outstanding_pdus, 0); 991#endif 992 ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH; 993 ic->ic_name = name; 994 995 return (ic); 996} 997 998void 999icl_conn_free(struct icl_conn *ic) 1000{ 1001 1002 cv_destroy(&ic->ic_send_cv); 1003 cv_destroy(&ic->ic_receive_cv); 1004 uma_zfree(icl_conn_zone, ic); 1005 refcount_release(&icl_ncons); 1006} 1007 1008static int 1009icl_conn_start(struct icl_conn *ic) 1010{ 1011 size_t bufsize; 1012 struct sockopt opt; 1013 int error, one = 1; 1014 1015 ICL_CONN_LOCK(ic); 1016 1017 /* 1018 * XXX: Ugly hack. 1019 */ 1020 if (ic->ic_socket == NULL) { 1021 ICL_CONN_UNLOCK(ic); 1022 return (EINVAL); 1023 } 1024 1025 ic->ic_receive_state = ICL_CONN_STATE_BHS; 1026 ic->ic_receive_len = sizeof(struct iscsi_bhs); 1027 ic->ic_disconnecting = false; 1028 1029 ICL_CONN_UNLOCK(ic); 1030 1031 /* 1032 * Use max available sockbuf size for sending. Do it manually 1033 * instead of sbreserve(9) to work around resource limits. 1034 * 1035 * XXX: This kind of sucks. On one hand, we don't currently support 1036 * sending a part of data segment; we always do it in one piece, 1037 * so we have to make sure it can fit in the socket buffer. 1038 * Once I've implemented partial send, we'll get rid of this 1039 * and use autoscaling. 1040 */ 1041 bufsize = (sizeof(struct iscsi_bhs) + 1042 ic->ic_max_data_segment_length) * 8; 1043 error = soreserve(ic->ic_socket, bufsize, bufsize); 1044 if (error != 0) { 1045 ICL_WARN("soreserve failed with error %d", error); 1046 icl_conn_close(ic); 1047 return (error); 1048 } 1049 1050 /* 1051 * Disable Nagle. 1052 */ 1053 bzero(&opt, sizeof(opt)); 1054 opt.sopt_dir = SOPT_SET; 1055 opt.sopt_level = IPPROTO_TCP; 1056 opt.sopt_name = TCP_NODELAY; 1057 opt.sopt_val = &one; 1058 opt.sopt_valsize = sizeof(one); 1059 error = sosetopt(ic->ic_socket, &opt); 1060 if (error != 0) { 1061 ICL_WARN("disabling TCP_NODELAY failed with error %d", error); 1062 icl_conn_close(ic); 1063 return (error); 1064 } 1065 1066 /* 1067 * Start threads. 1068 */ 1069 error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", 1070 ic->ic_name); 1071 if (error != 0) { 1072 ICL_WARN("kthread_add(9) failed with error %d", error); 1073 icl_conn_close(ic); 1074 return (error); 1075 } 1076 1077 error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", 1078 ic->ic_name); 1079 if (error != 0) { 1080 ICL_WARN("kthread_add(9) failed with error %d", error); 1081 icl_conn_close(ic); 1082 return (error); 1083 } 1084 1085 /* 1086 * Register socket upcall, to get notified about incoming PDUs 1087 * and free space to send outgoing ones. 1088 */ 1089 SOCKBUF_LOCK(&ic->ic_socket->so_snd); 1090 soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); 1091 SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); 1092 SOCKBUF_LOCK(&ic->ic_socket->so_rcv); 1093 soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); 1094 SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); 1095 1096 return (0); 1097} 1098 1099int 1100icl_conn_handoff(struct icl_conn *ic, int fd) 1101{ 1102 struct file *fp; 1103 struct socket *so; 1104 cap_rights_t rights; 1105 int error; 1106 1107 ICL_CONN_LOCK_ASSERT_NOT(ic); 1108 1109 /* 1110 * Steal the socket from userland. 1111 */ 1112 error = fget(curthread, fd, 1113 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 1114 if (error != 0) 1115 return (error); 1116 if (fp->f_type != DTYPE_SOCKET) { 1117 fdrop(fp, curthread); 1118 return (EINVAL); 1119 } 1120 so = fp->f_data; 1121 if (so->so_type != SOCK_STREAM) { 1122 fdrop(fp, curthread); 1123 return (EINVAL); 1124 } 1125 1126 ICL_CONN_LOCK(ic); 1127 1128 if (ic->ic_socket != NULL) { 1129 ICL_CONN_UNLOCK(ic); 1130 fdrop(fp, curthread); 1131 return (EBUSY); 1132 } 1133 1134 ic->ic_socket = fp->f_data; 1135 fp->f_ops = &badfileops; 1136 fp->f_data = NULL; 1137 fdrop(fp, curthread); 1138 ICL_CONN_UNLOCK(ic); 1139 1140 error = icl_conn_start(ic); 1141 1142 return (error); 1143} 1144 1145void 1146icl_conn_shutdown(struct icl_conn *ic) 1147{ 1148 ICL_CONN_LOCK_ASSERT_NOT(ic); 1149 1150 ICL_CONN_LOCK(ic); 1151 if (ic->ic_socket == NULL) { 1152 ICL_CONN_UNLOCK(ic); 1153 return; 1154 } 1155 ICL_CONN_UNLOCK(ic); 1156 1157 soshutdown(ic->ic_socket, SHUT_RDWR); 1158} 1159 1160void 1161icl_conn_close(struct icl_conn *ic) 1162{ 1163 struct icl_pdu *pdu; 1164 1165 ICL_CONN_LOCK_ASSERT_NOT(ic); 1166 1167 ICL_CONN_LOCK(ic); 1168 if (ic->ic_socket == NULL) { 1169 ICL_CONN_UNLOCK(ic); 1170 return; 1171 } 1172 1173 ic->ic_disconnecting = true; 1174 1175 /* 1176 * Wake up the threads, so they can properly terminate. 1177 */ 1178 cv_signal(&ic->ic_receive_cv); 1179 cv_signal(&ic->ic_send_cv); 1180 while (ic->ic_receive_running || ic->ic_send_running) { 1181 //ICL_DEBUG("waiting for send/receive threads to terminate"); 1182 ICL_CONN_UNLOCK(ic); 1183 cv_signal(&ic->ic_receive_cv); 1184 cv_signal(&ic->ic_send_cv); 1185 pause("icl_close", 1 * hz); 1186 ICL_CONN_LOCK(ic); 1187 } 1188 //ICL_DEBUG("send/receive threads terminated"); 1189 1190 soclose(ic->ic_socket); 1191 ic->ic_socket = NULL; 1192 1193 if (ic->ic_receive_pdu != NULL) { 1194 //ICL_DEBUG("freeing partially received PDU"); 1195 icl_pdu_free(ic->ic_receive_pdu); 1196 ic->ic_receive_pdu = NULL; 1197 } 1198 1199 /* 1200 * Remove any outstanding PDUs from the send queue. 1201 */ 1202 while (!TAILQ_EMPTY(&ic->ic_to_send)) { 1203 pdu = TAILQ_FIRST(&ic->ic_to_send); 1204 TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next); 1205 icl_pdu_free(pdu); 1206 } 1207 1208 KASSERT(TAILQ_EMPTY(&ic->ic_to_send), 1209 ("destroying session with non-empty send queue")); 1210 /* 1211 * XXX 1212 */ 1213#if 0 1214 KASSERT(ic->ic_outstanding_pdus == 0, 1215 ("destroying session with %d outstanding PDUs", 1216 ic->ic_outstanding_pdus)); 1217#endif 1218 ICL_CONN_UNLOCK(ic); 1219} 1220 1221bool 1222icl_conn_connected(struct icl_conn *ic) 1223{ 1224 ICL_CONN_LOCK_ASSERT_NOT(ic); 1225 1226 ICL_CONN_LOCK(ic); 1227 if (ic->ic_socket == NULL) { 1228 ICL_CONN_UNLOCK(ic); 1229 return (false); 1230 } 1231 if (ic->ic_socket->so_error != 0) { 1232 ICL_CONN_UNLOCK(ic); 1233 return (false); 1234 } 1235 ICL_CONN_UNLOCK(ic); 1236 return (true); 1237} 1238 1239#ifdef ICL_KERNEL_PROXY 1240int 1241icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so) 1242{ 1243 int error; 1244 1245 ICL_CONN_LOCK_ASSERT_NOT(ic); 1246 1247 if (so->so_type != SOCK_STREAM) 1248 return (EINVAL); 1249 1250 ICL_CONN_LOCK(ic); 1251 if (ic->ic_socket != NULL) { 1252 ICL_CONN_UNLOCK(ic); 1253 return (EBUSY); 1254 } 1255 ic->ic_socket = so; 1256 ICL_CONN_UNLOCK(ic); 1257 1258 error = icl_conn_start(ic); 1259 1260 return (error); 1261} 1262#endif /* ICL_KERNEL_PROXY */ 1263 1264static int 1265icl_unload(void) 1266{ 1267 1268 if (icl_ncons != 0) 1269 return (EBUSY); 1270 1271 uma_zdestroy(icl_conn_zone); 1272 uma_zdestroy(icl_pdu_zone); 1273 1274 return (0); 1275} 1276 1277static void 1278icl_load(void) 1279{ 1280 1281 icl_conn_zone = uma_zcreate("icl_conn", 1282 sizeof(struct icl_conn), NULL, NULL, NULL, NULL, 1283 UMA_ALIGN_PTR, 0); 1284 icl_pdu_zone = uma_zcreate("icl_pdu", 1285 sizeof(struct icl_pdu), NULL, NULL, NULL, NULL, 1286 UMA_ALIGN_PTR, 0); 1287 1288 refcount_init(&icl_ncons, 0); 1289} 1290 1291static int 1292icl_modevent(module_t mod, int what, void *arg) 1293{ 1294 1295 switch (what) { 1296 case MOD_LOAD: 1297 icl_load(); 1298 return (0); 1299 case MOD_UNLOAD: 1300 return (icl_unload()); 1301 default: 1302 return (EINVAL); 1303 } 1304} 1305 1306moduledata_t icl_data = { 1307 "icl", 1308 icl_modevent, 1309 0 1310}; 1311 1312DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1313MODULE_VERSION(icl, 1); 1314