1/* 2 * Copyright (c) 2009-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/types.h> 32#include <sys/filedesc.h> 33#include <sys/file_internal.h> 34#include <sys/proc.h> 35#include <sys/socket.h> 36#include <sys/socketvar.h> 37#include <sys/errno.h> 38#include <sys/protosw.h> 39#include <sys/domain.h> 40#include <sys/mbuf.h> 41#include <sys/queue.h> 42 43#include <net/if.h> 44#include <net/route.h> 45 46#include <netinet/in.h> 47#include <netinet/in_var.h> 48#include <netinet/in_pcb.h> 49#include <netinet/ip.h> 50#include <netinet/ip_var.h> 51#include <netinet/ip6.h> 52#include <netinet6/ip6_var.h> 53#include <netinet/udp.h> 54#include <netinet/udp_var.h> 55#include <netinet/tcp.h> 56#include <netinet/tcp_var.h> 57#include <netinet/tcp_cc.h> 58#include <netinet/lro_ext.h> 59 60extern char *proc_name_address(void *p); 61 62static int tfp_count = 0; 63 64static TAILQ_HEAD(, tclass_for_proc) tfp_head = 65 TAILQ_HEAD_INITIALIZER(tfp_head); 66 67struct tclass_for_proc { 68 TAILQ_ENTRY(tclass_for_proc) tfp_link; 69 int tfp_class; 70 pid_t tfp_pid; 71 char tfp_pname[MAXCOMLEN + 1]; 72}; 73 74static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t); 75static int get_pid_tclass(struct so_tcdbg *); 76static int get_pname_tclass(struct so_tcdbg *); 77static int set_pid_tclass(struct so_tcdbg *); 78static int set_pname_tclass(struct so_tcdbg *); 79static int flush_pid_tclass(struct so_tcdbg *); 80static int purge_tclass_for_proc(void); 81static int flush_tclass_for_proc(void); 82int get_tclass_for_curr_proc(int *); 83 84static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ 85static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ 86static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ 87decl_lck_mtx_data(static, tclass_lock_data); 88static lck_mtx_t *tclass_lock = &tclass_lock_data; 89 90/* 91 * If there is no foreground activity on the interface for bg_switch_time 92 * seconds, the background connections can switch to foreground TCP 93 * congestion control. 94 */ 95#define TCP_BG_SWITCH_TIME 2 96 97/* 98 * Must be called with tclass_lock held 99 */ 100static struct tclass_for_proc * 101find_tfp_by_pid(pid_t pid) 102{ 103 struct tclass_for_proc *tfp; 104 105 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 106 if (tfp->tfp_pid == pid) 107 break; 108 } 109 return (tfp); 110} 111 112/* 113 * Must be called with tclass_lock held 114 */ 115static struct tclass_for_proc * 116find_tfp_by_pname(const char *pname) 117{ 118 struct tclass_for_proc *tfp; 119 120 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 121 if (strncmp(pname, tfp->tfp_pname, 122 sizeof (tfp->tfp_pname)) == 0) 123 break; 124 } 125 return (tfp); 126} 127 128__private_extern__ int 129get_tclass_for_curr_proc(int *sotc) 130{ 131 struct tclass_for_proc *tfp = NULL; 132 proc_t p = current_proc(); /* Not ref counted */ 133 pid_t pid = proc_pid(p); 134 char *pname = proc_name_address(p); 135 136 *sotc = -1; 137 138 lck_mtx_lock(tclass_lock); 139 140 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 141 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 && 142 strncmp(pname, tfp->tfp_pname, 143 sizeof (tfp->tfp_pname)) == 0)) { 144 *sotc = tfp->tfp_class; 145 break; 146 } 147 } 148 149 lck_mtx_unlock(tclass_lock); 150 151 return ((tfp == NULL) ? 0 : 1); 152} 153 154/* 155 * Purge entries with PIDs of exited processes 156 */ 157int 158purge_tclass_for_proc(void) 159{ 160 int error = 0; 161 struct tclass_for_proc *tfp, *tvar; 162 163 lck_mtx_lock(tclass_lock); 164 165 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { 166 proc_t p; 167 168 if (tfp->tfp_pid == -1) 169 continue; 170 if ((p = proc_find(tfp->tfp_pid)) == NULL) { 171 tfp_count--; 172 TAILQ_REMOVE(&tfp_head, tfp, tfp_link); 173 174 _FREE(tfp, M_TEMP); 175 } else { 176 proc_rele(p); 177 } 178 } 179 180 lck_mtx_unlock(tclass_lock); 181 182 return (error); 183} 184 185/* 186 * Remove one entry 187 * Must be called with tclass_lock held 188 */ 189static void 190free_tclass_for_proc(struct tclass_for_proc *tfp) 191{ 192 if (tfp == NULL) 193 return; 194 tfp_count--; 195 TAILQ_REMOVE(&tfp_head, tfp, tfp_link); 196 _FREE(tfp, M_TEMP); 197} 198 199/* 200 * Remove all entries 201 */ 202int 203flush_tclass_for_proc(void) 204{ 205 int error = 0; 206 struct tclass_for_proc *tfp, *tvar; 207 208 lck_mtx_lock(tclass_lock); 209 210 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { 211 free_tclass_for_proc(tfp); 212 } 213 214 lck_mtx_unlock(tclass_lock); 215 216 return (error); 217 218} 219 220/* 221 * Must be called with tclass_lock held 222 */ 223static struct tclass_for_proc * 224alloc_tclass_for_proc(pid_t pid, const char *pname) 225{ 226 struct tclass_for_proc *tfp; 227 228 if (pid == -1 && pname == NULL) 229 return (NULL); 230 231 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO); 232 if (tfp == NULL) 233 return (NULL); 234 235 tfp->tfp_pid = pid; 236 /* 237 * Add per pid entries before per proc name so we can find 238 * a specific instance of a process before the general name base entry. 239 */ 240 if (pid != -1) { 241 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); 242 } else { 243 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname)); 244 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); 245 } 246 247 tfp_count++; 248 249 return (tfp); 250} 251 252/* 253 * -1 for tclass means to remove the entry 254 */ 255int 256set_pid_tclass(struct so_tcdbg *so_tcdbg) 257{ 258 int error = EINVAL; 259 proc_t p = NULL; 260 struct filedesc *fdp; 261 struct fileproc *fp; 262 struct tclass_for_proc *tfp; 263 int i; 264 pid_t pid = so_tcdbg->so_tcdbg_pid; 265 int tclass = so_tcdbg->so_tcdbg_tclass; 266 267 p = proc_find(pid); 268 if (p == NULL) { 269 printf("%s proc_find(%d) failed\n", __func__, pid); 270 goto done; 271 } 272 273 /* Need a tfp */ 274 lck_mtx_lock(tclass_lock); 275 276 tfp = find_tfp_by_pid(pid); 277 if (tfp == NULL) { 278 tfp = alloc_tclass_for_proc(pid, NULL); 279 if (tfp == NULL) { 280 lck_mtx_unlock(tclass_lock); 281 error = ENOBUFS; 282 goto done; 283 } 284 } 285 tfp->tfp_class = tclass; 286 287 lck_mtx_unlock(tclass_lock); 288 289 if (tfp != NULL) { 290 proc_fdlock(p); 291 292 fdp = p->p_fd; 293 for (i = 0; i < fdp->fd_nfiles; i++) { 294 struct socket *so; 295 296 fp = fdp->fd_ofiles[i]; 297 if (fp == NULL || 298 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || 299 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) 300 continue; 301 302 so = (struct socket *)fp->f_fglob->fg_data; 303 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) 304 continue; 305 socket_lock(so, 1); 306 if (tclass != -1) { 307 error = so_set_traffic_class(so, tclass); 308 if (error != 0) { 309 printf("%s: so_set_traffic_class" 310 "(so=0x%llx, fd=%d, tclass=%d) " 311 "failed %d\n", __func__, 312 (uint64_t)VM_KERNEL_ADDRPERM(so), 313 i, tclass, error); 314 error = 0; 315 } 316 } 317 socket_unlock(so, 1); 318 } 319 320 proc_fdunlock(p); 321 } 322 323 error = 0; 324done: 325 if (p != NULL) 326 proc_rele(p); 327 328 return (error); 329} 330 331int 332set_pname_tclass(struct so_tcdbg *so_tcdbg) 333{ 334 int error = EINVAL; 335 struct tclass_for_proc *tfp; 336 337 lck_mtx_lock(tclass_lock); 338 339 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 340 if (tfp == NULL) { 341 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname); 342 if (tfp == NULL) { 343 lck_mtx_unlock(tclass_lock); 344 error = ENOBUFS; 345 goto done; 346 } 347 } 348 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass; 349 350 lck_mtx_unlock(tclass_lock); 351 352 error = 0; 353done: 354 355 return (error); 356} 357 358static int 359flush_pid_tclass(struct so_tcdbg *so_tcdbg) 360{ 361 pid_t pid = so_tcdbg->so_tcdbg_pid; 362 int tclass = so_tcdbg->so_tcdbg_tclass; 363 struct filedesc *fdp; 364 int error = EINVAL; 365 proc_t p; 366 int i; 367 368 p = proc_find(pid); 369 if (p == PROC_NULL) { 370 printf("%s proc_find(%d) failed\n", __func__, pid); 371 goto done; 372 } 373 374 proc_fdlock(p); 375 fdp = p->p_fd; 376 for (i = 0; i < fdp->fd_nfiles; i++) { 377 struct socket *so; 378 struct fileproc *fp; 379 380 fp = fdp->fd_ofiles[i]; 381 if (fp == NULL || 382 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || 383 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) 384 continue; 385 386 so = (struct socket *)fp->f_fglob->fg_data; 387 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass, 388 sizeof (tclass)); 389 if (error != 0) { 390 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, " 391 "tclass=%d) failed %d\n", __func__, 392 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass, 393 error); 394 error = 0; 395 } 396 } 397 proc_fdunlock(p); 398 399 error = 0; 400done: 401 if (p != PROC_NULL) 402 proc_rele(p); 403 404 return (error); 405} 406 407int 408get_pid_tclass(struct so_tcdbg *so_tcdbg) 409{ 410 int error = EINVAL; 411 proc_t p = NULL; 412 struct tclass_for_proc *tfp; 413 pid_t pid = so_tcdbg->so_tcdbg_pid; 414 415 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ 416 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ 417 418 p = proc_find(pid); 419 if (p == NULL) { 420 printf("%s proc_find(%d) failed\n", __func__, pid); 421 goto done; 422 } 423 424 /* Need a tfp */ 425 lck_mtx_lock(tclass_lock); 426 427 tfp = find_tfp_by_pid(pid); 428 if (tfp != NULL) { 429 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; 430 error = 0; 431 } 432 lck_mtx_unlock(tclass_lock); 433done: 434 if (p != NULL) 435 proc_rele(p); 436 437 return (error); 438} 439 440int 441get_pname_tclass(struct so_tcdbg *so_tcdbg) 442{ 443 int error = EINVAL; 444 struct tclass_for_proc *tfp; 445 446 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ 447 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ 448 449 /* Need a tfp */ 450 lck_mtx_lock(tclass_lock); 451 452 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 453 if (tfp != NULL) { 454 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; 455 error = 0; 456 } 457 lck_mtx_unlock(tclass_lock); 458 459 return (error); 460} 461 462static int 463delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg) 464{ 465 int error = EINVAL; 466 pid_t pid = so_tcdbg->so_tcdbg_pid; 467 struct tclass_for_proc *tfp = NULL; 468 469 lck_mtx_lock(tclass_lock); 470 471 if (pid != -1) 472 tfp = find_tfp_by_pid(pid); 473 else 474 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 475 476 if (tfp != NULL) { 477 free_tclass_for_proc(tfp); 478 error = 0; 479 } 480 481 lck_mtx_unlock(tclass_lock); 482 483 return (error); 484} 485 486/* 487 * Setting options requires privileges 488 */ 489__private_extern__ int 490so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) 491{ 492 int error = 0; 493 494 if ((so->so_state & SS_PRIV) == 0) 495 return (EPERM); 496 497 socket_unlock(so, 0); 498 499 switch (so_tcdbg->so_tcdbg_cmd) { 500 case SO_TCDBG_PID: 501 error = set_pid_tclass(so_tcdbg); 502 break; 503 504 case SO_TCDBG_PNAME: 505 error = set_pname_tclass(so_tcdbg); 506 break; 507 508 case SO_TCDBG_PURGE: 509 error = purge_tclass_for_proc(); 510 break; 511 512 case SO_TCDBG_FLUSH: 513 error = flush_tclass_for_proc(); 514 break; 515 516 case SO_TCDBG_DELETE: 517 error = delete_tclass_for_pid_pname(so_tcdbg); 518 break; 519 520 case SO_TCDBG_TCFLUSH_PID: 521 error = flush_pid_tclass(so_tcdbg); 522 break; 523 524 default: 525 error = EINVAL; 526 break; 527 } 528 529 socket_lock(so, 0); 530 531 return (error); 532} 533 534/* 535 * Not required to be privileged to get 536 */ 537__private_extern__ int 538sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) 539{ 540 int error = 0; 541 struct so_tcdbg so_tcdbg; 542 void *buf = NULL; 543 size_t len = sopt->sopt_valsize; 544 545 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), 546 sizeof (struct so_tcdbg)); 547 if (error != 0) 548 return (error); 549 550 sopt->sopt_valsize = len; 551 552 socket_unlock(so, 0); 553 554 switch (so_tcdbg.so_tcdbg_cmd) { 555 case SO_TCDBG_PID: 556 error = get_pid_tclass(&so_tcdbg); 557 break; 558 559 case SO_TCDBG_PNAME: 560 error = get_pname_tclass(&so_tcdbg); 561 break; 562 563 case SO_TCDBG_COUNT: 564 lck_mtx_lock(tclass_lock); 565 so_tcdbg.so_tcdbg_count = tfp_count; 566 lck_mtx_unlock(tclass_lock); 567 break; 568 569 case SO_TCDBG_LIST: { 570 struct tclass_for_proc *tfp; 571 int n, alloc_count; 572 struct so_tcdbg *ptr; 573 574 lck_mtx_lock(tclass_lock); 575 if ((alloc_count = tfp_count) == 0) { 576 lck_mtx_unlock(tclass_lock); 577 error = EINVAL; 578 break; 579 } 580 len = alloc_count * sizeof (struct so_tcdbg); 581 lck_mtx_unlock(tclass_lock); 582 583 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); 584 if (buf == NULL) { 585 error = ENOBUFS; 586 break; 587 } 588 589 lck_mtx_lock(tclass_lock); 590 n = 0; 591 ptr = (struct so_tcdbg *)buf; 592 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 593 if (++n > alloc_count) 594 break; 595 if (tfp->tfp_pid != -1) { 596 ptr->so_tcdbg_cmd = SO_TCDBG_PID; 597 ptr->so_tcdbg_pid = tfp->tfp_pid; 598 } else { 599 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; 600 ptr->so_tcdbg_pid = -1; 601 strlcpy(ptr->so_tcdbg_pname, 602 tfp->tfp_pname, 603 sizeof (ptr->so_tcdbg_pname)); 604 } 605 ptr->so_tcdbg_tclass = tfp->tfp_class; 606 ptr++; 607 } 608 609 lck_mtx_unlock(tclass_lock); 610 } 611 break; 612 613 default: 614 error = EINVAL; 615 break; 616 } 617 618 socket_lock(so, 0); 619 620 if (error == 0) { 621 if (buf == NULL) { 622 error = sooptcopyout(sopt, &so_tcdbg, 623 sizeof (struct so_tcdbg)); 624 } else { 625 error = sooptcopyout(sopt, buf, len); 626 _FREE(buf, M_TEMP); 627 } 628 } 629 return (error); 630} 631 632 633__private_extern__ int 634so_set_traffic_class(struct socket *so, int optval) 635{ 636 int error = 0; 637 638 if (optval < SO_TC_BE || optval > SO_TC_CTL) { 639 error = EINVAL; 640 } else { 641 switch (optval) { 642 case _SO_TC_BK: 643 optval = SO_TC_BK; 644 break; 645 case _SO_TC_VI: 646 optval = SO_TC_VI; 647 break; 648 case _SO_TC_VO: 649 optval = SO_TC_VO; 650 break; 651 default: 652 if (!SO_VALID_TC(optval)) 653 error = EINVAL; 654 break; 655 } 656 657 if (error == 0) { 658 int oldval = so->so_traffic_class; 659 660 VERIFY(SO_VALID_TC(optval)); 661 so->so_traffic_class = optval; 662 663 if ((SOCK_DOM(so) == PF_INET || 664 SOCK_DOM(so) == PF_INET6) && 665 SOCK_TYPE(so) == SOCK_STREAM) 666 set_tcp_stream_priority(so); 667 668 if ((SOCK_DOM(so) == PF_INET || 669 SOCK_DOM(so) == PF_INET6) && 670 optval != oldval && (optval == SO_TC_BK_SYS || 671 oldval == SO_TC_BK_SYS)) { 672 /* 673 * If the app switches from BK_SYS to something 674 * else, resume the socket if it was suspended. 675 */ 676 if (oldval == SO_TC_BK_SYS) 677 inp_reset_fc_state(so->so_pcb); 678 679 SOTHROTTLELOG(("throttle[%d]: so 0x%llx " 680 "[%d,%d] opportunistic %s\n", so->last_pid, 681 (uint64_t)VM_KERNEL_ADDRPERM(so), 682 SOCK_DOM(so), SOCK_TYPE(so), 683 (optval == SO_TC_BK_SYS) ? "ON" : "OFF")); 684 } 685 } 686 } 687 return (error); 688} 689 690__private_extern__ void 691so_set_default_traffic_class(struct socket *so) 692{ 693 int sotc = -1; 694 695 if (tfp_count > 0 && 696 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) { 697 get_tclass_for_curr_proc(&sotc); 698 } 699 700 so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE; 701} 702 703__private_extern__ int 704so_set_opportunistic(struct socket *so, int optval) 705{ 706 return (so_set_traffic_class(so, (optval == 0) ? 707 SO_TC_BE : SO_TC_BK_SYS)); 708} 709 710__private_extern__ int 711so_get_opportunistic(struct socket *so) 712{ 713 return (so->so_traffic_class == SO_TC_BK_SYS); 714} 715 716__private_extern__ mbuf_svc_class_t 717mbuf_service_class_from_control(struct mbuf *control) 718{ 719 struct cmsghdr *cm; 720 mbuf_svc_class_t msc = MBUF_SC_UNSPEC; 721 722 for (cm = M_FIRST_CMSGHDR(control); cm != NULL; 723 cm = M_NXT_CMSGHDR(control, cm)) { 724 int tc; 725 726 if (cm->cmsg_len < sizeof (struct cmsghdr)) 727 break; 728 729 if (cm->cmsg_level != SOL_SOCKET || 730 cm->cmsg_type != SO_TRAFFIC_CLASS) 731 continue; 732 if (cm->cmsg_len != CMSG_LEN(sizeof (int))) 733 continue; 734 735 tc = *(int *)(void *)CMSG_DATA(cm); 736 msc = so_tc2msc(tc); 737 if (MBUF_VALID_SC(msc)) 738 break; 739 } 740 741 return (msc); 742} 743 744__private_extern__ int 745dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc) 746{ 747 int dscp_code; 748 749 switch (mtc) { 750 default: 751 case MBUF_TC_BE: 752 dscp_code = 0; 753 break; 754 case MBUF_TC_BK: 755 dscp_code = 0x08; 756 break; 757 case MBUF_TC_VI: 758 dscp_code = 0x20; 759 break; 760 case MBUF_TC_VO: 761 dscp_code = 0x30; 762 break; 763 } 764 765 return (dscp_code); 766} 767 768__private_extern__ void 769so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) 770{ 771 uint32_t sotc = m_get_traffic_class(m); 772 773 if (sotc >= SO_TC_STATS_MAX) 774 sotc = SO_TC_BE; 775 776 so->so_tc_stats[sotc].rxpackets += 1; 777 so->so_tc_stats[sotc].rxbytes += 778 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; 779} 780 781__private_extern__ void 782set_tcp_stream_priority(struct socket *so) 783{ 784 struct inpcb *inp = sotoinpcb(so); 785 struct tcpcb *tp = intotcpcb(inp); 786 struct ifnet *outifp; 787 u_char old_cc = tp->tcp_cc_index; 788 int recvbg = IS_TCP_RECV_BG(so); 789 bool is_local, fg_active = false; 790 u_int32_t uptime; 791 792 VERIFY((SOCK_CHECK_DOM(so, PF_INET) 793 || SOCK_CHECK_DOM(so, PF_INET6)) 794 && SOCK_CHECK_TYPE(so, SOCK_STREAM) 795 && SOCK_CHECK_PROTO(so, IPPROTO_TCP)); 796 797 outifp = inp->inp_last_outifp; 798 uptime = net_uptime(); 799 800 /* 801 * If the socket was marked as a background socket or if the 802 * traffic class is set to background with traffic class socket 803 * option then make both send and recv side of the stream to be 804 * background. The variable sotcdb which can be set with sysctl 805 * is used to disable these settings for testing. 806 */ 807 if (soissrcbackground(so)) { 808 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK)) 809 is_local = true; 810 else 811 is_local = false; 812 813 /* Check if there has been recent foreground activity */ 814 if ((outifp != NULL && 815 outifp->if_fg_sendts > 0 && 816 (int)(uptime - outifp->if_fg_sendts) <= 817 TCP_BG_SWITCH_TIME) || 818 net_io_policy_throttled) 819 fg_active = true; 820 821 /* 822 * If the interface that the connection is using is 823 * loopback, do not use background congestion 824 * control algorithm. 825 * 826 * If there has been recent foreground activity or if 827 * there was an indication that a foreground application 828 * is going to use networking (net_io_policy_throttled), 829 * switch the backgroung streams to use background 830 * congestion control algorithm. Otherwise, even background 831 * flows can move into foreground. 832 */ 833 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || 834 is_local || !fg_active) { 835 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) 836 tcp_set_foreground_cc(so); 837 } else { 838 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX) 839 tcp_set_background_cc(so); 840 } 841 842 /* Set receive side background flags */ 843 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || 844 is_local || !fg_active) 845 tcp_clear_recv_bg(so); 846 else 847 tcp_set_recv_bg(so); 848 } else { 849 tcp_clear_recv_bg(so); 850 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) 851 tcp_set_foreground_cc(so); 852 } 853 854 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) { 855 SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; " 856 "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so), 857 SOCK_DOM(so), SOCK_TYPE(so), 858 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? 859 "background" : "foreground", 860 IS_TCP_RECV_BG(so) ? "background" : "foreground")); 861 } 862} 863 864/* 865 * Set traffic class to an IPv4 or IPv6 packet 866 * - mark the mbuf 867 * - set the DSCP code following the WMM mapping 868 */ 869__private_extern__ void 870set_packet_service_class(struct mbuf *m, struct socket *so, 871 mbuf_svc_class_t in_msc, u_int32_t flags) 872{ 873 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ 874 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ 875 struct ip *ip = mtod(m, struct ip *); 876#if INET6 877 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 878#endif /* INET6 */ 879 int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; 880 881 if (!(m->m_flags & M_PKTHDR)) 882 return; 883 884 /* 885 * Here is the precedence: 886 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all 887 * 2) Traffic class passed via ancillary data to sendmsdg(2) 888 * 3) Traffic class socket option last 889 */ 890 if (in_msc != MBUF_SC_UNSPEC) { 891 if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL) 892 msc = in_msc; 893 } else { 894 VERIFY(SO_VALID_TC(so->so_traffic_class)); 895 msc = so_tc2msc(so->so_traffic_class); 896 /* Assert because tc must have been valid */ 897 VERIFY(MBUF_VALID_SC(msc)); 898 } 899 900 /* 901 * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority. 902 */ 903 if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc)) 904 msc = MBUF_SC_BK; 905 906 if (soissrcbackground(so)) 907 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND; 908 /* 909 * Set the traffic class in the mbuf packet header svc field 910 */ 911 if (sotcdb & SOTCDB_NO_MTC) 912 goto no_mbtc; 913 914 /* Elevate service class if the packet is a pure TCP ACK. 915 * We can do this only when the flow is not a background 916 * flow and the outgoing interface supports 917 * transmit-start model. 918 */ 919 if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK)) 920 msc = MBUF_SC_CTL; 921 922 (void) m_set_service_class(m, msc); 923 924 /* 925 * Set the privileged traffic auxiliary flag if applicable, 926 * or clear it. 927 */ 928 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) && 929 msc != MBUF_SC_UNSPEC) 930 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED; 931 else 932 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED; 933 934no_mbtc: 935 /* 936 * Quick exit when best effort 937 */ 938 if (msc == MBUF_SC_BE) 939 goto no_dscp; 940 941 /* 942 * The default behavior is for the networking stack to not set the 943 * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is 944 * cleared, set the DSCP code in IPv4 or IPv6 header only for local 945 * traffic, if it is not already set. <rdar://problem/11277343> 946 */ 947 if (sotcdb & SOTCDB_NO_DSCP) 948 goto no_dscp; 949 950 /* 951 * Test if a IP TOS or IPV6 TCLASS has already been set 952 * on the socket or the raw packet. 953 */ 954 if (!(sotcdb & SOTCDB_NO_DSCPTST)) { 955#if INET6 956 if (isipv6) { 957 if ((so->so_type == SOCK_RAW && 958 (ip6->ip6_flow & htonl(0xff << 20)) != 0) || 959 (inp->in6p_outputopts && 960 inp->in6p_outputopts->ip6po_tclass != -1)) 961 goto no_dscp; 962 } else 963#endif /* INET6 */ 964 if ((so->so_type == SOCK_RAW && 965 (inp->inp_flags & INP_HDRINCL)) || 966 inp->inp_ip_tos != 0) 967 goto no_dscp; 968 } 969 970 /* 971 * Test if destination is local 972 */ 973 if (!(sotcdb & SOTCDB_NO_LCLTST)) { 974 int islocal = 0; 975 struct rtentry *rt = inp->inp_route.ro_rt; 976 977 if (so->so_type == SOCK_STREAM) { 978 if (intotcpcb(inp)->t_flags & TF_LOCAL) 979 islocal = 1; 980 } else if (rt != NULL && 981 (rt->rt_gateway->sa_family == AF_LINK || 982 (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) { 983 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT)) 984 islocal = 1; 985 } else 986#if INET6 987 if (isipv6 && in6addr_local(&ip6->ip6_dst)) { 988 islocal = 1; 989 } else 990#endif /* INET6 */ 991 if (inaddr_local(ip->ip_dst)) { 992 islocal = 1; 993 } 994 if (islocal == 0) 995 goto no_dscp; 996 } 997 998#if INET6 999 if (isipv6) 1000 ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass( 1001 m_get_traffic_class(m)) << 20); 1002 else 1003#endif /* INET6 */ 1004 ip->ip_tos |= dscp_code_from_mbuf_tclass( 1005 m_get_traffic_class(m)) << 2; 1006 1007no_dscp: 1008 /* 1009 * For TCP with background traffic class switch CC algo based on sysctl 1010 */ 1011 if (so->so_type == SOCK_STREAM) 1012 set_tcp_stream_priority(so); 1013 1014 so_tc_update_stats(m, so, msc); 1015} 1016 1017__private_extern__ void 1018so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc) 1019{ 1020 mbuf_traffic_class_t mtc; 1021 1022 /* 1023 * Assume socket and mbuf traffic class values are the same 1024 * Also assume the socket lock is held. Note that the stats 1025 * at the socket layer are reduced down to the legacy traffic 1026 * classes; we could/should potentially expand so_tc_stats[]. 1027 */ 1028 mtc = MBUF_SC2TC(msc); 1029 VERIFY(mtc < SO_TC_STATS_MAX); 1030 so->so_tc_stats[mtc].txpackets += 1; 1031 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; 1032} 1033 1034__private_extern__ void 1035socket_tclass_init(void) 1036{ 1037 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); 1038 1039 tclass_lck_grp_attr = lck_grp_attr_alloc_init(); 1040 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); 1041 tclass_lck_attr = lck_attr_alloc_init(); 1042 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr); 1043} 1044 1045__private_extern__ mbuf_svc_class_t 1046so_tc2msc(int tc) 1047{ 1048 mbuf_svc_class_t msc; 1049 1050 switch (tc) { 1051 case SO_TC_BK_SYS: 1052 msc = MBUF_SC_BK_SYS; 1053 break; 1054 case SO_TC_BK: 1055 case _SO_TC_BK: 1056 msc = MBUF_SC_BK; 1057 break; 1058 case SO_TC_BE: 1059 msc = MBUF_SC_BE; 1060 break; 1061 case SO_TC_RD: 1062 msc = MBUF_SC_RD; 1063 break; 1064 case SO_TC_OAM: 1065 msc = MBUF_SC_OAM; 1066 break; 1067 case SO_TC_AV: 1068 msc = MBUF_SC_AV; 1069 break; 1070 case SO_TC_RV: 1071 msc = MBUF_SC_RV; 1072 break; 1073 case SO_TC_VI: 1074 case _SO_TC_VI: 1075 msc = MBUF_SC_VI; 1076 break; 1077 case SO_TC_VO: 1078 case _SO_TC_VO: 1079 msc = MBUF_SC_VO; 1080 break; 1081 case SO_TC_CTL: 1082 msc = MBUF_SC_CTL; 1083 break; 1084 case SO_TC_ALL: 1085 default: 1086 msc = MBUF_SC_UNSPEC; 1087 break; 1088 } 1089 1090 return (msc); 1091} 1092 1093__private_extern__ int 1094so_svc2tc(mbuf_svc_class_t svc) 1095{ 1096 switch (svc) { 1097 case MBUF_SC_UNSPEC: 1098 return SO_TC_BE; 1099 case MBUF_SC_BK_SYS: 1100 return SO_TC_BK_SYS; 1101 case MBUF_SC_BK: 1102 return SO_TC_BK; 1103 case MBUF_SC_BE: 1104 return SO_TC_BE; 1105 case MBUF_SC_RD: 1106 return SO_TC_RD; 1107 case MBUF_SC_OAM: 1108 return SO_TC_OAM; 1109 case MBUF_SC_AV: 1110 return SO_TC_AV; 1111 case MBUF_SC_RV: 1112 return SO_TC_RV; 1113 case MBUF_SC_VI: 1114 return SO_TC_VI; 1115 case MBUF_SC_VO: 1116 return SO_TC_VO; 1117 case MBUF_SC_CTL: 1118 return SO_TC_CTL; 1119 default: 1120 return SO_TC_BE; 1121 } 1122} 1123 1124/* 1125 * LRO is turned on for AV streaming class. 1126 */ 1127void 1128so_set_lro(struct socket *so, int optval) 1129{ 1130 if (optval == SO_TC_AV) { 1131 so->so_flags |= SOF_USELRO; 1132 } else { 1133 if (so->so_flags & SOF_USELRO) { 1134 /* transition to non LRO class */ 1135 so->so_flags &= ~SOF_USELRO; 1136 struct inpcb *inp = sotoinpcb(so); 1137 struct tcpcb *tp = NULL; 1138 if (inp) { 1139 tp = intotcpcb(inp); 1140 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) { 1141 tcp_lro_remove_state(inp->inp_laddr, 1142 inp->inp_faddr, 1143 inp->inp_lport, 1144 inp->inp_fport); 1145 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 1146 } 1147 } 1148 } 1149 } 1150} 1151 1152