1/* 2 * Copyright (c) 2009-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/types.h> 32#include <sys/filedesc.h> 33#include <sys/file_internal.h> 34#include <sys/proc.h> 35#include <sys/socket.h> 36#include <sys/socketvar.h> 37#include <sys/errno.h> 38#include <sys/protosw.h> 39#include <sys/domain.h> 40#include <sys/mbuf.h> 41#include <sys/queue.h> 42 43#include <net/if.h> 44#include <net/route.h> 45 46#include <netinet/in.h> 47#include <netinet/in_var.h> 48#include <netinet/in_pcb.h> 49#include <netinet/ip.h> 50#include <netinet/ip_var.h> 51#include <netinet/ip6.h> 52#include <netinet6/ip6_var.h> 53#include <netinet/udp.h> 54#include <netinet/udp_var.h> 55#include <netinet/tcp.h> 56#include <netinet/tcp_var.h> 57#include <netinet/tcp_cc.h> 58#include <netinet/lro_ext.h> 59 60extern char *proc_name_address(void *p); 61 62static int tfp_count = 0; 63 64static TAILQ_HEAD(, tclass_for_proc) tfp_head = 65 TAILQ_HEAD_INITIALIZER(tfp_head); 66 67struct tclass_for_proc { 68 TAILQ_ENTRY(tclass_for_proc) tfp_link; 69 int tfp_class; 70 pid_t tfp_pid; 71 char tfp_pname[MAXCOMLEN + 1]; 72}; 73 74static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t); 75static int get_pid_tclass(struct so_tcdbg *); 76static int get_pname_tclass(struct so_tcdbg *); 77static int set_pid_tclass(struct so_tcdbg *); 78static int set_pname_tclass(struct so_tcdbg *); 79static int flush_pid_tclass(struct so_tcdbg *); 80static int purge_tclass_for_proc(void); 81static int flush_tclass_for_proc(void); 82int get_tclass_for_curr_proc(int *); 83 84static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ 85static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ 86static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ 87decl_lck_mtx_data(static, tclass_lock_data); 88static lck_mtx_t *tclass_lock = &tclass_lock_data; 89 90/* 91 * If there is no foreground activity on the interface for bg_switch_time 92 * seconds, the background connections can switch to foreground TCP 93 * congestion control. 94 */ 95#define TCP_BG_SWITCH_TIME 2 96 97/* 98 * Must be called with tclass_lock held 99 */ 100static struct tclass_for_proc * 101find_tfp_by_pid(pid_t pid) 102{ 103 struct tclass_for_proc *tfp; 104 105 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 106 if (tfp->tfp_pid == pid) 107 break; 108 } 109 return (tfp); 110} 111 112/* 113 * Must be called with tclass_lock held 114 */ 115static struct tclass_for_proc * 116find_tfp_by_pname(const char *pname) 117{ 118 struct tclass_for_proc *tfp; 119 120 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 121 if (strncmp(pname, tfp->tfp_pname, 122 sizeof (tfp->tfp_pname)) == 0) 123 break; 124 } 125 return (tfp); 126} 127 128__private_extern__ int 129get_tclass_for_curr_proc(int *sotc) 130{ 131 struct tclass_for_proc *tfp = NULL; 132 proc_t p = current_proc(); /* Not ref counted */ 133 pid_t pid = proc_pid(p); 134 char *pname = proc_name_address(p); 135 136 *sotc = -1; 137 138 lck_mtx_lock(tclass_lock); 139 140 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 141 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 && 142 strncmp(pname, tfp->tfp_pname, 143 sizeof (tfp->tfp_pname)) == 0)) { 144 *sotc = tfp->tfp_class; 145 break; 146 } 147 } 148 149 lck_mtx_unlock(tclass_lock); 150 151 return ((tfp == NULL) ? 0 : 1); 152} 153 154/* 155 * Purge entries with PIDs of exited processes 156 */ 157int 158purge_tclass_for_proc(void) 159{ 160 int error = 0; 161 struct tclass_for_proc *tfp, *tvar; 162 163 lck_mtx_lock(tclass_lock); 164 165 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { 166 proc_t p; 167 168 if (tfp->tfp_pid == -1) 169 continue; 170 if ((p = proc_find(tfp->tfp_pid)) == NULL) { 171 tfp_count--; 172 TAILQ_REMOVE(&tfp_head, tfp, tfp_link); 173 174 _FREE(tfp, M_TEMP); 175 } else { 176 proc_rele(p); 177 } 178 } 179 180 lck_mtx_unlock(tclass_lock); 181 182 return (error); 183} 184 185/* 186 * Remove one entry 187 * Must be called with tclass_lock held 188 */ 189static void 190free_tclass_for_proc(struct tclass_for_proc *tfp) 191{ 192 if (tfp == NULL) 193 return; 194 tfp_count--; 195 TAILQ_REMOVE(&tfp_head, tfp, tfp_link); 196 _FREE(tfp, M_TEMP); 197} 198 199/* 200 * Remove all entries 201 */ 202int 203flush_tclass_for_proc(void) 204{ 205 int error = 0; 206 struct tclass_for_proc *tfp, *tvar; 207 208 lck_mtx_lock(tclass_lock); 209 210 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { 211 free_tclass_for_proc(tfp); 212 } 213 214 lck_mtx_unlock(tclass_lock); 215 216 return (error); 217 218} 219 220/* 221 * Must be called with tclass_lock held 222 */ 223static struct tclass_for_proc * 224alloc_tclass_for_proc(pid_t pid, const char *pname) 225{ 226 struct tclass_for_proc *tfp; 227 228 if (pid == -1 && pname == NULL) 229 return (NULL); 230 231 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO); 232 if (tfp == NULL) 233 return (NULL); 234 235 tfp->tfp_pid = pid; 236 /* 237 * Add per pid entries before per proc name so we can find 238 * a specific instance of a process before the general name base entry. 239 */ 240 if (pid != -1) { 241 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); 242 } else { 243 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname)); 244 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); 245 } 246 247 tfp_count++; 248 249 return (tfp); 250} 251 252/* 253 * -1 for tclass means to remove the entry 254 */ 255int 256set_pid_tclass(struct so_tcdbg *so_tcdbg) 257{ 258 int error = EINVAL; 259 proc_t p = NULL; 260 struct filedesc *fdp; 261 struct fileproc *fp; 262 struct tclass_for_proc *tfp; 263 int i; 264 pid_t pid = so_tcdbg->so_tcdbg_pid; 265 int tclass = so_tcdbg->so_tcdbg_tclass; 266 267 p = proc_find(pid); 268 if (p == NULL) { 269 printf("%s proc_find(%d) failed\n", __func__, pid); 270 goto done; 271 } 272 273 /* Need a tfp */ 274 lck_mtx_lock(tclass_lock); 275 276 tfp = find_tfp_by_pid(pid); 277 if (tfp == NULL) { 278 tfp = alloc_tclass_for_proc(pid, NULL); 279 if (tfp == NULL) { 280 lck_mtx_unlock(tclass_lock); 281 error = ENOBUFS; 282 goto done; 283 } 284 } 285 tfp->tfp_class = tclass; 286 287 lck_mtx_unlock(tclass_lock); 288 289 if (tfp != NULL) { 290 proc_fdlock(p); 291 292 fdp = p->p_fd; 293 for (i = 0; i < fdp->fd_nfiles; i++) { 294 struct socket *so; 295 296 fp = fdp->fd_ofiles[i]; 297 if (fp == NULL || 298 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || 299 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) 300 continue; 301 302 so = (struct socket *)fp->f_fglob->fg_data; 303 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) 304 continue; 305 socket_lock(so, 1); 306 if (tclass != -1) { 307 error = so_set_traffic_class(so, tclass); 308 if (error != 0) { 309 printf("%s: so_set_traffic_class" 310 "(so=0x%llx, fd=%d, tclass=%d) " 311 "failed %d\n", __func__, 312 (uint64_t)VM_KERNEL_ADDRPERM(so), 313 i, tclass, error); 314 error = 0; 315 } 316 } 317 socket_unlock(so, 1); 318 } 319 320 proc_fdunlock(p); 321 } 322 323 error = 0; 324done: 325 if (p != NULL) 326 proc_rele(p); 327 328 return (error); 329} 330 331int 332set_pname_tclass(struct so_tcdbg *so_tcdbg) 333{ 334 int error = EINVAL; 335 struct tclass_for_proc *tfp; 336 337 lck_mtx_lock(tclass_lock); 338 339 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 340 if (tfp == NULL) { 341 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname); 342 if (tfp == NULL) { 343 lck_mtx_unlock(tclass_lock); 344 error = ENOBUFS; 345 goto done; 346 } 347 } 348 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass; 349 350 lck_mtx_unlock(tclass_lock); 351 352 error = 0; 353done: 354 355 return (error); 356} 357 358static int 359flush_pid_tclass(struct so_tcdbg *so_tcdbg) 360{ 361 pid_t pid = so_tcdbg->so_tcdbg_pid; 362 int tclass = so_tcdbg->so_tcdbg_tclass; 363 struct filedesc *fdp; 364 int error = EINVAL; 365 proc_t p; 366 int i; 367 368 p = proc_find(pid); 369 if (p == PROC_NULL) { 370 printf("%s proc_find(%d) failed\n", __func__, pid); 371 goto done; 372 } 373 374 proc_fdlock(p); 375 fdp = p->p_fd; 376 for (i = 0; i < fdp->fd_nfiles; i++) { 377 struct socket *so; 378 struct fileproc *fp; 379 380 fp = fdp->fd_ofiles[i]; 381 if (fp == NULL || 382 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || 383 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) 384 continue; 385 386 so = (struct socket *)fp->f_fglob->fg_data; 387 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass, 388 sizeof (tclass)); 389 if (error != 0) { 390 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, " 391 "tclass=%d) failed %d\n", __func__, 392 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass, 393 error); 394 error = 0; 395 } 396 } 397 proc_fdunlock(p); 398 399 error = 0; 400done: 401 if (p != PROC_NULL) 402 proc_rele(p); 403 404 return (error); 405} 406 407int 408get_pid_tclass(struct so_tcdbg *so_tcdbg) 409{ 410 int error = EINVAL; 411 proc_t p = NULL; 412 struct tclass_for_proc *tfp; 413 pid_t pid = so_tcdbg->so_tcdbg_pid; 414 415 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ 416 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ 417 418 p = proc_find(pid); 419 if (p == NULL) { 420 printf("%s proc_find(%d) failed\n", __func__, pid); 421 goto done; 422 } 423 424 /* Need a tfp */ 425 lck_mtx_lock(tclass_lock); 426 427 tfp = find_tfp_by_pid(pid); 428 if (tfp != NULL) { 429 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; 430 error = 0; 431 } 432 lck_mtx_unlock(tclass_lock); 433done: 434 if (p != NULL) 435 proc_rele(p); 436 437 return (error); 438} 439 440int 441get_pname_tclass(struct so_tcdbg *so_tcdbg) 442{ 443 int error = EINVAL; 444 struct tclass_for_proc *tfp; 445 446 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ 447 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ 448 449 /* Need a tfp */ 450 lck_mtx_lock(tclass_lock); 451 452 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 453 if (tfp != NULL) { 454 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; 455 error = 0; 456 } 457 lck_mtx_unlock(tclass_lock); 458 459 return (error); 460} 461 462static int 463delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg) 464{ 465 int error = EINVAL; 466 pid_t pid = so_tcdbg->so_tcdbg_pid; 467 struct tclass_for_proc *tfp = NULL; 468 469 lck_mtx_lock(tclass_lock); 470 471 if (pid != -1) 472 tfp = find_tfp_by_pid(pid); 473 else 474 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); 475 476 if (tfp != NULL) { 477 free_tclass_for_proc(tfp); 478 error = 0; 479 } 480 481 lck_mtx_unlock(tclass_lock); 482 483 return (error); 484} 485 486/* 487 * Setting options requires privileges 488 */ 489__private_extern__ int 490so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) 491{ 492 int error = 0; 493 494 if ((so->so_state & SS_PRIV) == 0) 495 return (EPERM); 496 497 socket_unlock(so, 0); 498 499 switch (so_tcdbg->so_tcdbg_cmd) { 500 case SO_TCDBG_PID: 501 error = set_pid_tclass(so_tcdbg); 502 break; 503 504 case SO_TCDBG_PNAME: 505 error = set_pname_tclass(so_tcdbg); 506 break; 507 508 case SO_TCDBG_PURGE: 509 error = purge_tclass_for_proc(); 510 break; 511 512 case SO_TCDBG_FLUSH: 513 error = flush_tclass_for_proc(); 514 break; 515 516 case SO_TCDBG_DELETE: 517 error = delete_tclass_for_pid_pname(so_tcdbg); 518 break; 519 520 case SO_TCDBG_TCFLUSH_PID: 521 error = flush_pid_tclass(so_tcdbg); 522 break; 523 524 default: 525 error = EINVAL; 526 break; 527 } 528 529 socket_lock(so, 0); 530 531 return (error); 532} 533 534/* 535 * Not required to be privileged to get 536 */ 537__private_extern__ int 538sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) 539{ 540 int error = 0; 541 struct so_tcdbg so_tcdbg; 542 void *buf = NULL; 543 size_t len = sopt->sopt_valsize; 544 545 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), 546 sizeof (struct so_tcdbg)); 547 if (error != 0) 548 return (error); 549 550 sopt->sopt_valsize = len; 551 552 socket_unlock(so, 0); 553 554 switch (so_tcdbg.so_tcdbg_cmd) { 555 case SO_TCDBG_PID: 556 error = get_pid_tclass(&so_tcdbg); 557 break; 558 559 case SO_TCDBG_PNAME: 560 error = get_pname_tclass(&so_tcdbg); 561 break; 562 563 case SO_TCDBG_COUNT: 564 lck_mtx_lock(tclass_lock); 565 so_tcdbg.so_tcdbg_count = tfp_count; 566 lck_mtx_unlock(tclass_lock); 567 break; 568 569 case SO_TCDBG_LIST: { 570 struct tclass_for_proc *tfp; 571 int n, alloc_count; 572 struct so_tcdbg *ptr; 573 574 lck_mtx_lock(tclass_lock); 575 if ((alloc_count = tfp_count) == 0) { 576 lck_mtx_unlock(tclass_lock); 577 error = EINVAL; 578 break; 579 } 580 len = alloc_count * sizeof (struct so_tcdbg); 581 lck_mtx_unlock(tclass_lock); 582 583 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); 584 if (buf == NULL) { 585 error = ENOBUFS; 586 break; 587 } 588 589 lck_mtx_lock(tclass_lock); 590 n = 0; 591 ptr = (struct so_tcdbg *)buf; 592 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { 593 if (++n > alloc_count) 594 break; 595 if (tfp->tfp_pid != -1) { 596 ptr->so_tcdbg_cmd = SO_TCDBG_PID; 597 ptr->so_tcdbg_pid = tfp->tfp_pid; 598 } else { 599 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; 600 ptr->so_tcdbg_pid = -1; 601 strlcpy(ptr->so_tcdbg_pname, 602 tfp->tfp_pname, 603 sizeof (ptr->so_tcdbg_pname)); 604 } 605 ptr->so_tcdbg_tclass = tfp->tfp_class; 606 ptr++; 607 } 608 609 lck_mtx_unlock(tclass_lock); 610 } 611 break; 612 613 default: 614 error = EINVAL; 615 break; 616 } 617 618 socket_lock(so, 0); 619 620 if (error == 0) { 621 if (buf == NULL) { 622 error = sooptcopyout(sopt, &so_tcdbg, 623 sizeof (struct so_tcdbg)); 624 } else { 625 error = sooptcopyout(sopt, buf, len); 626 _FREE(buf, M_TEMP); 627 } 628 } 629 return (error); 630} 631 632 633__private_extern__ int 634so_set_traffic_class(struct socket *so, int optval) 635{ 636 int error = 0; 637 638 if (optval < SO_TC_BE || optval > SO_TC_CTL) { 639 error = EINVAL; 640 } else { 641 switch (optval) { 642 case _SO_TC_BK: 643 optval = SO_TC_BK; 644 break; 645 case _SO_TC_VI: 646 optval = SO_TC_VI; 647 break; 648 case _SO_TC_VO: 649 optval = SO_TC_VO; 650 break; 651 default: 652 if (!SO_VALID_TC(optval)) 653 error = EINVAL; 654 break; 655 } 656 657 if (error == 0) { 658 int oldval = so->so_traffic_class; 659 660 VERIFY(SO_VALID_TC(optval)); 661 so->so_traffic_class = optval; 662 663 if ((SOCK_DOM(so) == PF_INET || 664 SOCK_DOM(so) == PF_INET6) && 665 SOCK_TYPE(so) == SOCK_STREAM) 666 set_tcp_stream_priority(so); 667 668 if ((SOCK_DOM(so) == PF_INET || 669 SOCK_DOM(so) == PF_INET6) && 670 optval != oldval && (optval == SO_TC_BK_SYS || 671 oldval == SO_TC_BK_SYS)) { 672 /* 673 * If the app switches from BK_SYS to something 674 * else, resume the socket if it was suspended. 675 */ 676 if (oldval == SO_TC_BK_SYS) 677 inp_reset_fc_state(so->so_pcb); 678 679 SOTHROTTLELOG(("throttle[%d]: so 0x%llx " 680 "[%d,%d] opportunistic %s\n", so->last_pid, 681 (uint64_t)VM_KERNEL_ADDRPERM(so), 682 SOCK_DOM(so), SOCK_TYPE(so), 683 (optval == SO_TC_BK_SYS) ? "ON" : "OFF")); 684 } 685 } 686 } 687 return (error); 688} 689 690__private_extern__ void 691so_set_default_traffic_class(struct socket *so) 692{ 693 int sotc = -1; 694 695 if (tfp_count > 0 && 696 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) { 697 get_tclass_for_curr_proc(&sotc); 698 } 699 700 so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE; 701} 702 703__private_extern__ int 704so_set_opportunistic(struct socket *so, int optval) 705{ 706 return (so_set_traffic_class(so, (optval == 0) ? 707 SO_TC_BE : SO_TC_BK_SYS)); 708} 709 710__private_extern__ int 711so_get_opportunistic(struct socket *so) 712{ 713 return (so->so_traffic_class == SO_TC_BK_SYS); 714} 715 716__private_extern__ mbuf_svc_class_t 717mbuf_service_class_from_control(struct mbuf *control) 718{ 719 struct cmsghdr *cm; 720 mbuf_svc_class_t msc = MBUF_SC_UNSPEC; 721 722 for (cm = M_FIRST_CMSGHDR(control); cm != NULL; 723 cm = M_NXT_CMSGHDR(control, cm)) { 724 int tc; 725 726 if (cm->cmsg_len < sizeof (struct cmsghdr)) 727 break; 728 729 if (cm->cmsg_level != SOL_SOCKET || 730 cm->cmsg_type != SO_TRAFFIC_CLASS) 731 continue; 732 if (cm->cmsg_len != CMSG_LEN(sizeof (int))) 733 continue; 734 735 tc = *(int *)(void *)CMSG_DATA(cm); 736 msc = so_tc2msc(tc); 737 if (MBUF_VALID_SC(msc)) 738 break; 739 } 740 741 return (msc); 742} 743 744__private_extern__ int 745dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc) 746{ 747 int dscp_code; 748 749 switch (mtc) { 750 default: 751 case MBUF_TC_BE: 752 dscp_code = 0; 753 break; 754 case MBUF_TC_BK: 755 dscp_code = 0x08; 756 break; 757 case MBUF_TC_VI: 758 dscp_code = 0x20; 759 break; 760 case MBUF_TC_VO: 761 dscp_code = 0x30; 762 break; 763 } 764 765 return (dscp_code); 766} 767 768__private_extern__ void 769so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) 770{ 771 uint32_t sotc = m_get_traffic_class(m); 772 773 if (sotc >= SO_TC_STATS_MAX) 774 sotc = SO_TC_BE; 775 776 so->so_tc_stats[sotc].rxpackets += 1; 777 so->so_tc_stats[sotc].rxbytes += 778 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; 779} 780 781__private_extern__ void 782so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, uint32_t tc) 783{ 784 if (tc >= SO_TC_STATS_MAX) 785 tc = SO_TC_BE; 786 787 so->so_tc_stats[tc].rxpackets += pkts; 788 so->so_tc_stats[tc].rxbytes +=bytes; 789} 790__private_extern__ void 791set_tcp_stream_priority(struct socket *so) 792{ 793 struct inpcb *inp = sotoinpcb(so); 794 struct tcpcb *tp = intotcpcb(inp); 795 struct ifnet *outifp; 796 u_char old_cc = tp->tcp_cc_index; 797 int recvbg = IS_TCP_RECV_BG(so); 798 bool is_local, fg_active = false; 799 u_int32_t uptime; 800 801 VERIFY((SOCK_CHECK_DOM(so, PF_INET) 802 || SOCK_CHECK_DOM(so, PF_INET6)) 803 && SOCK_CHECK_TYPE(so, SOCK_STREAM) 804 && SOCK_CHECK_PROTO(so, IPPROTO_TCP)); 805 806 /* Return if the socket is in a terminal state */ 807 if (inp->inp_state == INPCB_STATE_DEAD) 808 return; 809 810 outifp = inp->inp_last_outifp; 811 uptime = net_uptime(); 812 813 /* 814 * If the socket was marked as a background socket or if the 815 * traffic class is set to background with traffic class socket 816 * option then make both send and recv side of the stream to be 817 * background. The variable sotcdb which can be set with sysctl 818 * is used to disable these settings for testing. 819 */ 820 if (soissrcbackground(so)) { 821 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK)) 822 is_local = true; 823 else 824 is_local = false; 825 826 /* Check if there has been recent foreground activity */ 827 if ((outifp != NULL && 828 outifp->if_fg_sendts > 0 && 829 (int)(uptime - outifp->if_fg_sendts) <= 830 TCP_BG_SWITCH_TIME) || 831 net_io_policy_throttled) 832 fg_active = true; 833 834 /* 835 * If the interface that the connection is using is 836 * loopback, do not use background congestion 837 * control algorithm. 838 * 839 * If there has been recent foreground activity or if 840 * there was an indication that a foreground application 841 * is going to use networking (net_io_policy_throttled), 842 * switch the backgroung streams to use background 843 * congestion control algorithm. Otherwise, even background 844 * flows can move into foreground. 845 */ 846 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || 847 is_local || !fg_active) { 848 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) 849 tcp_set_foreground_cc(so); 850 } else { 851 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX) 852 tcp_set_background_cc(so); 853 } 854 855 /* Set receive side background flags */ 856 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || 857 is_local || !fg_active) 858 tcp_clear_recv_bg(so); 859 else 860 tcp_set_recv_bg(so); 861 } else { 862 tcp_clear_recv_bg(so); 863 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) 864 tcp_set_foreground_cc(so); 865 } 866 867 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) { 868 SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; " 869 "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so), 870 SOCK_DOM(so), SOCK_TYPE(so), 871 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? 872 "background" : "foreground", 873 IS_TCP_RECV_BG(so) ? "background" : "foreground")); 874 } 875} 876 877/* 878 * Set traffic class to an IPv4 or IPv6 packet 879 * - mark the mbuf 880 * - set the DSCP code following the WMM mapping 881 */ 882__private_extern__ void 883set_packet_service_class(struct mbuf *m, struct socket *so, 884 mbuf_svc_class_t in_msc, u_int32_t flags) 885{ 886 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ 887 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ 888 struct ip *ip = mtod(m, struct ip *); 889#if INET6 890 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 891#endif /* INET6 */ 892 int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; 893 894 if (!(m->m_flags & M_PKTHDR)) 895 return; 896 897 /* 898 * Here is the precedence: 899 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all 900 * 2) Traffic class passed via ancillary data to sendmsdg(2) 901 * 3) Traffic class socket option last 902 */ 903 if (in_msc != MBUF_SC_UNSPEC) { 904 if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL) 905 msc = in_msc; 906 } else { 907 VERIFY(SO_VALID_TC(so->so_traffic_class)); 908 msc = so_tc2msc(so->so_traffic_class); 909 /* Assert because tc must have been valid */ 910 VERIFY(MBUF_VALID_SC(msc)); 911 } 912 913 /* 914 * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority. 915 */ 916 if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc)) 917 msc = MBUF_SC_BK; 918 919 if (soissrcbackground(so)) 920 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND; 921 /* 922 * Set the traffic class in the mbuf packet header svc field 923 */ 924 if (sotcdb & SOTCDB_NO_MTC) 925 goto no_mbtc; 926 927 /* Elevate service class if the packet is a pure TCP ACK. 928 * We can do this only when the flow is not a background 929 * flow and the outgoing interface supports 930 * transmit-start model. 931 */ 932 if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK)) 933 msc = MBUF_SC_CTL; 934 935 (void) m_set_service_class(m, msc); 936 937 /* 938 * Set the privileged traffic auxiliary flag if applicable, 939 * or clear it. 940 */ 941 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) && 942 msc != MBUF_SC_UNSPEC) 943 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED; 944 else 945 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED; 946 947no_mbtc: 948 /* 949 * Quick exit when best effort 950 */ 951 if (msc == MBUF_SC_BE) 952 goto no_dscp; 953 954 /* 955 * The default behavior is for the networking stack to not set the 956 * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is 957 * cleared, set the DSCP code in IPv4 or IPv6 header only for local 958 * traffic, if it is not already set. <rdar://problem/11277343> 959 */ 960 if (sotcdb & SOTCDB_NO_DSCP) 961 goto no_dscp; 962 963 /* 964 * Test if a IP TOS or IPV6 TCLASS has already been set 965 * on the socket or the raw packet. 966 */ 967 if (!(sotcdb & SOTCDB_NO_DSCPTST)) { 968#if INET6 969 if (isipv6) { 970 if ((so->so_type == SOCK_RAW && 971 (ip6->ip6_flow & htonl(0xff << 20)) != 0) || 972 (inp->in6p_outputopts && 973 inp->in6p_outputopts->ip6po_tclass != -1)) 974 goto no_dscp; 975 } else 976#endif /* INET6 */ 977 if ((so->so_type == SOCK_RAW && 978 (inp->inp_flags & INP_HDRINCL)) || 979 inp->inp_ip_tos != 0) 980 goto no_dscp; 981 } 982 983 /* 984 * Test if destination is local 985 */ 986 if (!(sotcdb & SOTCDB_NO_LCLTST)) { 987 int islocal = 0; 988 struct rtentry *rt = inp->inp_route.ro_rt; 989 990 if (so->so_type == SOCK_STREAM) { 991 if (intotcpcb(inp)->t_flags & TF_LOCAL) 992 islocal = 1; 993 } else if (rt != NULL && 994 (rt->rt_gateway->sa_family == AF_LINK || 995 (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) { 996 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT)) 997 islocal = 1; 998 } else 999#if INET6 1000 if (isipv6 && in6addr_local(&ip6->ip6_dst)) { 1001 islocal = 1; 1002 } else 1003#endif /* INET6 */ 1004 if (inaddr_local(ip->ip_dst)) { 1005 islocal = 1; 1006 } 1007 if (islocal == 0) 1008 goto no_dscp; 1009 } 1010 1011#if INET6 1012 if (isipv6) 1013 ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass( 1014 m_get_traffic_class(m)) << 20); 1015 else 1016#endif /* INET6 */ 1017 ip->ip_tos |= dscp_code_from_mbuf_tclass( 1018 m_get_traffic_class(m)) << 2; 1019 1020no_dscp: 1021 /* 1022 * For TCP with background traffic class switch CC algo based on sysctl 1023 */ 1024 if (so->so_type == SOCK_STREAM) 1025 set_tcp_stream_priority(so); 1026 1027 so_tc_update_stats(m, so, msc); 1028} 1029 1030__private_extern__ void 1031so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc) 1032{ 1033 mbuf_traffic_class_t mtc; 1034 1035 /* 1036 * Assume socket and mbuf traffic class values are the same 1037 * Also assume the socket lock is held. Note that the stats 1038 * at the socket layer are reduced down to the legacy traffic 1039 * classes; we could/should potentially expand so_tc_stats[]. 1040 */ 1041 mtc = MBUF_SC2TC(msc); 1042 VERIFY(mtc < SO_TC_STATS_MAX); 1043 so->so_tc_stats[mtc].txpackets += 1; 1044 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; 1045} 1046 1047__private_extern__ void 1048socket_tclass_init(void) 1049{ 1050 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); 1051 1052 tclass_lck_grp_attr = lck_grp_attr_alloc_init(); 1053 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); 1054 tclass_lck_attr = lck_attr_alloc_init(); 1055 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr); 1056} 1057 1058__private_extern__ mbuf_svc_class_t 1059so_tc2msc(int tc) 1060{ 1061 mbuf_svc_class_t msc; 1062 1063 switch (tc) { 1064 case SO_TC_BK_SYS: 1065 msc = MBUF_SC_BK_SYS; 1066 break; 1067 case SO_TC_BK: 1068 case _SO_TC_BK: 1069 msc = MBUF_SC_BK; 1070 break; 1071 case SO_TC_BE: 1072 msc = MBUF_SC_BE; 1073 break; 1074 case SO_TC_RD: 1075 msc = MBUF_SC_RD; 1076 break; 1077 case SO_TC_OAM: 1078 msc = MBUF_SC_OAM; 1079 break; 1080 case SO_TC_AV: 1081 msc = MBUF_SC_AV; 1082 break; 1083 case SO_TC_RV: 1084 msc = MBUF_SC_RV; 1085 break; 1086 case SO_TC_VI: 1087 case _SO_TC_VI: 1088 msc = MBUF_SC_VI; 1089 break; 1090 case SO_TC_VO: 1091 case _SO_TC_VO: 1092 msc = MBUF_SC_VO; 1093 break; 1094 case SO_TC_CTL: 1095 msc = MBUF_SC_CTL; 1096 break; 1097 case SO_TC_ALL: 1098 default: 1099 msc = MBUF_SC_UNSPEC; 1100 break; 1101 } 1102 1103 return (msc); 1104} 1105 1106__private_extern__ int 1107so_svc2tc(mbuf_svc_class_t svc) 1108{ 1109 switch (svc) { 1110 case MBUF_SC_UNSPEC: 1111 return SO_TC_BE; 1112 case MBUF_SC_BK_SYS: 1113 return SO_TC_BK_SYS; 1114 case MBUF_SC_BK: 1115 return SO_TC_BK; 1116 case MBUF_SC_BE: 1117 return SO_TC_BE; 1118 case MBUF_SC_RD: 1119 return SO_TC_RD; 1120 case MBUF_SC_OAM: 1121 return SO_TC_OAM; 1122 case MBUF_SC_AV: 1123 return SO_TC_AV; 1124 case MBUF_SC_RV: 1125 return SO_TC_RV; 1126 case MBUF_SC_VI: 1127 return SO_TC_VI; 1128 case MBUF_SC_VO: 1129 return SO_TC_VO; 1130 case MBUF_SC_CTL: 1131 return SO_TC_CTL; 1132 default: 1133 return SO_TC_BE; 1134 } 1135} 1136 1137/* 1138 * LRO is turned on for AV streaming class. 1139 */ 1140void 1141so_set_lro(struct socket *so, int optval) 1142{ 1143 if (optval == SO_TC_AV) { 1144 so->so_flags |= SOF_USELRO; 1145 } else { 1146 if (so->so_flags & SOF_USELRO) { 1147 /* transition to non LRO class */ 1148 so->so_flags &= ~SOF_USELRO; 1149 struct inpcb *inp = sotoinpcb(so); 1150 struct tcpcb *tp = NULL; 1151 if (inp) { 1152 tp = intotcpcb(inp); 1153 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) { 1154 tcp_lro_remove_state(inp->inp_laddr, 1155 inp->inp_faddr, 1156 inp->inp_lport, 1157 inp->inp_fport); 1158 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 1159 } 1160 } 1161 } 1162 } 1163} 1164 1165