1/*
2 * Copyright (c) 2009-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/systm.h>
30#include <sys/kernel.h>
31#include <sys/types.h>
32#include <sys/filedesc.h>
33#include <sys/file_internal.h>
34#include <sys/proc.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/errno.h>
38#include <sys/protosw.h>
39#include <sys/domain.h>
40#include <sys/mbuf.h>
41#include <sys/queue.h>
42
43#include <net/if.h>
44#include <net/route.h>
45
46#include <netinet/in.h>
47#include <netinet/in_var.h>
48#include <netinet/in_pcb.h>
49#include <netinet/ip.h>
50#include <netinet/ip_var.h>
51#include <netinet/ip6.h>
52#include <netinet6/ip6_var.h>
53#include <netinet/udp.h>
54#include <netinet/udp_var.h>
55#include <netinet/tcp.h>
56#include <netinet/tcp_var.h>
57#include <netinet/tcp_cc.h>
58#include <netinet/lro_ext.h>
59
60extern char *proc_name_address(void *p);
61
62static int tfp_count = 0;
63
64static TAILQ_HEAD(, tclass_for_proc) tfp_head =
65    TAILQ_HEAD_INITIALIZER(tfp_head);
66
67struct tclass_for_proc {
68	TAILQ_ENTRY(tclass_for_proc)	tfp_link;
69	int	tfp_class;
70	pid_t	tfp_pid;
71	char	tfp_pname[MAXCOMLEN + 1];
72};
73
74static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t);
75static int get_pid_tclass(struct so_tcdbg *);
76static int get_pname_tclass(struct so_tcdbg *);
77static int set_pid_tclass(struct so_tcdbg *);
78static int set_pname_tclass(struct so_tcdbg *);
79static int flush_pid_tclass(struct so_tcdbg *);
80static int purge_tclass_for_proc(void);
81static int flush_tclass_for_proc(void);
82int get_tclass_for_curr_proc(int *);
83
84static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
85static lck_grp_t *tclass_lck_grp = NULL;	/* mutex group definition */
86static lck_attr_t *tclass_lck_attr = NULL;	/* mutex attributes */
87decl_lck_mtx_data(static, tclass_lock_data);
88static lck_mtx_t *tclass_lock = &tclass_lock_data;
89
90/*
91 * If there is no foreground activity on the interface for bg_switch_time
92 * seconds, the background connections can switch to foreground TCP
93 * congestion control.
94 */
95#define TCP_BG_SWITCH_TIME 2
96
97/*
98 * Must be called with tclass_lock held
99 */
100static struct tclass_for_proc *
101find_tfp_by_pid(pid_t pid)
102{
103	struct tclass_for_proc *tfp;
104
105	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
106		if (tfp->tfp_pid == pid)
107			break;
108	}
109	return (tfp);
110}
111
112/*
113 * Must be called with tclass_lock held
114 */
115static struct tclass_for_proc *
116find_tfp_by_pname(const char *pname)
117{
118	struct tclass_for_proc *tfp;
119
120	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
121		if (strncmp(pname, tfp->tfp_pname,
122		    sizeof (tfp->tfp_pname)) == 0)
123			break;
124	}
125	return (tfp);
126}
127
128__private_extern__ int
129get_tclass_for_curr_proc(int *sotc)
130{
131	struct tclass_for_proc *tfp = NULL;
132	proc_t p = current_proc();	/* Not ref counted */
133	pid_t pid = proc_pid(p);
134	char *pname = proc_name_address(p);
135
136	*sotc = -1;
137
138	lck_mtx_lock(tclass_lock);
139
140	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
141		if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
142		    strncmp(pname, tfp->tfp_pname,
143		    sizeof (tfp->tfp_pname)) == 0)) {
144			*sotc = tfp->tfp_class;
145			break;
146		}
147	}
148
149	lck_mtx_unlock(tclass_lock);
150
151	return ((tfp == NULL) ? 0 : 1);
152}
153
154/*
155 * Purge entries with PIDs of exited processes
156 */
157int
158purge_tclass_for_proc(void)
159{
160	int error = 0;
161	struct tclass_for_proc *tfp, *tvar;
162
163	lck_mtx_lock(tclass_lock);
164
165	TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
166		proc_t p;
167
168		if (tfp->tfp_pid == -1)
169			continue;
170		if ((p = proc_find(tfp->tfp_pid)) == NULL) {
171			tfp_count--;
172			TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
173
174			_FREE(tfp, M_TEMP);
175		} else {
176			proc_rele(p);
177		}
178	}
179
180	lck_mtx_unlock(tclass_lock);
181
182	return (error);
183}
184
185/*
186 * Remove one entry
187 * Must be called with tclass_lock held
188 */
189static void
190free_tclass_for_proc(struct tclass_for_proc *tfp)
191{
192	if (tfp == NULL)
193		return;
194	tfp_count--;
195	TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
196	_FREE(tfp, M_TEMP);
197}
198
199/*
200 * Remove all entries
201 */
202int
203flush_tclass_for_proc(void)
204{
205	int error = 0;
206	struct tclass_for_proc *tfp, *tvar;
207
208	lck_mtx_lock(tclass_lock);
209
210	TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
211		free_tclass_for_proc(tfp);
212	}
213
214	lck_mtx_unlock(tclass_lock);
215
216	return (error);
217
218}
219
220/*
221 * Must be called with tclass_lock held
222 */
223static struct tclass_for_proc *
224alloc_tclass_for_proc(pid_t pid, const char *pname)
225{
226	struct tclass_for_proc *tfp;
227
228	if (pid == -1 && pname == NULL)
229		return (NULL);
230
231	tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
232	if (tfp == NULL)
233		return (NULL);
234
235	tfp->tfp_pid = pid;
236	/*
237	 * Add per pid entries before per proc name so we can find
238	 * a specific instance of a process before the general name base entry.
239	 */
240	if (pid != -1) {
241		TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
242	} else {
243		strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
244		TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
245	}
246
247	tfp_count++;
248
249	return (tfp);
250}
251
252/*
253 * -1 for tclass means to remove the entry
254 */
255int
256set_pid_tclass(struct so_tcdbg *so_tcdbg)
257{
258	int error = EINVAL;
259	proc_t p = NULL;
260	struct filedesc *fdp;
261	struct fileproc *fp;
262	struct tclass_for_proc *tfp;
263	int i;
264	pid_t pid = so_tcdbg->so_tcdbg_pid;
265	int tclass = so_tcdbg->so_tcdbg_tclass;
266
267	p = proc_find(pid);
268	if (p == NULL) {
269		printf("%s proc_find(%d) failed\n", __func__, pid);
270		goto done;
271	}
272
273	/* Need a tfp */
274	lck_mtx_lock(tclass_lock);
275
276	tfp = find_tfp_by_pid(pid);
277	if (tfp == NULL) {
278		tfp = alloc_tclass_for_proc(pid, NULL);
279		if (tfp == NULL) {
280			lck_mtx_unlock(tclass_lock);
281			error = ENOBUFS;
282			goto done;
283		}
284	}
285	tfp->tfp_class = tclass;
286
287	lck_mtx_unlock(tclass_lock);
288
289	if (tfp != NULL) {
290		proc_fdlock(p);
291
292		fdp = p->p_fd;
293		for (i = 0; i < fdp->fd_nfiles; i++) {
294			struct socket *so;
295
296			fp = fdp->fd_ofiles[i];
297			if (fp == NULL ||
298			    (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
299			    FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
300				continue;
301
302			so = (struct socket *)fp->f_fglob->fg_data;
303			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
304				continue;
305			socket_lock(so, 1);
306			if (tclass != -1) {
307				error = so_set_traffic_class(so, tclass);
308				if (error != 0) {
309					printf("%s: so_set_traffic_class"
310					    "(so=0x%llx, fd=%d, tclass=%d) "
311					    "failed %d\n", __func__,
312					    (uint64_t)VM_KERNEL_ADDRPERM(so),
313					    i, tclass, error);
314					error = 0;
315				}
316			}
317			socket_unlock(so, 1);
318		}
319
320		proc_fdunlock(p);
321	}
322
323	error = 0;
324done:
325	if (p != NULL)
326		proc_rele(p);
327
328	return (error);
329}
330
331int
332set_pname_tclass(struct so_tcdbg *so_tcdbg)
333{
334	int error = EINVAL;
335	struct tclass_for_proc *tfp;
336
337	lck_mtx_lock(tclass_lock);
338
339	tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
340	if (tfp == NULL) {
341		tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
342		if (tfp == NULL) {
343			lck_mtx_unlock(tclass_lock);
344			error = ENOBUFS;
345			goto done;
346		}
347	}
348	tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
349
350	lck_mtx_unlock(tclass_lock);
351
352	error = 0;
353done:
354
355	return (error);
356}
357
358static int
359flush_pid_tclass(struct so_tcdbg *so_tcdbg)
360{
361	pid_t pid = so_tcdbg->so_tcdbg_pid;
362	int tclass = so_tcdbg->so_tcdbg_tclass;
363	struct filedesc *fdp;
364	int error = EINVAL;
365	proc_t p;
366	int i;
367
368	p = proc_find(pid);
369	if (p == PROC_NULL) {
370		printf("%s proc_find(%d) failed\n", __func__, pid);
371		goto done;
372	}
373
374	proc_fdlock(p);
375	fdp = p->p_fd;
376	for (i = 0; i < fdp->fd_nfiles; i++) {
377		struct socket *so;
378		struct fileproc *fp;
379
380		fp = fdp->fd_ofiles[i];
381		if (fp == NULL ||
382		    (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
383		    FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
384			continue;
385
386		so = (struct socket *)fp->f_fglob->fg_data;
387		error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
388		    sizeof (tclass));
389		if (error != 0) {
390			printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
391			    "tclass=%d) failed %d\n", __func__,
392			    (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
393			    error);
394			error = 0;
395		}
396	}
397	proc_fdunlock(p);
398
399	error = 0;
400done:
401	if (p != PROC_NULL)
402		proc_rele(p);
403
404	return (error);
405}
406
407int
408get_pid_tclass(struct so_tcdbg *so_tcdbg)
409{
410	int error = EINVAL;
411	proc_t p = NULL;
412	struct tclass_for_proc *tfp;
413	pid_t pid = so_tcdbg->so_tcdbg_pid;
414
415	so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
416	so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
417
418	p = proc_find(pid);
419	if (p == NULL) {
420		printf("%s proc_find(%d) failed\n", __func__, pid);
421		goto done;
422	}
423
424	/* Need a tfp */
425	lck_mtx_lock(tclass_lock);
426
427	tfp = find_tfp_by_pid(pid);
428	if (tfp != NULL) {
429		so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
430		error = 0;
431	}
432	lck_mtx_unlock(tclass_lock);
433done:
434	if (p != NULL)
435		proc_rele(p);
436
437	return (error);
438}
439
440int
441get_pname_tclass(struct so_tcdbg *so_tcdbg)
442{
443	int error = EINVAL;
444	struct tclass_for_proc *tfp;
445
446	so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
447	so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
448
449	/* Need a tfp */
450	lck_mtx_lock(tclass_lock);
451
452	tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
453	if (tfp != NULL) {
454		so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
455		error = 0;
456	}
457	lck_mtx_unlock(tclass_lock);
458
459	return (error);
460}
461
462static int
463delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
464{
465	int error = EINVAL;
466	pid_t pid = so_tcdbg->so_tcdbg_pid;
467	struct tclass_for_proc *tfp = NULL;
468
469	lck_mtx_lock(tclass_lock);
470
471	if (pid != -1)
472		tfp = find_tfp_by_pid(pid);
473	else
474		tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
475
476	if (tfp != NULL) {
477		free_tclass_for_proc(tfp);
478		error = 0;
479	}
480
481	lck_mtx_unlock(tclass_lock);
482
483	return (error);
484}
485
486/*
487 * Setting options requires privileges
488 */
489__private_extern__ int
490so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
491{
492	int error = 0;
493
494	if ((so->so_state & SS_PRIV) == 0)
495		return (EPERM);
496
497	socket_unlock(so, 0);
498
499	switch (so_tcdbg->so_tcdbg_cmd) {
500		case SO_TCDBG_PID:
501			error = set_pid_tclass(so_tcdbg);
502			break;
503
504		case SO_TCDBG_PNAME:
505			error = set_pname_tclass(so_tcdbg);
506			break;
507
508		case SO_TCDBG_PURGE:
509			error = purge_tclass_for_proc();
510			break;
511
512		case SO_TCDBG_FLUSH:
513			error = flush_tclass_for_proc();
514			break;
515
516		case SO_TCDBG_DELETE:
517			error = delete_tclass_for_pid_pname(so_tcdbg);
518			break;
519
520		case SO_TCDBG_TCFLUSH_PID:
521			error = flush_pid_tclass(so_tcdbg);
522			break;
523
524		default:
525			error = EINVAL;
526			break;
527	}
528
529	socket_lock(so, 0);
530
531	return (error);
532}
533
534/*
535 * Not required to be privileged to get
536 */
537__private_extern__ int
538sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
539{
540	int error = 0;
541	struct so_tcdbg so_tcdbg;
542	void *buf = NULL;
543	size_t len = sopt->sopt_valsize;
544
545	error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
546	    sizeof (struct so_tcdbg));
547	if (error != 0)
548		return (error);
549
550	sopt->sopt_valsize = len;
551
552	socket_unlock(so, 0);
553
554	switch (so_tcdbg.so_tcdbg_cmd) {
555		case SO_TCDBG_PID:
556			error = get_pid_tclass(&so_tcdbg);
557			break;
558
559		case SO_TCDBG_PNAME:
560			error = get_pname_tclass(&so_tcdbg);
561			break;
562
563		case SO_TCDBG_COUNT:
564			lck_mtx_lock(tclass_lock);
565			so_tcdbg.so_tcdbg_count = tfp_count;
566			lck_mtx_unlock(tclass_lock);
567			break;
568
569		case SO_TCDBG_LIST: {
570			struct tclass_for_proc *tfp;
571			int n, alloc_count;
572			struct so_tcdbg *ptr;
573
574			lck_mtx_lock(tclass_lock);
575			if ((alloc_count = tfp_count) == 0) {
576				lck_mtx_unlock(tclass_lock);
577				error = EINVAL;
578				break;
579			}
580			len = alloc_count * sizeof (struct so_tcdbg);
581			lck_mtx_unlock(tclass_lock);
582
583			buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
584			if (buf == NULL) {
585				error = ENOBUFS;
586				break;
587			}
588
589			lck_mtx_lock(tclass_lock);
590			n = 0;
591			ptr = (struct so_tcdbg *)buf;
592			TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
593				if (++n > alloc_count)
594					break;
595				if (tfp->tfp_pid != -1) {
596					ptr->so_tcdbg_cmd = SO_TCDBG_PID;
597					ptr->so_tcdbg_pid = tfp->tfp_pid;
598				} else {
599					ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
600					ptr->so_tcdbg_pid = -1;
601					strlcpy(ptr->so_tcdbg_pname,
602					    tfp->tfp_pname,
603					    sizeof (ptr->so_tcdbg_pname));
604				}
605				ptr->so_tcdbg_tclass = tfp->tfp_class;
606				ptr++;
607			}
608
609			lck_mtx_unlock(tclass_lock);
610			}
611			break;
612
613		default:
614			error = EINVAL;
615			break;
616	}
617
618	socket_lock(so, 0);
619
620	if (error == 0) {
621		if (buf == NULL) {
622			error = sooptcopyout(sopt, &so_tcdbg,
623			    sizeof (struct so_tcdbg));
624		} else {
625			error = sooptcopyout(sopt, buf, len);
626			_FREE(buf, M_TEMP);
627		}
628	}
629	return (error);
630}
631
632
633__private_extern__ int
634so_set_traffic_class(struct socket *so, int optval)
635{
636	int error = 0;
637
638	if (optval < SO_TC_BE || optval > SO_TC_CTL) {
639		error = EINVAL;
640	} else {
641		switch (optval) {
642		case _SO_TC_BK:
643			optval = SO_TC_BK;
644			break;
645		case _SO_TC_VI:
646			optval = SO_TC_VI;
647			break;
648		case _SO_TC_VO:
649			optval = SO_TC_VO;
650			break;
651		default:
652			if (!SO_VALID_TC(optval))
653				error = EINVAL;
654			break;
655		}
656
657		if (error == 0) {
658			int oldval = so->so_traffic_class;
659
660			VERIFY(SO_VALID_TC(optval));
661			so->so_traffic_class = optval;
662
663			if ((SOCK_DOM(so) == PF_INET ||
664			    SOCK_DOM(so) == PF_INET6) &&
665			    SOCK_TYPE(so) == SOCK_STREAM)
666				set_tcp_stream_priority(so);
667
668			if ((SOCK_DOM(so) == PF_INET ||
669			    SOCK_DOM(so) == PF_INET6) &&
670			    optval != oldval && (optval == SO_TC_BK_SYS ||
671			    oldval == SO_TC_BK_SYS)) {
672				/*
673				 * If the app switches from BK_SYS to something
674				 * else, resume the socket if it was suspended.
675				 */
676				if (oldval == SO_TC_BK_SYS)
677					inp_reset_fc_state(so->so_pcb);
678
679				SOTHROTTLELOG(("throttle[%d]: so 0x%llx "
680				    "[%d,%d] opportunistic %s\n", so->last_pid,
681				    (uint64_t)VM_KERNEL_ADDRPERM(so),
682				    SOCK_DOM(so), SOCK_TYPE(so),
683				    (optval == SO_TC_BK_SYS) ? "ON" : "OFF"));
684			}
685		}
686	}
687	return (error);
688}
689
690__private_extern__ void
691so_set_default_traffic_class(struct socket *so)
692{
693	int sotc = -1;
694
695	if (tfp_count > 0 &&
696	    (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
697		get_tclass_for_curr_proc(&sotc);
698	}
699
700	so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE;
701}
702
703__private_extern__ int
704so_set_opportunistic(struct socket *so, int optval)
705{
706	return (so_set_traffic_class(so, (optval == 0) ?
707	    SO_TC_BE : SO_TC_BK_SYS));
708}
709
710__private_extern__ int
711so_get_opportunistic(struct socket *so)
712{
713	return (so->so_traffic_class == SO_TC_BK_SYS);
714}
715
716__private_extern__ mbuf_svc_class_t
717mbuf_service_class_from_control(struct mbuf *control)
718{
719	struct cmsghdr *cm;
720	mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
721
722	for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
723	    cm = M_NXT_CMSGHDR(control, cm)) {
724		int tc;
725
726		if (cm->cmsg_len < sizeof (struct cmsghdr))
727			break;
728
729		if (cm->cmsg_level != SOL_SOCKET ||
730		    cm->cmsg_type != SO_TRAFFIC_CLASS)
731			continue;
732		if (cm->cmsg_len != CMSG_LEN(sizeof (int)))
733			continue;
734
735		tc = *(int *)(void *)CMSG_DATA(cm);
736		msc = so_tc2msc(tc);
737		if (MBUF_VALID_SC(msc))
738			break;
739	}
740
741	return (msc);
742}
743
744__private_extern__  int
745dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc)
746{
747	int dscp_code;
748
749	switch (mtc) {
750		default:
751		case MBUF_TC_BE:
752			dscp_code = 0;
753			break;
754		case MBUF_TC_BK:
755			dscp_code = 0x08;
756			break;
757		case MBUF_TC_VI:
758			dscp_code = 0x20;
759			break;
760		case MBUF_TC_VO:
761			dscp_code = 0x30;
762			break;
763	}
764
765	return (dscp_code);
766}
767
768__private_extern__ void
769so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
770{
771	uint32_t sotc = m_get_traffic_class(m);
772
773	if (sotc >= SO_TC_STATS_MAX)
774		sotc = SO_TC_BE;
775
776	so->so_tc_stats[sotc].rxpackets += 1;
777	so->so_tc_stats[sotc].rxbytes +=
778	    ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
779}
780
781__private_extern__ void
782so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, uint32_t tc)
783{
784	if (tc >= SO_TC_STATS_MAX)
785		tc = SO_TC_BE;
786
787	so->so_tc_stats[tc].rxpackets += pkts;
788	so->so_tc_stats[tc].rxbytes +=bytes;
789}
790__private_extern__ void
791set_tcp_stream_priority(struct socket *so)
792{
793	struct inpcb *inp = sotoinpcb(so);
794	struct tcpcb *tp = intotcpcb(inp);
795	struct ifnet *outifp;
796	u_char old_cc = tp->tcp_cc_index;
797	int recvbg = IS_TCP_RECV_BG(so);
798	bool is_local, fg_active = false;
799	u_int32_t uptime;
800
801	VERIFY((SOCK_CHECK_DOM(so, PF_INET)
802	    || SOCK_CHECK_DOM(so, PF_INET6))
803	    && SOCK_CHECK_TYPE(so, SOCK_STREAM)
804	    && SOCK_CHECK_PROTO(so, IPPROTO_TCP));
805
806	/* Return if the socket is in a terminal state */
807	if (inp->inp_state == INPCB_STATE_DEAD)
808		return;
809
810	outifp = inp->inp_last_outifp;
811	uptime = net_uptime();
812
813	/*
814	 * If the socket was marked as a background socket or if the
815	 * traffic class is set to background with traffic class socket
816	 * option then make both send and recv side of the stream to be
817	 * background. The variable sotcdb which can be set with sysctl
818	 * is used to disable these settings for testing.
819	 */
820	if (soissrcbackground(so)) {
821		if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
822			is_local = true;
823		else
824			is_local = false;
825
826		/* Check if there has been recent foreground activity */
827		if ((outifp != NULL &&
828		    outifp->if_fg_sendts > 0 &&
829		    (int)(uptime - outifp->if_fg_sendts) <=
830		    TCP_BG_SWITCH_TIME) ||
831		    net_io_policy_throttled)
832			fg_active = true;
833
834		/*
835		 * If the interface that the connection is using is
836		 * loopback, do not use background congestion
837		 * control algorithm.
838		 *
839		 * If there has been recent foreground activity or if
840		 * there was an indication that a foreground application
841		 * is going to use networking (net_io_policy_throttled),
842		 * switch the backgroung streams to use background
843		 * congestion control algorithm. Otherwise, even background
844		 * flows can move into foreground.
845		 */
846		if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 ||
847			is_local || !fg_active) {
848			if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
849				tcp_set_foreground_cc(so);
850		} else {
851			if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
852				tcp_set_background_cc(so);
853		}
854
855		/* Set receive side background flags */
856		if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 ||
857			is_local || !fg_active)
858			tcp_clear_recv_bg(so);
859		else
860			tcp_set_recv_bg(so);
861	} else {
862		tcp_clear_recv_bg(so);
863		if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
864			tcp_set_foreground_cc(so);
865	}
866
867	if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
868		SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
869		   "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
870		   SOCK_DOM(so), SOCK_TYPE(so),
871		   (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
872		   "background" : "foreground",
873		   IS_TCP_RECV_BG(so) ? "background" : "foreground"));
874	}
875}
876
877/*
878 * Set traffic class to an IPv4 or IPv6 packet
879 * - mark the mbuf
880 * - set the DSCP code following the WMM mapping
881 */
882__private_extern__ void
883set_packet_service_class(struct mbuf *m, struct socket *so,
884    mbuf_svc_class_t in_msc, u_int32_t flags)
885{
886	mbuf_svc_class_t msc = MBUF_SC_BE;	   /* Best effort by default */
887	struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
888	struct ip *ip = mtod(m, struct ip *);
889#if INET6
890	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
891#endif /* INET6 */
892	int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0;
893
894	if (!(m->m_flags & M_PKTHDR))
895		return;
896
897	/*
898	 * Here is the precedence:
899	 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
900	 * 2) Traffic class passed via ancillary data to sendmsdg(2)
901	 * 3) Traffic class socket option last
902	 */
903	if (in_msc != MBUF_SC_UNSPEC) {
904		if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL)
905			msc = in_msc;
906	} else {
907		VERIFY(SO_VALID_TC(so->so_traffic_class));
908		msc = so_tc2msc(so->so_traffic_class);
909		/* Assert because tc must have been valid */
910		VERIFY(MBUF_VALID_SC(msc));
911	}
912
913	/*
914	 * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority.
915	 */
916	if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc))
917		msc = MBUF_SC_BK;
918
919	if (soissrcbackground(so))
920		m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
921	/*
922	 * Set the traffic class in the mbuf packet header svc field
923	 */
924	if (sotcdb & SOTCDB_NO_MTC)
925		goto no_mbtc;
926
927	/* Elevate service class if the packet is a pure TCP ACK.
928	 * We can do this only when the flow is not a background
929	 * flow and the outgoing interface supports
930	 * transmit-start model.
931	 */
932	if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK))
933		msc = MBUF_SC_CTL;
934
935	(void) m_set_service_class(m, msc);
936
937	/*
938	 * Set the privileged traffic auxiliary flag if applicable,
939	 * or clear it.
940	 */
941	if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
942	    msc != MBUF_SC_UNSPEC)
943		m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
944	else
945		m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
946
947no_mbtc:
948	/*
949	 * Quick exit when best effort
950	 */
951	if (msc == MBUF_SC_BE)
952		goto no_dscp;
953
954	/*
955	 * The default behavior is for the networking stack to not set the
956	 * DSCP code, based on SOTCDB_NO_DSCP being set.  If the flag is
957	 * cleared, set the DSCP code in IPv4 or IPv6 header only for local
958	 * traffic, if it is not already set.  <rdar://problem/11277343>
959	 */
960	if (sotcdb & SOTCDB_NO_DSCP)
961		goto no_dscp;
962
963	/*
964	 * Test if a IP TOS or IPV6 TCLASS has already been set
965	 * on the socket or the raw packet.
966	 */
967	if (!(sotcdb & SOTCDB_NO_DSCPTST)) {
968#if INET6
969		if (isipv6) {
970			if ((so->so_type == SOCK_RAW &&
971			    (ip6->ip6_flow & htonl(0xff << 20)) != 0) ||
972			    (inp->in6p_outputopts &&
973			    inp->in6p_outputopts->ip6po_tclass != -1))
974				goto no_dscp;
975		} else
976#endif /* INET6 */
977		if ((so->so_type == SOCK_RAW &&
978		    (inp->inp_flags & INP_HDRINCL)) ||
979		    inp->inp_ip_tos != 0)
980			goto no_dscp;
981	}
982
983	/*
984	 * Test if destination is local
985	 */
986	if (!(sotcdb & SOTCDB_NO_LCLTST)) {
987		int islocal = 0;
988		struct rtentry *rt = inp->inp_route.ro_rt;
989
990		if (so->so_type == SOCK_STREAM) {
991			if (intotcpcb(inp)->t_flags & TF_LOCAL)
992				islocal = 1;
993		} else if (rt != NULL &&
994		    (rt->rt_gateway->sa_family == AF_LINK ||
995		    (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) {
996			if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT))
997				islocal = 1;
998		} else
999#if INET6
1000		if (isipv6 && in6addr_local(&ip6->ip6_dst)) {
1001			islocal = 1;
1002		} else
1003#endif /* INET6 */
1004		if (inaddr_local(ip->ip_dst)) {
1005			islocal = 1;
1006		}
1007		if (islocal == 0)
1008			goto no_dscp;
1009	}
1010
1011#if INET6
1012	if (isipv6)
1013		ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass(
1014		    m_get_traffic_class(m)) << 20);
1015	else
1016#endif /* INET6 */
1017		ip->ip_tos |= dscp_code_from_mbuf_tclass(
1018		    m_get_traffic_class(m)) << 2;
1019
1020no_dscp:
1021	/*
1022	 * For TCP with background traffic class switch CC algo based on sysctl
1023	 */
1024	if (so->so_type == SOCK_STREAM)
1025		set_tcp_stream_priority(so);
1026
1027	so_tc_update_stats(m, so, msc);
1028}
1029
1030__private_extern__ void
1031so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1032{
1033	mbuf_traffic_class_t mtc;
1034
1035	/*
1036	 * Assume socket and mbuf traffic class values are the same
1037	 * Also assume the socket lock is held.  Note that the stats
1038	 * at the socket layer are reduced down to the legacy traffic
1039	 * classes; we could/should potentially expand so_tc_stats[].
1040	 */
1041	mtc = MBUF_SC2TC(msc);
1042	VERIFY(mtc < SO_TC_STATS_MAX);
1043	so->so_tc_stats[mtc].txpackets += 1;
1044	so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
1045}
1046
1047__private_extern__ void
1048socket_tclass_init(void)
1049{
1050        _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1051
1052	tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1053	tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1054	tclass_lck_attr = lck_attr_alloc_init();
1055	lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1056}
1057
1058__private_extern__ mbuf_svc_class_t
1059so_tc2msc(int tc)
1060{
1061	mbuf_svc_class_t msc;
1062
1063	switch (tc) {
1064	case SO_TC_BK_SYS:
1065		msc = MBUF_SC_BK_SYS;
1066		break;
1067	case SO_TC_BK:
1068	case _SO_TC_BK:
1069		msc = MBUF_SC_BK;
1070		break;
1071	case SO_TC_BE:
1072		msc = MBUF_SC_BE;
1073		break;
1074	case SO_TC_RD:
1075		msc = MBUF_SC_RD;
1076		break;
1077	case SO_TC_OAM:
1078		msc = MBUF_SC_OAM;
1079		break;
1080	case SO_TC_AV:
1081		msc = MBUF_SC_AV;
1082		break;
1083	case SO_TC_RV:
1084		msc = MBUF_SC_RV;
1085		break;
1086	case SO_TC_VI:
1087	case _SO_TC_VI:
1088		msc = MBUF_SC_VI;
1089		break;
1090	case SO_TC_VO:
1091	case _SO_TC_VO:
1092		msc = MBUF_SC_VO;
1093		break;
1094	case SO_TC_CTL:
1095		msc = MBUF_SC_CTL;
1096		break;
1097	case SO_TC_ALL:
1098	default:
1099		msc = MBUF_SC_UNSPEC;
1100		break;
1101	}
1102
1103	return (msc);
1104}
1105
1106__private_extern__ int
1107so_svc2tc(mbuf_svc_class_t svc)
1108{
1109	switch (svc) {
1110	case MBUF_SC_UNSPEC:
1111		return SO_TC_BE;
1112	case MBUF_SC_BK_SYS:
1113		return SO_TC_BK_SYS;
1114	case MBUF_SC_BK:
1115		return SO_TC_BK;
1116	case MBUF_SC_BE:
1117		return SO_TC_BE;
1118	case MBUF_SC_RD:
1119		return SO_TC_RD;
1120	case MBUF_SC_OAM:
1121		return SO_TC_OAM;
1122	case MBUF_SC_AV:
1123		return SO_TC_AV;
1124	case MBUF_SC_RV:
1125		return SO_TC_RV;
1126	case MBUF_SC_VI:
1127		return SO_TC_VI;
1128	case MBUF_SC_VO:
1129		return SO_TC_VO;
1130	case MBUF_SC_CTL:
1131		return SO_TC_CTL;
1132	default:
1133		return SO_TC_BE;
1134	}
1135}
1136
1137/*
1138 * LRO is turned on for AV streaming class.
1139 */
1140void
1141so_set_lro(struct socket *so, int optval)
1142{
1143	if (optval == SO_TC_AV) {
1144		so->so_flags |= SOF_USELRO;
1145	} else {
1146		if (so->so_flags & SOF_USELRO) {
1147			/* transition to non LRO class */
1148			so->so_flags &= ~SOF_USELRO;
1149			struct inpcb *inp = sotoinpcb(so);
1150			struct tcpcb *tp = NULL;
1151			if (inp) {
1152				tp = intotcpcb(inp);
1153				if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1154					tcp_lro_remove_state(inp->inp_laddr,
1155						inp->inp_faddr,
1156						inp->inp_lport,
1157						inp->inp_fport);
1158					tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1159				}
1160			}
1161		}
1162	}
1163}
1164
1165