1/*
2 * Copyright (c) 2013-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24/*
25 * THEORY OF OPERATION
26 *
27 * The socket content filter subsystem provides a way for user space agents to
28 * make filtering decisions based on the content of the data being sent and
29 * received by TCP/IP sockets.
30 *
31 * A content filter user space agents gets a copy of the data and the data is
32 * also kept in kernel buffer until the user space agents makes a pass or drop
33 * decision. This unidirectional flow of content avoids unnecessary data copies
34 * back to the kernel.
35 * *
36 * A user space filter agent opens a kernel control socket with the name
37 * CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
38 * When connected, a "struct content_filter" is created and set as the
39 * "unitinfo" of the corresponding kernel control socket instance.
40 *
41 * The socket content filter subsystem exchanges messages with the user space
42 * filter agent until an ultimate pass or drop decision is made by the
43 * user space filter agent.
44 *
45 * It should be noted that messages about many TCP/IP sockets can be multiplexed
46 * over a single kernel control socket.
47 *
48 * Notes:
49 * - The current implementation is limited to TCP sockets.
50 * - The current implementation supports up to two simultaneous content filters
51 *   for the sake of simplicity of the implementation.
52 *
53 *
54 * NECP FILTER CONTROL UNIT
55 *
56 * A user space filter agent uses the Network Extension Control Policy (NECP)
57 * database specify which TCP/IP sockets needs to be filtered. The NECP
58 * criteria may be based on a variety of properties like user ID or proc UUID.
59 *
60 * The NECP "filter control unit" is used by the socket content filter subsystem
61 * to deliver the relevant TCP/IP content information to the appropriate
62 * user space filter agent via its kernel control socket instance.
63 * This works as follows:
64 *
65 * 1) The user space filter agent specifies an NECP filter control unit when
66 *    in adds its filtering rules to the NECP database.
67 *
68 * 2) The user space filter agent also sets its NECP filter control unit on the
69 *    content filter kernel control socket via the socket option
70 *    CFIL_OPT_NECP_CONTROL_UNIT.
71 *
72 * 3) The NECP database is consulted to find out if a given TCP/IP socket
73 *    needs to be subjected to content filtering and returns the corresponding
74 *    NECP filter control unit  -- the NECP filter control unit is actually
75 *    stored in the TCP/IP socket structure so the NECP lookup is really simple.
76 *
77 * 4) The NECP filter control unit is then used to find the corresponding
78 *    kernel control socket instance.
79 *
80 * Note: NECP currently supports a ingle filter control unit per TCP/IP socket
81 *       but this restriction may be soon lifted.
82 *
83 *
84 * THE MESSAGING PROTOCOL
85 *
86 * The socket content filter subsystem and a user space filter agent
87 * communicate over the kernel control socket via an asynchronous
88 * messaging protocol (this is not a request-response protocol).
89 * The socket content filter subsystem sends event messages to the user
90 * space filter agent about the TCP/IP sockets it is interested to filter.
91 * The user space filter agent sends action messages to either allow
92 * data to pass or to disallow the data flow (and drop the connection).
93 *
94 * All messages over a content filter kernel control socket share the same
95 * common header of type "struct cfil_msg_hdr". The message type tells if
96 * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
97 * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
98 * Note the message header length field may be padded for alignment and can
99 * be larger than the actual content of the message.
100 * The field "cfm_op" describe the kind of event or action.
101 *
102 * Here are the kinds of content filter events:
103 * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
104 * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
105 * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
106 * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
107 *
108 *
109 * EVENT MESSAGES
110 *
111 * The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
112 * data that is being sent or received. The position of this span of data
113 * in the data flow is described by a set of start and end offsets. These
114 * are absolute 64 bits offsets. The first byte sent (or received) starts
115 * at offset 0 and ends at offset 1. The length of the content data
116 * is given by the difference between the end offset and the start offset.
117 *
118 * After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
119 * CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
120 * action message is send by the user space filter agent.
121 *
122 * Note: absolute 64 bits offsets should be large enough for the foreseeable
123 * future.  A 64-bits counter will wrap after 468 years are 10 Gbit/sec:
124 *   2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
125 *
126 * They are two kinds of content filter actions:
127 * - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
128 * - CFM_OP_DROP: to shutdown socket and disallow further data flow
129 *
130 *
131 * ACTION MESSAGES
132 *
133 * The CFM_OP_DATA_UPDATE action messages let the user space filter
134 * agent allow data to flow up to the specified pass offset -- there
135 * is a pass offset for outgoing data and  a pass offset for incoming data.
136 * When a new TCP/IP socket is attached to the content filter, each pass offset
137 * is initially set to 0 so not data is allowed to pass by default.
138 * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
139 * then the data flow becomes unrestricted.
140 *
141 * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
142 * with a pass offset smaller than the pass offset of a previous
143 * CFM_OP_DATA_UPDATE message is silently ignored.
144 *
145 * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
146 * to tell the kernel how much data it wants to see by using the peek offsets.
147 * Just like pass offsets, there is a peek offset for each direction.
148 * When a new TCP/IP socket is attached to the content filter, each peek offset
149 * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
150 * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
151 * with a greater than 0 peek offset is sent by the user space filter agent.
152 * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
153 * then the flow of update data events becomes unrestricted.
154 *
155 * Note that peek offsets cannot be smaller than the corresponding pass offset.
156 * Also a peek offsets cannot be smaller than the corresponding end offset
157 * of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
158 * to set a too small peek value is silently ignored.
159 *
160 *
161 * PER SOCKET "struct cfil_info"
162 *
163 * As soon as a TCP/IP socket gets attached to a content filter, a
164 * "struct cfil_info" is created to hold the content filtering state for this
165 * socket.
166 *
167 * The content filtering state is made of the following information
168 * for each direction:
169 * - The current pass offset;
170 * - The first and last offsets of the data pending, waiting for a filtering
171 *   decision;
172 * - The inject queue for data that passed the filters and that needs
173 *   to be re-injected;
174 * - A content filter specific state in a set of  "struct cfil_entry"
175 *
176 *
177 * CONTENT FILTER STATE "struct cfil_entry"
178 *
179 * The "struct cfil_entry" maintains the information most relevant to the
180 * message handling over a kernel control socket with a user space filter agent.
181 *
182 * The "struct cfil_entry" holds the NECP filter control unit that corresponds
183 * to the kernel control socket unit it corresponds to and also has a pointer
184 * to the corresponding "struct content_filter".
185 *
186 * For each direction, "struct cfil_entry" maintains the following information:
187 * - The pass offset
188 * - The peek offset
189 * - The offset of the last data peeked at by the filter
190 * - A queue of data that's waiting to be delivered to the  user space filter
191 *   agent on the kernel control socket
192 * - A queue of data for which event messages have been sent on the kernel
193 *   control socket and are pending for a filtering decision.
194 *
195 *
196 * CONTENT FILTER QUEUES
197 *
198 * Data that is being filtered is steered away from the TCP/IP socket buffer
199 * and instead will sit in one of three content filter queue until the data
200 * can be re-injected into the TCP/IP socket buffer.
201 *
202 * A content filter queue is represented by "struct cfil_queue" that contains
203 * a list of mbufs and the start and end offset of the data span of
204 * the list of mbufs.
205 *
206 * The data moves into the three content filter queues according to this
207 * sequence:
208 * a) The "cfe_ctl_q" of "struct cfil_entry"
209 * b) The "cfe_pending_q" of "struct cfil_entry"
210 * c) The "cfi_inject_q" of "struct cfil_info"
211 *
212 * Note: The seqyence (a),(b) may be repeated several times if there are more
213 * than one content filter attached to the TCP/IP socket.
214 *
215 * The "cfe_ctl_q" queue holds data than cannot be delivered to the
216 * kernel conntrol socket for two reasons:
217 * - The peek offset is less that the end offset of the mbuf data
218 * - The kernel control socket is flow controlled
219 *
220 * The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
221 * CFM_OP_DATA_IN have been successfully dispatched to the kernel control
222 * socket and are waiting for a pass action message fromn the user space
223 * filter agent. An mbuf length must be fully allowed to pass to be removed
224 * from the cfe_pending_q.
225 *
226 * The "cfi_inject_q" queue holds data that has been fully allowed to pass
227 * by the user space filter agent and that needs to be re-injected into the
228 * TCP/IP socket.
229 *
230 *
231 * IMPACT ON FLOW CONTROL
232 *
233 * An essential aspect of the content filer subsystem is to minimize the
234 * impact on flow control of the TCP/IP sockets being filtered.
235 *
236 * The processing overhead of the content filtering may have an effect on
237 * flow control by adding noticeable delays and cannot be eliminated --
238 * care must be taken by the user space filter agent to minimize the
239 * processing delays.
240 *
241 * The amount of data being filtered is kept in buffers while waiting for
242 * a decision by the user space filter agent. This amount of data pending
243 * needs to be subtracted from the amount of data available in the
244 * corresponding TCP/IP socket buffer. This is done by modifying
245 * sbspace() and tcp_sbspace() to account for amount of data pending
246 * in the content filter.
247 *
248 *
249 * LOCKING STRATEGY
250 *
251 * The global state of content filter subsystem is protected by a single
252 * read-write lock "cfil_lck_rw". The data flow can be done with the
253 * cfil read-write lock held as shared so it can be re-entered from multiple
254 * threads.
255 *
256 * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
257 * protected by the socket lock.
258 *
259 * A TCP/IP socket lock cannot be taken while the cfil read-write lock
260 * is held. That's why we have some sequences where we drop the cfil read-write
261 * lock before taking the TCP/IP lock.
262 *
263 * It is also important to lock the TCP/IP socket buffer while the content
264 * filter is modifying the amount of pending data. Otherwise the calculations
265 * in sbspace() and tcp_sbspace()  could be wrong.
266 *
267 * The "cfil_lck_rw" protects "struct content_filter" and also the fields
268 * "cfe_link" and "cfe_filter" of "struct cfil_entry".
269 *
270 * Actually "cfe_link" and "cfe_filter" are protected by both by
271 * "cfil_lck_rw" and the socket lock: they may be modified only when
272 * "cfil_lck_rw" is exclusive and the socket is locked.
273 *
274 * To read the other fields of "struct content_filter" we have to take
275 * "cfil_lck_rw" in shared mode.
276 *
277 *
278 * LIMITATIONS
279 *
280 * - For TCP sockets only
281 *
282 * - Does not support TCP unordered messages
283 */
284
285/*
286 *	TO DO LIST
287 *
288 *	SOONER:
289 *
290 *	Deal with OOB
291 *
292 *	LATER:
293 *
294 *	If support datagram, enqueue control and address mbufs as well
295 */
296
297#include <sys/types.h>
298#include <sys/kern_control.h>
299#include <sys/queue.h>
300#include <sys/domain.h>
301#include <sys/protosw.h>
302#include <sys/syslog.h>
303
304#include <kern/locks.h>
305#include <kern/zalloc.h>
306#include <kern/debug.h>
307
308#include <net/content_filter.h>
309
310#include <netinet/in_pcb.h>
311#include <netinet/tcp.h>
312#include <netinet/tcp_var.h>
313
314#include <string.h>
315#include <libkern/libkern.h>
316
317
318#define	MAX_CONTENT_FILTER 2
319
320struct cfil_entry;
321
322/*
323 * The structure content_filter represents a user space content filter
324 * It's created and associated with a kernel control socket instance
325 */
326struct content_filter {
327	kern_ctl_ref		cf_kcref;
328	u_int32_t		cf_kcunit;
329	u_int32_t		cf_flags;
330
331	uint32_t		cf_necp_control_unit;
332
333	uint32_t		cf_sock_count;
334	TAILQ_HEAD(, cfil_entry) cf_sock_entries;
335};
336
337#define	CFF_ACTIVE		0x01
338#define	CFF_DETACHING		0x02
339#define	CFF_FLOW_CONTROLLED	0x04
340
341struct content_filter **content_filters = NULL;
342uint32_t cfil_active_count = 0;	/* Number of active content filters */
343uint32_t cfil_sock_attached_count = 0;	/* Number of sockets attachements */
344uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
345
346static kern_ctl_ref cfil_kctlref = NULL;
347
348static lck_grp_attr_t *cfil_lck_grp_attr = NULL;
349static lck_attr_t *cfil_lck_attr = NULL;
350static lck_grp_t *cfil_lck_grp = NULL;
351decl_lck_rw_data(static, cfil_lck_rw);
352
353#define	CFIL_RW_LCK_MAX 8
354
355int cfil_rw_nxt_lck = 0;
356void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
357
358int cfil_rw_nxt_unlck = 0;
359void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
360
361#define	CONTENT_FILTER_ZONE_NAME	"content_filter"
362#define	CONTENT_FILTER_ZONE_MAX		10
363static struct zone *content_filter_zone = NULL;	/* zone for content_filter */
364
365
366#define	CFIL_INFO_ZONE_NAME	"cfil_info"
367#define	CFIL_INFO_ZONE_MAX	1024
368static struct zone *cfil_info_zone = NULL;	/* zone for cfil_info */
369
370MBUFQ_HEAD(cfil_mqhead);
371
372struct cfil_queue {
373	uint64_t		q_start; /* offset of first byte in queue */
374	uint64_t		q_end; /* offset of last byte in queue */
375	struct cfil_mqhead	q_mq;
376};
377
378/*
379 * struct cfil_entry
380 *
381 * The is one entry per content filter
382 */
383struct cfil_entry {
384	TAILQ_ENTRY(cfil_entry) cfe_link;
385	struct content_filter	*cfe_filter;
386
387	struct cfil_info	*cfe_cfil_info;
388	uint32_t		cfe_flags;
389	uint32_t		cfe_necp_control_unit;
390	struct timeval		cfe_last_event; /* To user space */
391	struct timeval		cfe_last_action; /* From user space */
392
393	struct cfe_buf {
394		/*
395		 * cfe_pending_q holds data that has been delivered to
396		 * the filter and for which we are waiting for an action
397		 */
398		struct cfil_queue	cfe_pending_q;
399		/*
400		 * This queue is for data that has not be delivered to
401		 * the content filter (new data, pass peek or flow control)
402		 */
403		struct cfil_queue	cfe_ctl_q;
404
405		uint64_t		cfe_pass_offset;
406		uint64_t		cfe_peek_offset;
407		uint64_t		cfe_peeked;
408	} cfe_snd, cfe_rcv;
409};
410
411#define	CFEF_CFIL_ATTACHED		0x0001	/* was attached to filter */
412#define	CFEF_SENT_SOCK_ATTACHED		0x0002	/* sock attach event was sent */
413#define	CFEF_DATA_START			0x0004	/* can send data event */
414#define	CFEF_FLOW_CONTROLLED		0x0008	/* wait for flow control lift */
415#define	CFEF_SENT_DISCONNECT_IN		0x0010	/* event was sent */
416#define	CFEF_SENT_DISCONNECT_OUT	0x0020	/* event was sent */
417#define	CFEF_SENT_SOCK_CLOSED		0x0040	/* closed event was sent */
418#define	CFEF_CFIL_DETACHED		0x0080	/* filter was detached */
419
420/*
421 * struct cfil_info
422 *
423 * There is a struct cfil_info per socket
424 */
425struct cfil_info {
426	TAILQ_ENTRY(cfil_info)	cfi_link;
427	struct socket		*cfi_so;
428	uint64_t		cfi_flags;
429	uint64_t		cfi_sock_id;
430
431	struct cfi_buf {
432		/*
433		 * cfi_pending_first and cfi_pending_last describe the total
434		 * amount of data outstanding for all the filters on
435		 * this socket and data in the flow queue
436		 * cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
437		 */
438		uint64_t		cfi_pending_first;
439		uint64_t		cfi_pending_last;
440		int			cfi_pending_mbcnt;
441		/*
442		 * cfi_pass_offset is the minimum of all the filters
443		 */
444		uint64_t		cfi_pass_offset;
445		/*
446		 * cfi_inject_q holds data that needs to be re-injected
447		 * into the socket after filtering and that can
448		 * be queued because of flow control
449		 */
450		struct cfil_queue	cfi_inject_q;
451	} cfi_snd, cfi_rcv;
452
453	struct cfil_entry	cfi_entries[MAX_CONTENT_FILTER];
454};
455
456#define	CFIF_DROP		0x0001	/* drop action applied */
457#define	CFIF_CLOSE_WAIT		0x0002	/* waiting for filter to close */
458#define	CFIF_SOCK_CLOSED	0x0004	/* socket is closed */
459#define	CFIF_RETRY_INJECT_IN	0x0010	/* inject in failed */
460#define	CFIF_RETRY_INJECT_OUT	0x0020	/* inject out failed */
461#define	CFIF_SHUT_WR		0x0040	/* shutdown write */
462#define	CFIF_SHUT_RD		0x0080	/* shutdown read */
463
464#define	CFI_MASK_GENCNT		0xFFFFFFFF00000000	/* upper 32 bits */
465#define	CFI_SHIFT_GENCNT	32
466#define	CFI_MASK_FLOWHASH	0x00000000FFFFFFFF	/* lower 32 bits */
467#define	CFI_SHIFT_FLOWHASH	0
468
469TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
470
471#define	CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
472#define	CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
473
474/*
475 * Statistics
476 */
477
478struct cfil_stats cfil_stats;
479
480/*
481 * For troubleshooting
482 */
483int cfil_log_level = LOG_ERR;
484int cfil_debug = 1;
485
486/*
487 * Sysctls for logs and statistics
488 */
489static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
490	struct sysctl_req *);
491static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
492	struct sysctl_req *);
493
494SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cfil");
495
496SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW|CTLFLAG_LOCKED,
497	&cfil_log_level, 0, "");
498
499SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW|CTLFLAG_LOCKED,
500	&cfil_debug, 0, "");
501
502SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD|CTLFLAG_LOCKED,
503	&cfil_sock_attached_count, 0, "");
504
505SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD|CTLFLAG_LOCKED,
506	&cfil_active_count, 0, "");
507
508SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW|CTLFLAG_LOCKED,
509	&cfil_close_wait_timeout, 0, "");
510
511static int cfil_sbtrim = 1;
512SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW|CTLFLAG_LOCKED,
513	&cfil_sbtrim, 0, "");
514
515SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD|CTLFLAG_LOCKED,
516	0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat",  "");
517
518SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD|CTLFLAG_LOCKED,
519	0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat",  "");
520
521SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED,
522	&cfil_stats, cfil_stats, "");
523
524/*
525 * Forward declaration to appease the compiler
526 */
527static int cfil_action_data_pass(struct socket *, uint32_t, int,
528	uint64_t, uint64_t);
529static int cfil_action_drop(struct socket *, uint32_t);
530static int cfil_dispatch_closed_event(struct socket *, int);
531static int cfil_data_common(struct socket *, int, struct sockaddr *,
532	struct mbuf *, struct mbuf *, uint32_t);
533static int cfil_data_filter(struct socket *, uint32_t, int,
534	struct mbuf *, uint64_t);
535static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
536	struct in_addr, u_int16_t);
537static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
538	struct in6_addr *, u_int16_t);
539static int cfil_dispatch_attach_event(struct socket *, uint32_t);
540static void cfil_info_free(struct socket *, struct cfil_info *);
541static struct cfil_info * cfil_info_alloc(struct socket *);
542static int cfil_info_attach_unit(struct socket *, uint32_t);
543static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t);
544static int cfil_service_pending_queue(struct socket *, uint32_t, int);
545static int cfil_data_service_ctl_q(struct socket *, uint32_t, int);
546static void cfil_info_verify(struct cfil_info *);
547static int cfil_update_data_offsets(struct socket *, uint32_t, int,
548	uint64_t, uint64_t);
549static int cfil_acquire_sockbuf(struct socket *, int);
550static void cfil_release_sockbuf(struct socket *, int);
551static int cfil_filters_attached(struct socket *);
552
553static void cfil_rw_lock_exclusive(lck_rw_t *);
554static void cfil_rw_unlock_exclusive(lck_rw_t *);
555static void cfil_rw_lock_shared(lck_rw_t *);
556static void cfil_rw_unlock_shared(lck_rw_t *);
557static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
558static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
559
560static unsigned int cfil_data_length(struct mbuf *, int *);
561
562/*
563 * Content filter global read write lock
564 */
565
566static void
567cfil_rw_lock_exclusive(lck_rw_t *lck)
568{
569	void *lr_saved;
570
571	lr_saved = __builtin_return_address(0);
572
573	lck_rw_lock_exclusive(lck);
574
575	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
576	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
577}
578
579static void
580cfil_rw_unlock_exclusive(lck_rw_t *lck)
581{
582	void *lr_saved;
583
584	lr_saved = __builtin_return_address(0);
585
586	lck_rw_unlock_exclusive(lck);
587
588	cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
589	cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
590}
591
592static void
593cfil_rw_lock_shared(lck_rw_t *lck)
594{
595	void *lr_saved;
596
597	lr_saved = __builtin_return_address(0);
598
599	lck_rw_lock_shared(lck);
600
601	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
602	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
603}
604
605static void
606cfil_rw_unlock_shared(lck_rw_t *lck)
607{
608	void *lr_saved;
609
610	lr_saved = __builtin_return_address(0);
611
612	lck_rw_unlock_shared(lck);
613
614	cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
615	cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
616}
617
618static boolean_t
619cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
620{
621	void *lr_saved;
622	boolean_t upgraded;
623
624	lr_saved = __builtin_return_address(0);
625
626	upgraded = lck_rw_lock_shared_to_exclusive(lck);
627	if (upgraded) {
628		cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
629		cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
630	}
631	return (upgraded);
632}
633
634static void
635cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
636{
637	void *lr_saved;
638
639	lr_saved = __builtin_return_address(0);
640
641	lck_rw_lock_exclusive_to_shared(lck);
642
643	cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
644	cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
645}
646
647static void
648cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
649{
650	lck_rw_assert(lck,
651	    exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
652}
653
654static void
655socket_lock_assert_owned(struct socket *so)
656{
657	lck_mtx_t *mutex_held;
658
659	if (so->so_proto->pr_getlock != NULL)
660		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
661	else
662		mutex_held = so->so_proto->pr_domain->dom_mtx;
663
664	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
665}
666
667/*
668 * Return the number of bytes in the mbuf chain using the same
669 * method as m_length() or sballoc()
670 */
671static unsigned int
672cfil_data_length(struct mbuf *m, int *retmbcnt)
673{
674	struct mbuf *m0;
675	unsigned int pktlen;
676	int mbcnt;
677
678	if (retmbcnt == NULL)
679		return (m_length(m));
680
681	pktlen = 0;
682	mbcnt = 0;
683	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
684		pktlen += m0->m_len;
685		mbcnt += MSIZE;
686		if (m0->m_flags & M_EXT)
687			mbcnt += m0->m_ext.ext_size;
688	}
689	*retmbcnt = mbcnt;
690	return (pktlen);
691}
692
693/*
694 * Common mbuf queue utilities
695 */
696
697static inline void
698cfil_queue_init(struct cfil_queue *cfq)
699{
700	cfq->q_start = 0;
701	cfq->q_end = 0;
702	MBUFQ_INIT(&cfq->q_mq);
703}
704
705static inline uint64_t
706cfil_queue_drain(struct cfil_queue *cfq)
707{
708	uint64_t drained = cfq->q_start - cfq->q_end;
709	cfq->q_start = 0;
710	cfq->q_end = 0;
711	MBUFQ_DRAIN(&cfq->q_mq);
712
713	return (drained);
714}
715
716/* Return 1 when empty, 0 otherwise */
717static inline int
718cfil_queue_empty(struct cfil_queue *cfq)
719{
720	return (MBUFQ_EMPTY(&cfq->q_mq));
721}
722
723static inline uint64_t
724cfil_queue_offset_first(struct cfil_queue *cfq)
725{
726	return (cfq->q_start);
727}
728
729static inline uint64_t
730cfil_queue_offset_last(struct cfil_queue *cfq)
731{
732	return (cfq->q_end);
733}
734
735static inline uint64_t
736cfil_queue_len(struct cfil_queue *cfq)
737{
738	return (cfq->q_end - cfq->q_start);
739}
740
741/*
742 * Routines to verify some fundamental assumptions
743 */
744
745static void
746cfil_queue_verify(struct cfil_queue *cfq)
747{
748	mbuf_t m;
749	mbuf_t n;
750	uint64_t queuesize = 0;
751
752	/* Verify offset are ordered */
753	VERIFY(cfq->q_start <= cfq->q_end);
754
755	/*
756	 * When queue is empty, the offsets are equal otherwise the offsets
757	 * are different
758	 */
759	VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
760		(!MBUFQ_EMPTY(&cfq->q_mq) &&
761		cfq->q_start != cfq->q_end));
762
763	MBUFQ_FOREACH(m, &cfq->q_mq) {
764		size_t chainsize = 0;
765		unsigned int mlen = m_length(m);
766
767		if (m == (void *)M_TAG_FREE_PATTERN ||
768			m->m_next == (void *)M_TAG_FREE_PATTERN ||
769			m->m_nextpkt == (void *)M_TAG_FREE_PATTERN)
770			panic("%s - mq %p is free at %p", __func__,
771				&cfq->q_mq, m);
772		for (n = m; n != NULL; n = n->m_next) {
773			if (n->m_type != MT_DATA &&
774				n->m_type != MT_HEADER &&
775				n->m_type != MT_OOBDATA)
776			panic("%s - %p unsupported type %u", __func__,
777				n, n->m_type);
778			chainsize += n->m_len;
779		}
780		if (mlen != chainsize)
781			panic("%s - %p m_length() %u != chainsize %lu",
782				__func__, m, mlen, chainsize);
783		queuesize += chainsize;
784	}
785	if (queuesize != cfq->q_end - cfq->q_start)
786		panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
787			m, queuesize, cfq->q_end - cfq->q_start);
788}
789
790static void
791cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
792{
793	CFIL_QUEUE_VERIFY(cfq);
794
795	MBUFQ_ENQUEUE(&cfq->q_mq, m);
796	cfq->q_end += len;
797
798	CFIL_QUEUE_VERIFY(cfq);
799}
800
801static void
802cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
803{
804	CFIL_QUEUE_VERIFY(cfq);
805
806	VERIFY(m_length(m) == len);
807
808	MBUFQ_REMOVE(&cfq->q_mq, m);
809	MBUFQ_NEXT(m) = NULL;
810	cfq->q_start += len;
811
812	CFIL_QUEUE_VERIFY(cfq);
813}
814
815static mbuf_t
816cfil_queue_first(struct cfil_queue *cfq)
817{
818	return (MBUFQ_FIRST(&cfq->q_mq));
819}
820
821static mbuf_t
822cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
823{
824#pragma unused(cfq)
825	return (MBUFQ_NEXT(m));
826}
827
828static void
829cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
830{
831	CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
832	CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
833
834	/* Verify the queues are ordered so that pending is before ctl */
835	VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
836
837	/* The peek offset cannot be less than the pass offset */
838	VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
839
840	/* Make sure we've updated the offset we peeked at  */
841	VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
842}
843
844static void
845cfil_entry_verify(struct cfil_entry *entry)
846{
847	cfil_entry_buf_verify(&entry->cfe_snd);
848	cfil_entry_buf_verify(&entry->cfe_rcv);
849}
850
851static void
852cfil_info_buf_verify(struct cfi_buf *cfi_buf)
853{
854	CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
855
856	VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
857	VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
858}
859
860static void
861cfil_info_verify(struct cfil_info *cfil_info)
862{
863	int i;
864
865	if (cfil_info == NULL)
866		return;
867
868	cfil_info_buf_verify(&cfil_info->cfi_snd);
869	cfil_info_buf_verify(&cfil_info->cfi_rcv);
870
871	for (i = 0; i < MAX_CONTENT_FILTER; i++)
872		cfil_entry_verify(&cfil_info->cfi_entries[i]);
873}
874
875static void
876verify_content_filter(struct content_filter *cfc)
877{
878	struct cfil_entry *entry;
879	uint32_t count = 0;
880
881	VERIFY(cfc->cf_sock_count >= 0);
882
883	TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
884		count++;
885		VERIFY(cfc == entry->cfe_filter);
886	}
887	VERIFY(count == cfc->cf_sock_count);
888}
889
890/*
891 * Kernel control socket callbacks
892 */
893static errno_t
894cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
895		void **unitinfo)
896{
897	errno_t	error = 0;
898	struct content_filter *cfc = NULL;
899
900	CFIL_LOG(LOG_NOTICE, "");
901
902	cfc = zalloc(content_filter_zone);
903	if (cfc == NULL) {
904		CFIL_LOG(LOG_ERR, "zalloc failed");
905		error = ENOMEM;
906		goto done;
907	}
908	bzero(cfc, sizeof(struct content_filter));
909
910	cfil_rw_lock_exclusive(&cfil_lck_rw);
911	if (content_filters == NULL) {
912		struct content_filter **tmp;
913
914		cfil_rw_unlock_exclusive(&cfil_lck_rw);
915
916		MALLOC(tmp,
917			struct content_filter **,
918			MAX_CONTENT_FILTER * sizeof(struct content_filter *),
919			M_TEMP,
920			M_WAITOK | M_ZERO);
921
922		cfil_rw_lock_exclusive(&cfil_lck_rw);
923
924		if (tmp == NULL && content_filters == NULL) {
925			error = ENOMEM;
926			cfil_rw_unlock_exclusive(&cfil_lck_rw);
927			goto done;
928		}
929		/* Another thread may have won the race */
930		if (content_filters != NULL)
931			FREE(tmp, M_TEMP);
932		else
933			content_filters = tmp;
934	}
935
936	if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
937		CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
938		error = EINVAL;
939	} else if (content_filters[sac->sc_unit - 1] != NULL) {
940		CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
941		error = EADDRINUSE;
942	} else {
943		/*
944		 * kernel control socket kcunit numbers start at 1
945		 */
946		content_filters[sac->sc_unit - 1] = cfc;
947
948		cfc->cf_kcref = kctlref;
949		cfc->cf_kcunit = sac->sc_unit;
950		TAILQ_INIT(&cfc->cf_sock_entries);
951
952		*unitinfo = cfc;
953		cfil_active_count++;
954	}
955	cfil_rw_unlock_exclusive(&cfil_lck_rw);
956done:
957	if (error != 0 && cfc != NULL)
958		zfree(content_filter_zone, cfc);
959
960	if (error == 0)
961		OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
962	else
963		OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
964
965	CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
966		error, cfil_active_count, sac->sc_unit);
967
968	return (error);
969}
970
971static errno_t
972cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
973{
974#pragma unused(kctlref)
975	errno_t	error = 0;
976	struct content_filter *cfc;
977	struct cfil_entry *entry;
978
979	CFIL_LOG(LOG_NOTICE, "");
980
981	if (content_filters == NULL) {
982		CFIL_LOG(LOG_ERR, "no content filter");
983		error = EINVAL;
984		goto done;
985	}
986	if (kcunit > MAX_CONTENT_FILTER) {
987		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
988			kcunit, MAX_CONTENT_FILTER);
989		error = EINVAL;
990		goto done;
991	}
992
993	cfc = (struct content_filter *)unitinfo;
994	if (cfc == NULL)
995		goto done;
996
997	cfil_rw_lock_exclusive(&cfil_lck_rw);
998	if (content_filters[kcunit - 1] != cfc || cfc->cf_kcunit != kcunit) {
999		CFIL_LOG(LOG_ERR, "bad unit info %u)",
1000			kcunit);
1001		cfil_rw_unlock_exclusive(&cfil_lck_rw);
1002		goto done;
1003	}
1004	cfc->cf_flags |= CFF_DETACHING;
1005	/*
1006	 * Remove all sockets from the filter
1007	 */
1008	while ((entry = TAILQ_FIRST(&cfc->cf_sock_entries)) != NULL) {
1009		cfil_rw_lock_assert_held(&cfil_lck_rw, 1);
1010
1011		verify_content_filter(cfc);
1012		/*
1013		 * Accept all outstanding data by pushing to next filter
1014		 * or back to socket
1015		 *
1016		 * TBD: Actually we should make sure all data has been pushed
1017		 * back to socket
1018		 */
1019		if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
1020			struct cfil_info *cfil_info = entry->cfe_cfil_info;
1021			struct socket *so = cfil_info->cfi_so;
1022
1023			/* Need to let data flow immediately */
1024			entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
1025				CFEF_DATA_START;
1026
1027			/*
1028			 * Respect locking hierarchy
1029			 */
1030			cfil_rw_unlock_exclusive(&cfil_lck_rw);
1031
1032			socket_lock(so, 1);
1033
1034			/*
1035			 * When cfe_filter is NULL the filter is detached
1036			 * and the entry has been removed from cf_sock_entries
1037			 */
1038			if (so->so_cfil == NULL || entry->cfe_filter == NULL) {
1039				cfil_rw_lock_exclusive(&cfil_lck_rw);
1040				goto release;
1041			}
1042			(void) cfil_action_data_pass(so, kcunit, 1,
1043					CFM_MAX_OFFSET,
1044					CFM_MAX_OFFSET);
1045
1046			(void) cfil_action_data_pass(so, kcunit, 0,
1047					CFM_MAX_OFFSET,
1048					CFM_MAX_OFFSET);
1049
1050			cfil_rw_lock_exclusive(&cfil_lck_rw);
1051
1052			/*
1053			 * Check again as the socket may have been unlocked
1054			 * when when calling cfil_acquire_sockbuf()
1055			 */
1056			if (so->so_cfil == NULL || entry->cfe_filter == NULL)
1057				goto release;
1058
1059			/* The filter is now detached */
1060			entry->cfe_flags |= CFEF_CFIL_DETACHED;
1061			CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
1062				(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1063
1064			if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
1065			    cfil_filters_attached(so) == 0) {
1066				CFIL_LOG(LOG_NOTICE, "so %llx waking",
1067					(uint64_t)VM_KERNEL_ADDRPERM(so));
1068				wakeup((caddr_t)&so->so_cfil);
1069			}
1070
1071			/*
1072			 * Remove the filter entry from the content filter
1073			 * but leave the rest of the state intact as the queues
1074			 * may not be empty yet
1075			 */
1076			entry->cfe_filter = NULL;
1077			entry->cfe_necp_control_unit = 0;
1078
1079			TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1080			cfc->cf_sock_count--;
1081release:
1082			socket_unlock(so, 1);
1083		}
1084	}
1085	verify_content_filter(cfc);
1086
1087	VERIFY(cfc->cf_sock_count == 0);
1088
1089	/*
1090	 * Make filter inactive
1091	 */
1092	content_filters[kcunit - 1] = NULL;
1093	cfil_active_count--;
1094	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1095
1096	zfree(content_filter_zone, cfc);
1097done:
1098	if (error == 0)
1099		OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_ok);
1100	else
1101		OSIncrementAtomic(&cfil_stats.cfs_ctl_disconnect_fail);
1102
1103	CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
1104		error, cfil_active_count, kcunit);
1105
1106	return (error);
1107}
1108
1109/*
1110 * cfil_acquire_sockbuf()
1111 *
1112 * Prevent any other thread from acquiring the sockbuf
1113 * We use sb_cfil_thread as a semaphore to prevent other threads from
1114 * messing with the sockbuf -- see sblock()
1115 * Note: We do not set SB_LOCK here because the thread may check or modify
1116 * SB_LOCK several times until it calls cfil_release_sockbuf() -- currently
1117 * sblock(), sbunlock() or sodefunct()
1118 */
1119static int
1120cfil_acquire_sockbuf(struct socket *so, int outgoing)
1121{
1122	thread_t tp = current_thread();
1123	struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1124	lck_mtx_t *mutex_held;
1125	int error = 0;
1126
1127	/*
1128	 * Wait until no thread is holding the sockbuf and other content
1129	 * filter threads have released the sockbuf
1130	 */
1131	while ((sb->sb_flags & SB_LOCK) ||
1132		(sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)) {
1133		if (so->so_proto->pr_getlock != NULL)
1134			mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1135		else
1136			mutex_held = so->so_proto->pr_domain->dom_mtx;
1137
1138		lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1139
1140		sb->sb_wantlock++;
1141		VERIFY(sb->sb_wantlock != 0);
1142
1143		msleep(&sb->sb_flags, mutex_held, PSOCK, "cfil_acquire_sockbuf",
1144			NULL);
1145
1146		VERIFY(sb->sb_wantlock != 0);
1147		sb->sb_wantlock--;
1148	}
1149	/*
1150	 * Use reference count for repetitive calls on same thread
1151	 */
1152	if (sb->sb_cfil_refs == 0) {
1153		VERIFY(sb->sb_cfil_thread == NULL);
1154		VERIFY((sb->sb_flags & SB_LOCK) == 0);
1155
1156		sb->sb_cfil_thread = tp;
1157		sb->sb_flags |= SB_LOCK;
1158	}
1159	sb->sb_cfil_refs++;
1160
1161	/* We acquire the socket buffer when we need to cleanup */
1162	if (so->so_cfil == NULL) {
1163		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
1164			(uint64_t)VM_KERNEL_ADDRPERM(so));
1165		error = 0;
1166	} else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1167		CFIL_LOG(LOG_ERR, "so %llx drop set",
1168			(uint64_t)VM_KERNEL_ADDRPERM(so));
1169		error = EPIPE;
1170	}
1171
1172	return (error);
1173}
1174
1175static void
1176cfil_release_sockbuf(struct socket *so, int outgoing)
1177{
1178	struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
1179	thread_t tp = current_thread();
1180
1181	socket_lock_assert_owned(so);
1182
1183	if (sb->sb_cfil_thread != NULL && sb->sb_cfil_thread != tp)
1184		panic("%s sb_cfil_thread %p not current %p", __func__,
1185			sb->sb_cfil_thread, tp);
1186	/*
1187	 * Don't panic if we are defunct because SB_LOCK has
1188	 * been cleared by sodefunct()
1189	 */
1190	if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK))
1191		panic("%s SB_LOCK not set on %p", __func__,
1192			sb);
1193	/*
1194	 * We can unlock when the thread unwinds to the last reference
1195	 */
1196	sb->sb_cfil_refs--;
1197	if (sb->sb_cfil_refs == 0) {
1198		sb->sb_cfil_thread = NULL;
1199		sb->sb_flags &= ~SB_LOCK;
1200
1201		if (sb->sb_wantlock > 0)
1202			wakeup(&sb->sb_flags);
1203	}
1204}
1205
1206cfil_sock_id_t
1207cfil_sock_id_from_socket(struct socket *so)
1208{
1209	if ((so->so_flags & SOF_CONTENT_FILTER) && so->so_cfil)
1210		return (so->so_cfil->cfi_sock_id);
1211	else
1212		return (CFIL_SOCK_ID_NONE);
1213}
1214
1215static struct socket *
1216cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id)
1217{
1218	struct socket *so = NULL;
1219	u_int64_t gencnt = cfil_sock_id >> 32;
1220	u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
1221	struct inpcb *inp = NULL;
1222	struct inpcbinfo *pcbinfo = &tcbinfo;
1223
1224	lck_rw_lock_shared(pcbinfo->ipi_lock);
1225	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1226		if (inp->inp_state != INPCB_STATE_DEAD &&
1227			inp->inp_socket != NULL &&
1228			inp->inp_flowhash == flowhash &&
1229			(inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
1230			inp->inp_socket->so_cfil != NULL) {
1231			so = inp->inp_socket;
1232			break;
1233		}
1234	}
1235	lck_rw_done(pcbinfo->ipi_lock);
1236
1237	if (so == NULL) {
1238		OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
1239		CFIL_LOG(LOG_DEBUG,
1240			"no socket for sock_id %llx gencnt %llx flowhash %x",
1241			cfil_sock_id, gencnt, flowhash);
1242	}
1243
1244	return (so);
1245}
1246
1247static errno_t
1248cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
1249		int flags)
1250{
1251#pragma unused(kctlref, flags)
1252	errno_t	error = 0;
1253	struct cfil_msg_hdr *msghdr;
1254	struct content_filter *cfc = (struct content_filter *)unitinfo;
1255	struct socket *so;
1256	struct cfil_msg_action *action_msg;
1257	struct cfil_entry *entry;
1258
1259	CFIL_LOG(LOG_INFO, "");
1260
1261	if (content_filters == NULL) {
1262		CFIL_LOG(LOG_ERR, "no content filter");
1263		error = EINVAL;
1264		goto done;
1265	}
1266	if (kcunit > MAX_CONTENT_FILTER) {
1267		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1268			kcunit, MAX_CONTENT_FILTER);
1269		error = EINVAL;
1270		goto done;
1271	}
1272
1273	if (m_length(m) < sizeof(struct cfil_msg_hdr)) {
1274		CFIL_LOG(LOG_ERR, "too short %u", m_length(m));
1275		error = EINVAL;
1276		goto done;
1277	}
1278	msghdr = (struct cfil_msg_hdr *)mbuf_data(m);
1279	if (msghdr->cfm_version != CFM_VERSION_CURRENT) {
1280		CFIL_LOG(LOG_ERR, "bad version %u", msghdr->cfm_version);
1281		error = EINVAL;
1282		goto done;
1283	}
1284	if (msghdr->cfm_type != CFM_TYPE_ACTION) {
1285		CFIL_LOG(LOG_ERR, "bad type %u", msghdr->cfm_type);
1286		error = EINVAL;
1287		goto done;
1288	}
1289	/* Validate action operation */
1290	switch (msghdr->cfm_op) {
1291		case CFM_OP_DATA_UPDATE:
1292			OSIncrementAtomic(
1293				&cfil_stats.cfs_ctl_action_data_update);
1294			break;
1295		case CFM_OP_DROP:
1296			OSIncrementAtomic(&cfil_stats.cfs_ctl_action_drop);
1297			break;
1298		default:
1299			OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_op);
1300			CFIL_LOG(LOG_ERR, "bad op %u", msghdr->cfm_op);
1301			error = EINVAL;
1302			goto done;
1303		}
1304		if (msghdr->cfm_len != sizeof(struct cfil_msg_action)) {
1305			OSIncrementAtomic(&cfil_stats.cfs_ctl_action_bad_len);
1306				error = EINVAL;
1307				CFIL_LOG(LOG_ERR, "bad len: %u for op %u",
1308					msghdr->cfm_len,
1309					msghdr->cfm_op);
1310				goto done;
1311			}
1312	cfil_rw_lock_shared(&cfil_lck_rw);
1313	if (cfc != (void *)content_filters[kcunit - 1]) {
1314		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1315			kcunit);
1316		error = EINVAL;
1317		cfil_rw_unlock_shared(&cfil_lck_rw);
1318		goto done;
1319	}
1320
1321	so = cfil_socket_from_sock_id(msghdr->cfm_sock_id);
1322	if (so == NULL) {
1323		CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
1324			msghdr->cfm_sock_id);
1325		error = EINVAL;
1326		cfil_rw_unlock_shared(&cfil_lck_rw);
1327		goto done;
1328	}
1329	cfil_rw_unlock_shared(&cfil_lck_rw);
1330
1331	socket_lock(so, 1);
1332
1333	if (so->so_cfil == NULL) {
1334		CFIL_LOG(LOG_NOTICE, "so %llx not attached",
1335			(uint64_t)VM_KERNEL_ADDRPERM(so));
1336		error = EINVAL;
1337		goto unlock;
1338	} else if (so->so_cfil->cfi_flags & CFIF_DROP) {
1339		CFIL_LOG(LOG_NOTICE, "so %llx drop set",
1340			(uint64_t)VM_KERNEL_ADDRPERM(so));
1341		error = EINVAL;
1342		goto unlock;
1343	}
1344	entry = &so->so_cfil->cfi_entries[kcunit - 1];
1345	if (entry->cfe_filter == NULL) {
1346		CFIL_LOG(LOG_NOTICE, "so %llx no filter",
1347			(uint64_t)VM_KERNEL_ADDRPERM(so));
1348		error = EINVAL;
1349		goto unlock;
1350	}
1351
1352	if (entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED)
1353		entry->cfe_flags |= CFEF_DATA_START;
1354	else {
1355		CFIL_LOG(LOG_ERR,
1356			"so %llx attached not sent for %u",
1357			(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
1358		error = EINVAL;
1359		goto unlock;
1360	}
1361
1362	microuptime(&entry->cfe_last_action);
1363
1364	action_msg = (struct cfil_msg_action *)msghdr;
1365
1366	switch (msghdr->cfm_op) {
1367		case CFM_OP_DATA_UPDATE:
1368			if (action_msg->cfa_out_peek_offset != 0 ||
1369				action_msg->cfa_out_pass_offset != 0)
1370				error = cfil_action_data_pass(so, kcunit, 1,
1371					action_msg->cfa_out_pass_offset,
1372					action_msg->cfa_out_peek_offset);
1373			if (error == EJUSTRETURN)
1374				error = 0;
1375			if (error != 0)
1376				break;
1377			if (action_msg->cfa_in_peek_offset != 0 ||
1378				action_msg->cfa_in_pass_offset != 0)
1379				error = cfil_action_data_pass(so, kcunit, 0,
1380					action_msg->cfa_in_pass_offset,
1381					action_msg->cfa_in_peek_offset);
1382			if (error == EJUSTRETURN)
1383				error = 0;
1384			break;
1385
1386		case CFM_OP_DROP:
1387			error = cfil_action_drop(so, kcunit);
1388			break;
1389
1390		default:
1391			error = EINVAL;
1392			break;
1393	}
1394unlock:
1395	socket_unlock(so, 1);
1396done:
1397	mbuf_freem(m);
1398
1399	if (error == 0)
1400		OSIncrementAtomic(&cfil_stats.cfs_ctl_send_ok);
1401	else
1402		OSIncrementAtomic(&cfil_stats.cfs_ctl_send_bad);
1403
1404	return (error);
1405}
1406
1407static errno_t
1408cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1409		int opt, void *data, size_t *len)
1410{
1411#pragma unused(kctlref, opt)
1412	errno_t	error = 0;
1413	struct content_filter *cfc = (struct content_filter *)unitinfo;
1414
1415	CFIL_LOG(LOG_NOTICE, "");
1416
1417	cfil_rw_lock_shared(&cfil_lck_rw);
1418
1419	if (content_filters == NULL) {
1420		CFIL_LOG(LOG_ERR, "no content filter");
1421		error = EINVAL;
1422		goto done;
1423	}
1424	if (kcunit > MAX_CONTENT_FILTER) {
1425		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1426			kcunit, MAX_CONTENT_FILTER);
1427		error = EINVAL;
1428		goto done;
1429	}
1430	if (cfc != (void *)content_filters[kcunit - 1]) {
1431		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1432			kcunit);
1433		error = EINVAL;
1434		goto done;
1435	}
1436	switch (opt) {
1437		case CFIL_OPT_NECP_CONTROL_UNIT:
1438			if (*len < sizeof(uint32_t)) {
1439				CFIL_LOG(LOG_ERR, "len too small %lu", *len);
1440				error = EINVAL;
1441				goto done;
1442			}
1443			if (data != NULL)
1444				*(uint32_t *)data = cfc->cf_necp_control_unit;
1445			break;
1446		default:
1447			error = ENOPROTOOPT;
1448			break;
1449	}
1450done:
1451	cfil_rw_unlock_shared(&cfil_lck_rw);
1452
1453	return (error);
1454}
1455
1456static errno_t
1457cfil_ctl_setopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
1458		int opt, void *data, size_t len)
1459{
1460#pragma unused(kctlref, opt)
1461	errno_t	error = 0;
1462	struct content_filter *cfc = (struct content_filter *)unitinfo;
1463
1464	CFIL_LOG(LOG_NOTICE, "");
1465
1466	cfil_rw_lock_exclusive(&cfil_lck_rw);
1467
1468	if (content_filters == NULL) {
1469		CFIL_LOG(LOG_ERR, "no content filter");
1470		error = EINVAL;
1471		goto done;
1472	}
1473	if (kcunit > MAX_CONTENT_FILTER) {
1474		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1475			kcunit, MAX_CONTENT_FILTER);
1476		error = EINVAL;
1477		goto done;
1478	}
1479	if (cfc != (void *)content_filters[kcunit - 1]) {
1480		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1481			kcunit);
1482		error = EINVAL;
1483		goto done;
1484	}
1485	switch (opt) {
1486		case CFIL_OPT_NECP_CONTROL_UNIT:
1487			if (len < sizeof(uint32_t)) {
1488				CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1489					"len too small %lu", len);
1490				error = EINVAL;
1491				goto done;
1492			}
1493			if (cfc->cf_necp_control_unit != 0) {
1494				CFIL_LOG(LOG_ERR, "CFIL_OPT_NECP_CONTROL_UNIT "
1495					"already set %u",
1496					cfc->cf_necp_control_unit);
1497				error = EINVAL;
1498				goto done;
1499			}
1500			cfc->cf_necp_control_unit = *(uint32_t *)data;
1501			break;
1502		default:
1503			error = ENOPROTOOPT;
1504			break;
1505	}
1506done:
1507	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1508
1509	return (error);
1510}
1511
1512
1513static void
1514cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
1515{
1516#pragma unused(kctlref, flags)
1517	struct content_filter *cfc = (struct content_filter *)unitinfo;
1518	struct socket *so = NULL;
1519	int error;
1520	struct cfil_entry *entry;
1521
1522	CFIL_LOG(LOG_INFO, "");
1523
1524	if (content_filters == NULL) {
1525		CFIL_LOG(LOG_ERR, "no content filter");
1526		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1527		return;
1528	}
1529	if (kcunit > MAX_CONTENT_FILTER) {
1530		CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
1531			kcunit, MAX_CONTENT_FILTER);
1532		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1533		return;
1534	}
1535	cfil_rw_lock_shared(&cfil_lck_rw);
1536	if (cfc != (void *)content_filters[kcunit - 1]) {
1537		CFIL_LOG(LOG_ERR, "unitinfo does not match for kcunit %u",
1538			kcunit);
1539		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_bad);
1540		goto done;
1541	}
1542	/* Let's assume the flow control is lifted */
1543	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
1544		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
1545			cfil_rw_lock_exclusive(&cfil_lck_rw);
1546
1547	cfc->cf_flags &= ~CFF_FLOW_CONTROLLED;
1548
1549		cfil_rw_lock_exclusive_to_shared(&cfil_lck_rw);
1550		lck_rw_assert(&cfil_lck_rw, LCK_RW_ASSERT_SHARED);
1551	}
1552	/*
1553	 * Flow control will be raised again as soon as an entry cannot enqueue
1554	 * to the kernel control socket
1555	 */
1556	while ((cfc->cf_flags & CFF_FLOW_CONTROLLED) == 0) {
1557		verify_content_filter(cfc);
1558
1559		cfil_rw_lock_assert_held(&cfil_lck_rw, 0);
1560
1561		/* Find an entry that is flow controlled */
1562		TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
1563			if (entry->cfe_cfil_info == NULL ||
1564				entry->cfe_cfil_info->cfi_so == NULL)
1565				continue;
1566			if ((entry->cfe_flags & CFEF_FLOW_CONTROLLED) == 0)
1567				continue;
1568		}
1569		if (entry == NULL)
1570			break;
1571
1572		OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
1573
1574		so = entry->cfe_cfil_info->cfi_so;
1575
1576		cfil_rw_unlock_shared(&cfil_lck_rw);
1577		socket_lock(so, 1);
1578
1579		do {
1580			error = cfil_acquire_sockbuf(so, 1);
1581			if (error == 0)
1582				error = cfil_data_service_ctl_q(so, kcunit, 1);
1583			cfil_release_sockbuf(so, 1);
1584			if (error != 0)
1585				break;
1586
1587			error = cfil_acquire_sockbuf(so, 0);
1588			if (error == 0)
1589				error = cfil_data_service_ctl_q(so, kcunit, 0);
1590			cfil_release_sockbuf(so, 0);
1591		} while (0);
1592
1593		socket_lock_assert_owned(so);
1594		socket_unlock(so, 1);
1595
1596		cfil_rw_lock_shared(&cfil_lck_rw);
1597	}
1598done:
1599	cfil_rw_unlock_shared(&cfil_lck_rw);
1600}
1601
1602void
1603cfil_init(void)
1604{
1605	struct kern_ctl_reg kern_ctl;
1606	errno_t	error = 0;
1607	vm_size_t content_filter_size = 0;	/* size of content_filter */
1608	vm_size_t cfil_info_size = 0;	/* size of cfil_info */
1609
1610	CFIL_LOG(LOG_NOTICE, "");
1611
1612	/*
1613	 * Compile time verifications
1614	 */
1615	_CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER);
1616	_CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0);
1617	_CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0);
1618	_CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0);
1619
1620	/*
1621	 * Runtime time verifications
1622	 */
1623	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_enqueued,
1624		sizeof(uint32_t)));
1625	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_enqueued,
1626		sizeof(uint32_t)));
1627	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_in_peeked,
1628		sizeof(uint32_t)));
1629	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_ctl_q_out_peeked,
1630		sizeof(uint32_t)));
1631
1632	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_in_enqueued,
1633		sizeof(uint32_t)));
1634	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_pending_q_out_enqueued,
1635		sizeof(uint32_t)));
1636
1637	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_enqueued,
1638		sizeof(uint32_t)));
1639	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_enqueued,
1640		sizeof(uint32_t)));
1641	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_in_passed,
1642		sizeof(uint32_t)));
1643	VERIFY(IS_P2ALIGNED(&cfil_stats.cfs_inject_q_out_passed,
1644		sizeof(uint32_t)));
1645
1646	/*
1647	 * Zone for content filters kernel control sockets
1648	 */
1649	content_filter_size = sizeof(struct content_filter);
1650	content_filter_zone = zinit(content_filter_size,
1651				CONTENT_FILTER_ZONE_MAX * content_filter_size,
1652				0,
1653				CONTENT_FILTER_ZONE_NAME);
1654	if (content_filter_zone == NULL) {
1655		panic("%s: zinit(%s) failed", __func__,
1656			CONTENT_FILTER_ZONE_NAME);
1657		/* NOTREACHED */
1658	}
1659	zone_change(content_filter_zone, Z_CALLERACCT, FALSE);
1660	zone_change(content_filter_zone, Z_EXPAND, TRUE);
1661
1662	/*
1663	 * Zone for per socket content filters
1664	 */
1665	cfil_info_size = sizeof(struct cfil_info);
1666	cfil_info_zone = zinit(cfil_info_size,
1667				CFIL_INFO_ZONE_MAX * cfil_info_size,
1668				0,
1669				CFIL_INFO_ZONE_NAME);
1670	if (cfil_info_zone == NULL) {
1671		panic("%s: zinit(%s) failed", __func__, CFIL_INFO_ZONE_NAME);
1672		/* NOTREACHED */
1673	}
1674	zone_change(cfil_info_zone, Z_CALLERACCT, FALSE);
1675	zone_change(cfil_info_zone, Z_EXPAND, TRUE);
1676
1677	/*
1678	 * Allocate locks
1679	 */
1680	cfil_lck_grp_attr = lck_grp_attr_alloc_init();
1681	if (cfil_lck_grp_attr == NULL) {
1682		panic("%s: lck_grp_attr_alloc_init failed", __func__);
1683		/* NOTREACHED */
1684	}
1685	cfil_lck_grp = lck_grp_alloc_init("content filter",
1686					cfil_lck_grp_attr);
1687	if (cfil_lck_grp == NULL) {
1688		panic("%s: lck_grp_alloc_init failed", __func__);
1689		/* NOTREACHED */
1690	}
1691	cfil_lck_attr = lck_attr_alloc_init();
1692	if (cfil_lck_attr == NULL) {
1693		panic("%s: lck_attr_alloc_init failed", __func__);
1694		/* NOTREACHED */
1695	}
1696	lck_rw_init(&cfil_lck_rw, cfil_lck_grp, cfil_lck_attr);
1697
1698	TAILQ_INIT(&cfil_sock_head);
1699
1700	/*
1701	 * Register kernel control
1702	 */
1703	bzero(&kern_ctl, sizeof(kern_ctl));
1704	strlcpy(kern_ctl.ctl_name, CONTENT_FILTER_CONTROL_NAME,
1705		sizeof(kern_ctl.ctl_name));
1706	kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED | CTL_FLAG_REG_EXTENDED;
1707	kern_ctl.ctl_sendsize = 512 * 1024; /* enough? */
1708	kern_ctl.ctl_recvsize = 512 * 1024; /* enough? */
1709	kern_ctl.ctl_connect = cfil_ctl_connect;
1710	kern_ctl.ctl_disconnect = cfil_ctl_disconnect;
1711	kern_ctl.ctl_send = cfil_ctl_send;
1712	kern_ctl.ctl_getopt = cfil_ctl_getopt;
1713	kern_ctl.ctl_setopt = cfil_ctl_setopt;
1714	kern_ctl.ctl_rcvd = cfil_ctl_rcvd;
1715	error = ctl_register(&kern_ctl, &cfil_kctlref);
1716	if (error != 0) {
1717		CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
1718		return;
1719	}
1720}
1721
1722struct cfil_info *
1723cfil_info_alloc(struct socket *so)
1724{
1725	int kcunit;
1726	struct cfil_info *cfil_info = NULL;
1727	struct inpcb *inp = sotoinpcb(so);
1728
1729	CFIL_LOG(LOG_INFO, "");
1730
1731	socket_lock_assert_owned(so);
1732
1733	cfil_info = zalloc(cfil_info_zone);
1734	if (cfil_info == NULL)
1735		goto done;
1736	bzero(cfil_info, sizeof(struct cfil_info));
1737
1738	cfil_queue_init(&cfil_info->cfi_snd.cfi_inject_q);
1739	cfil_queue_init(&cfil_info->cfi_rcv.cfi_inject_q);
1740
1741	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1742		struct cfil_entry *entry;
1743
1744		entry = &cfil_info->cfi_entries[kcunit - 1];
1745		entry->cfe_cfil_info = cfil_info;
1746
1747		/* Initialize the filter entry */
1748		entry->cfe_filter = NULL;
1749		entry->cfe_flags = 0;
1750		entry->cfe_necp_control_unit = 0;
1751		entry->cfe_snd.cfe_pass_offset = 0;
1752		entry->cfe_snd.cfe_peek_offset = 0;
1753		entry->cfe_snd.cfe_peeked = 0;
1754		entry->cfe_rcv.cfe_pass_offset = 0;
1755		entry->cfe_rcv.cfe_peek_offset = 0;
1756		entry->cfe_rcv.cfe_peeked = 0;
1757
1758		cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
1759		cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
1760		cfil_queue_init(&entry->cfe_snd.cfe_ctl_q);
1761		cfil_queue_init(&entry->cfe_rcv.cfe_ctl_q);
1762	}
1763
1764	cfil_rw_lock_exclusive(&cfil_lck_rw);
1765
1766	so->so_cfil = cfil_info;
1767	cfil_info->cfi_so = so;
1768	/*
1769	 * Create a cfi_sock_id that's not the socket pointer!
1770	 */
1771	if (inp->inp_flowhash == 0)
1772		inp->inp_flowhash = inp_calc_flowhash(inp);
1773	cfil_info->cfi_sock_id =
1774		((so->so_gencnt << 32) | inp->inp_flowhash);
1775
1776	TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
1777
1778	cfil_sock_attached_count++;
1779
1780	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1781
1782done:
1783	if (cfil_info != NULL)
1784		OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_ok);
1785	else
1786		OSIncrementAtomic(&cfil_stats.cfs_cfi_alloc_fail);
1787
1788	return (cfil_info);
1789}
1790
1791int
1792cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit)
1793{
1794	int kcunit;
1795	struct cfil_info *cfil_info = so->so_cfil;
1796	int attached = 0;
1797
1798	CFIL_LOG(LOG_INFO, "");
1799
1800	socket_lock_assert_owned(so);
1801
1802	cfil_rw_lock_exclusive(&cfil_lck_rw);
1803
1804	for (kcunit = 1;
1805		content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1806		kcunit++) {
1807		struct content_filter *cfc = content_filters[kcunit - 1];
1808		struct cfil_entry *entry;
1809
1810		if (cfc == NULL)
1811			continue;
1812		if (cfc->cf_necp_control_unit != filter_control_unit)
1813			continue;
1814
1815		entry = &cfil_info->cfi_entries[kcunit - 1];
1816
1817		entry->cfe_filter = cfc;
1818		entry->cfe_necp_control_unit = filter_control_unit;
1819		TAILQ_INSERT_TAIL(&cfc->cf_sock_entries, entry, cfe_link);
1820		cfc->cf_sock_count++;
1821		verify_content_filter(cfc);
1822		attached = 1;
1823		entry->cfe_flags |= CFEF_CFIL_ATTACHED;
1824		break;
1825	}
1826
1827	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1828
1829	return (attached);
1830}
1831
1832static void
1833cfil_info_free(struct socket *so, struct cfil_info *cfil_info)
1834{
1835	int kcunit;
1836	uint64_t in_drain = 0;
1837	uint64_t out_drained = 0;
1838
1839	so->so_cfil = NULL;
1840
1841	if (so->so_flags & SOF_CONTENT_FILTER) {
1842		so->so_flags &= ~SOF_CONTENT_FILTER;
1843		so->so_usecount--;
1844	}
1845	if (cfil_info == NULL)
1846		return;
1847
1848	CFIL_LOG(LOG_INFO, "");
1849
1850	cfil_rw_lock_exclusive(&cfil_lck_rw);
1851
1852	for (kcunit = 1;
1853		content_filters != NULL && kcunit <= MAX_CONTENT_FILTER;
1854		kcunit++) {
1855		struct cfil_entry *entry;
1856		struct content_filter *cfc;
1857
1858		entry = &cfil_info->cfi_entries[kcunit - 1];
1859
1860		/* Don't be silly and try to detach twice */
1861		if (entry->cfe_filter == NULL)
1862			continue;
1863
1864		cfc = content_filters[kcunit - 1];
1865
1866		VERIFY(cfc == entry->cfe_filter);
1867
1868		entry->cfe_filter = NULL;
1869		entry->cfe_necp_control_unit = 0;
1870		TAILQ_REMOVE(&cfc->cf_sock_entries, entry, cfe_link);
1871		cfc->cf_sock_count--;
1872
1873		verify_content_filter(cfc);
1874	}
1875	cfil_sock_attached_count--;
1876	TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
1877
1878	out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
1879	in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
1880
1881	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
1882		struct cfil_entry *entry;
1883
1884		entry = &cfil_info->cfi_entries[kcunit - 1];
1885		out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
1886		in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_pending_q);
1887		out_drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
1888		in_drain += cfil_queue_drain(&entry->cfe_rcv.cfe_ctl_q);
1889	}
1890	cfil_rw_unlock_exclusive(&cfil_lck_rw);
1891
1892	if (out_drained)
1893		OSIncrementAtomic(&cfil_stats.cfs_flush_out_free);
1894	if (in_drain)
1895		OSIncrementAtomic(&cfil_stats.cfs_flush_in_free);
1896
1897	zfree(cfil_info_zone, cfil_info);
1898}
1899
1900/*
1901 * Entry point from Sockets layer
1902 * The socket is locked.
1903 */
1904errno_t
1905cfil_sock_attach(struct socket *so)
1906{
1907	errno_t error = 0;
1908	uint32_t filter_control_unit;
1909
1910	socket_lock_assert_owned(so);
1911
1912	/* Limit ourselves to TCP */
1913	if ((so->so_proto->pr_domain->dom_family != PF_INET &&
1914		so->so_proto->pr_domain->dom_family != PF_INET6) ||
1915		so->so_proto->pr_type != SOCK_STREAM ||
1916		so->so_proto->pr_protocol != IPPROTO_TCP)
1917		goto done;
1918
1919	filter_control_unit = necp_socket_get_content_filter_control_unit(so);
1920	if (filter_control_unit == 0)
1921		goto done;
1922
1923	if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
1924		OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
1925		goto done;
1926	}
1927	if (cfil_active_count == 0) {
1928		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
1929		goto done;
1930	}
1931	if (so->so_cfil != NULL) {
1932		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
1933		CFIL_LOG(LOG_ERR, "already attached");
1934	} else {
1935		cfil_info_alloc(so);
1936		if (so->so_cfil == NULL) {
1937			error = ENOMEM;
1938			OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
1939			goto done;
1940		}
1941	}
1942	if (cfil_info_attach_unit(so, filter_control_unit) == 0) {
1943		CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
1944			filter_control_unit);
1945		OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
1946		goto done;
1947	}
1948	CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockid %llx",
1949		(uint64_t)VM_KERNEL_ADDRPERM(so),
1950		filter_control_unit, so->so_cfil->cfi_sock_id);
1951
1952	so->so_flags |= SOF_CONTENT_FILTER;
1953	OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
1954
1955	/* Hold a reference on the socket */
1956	so->so_usecount++;
1957
1958	error = cfil_dispatch_attach_event(so, filter_control_unit);
1959	/* We can recover from flow control or out of memory errors */
1960	if (error == ENOBUFS || error == ENOMEM)
1961		error = 0;
1962	else if (error != 0)
1963		goto done;
1964
1965	CFIL_INFO_VERIFY(so->so_cfil);
1966done:
1967	return (error);
1968}
1969
1970/*
1971 * Entry point from Sockets layer
1972 * The socket is locked.
1973 */
1974errno_t
1975cfil_sock_detach(struct socket *so)
1976{
1977	if (so->so_cfil) {
1978		cfil_info_free(so, so->so_cfil);
1979		OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
1980	}
1981	return (0);
1982}
1983
1984static int
1985cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
1986{
1987	errno_t error = 0;
1988	struct cfil_entry *entry = NULL;
1989	struct cfil_msg_sock_attached msg_attached;
1990	uint32_t kcunit;
1991	struct content_filter *cfc;
1992
1993	socket_lock_assert_owned(so);
1994
1995	cfil_rw_lock_shared(&cfil_lck_rw);
1996
1997	if (so->so_proto == NULL || so->so_proto->pr_domain == NULL) {
1998		error = EINVAL;
1999		goto done;
2000	}
2001	/*
2002	 * Find the matching filter unit
2003	 */
2004	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
2005		cfc = content_filters[kcunit - 1];
2006
2007		if (cfc == NULL)
2008			continue;
2009		if (cfc->cf_necp_control_unit != filter_control_unit)
2010			continue;
2011		entry = &so->so_cfil->cfi_entries[kcunit - 1];
2012		if (entry->cfe_filter == NULL)
2013			continue;
2014
2015		VERIFY(cfc == entry->cfe_filter);
2016
2017		break;
2018	}
2019
2020	if (entry == NULL || entry->cfe_filter == NULL)
2021		goto done;
2022
2023	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED))
2024		goto done;
2025
2026	CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u kcunit %u",
2027		(uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, kcunit);
2028
2029	/* Would be wasteful to try when flow controlled */
2030	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2031		error = ENOBUFS;
2032		goto done;
2033	}
2034
2035	bzero(&msg_attached, sizeof(struct cfil_msg_sock_attached));
2036	msg_attached.cfs_msghdr.cfm_len = sizeof(struct cfil_msg_sock_attached);
2037	msg_attached.cfs_msghdr.cfm_version = CFM_VERSION_CURRENT;
2038	msg_attached.cfs_msghdr.cfm_type = CFM_TYPE_EVENT;
2039	msg_attached.cfs_msghdr.cfm_op = CFM_OP_SOCKET_ATTACHED;
2040	msg_attached.cfs_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2041
2042	msg_attached.cfs_sock_family = so->so_proto->pr_domain->dom_family;
2043	msg_attached.cfs_sock_type = so->so_proto->pr_type;
2044	msg_attached.cfs_sock_protocol = so->so_proto->pr_protocol;
2045	msg_attached.cfs_pid = so->last_pid;
2046	memcpy(msg_attached.cfs_uuid, so->last_uuid, sizeof(uuid_t));
2047	if (so->so_flags & SOF_DELEGATED) {
2048		msg_attached.cfs_e_pid = so->e_pid;
2049		memcpy(msg_attached.cfs_e_uuid, so->e_uuid, sizeof(uuid_t));
2050	} else {
2051		msg_attached.cfs_e_pid = so->last_pid;
2052		memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
2053	}
2054	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2055				entry->cfe_filter->cf_kcunit,
2056				&msg_attached,
2057				sizeof(struct cfil_msg_sock_attached),
2058				CTL_DATA_EOR);
2059	if (error != 0) {
2060		CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error);
2061		goto done;
2062	}
2063	microuptime(&entry->cfe_last_event);
2064	entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
2065	OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
2066done:
2067
2068	/* We can recover from flow control */
2069	if (error == ENOBUFS) {
2070		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2071		OSIncrementAtomic(&cfil_stats.cfs_attach_event_flow_control);
2072
2073		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2074			cfil_rw_lock_exclusive(&cfil_lck_rw);
2075
2076		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2077
2078		cfil_rw_unlock_exclusive(&cfil_lck_rw);
2079	} else {
2080		if (error != 0)
2081			OSIncrementAtomic(&cfil_stats.cfs_attach_event_fail);
2082
2083		cfil_rw_unlock_shared(&cfil_lck_rw);
2084	}
2085	return (error);
2086}
2087
2088static int
2089cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
2090{
2091	errno_t error = 0;
2092	struct mbuf *msg = NULL;
2093	struct cfil_entry *entry;
2094	struct cfe_buf *entrybuf;
2095	struct cfil_msg_hdr msg_disconnected;
2096	struct content_filter *cfc;
2097
2098	socket_lock_assert_owned(so);
2099
2100	cfil_rw_lock_shared(&cfil_lck_rw);
2101
2102	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2103	if (outgoing)
2104		entrybuf = &entry->cfe_snd;
2105	else
2106		entrybuf = &entry->cfe_rcv;
2107
2108	cfc = entry->cfe_filter;
2109	if (cfc == NULL)
2110		goto done;
2111
2112	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2113		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2114
2115	/*
2116	 * Send the disconnection event once
2117	 */
2118	if ((outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) ||
2119		(!outgoing && (entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))) {
2120		CFIL_LOG(LOG_INFO, "so %llx disconnect already sent",
2121			(uint64_t)VM_KERNEL_ADDRPERM(so));
2122		goto done;
2123	}
2124
2125	/*
2126	 * We're not disconnected as long as some data is waiting
2127	 * to be delivered to the filter
2128	 */
2129	if (outgoing && cfil_queue_empty(&entrybuf->cfe_ctl_q) == 0) {
2130		CFIL_LOG(LOG_INFO, "so %llx control queue not empty",
2131			(uint64_t)VM_KERNEL_ADDRPERM(so));
2132		error = EBUSY;
2133		goto done;
2134	}
2135	/* Would be wasteful to try when flow controlled */
2136	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2137		error = ENOBUFS;
2138		goto done;
2139	}
2140
2141	bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
2142	msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
2143	msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
2144	msg_disconnected.cfm_type = CFM_TYPE_EVENT;
2145	msg_disconnected.cfm_op = outgoing ? CFM_OP_DISCONNECT_OUT :
2146		CFM_OP_DISCONNECT_IN;
2147	msg_disconnected.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2148	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2149				entry->cfe_filter->cf_kcunit,
2150				&msg_disconnected,
2151				sizeof(struct cfil_msg_hdr),
2152				CTL_DATA_EOR);
2153	if (error != 0) {
2154		CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2155		mbuf_freem(msg);
2156		goto done;
2157	}
2158	microuptime(&entry->cfe_last_event);
2159
2160	/* Remember we have sent the disconnection message */
2161	if (outgoing) {
2162		entry->cfe_flags |= CFEF_SENT_DISCONNECT_OUT;
2163		OSIncrementAtomic(&cfil_stats.cfs_disconnect_out_event_ok);
2164	} else {
2165		entry->cfe_flags |= CFEF_SENT_DISCONNECT_IN;
2166		OSIncrementAtomic(&cfil_stats.cfs_disconnect_in_event_ok);
2167	}
2168done:
2169	if (error == ENOBUFS) {
2170		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2171		OSIncrementAtomic(
2172			&cfil_stats.cfs_disconnect_event_flow_control);
2173
2174		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2175			cfil_rw_lock_exclusive(&cfil_lck_rw);
2176
2177		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2178
2179		cfil_rw_unlock_exclusive(&cfil_lck_rw);
2180	} else {
2181		if (error != 0)
2182			OSIncrementAtomic(
2183				&cfil_stats.cfs_disconnect_event_fail);
2184
2185		cfil_rw_unlock_shared(&cfil_lck_rw);
2186	}
2187	return (error);
2188}
2189
2190int
2191cfil_dispatch_closed_event(struct socket *so, int kcunit)
2192{
2193	struct cfil_entry *entry;
2194	struct cfil_msg_hdr msg_closed;
2195	errno_t error = 0;
2196	struct content_filter *cfc;
2197
2198	socket_lock_assert_owned(so);
2199
2200	cfil_rw_lock_shared(&cfil_lck_rw);
2201
2202	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2203	cfc = entry->cfe_filter;
2204	if (cfc == NULL)
2205		goto done;
2206
2207	CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
2208		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
2209
2210	/* Would be wasteful to try when flow controlled */
2211	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2212		error = ENOBUFS;
2213		goto done;
2214	}
2215	/*
2216	 * Send a single closed message per filter
2217	 */
2218	if ((entry->cfe_flags & CFEF_SENT_SOCK_CLOSED) != 0)
2219		goto done;
2220	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
2221		goto done;
2222
2223	bzero(&msg_closed, sizeof(struct cfil_msg_hdr));
2224	msg_closed.cfm_len = sizeof(struct cfil_msg_hdr);
2225	msg_closed.cfm_version = CFM_VERSION_CURRENT;
2226	msg_closed.cfm_type = CFM_TYPE_EVENT;
2227	msg_closed.cfm_op = CFM_OP_SOCKET_CLOSED;
2228	msg_closed.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
2229	error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
2230				entry->cfe_filter->cf_kcunit,
2231				&msg_closed,
2232				sizeof(struct cfil_msg_hdr),
2233				CTL_DATA_EOR);
2234	if (error != 0) {
2235		CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d",
2236			error);
2237		goto done;
2238	}
2239	microuptime(&entry->cfe_last_event);
2240	entry->cfe_flags |= CFEF_SENT_SOCK_CLOSED;
2241	OSIncrementAtomic(&cfil_stats.cfs_closed_event_ok);
2242done:
2243	/* We can recover from flow control */
2244	if (error == ENOBUFS) {
2245		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2246		OSIncrementAtomic(&cfil_stats.cfs_closed_event_flow_control);
2247
2248		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2249			cfil_rw_lock_exclusive(&cfil_lck_rw);
2250
2251		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2252
2253		cfil_rw_unlock_exclusive(&cfil_lck_rw);
2254	} else {
2255		if (error != 0)
2256			OSIncrementAtomic(&cfil_stats.cfs_closed_event_fail);
2257
2258		cfil_rw_unlock_shared(&cfil_lck_rw);
2259	}
2260
2261	return (error);
2262}
2263
2264static void
2265fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2266	struct in6_addr *ip6, u_int16_t port)
2267{
2268	struct sockaddr_in6 *sin6 = &sin46->sin6;
2269
2270	sin6->sin6_family = AF_INET6;
2271	sin6->sin6_len = sizeof(*sin6);
2272	sin6->sin6_port = port;
2273	sin6->sin6_addr = *ip6;
2274	if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) {
2275		sin6->sin6_scope_id = ntohs(sin6->sin6_addr.s6_addr16[1]);
2276		sin6->sin6_addr.s6_addr16[1] = 0;
2277	}
2278}
2279
2280static void
2281fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
2282	struct in_addr ip, u_int16_t port)
2283{
2284	struct sockaddr_in *sin = &sin46->sin;
2285
2286	sin->sin_family = AF_INET;
2287	sin->sin_len = sizeof(*sin);
2288	sin->sin_port = port;
2289	sin->sin_addr.s_addr = ip.s_addr;
2290}
2291
2292static int
2293cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
2294	struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
2295{
2296	errno_t error = 0;
2297	struct mbuf *copy = NULL;
2298	struct mbuf *msg = NULL;
2299	unsigned int one = 1;
2300	struct cfil_msg_data_event *data_req;
2301	size_t hdrsize;
2302	struct inpcb *inp = (struct inpcb *)so->so_pcb;
2303	struct cfil_entry *entry;
2304	struct cfe_buf *entrybuf;
2305	struct content_filter *cfc;
2306
2307	cfil_rw_lock_shared(&cfil_lck_rw);
2308
2309	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2310	if (outgoing)
2311		entrybuf = &entry->cfe_snd;
2312	else
2313		entrybuf = &entry->cfe_rcv;
2314
2315	cfc = entry->cfe_filter;
2316	if (cfc == NULL)
2317		goto done;
2318
2319	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2320		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2321
2322	socket_lock_assert_owned(so);
2323
2324	/* Would be wasteful to try */
2325	if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
2326		error = ENOBUFS;
2327		goto done;
2328	}
2329
2330	/* Make a copy of the data to pass to kernel control socket */
2331	copy = m_copym_mode(data, copyoffset, copylen, M_DONTWAIT,
2332		M_COPYM_NOOP_HDR);
2333	if (copy == NULL) {
2334		CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2335		error = ENOMEM;
2336		goto done;
2337	}
2338
2339	/* We need an mbuf packet for the message header */
2340	hdrsize = sizeof(struct cfil_msg_data_event);
2341	error = mbuf_allocpacket(MBUF_DONTWAIT, hdrsize, &one, &msg);
2342	if (error != 0) {
2343		CFIL_LOG(LOG_ERR, "mbuf_allocpacket() failed");
2344		m_freem(copy);
2345		/*
2346		 * ENOBUFS is to indicate flow control
2347		 */
2348		error = ENOMEM;
2349		goto done;
2350	}
2351	mbuf_setlen(msg, hdrsize);
2352	mbuf_pkthdr_setlen(msg, hdrsize + copylen);
2353	msg->m_next = copy;
2354	data_req = (struct cfil_msg_data_event *)mbuf_data(msg);
2355	bzero(data_req, hdrsize);
2356	data_req->cfd_msghdr.cfm_len = hdrsize + copylen;
2357	data_req->cfd_msghdr.cfm_version = 1;
2358	data_req->cfd_msghdr.cfm_type = CFM_TYPE_EVENT;
2359	data_req->cfd_msghdr.cfm_op =
2360		outgoing ? CFM_OP_DATA_OUT : CFM_OP_DATA_IN;
2361	data_req->cfd_msghdr.cfm_sock_id =
2362		entry->cfe_cfil_info->cfi_sock_id;
2363	data_req->cfd_start_offset = entrybuf->cfe_peeked;
2364	data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
2365
2366	/*
2367	 * TBD:
2368	 * For non connected sockets need to copy addresses from passed
2369	 * parameters
2370	 */
2371	if (inp->inp_vflag & INP_IPV6) {
2372		if (outgoing) {
2373			fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2374				&inp->in6p_laddr, inp->inp_lport);
2375			fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2376				&inp->in6p_faddr, inp->inp_fport);
2377		} else {
2378			fill_ip6_sockaddr_4_6(&data_req->cfc_src,
2379				&inp->in6p_faddr, inp->inp_fport);
2380			fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
2381				&inp->in6p_laddr, inp->inp_lport);
2382		}
2383	} else if (inp->inp_vflag & INP_IPV4) {
2384		if (outgoing) {
2385			fill_ip_sockaddr_4_6(&data_req->cfc_src,
2386				inp->inp_laddr, inp->inp_lport);
2387			fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2388				inp->inp_faddr, inp->inp_fport);
2389		} else {
2390			fill_ip_sockaddr_4_6(&data_req->cfc_src,
2391				inp->inp_faddr, inp->inp_fport);
2392			fill_ip_sockaddr_4_6(&data_req->cfc_dst,
2393				inp->inp_laddr, inp->inp_lport);
2394		}
2395	}
2396
2397	/* Pass the message to the content filter */
2398	error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
2399				entry->cfe_filter->cf_kcunit,
2400				msg, CTL_DATA_EOR);
2401	if (error != 0) {
2402		CFIL_LOG(LOG_ERR, "ctl_enqueuembuf() failed: %d", error);
2403		mbuf_freem(msg);
2404		goto done;
2405	}
2406	entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
2407	OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
2408done:
2409	if (error == ENOBUFS) {
2410		entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
2411		OSIncrementAtomic(
2412			&cfil_stats.cfs_data_event_flow_control);
2413
2414		if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw))
2415			cfil_rw_lock_exclusive(&cfil_lck_rw);
2416
2417		cfc->cf_flags |= CFF_FLOW_CONTROLLED;
2418
2419		cfil_rw_unlock_exclusive(&cfil_lck_rw);
2420	} else {
2421		if (error != 0)
2422			OSIncrementAtomic(&cfil_stats.cfs_data_event_fail);
2423
2424		cfil_rw_unlock_shared(&cfil_lck_rw);
2425	}
2426	return (error);
2427}
2428
2429/*
2430 * Process the queue of data waiting to be delivered to content filter
2431 */
2432static int
2433cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
2434{
2435	errno_t error = 0;
2436	struct mbuf *data, *tmp = NULL;
2437	unsigned int datalen = 0, copylen = 0, copyoffset = 0;
2438	struct cfil_entry *entry;
2439	struct cfe_buf *entrybuf;
2440	uint64_t currentoffset = 0;
2441
2442	if (so->so_cfil == NULL)
2443		return (0);
2444
2445	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2446		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2447
2448	socket_lock_assert_owned(so);
2449
2450	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2451	if (outgoing)
2452		entrybuf = &entry->cfe_snd;
2453	else
2454		entrybuf = &entry->cfe_rcv;
2455
2456	/* Send attached message if not yet done */
2457	if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
2458		error = cfil_dispatch_attach_event(so, kcunit);
2459		if (error != 0) {
2460			/* We can recover from flow control */
2461			if (error == ENOBUFS || error == ENOMEM)
2462				error = 0;
2463			goto done;
2464		}
2465	} else if ((entry->cfe_flags & CFEF_DATA_START) == 0) {
2466		OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
2467		goto done;
2468	}
2469	CFIL_LOG(LOG_DEBUG, "pass_offset %llu peeked %llu peek_offset %llu",
2470		entrybuf->cfe_pass_offset,
2471		entrybuf->cfe_peeked,
2472		entrybuf->cfe_peek_offset);
2473
2474	/* Move all data that can pass */
2475	while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
2476		entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
2477		datalen = cfil_data_length(data, NULL);
2478		tmp = data;
2479
2480		if (entrybuf->cfe_ctl_q.q_start + datalen <=
2481			entrybuf->cfe_pass_offset) {
2482			/*
2483			 * The first mbuf can fully pass
2484			 */
2485			copylen = datalen;
2486		} else {
2487			/*
2488			 * The first mbuf can partially pass
2489			 */
2490			copylen = entrybuf->cfe_pass_offset -
2491				entrybuf->cfe_ctl_q.q_start;
2492		}
2493		VERIFY(copylen <= datalen);
2494
2495		CFIL_LOG(LOG_DEBUG,
2496			"%llx first %llu peeked %llu pass %llu peek %llu"
2497			"datalen %u copylen %u",
2498			(uint64_t)VM_KERNEL_ADDRPERM(tmp),
2499			entrybuf->cfe_ctl_q.q_start,
2500			entrybuf->cfe_peeked,
2501			entrybuf->cfe_pass_offset,
2502			entrybuf->cfe_peek_offset,
2503			datalen, copylen);
2504
2505		/*
2506		 * Data that passes has been peeked at explicitly or
2507		 * implicitly
2508		 */
2509		if (entrybuf->cfe_ctl_q.q_start + copylen >
2510			entrybuf->cfe_peeked)
2511			entrybuf->cfe_peeked =
2512				entrybuf->cfe_ctl_q.q_start + copylen;
2513		/*
2514		 * Stop on partial pass
2515		 */
2516		if (copylen < datalen)
2517			break;
2518
2519		/* All good, move full data from ctl queue to pending queue */
2520		cfil_queue_remove(&entrybuf->cfe_ctl_q, data, datalen);
2521
2522		cfil_queue_enqueue(&entrybuf->cfe_pending_q, data, datalen);
2523		if (outgoing)
2524			OSAddAtomic64(datalen,
2525				&cfil_stats.cfs_pending_q_out_enqueued);
2526		else
2527			OSAddAtomic64(datalen,
2528				&cfil_stats.cfs_pending_q_in_enqueued);
2529	}
2530	CFIL_INFO_VERIFY(so->so_cfil);
2531	if (tmp != NULL)
2532		CFIL_LOG(LOG_DEBUG,
2533			"%llx first %llu peeked %llu pass %llu peek %llu"
2534			"datalen %u copylen %u",
2535			(uint64_t)VM_KERNEL_ADDRPERM(tmp),
2536			entrybuf->cfe_ctl_q.q_start,
2537			entrybuf->cfe_peeked,
2538			entrybuf->cfe_pass_offset,
2539			entrybuf->cfe_peek_offset,
2540			datalen, copylen);
2541	tmp = NULL;
2542
2543	/* Now deal with remaining data the filter wants to peek at */
2544	for (data = cfil_queue_first(&entrybuf->cfe_ctl_q),
2545		currentoffset = entrybuf->cfe_ctl_q.q_start;
2546		data != NULL && currentoffset < entrybuf->cfe_peek_offset;
2547		data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
2548		currentoffset += datalen) {
2549		datalen = cfil_data_length(data, NULL);
2550		tmp = data;
2551
2552		/* We've already peeked at this mbuf */
2553		if (currentoffset + datalen <= entrybuf->cfe_peeked)
2554			continue;
2555		/*
2556		 * The data in the first mbuf may have been
2557		 * partially peeked at
2558		 */
2559		copyoffset = entrybuf->cfe_peeked - currentoffset;
2560		VERIFY(copyoffset < datalen);
2561		copylen = datalen - copyoffset;
2562		VERIFY(copylen <= datalen);
2563		/*
2564		 * Do not copy more than needed
2565		 */
2566		if (currentoffset + copyoffset + copylen >
2567			entrybuf->cfe_peek_offset) {
2568			copylen = entrybuf->cfe_peek_offset -
2569				(currentoffset + copyoffset);
2570		}
2571
2572		CFIL_LOG(LOG_DEBUG,
2573			"%llx current %llu peeked %llu pass %llu peek %llu"
2574			"datalen %u copylen %u copyoffset %u",
2575			(uint64_t)VM_KERNEL_ADDRPERM(tmp),
2576			currentoffset,
2577			entrybuf->cfe_peeked,
2578			entrybuf->cfe_pass_offset,
2579			entrybuf->cfe_peek_offset,
2580			datalen, copylen, copyoffset);
2581
2582		/*
2583		 * Stop if there is nothing more to peek at
2584		 */
2585		if (copylen == 0)
2586			break;
2587		/*
2588		 * Let the filter get a peek at this span of data
2589		 */
2590		error = cfil_dispatch_data_event(so, kcunit,
2591			outgoing, data, copyoffset, copylen);
2592		if (error != 0) {
2593			/* On error, leave data in ctl_q */
2594			break;
2595		}
2596		entrybuf->cfe_peeked += copylen;
2597		if (outgoing)
2598			OSAddAtomic64(copylen,
2599				&cfil_stats.cfs_ctl_q_out_peeked);
2600		else
2601			OSAddAtomic64(copylen,
2602				&cfil_stats.cfs_ctl_q_in_peeked);
2603
2604		/* Stop when data could not be fully peeked at */
2605		if (copylen + copyoffset < datalen)
2606			break;
2607	}
2608	CFIL_INFO_VERIFY(so->so_cfil);
2609	if (tmp != NULL)
2610		CFIL_LOG(LOG_DEBUG,
2611			"%llx first %llu peeked %llu pass %llu peek %llu"
2612			"datalen %u copylen %u copyoffset %u",
2613			(uint64_t)VM_KERNEL_ADDRPERM(tmp),
2614			currentoffset,
2615			entrybuf->cfe_peeked,
2616			entrybuf->cfe_pass_offset,
2617			entrybuf->cfe_peek_offset,
2618			datalen, copylen, copyoffset);
2619
2620	/*
2621	 * Process data that has passed the filter
2622	 */
2623	error = cfil_service_pending_queue(so, kcunit, outgoing);
2624	if (error != 0) {
2625		CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
2626			error);
2627		goto done;
2628	}
2629
2630	/*
2631	 * Dispatch disconnect events that could not be sent
2632	 */
2633	if (so->so_cfil == NULL)
2634		goto done;
2635	else if (outgoing) {
2636		if ((so->so_cfil->cfi_flags & CFIF_SHUT_WR) &&
2637		    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT))
2638			cfil_dispatch_disconnect_event(so, kcunit, 1);
2639	} else {
2640		if ((so->so_cfil->cfi_flags & CFIF_SHUT_RD) &&
2641		    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))
2642			cfil_dispatch_disconnect_event(so, kcunit, 0);
2643	}
2644
2645done:
2646	CFIL_LOG(LOG_DEBUG,
2647		"first %llu peeked %llu pass %llu peek %llu",
2648		entrybuf->cfe_ctl_q.q_start,
2649		entrybuf->cfe_peeked,
2650		entrybuf->cfe_pass_offset,
2651		entrybuf->cfe_peek_offset);
2652
2653	CFIL_INFO_VERIFY(so->so_cfil);
2654	return (error);
2655}
2656
2657/*
2658 * cfil_data_filter()
2659 *
2660 * Process data for a content filter installed on a socket
2661 */
2662int
2663cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
2664	struct mbuf *data, uint64_t datalen)
2665{
2666	errno_t error = 0;
2667	struct cfil_entry *entry;
2668	struct cfe_buf *entrybuf;
2669
2670	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2671		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2672
2673	socket_lock_assert_owned(so);
2674
2675	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2676	if (outgoing)
2677		entrybuf = &entry->cfe_snd;
2678	else
2679		entrybuf = &entry->cfe_rcv;
2680
2681	/* Are we attached to the filter? */
2682	if (entry->cfe_filter == NULL) {
2683		error = 0;
2684		goto done;
2685	}
2686
2687	/* Dispatch to filters */
2688	cfil_queue_enqueue(&entrybuf->cfe_ctl_q, data, datalen);
2689	if (outgoing)
2690		OSAddAtomic64(datalen,
2691			&cfil_stats.cfs_ctl_q_out_enqueued);
2692	else
2693		OSAddAtomic64(datalen,
2694			&cfil_stats.cfs_ctl_q_in_enqueued);
2695
2696	error = cfil_data_service_ctl_q(so, kcunit, outgoing);
2697	if (error != 0) {
2698		CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
2699			error);
2700	}
2701	/*
2702	 * We have to return EJUSTRETURN in all cases to avoid double free
2703	 * by socket layer
2704	 */
2705	error = EJUSTRETURN;
2706done:
2707	CFIL_INFO_VERIFY(so->so_cfil);
2708
2709	CFIL_LOG(LOG_INFO, "return %d", error);
2710	return (error);
2711}
2712
2713/*
2714 * cfil_service_inject_queue() re-inject data that passed the
2715 * content filters
2716 */
2717static int
2718cfil_service_inject_queue(struct socket *so, int outgoing)
2719{
2720	mbuf_t data;
2721	unsigned int datalen;
2722	int mbcnt;
2723	unsigned int copylen;
2724	errno_t error = 0;
2725	struct mbuf *copy = NULL;
2726	struct cfi_buf *cfi_buf;
2727	struct cfil_queue *inject_q;
2728	int need_rwakeup = 0;
2729
2730	if (so->so_cfil == NULL)
2731		return (0);
2732
2733	CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
2734		(uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
2735
2736	socket_lock_assert_owned(so);
2737
2738	if (outgoing) {
2739		cfi_buf = &so->so_cfil->cfi_snd;
2740		so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
2741	} else {
2742		cfi_buf = &so->so_cfil->cfi_rcv;
2743		so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
2744	}
2745	inject_q = &cfi_buf->cfi_inject_q;
2746
2747	while ((data = cfil_queue_first(inject_q)) != NULL) {
2748		datalen = cfil_data_length(data, &mbcnt);
2749
2750		CFIL_LOG(LOG_INFO, "data %llx datalen %u",
2751			(uint64_t)VM_KERNEL_ADDRPERM(data), datalen);
2752
2753		/* Make a copy in case of injection error */
2754		copy = m_copym_mode(data, 0, M_COPYALL, M_DONTWAIT,
2755			M_COPYM_COPY_HDR);
2756		if (copy == NULL) {
2757			CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
2758			error = ENOMEM;
2759			break;
2760		}
2761
2762		if ((copylen = m_length(copy)) != datalen)
2763			panic("%s so %p copylen %d != datalen %d",
2764				__func__, so, copylen, datalen);
2765
2766		if (outgoing) {
2767			socket_unlock(so, 0);
2768
2769			/*
2770			 * Set both DONTWAIT and NBIO flags are we really
2771			 * do not want to block
2772			 */
2773			error = sosend(so, NULL, NULL,
2774					copy, NULL,
2775					MSG_SKIPCFIL | MSG_DONTWAIT | MSG_NBIO);
2776
2777			socket_lock(so, 0);
2778
2779			if (error != 0) {
2780				CFIL_LOG(LOG_ERR, "sosend() failed %d",
2781					error);
2782			}
2783		} else {
2784			copy->m_flags |= M_SKIPCFIL;
2785
2786			/*
2787			 * NOTE:
2788			 * This work only because we support plain TCP
2789			 * For UDP, RAWIP, MPTCP and message TCP we'll
2790			 * need to call the appropriate sbappendxxx()
2791			 * of fix sock_inject_data_in()
2792			 */
2793			if (sbappendstream(&so->so_rcv, copy))
2794				need_rwakeup = 1;
2795		}
2796
2797		/* Need to reassess if filter is still attached after unlock */
2798		if (so->so_cfil == NULL) {
2799			CFIL_LOG(LOG_ERR, "so %llx cfil detached",
2800				(uint64_t)VM_KERNEL_ADDRPERM(so));
2801			OSIncrementAtomic(&cfil_stats.cfs_inject_q_detached);
2802			error = 0;
2803			break;
2804		}
2805		if (error != 0)
2806			break;
2807
2808		/* Injection successful */
2809		cfil_queue_remove(inject_q, data, datalen);
2810		mbuf_freem(data);
2811
2812		cfi_buf->cfi_pending_first += datalen;
2813		cfi_buf->cfi_pending_mbcnt -= mbcnt;
2814		cfil_info_buf_verify(cfi_buf);
2815
2816		if (outgoing)
2817			OSAddAtomic64(datalen,
2818				&cfil_stats.cfs_inject_q_out_passed);
2819		else
2820			OSAddAtomic64(datalen,
2821				&cfil_stats.cfs_inject_q_in_passed);
2822	}
2823
2824	/* A single wakeup is for several packets is more efficient */
2825	if (need_rwakeup)
2826		sorwakeup(so);
2827
2828	if (error != 0 && so->so_cfil) {
2829		if (error == ENOBUFS)
2830			OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
2831		if (error == ENOMEM)
2832			OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
2833
2834		if (outgoing) {
2835			so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_OUT;
2836			OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
2837		} else {
2838			so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_IN;
2839			OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
2840		}
2841	}
2842
2843	/*
2844	 * Notify
2845	 */
2846	if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_SHUT_WR)) {
2847		cfil_sock_notify_shutdown(so, SHUT_WR);
2848		if (cfil_sock_data_pending(&so->so_snd) == 0)
2849			soshutdownlock_final(so, SHUT_WR);
2850	}
2851	if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
2852		if (cfil_filters_attached(so) == 0) {
2853			CFIL_LOG(LOG_INFO, "so %llx waking",
2854				(uint64_t)VM_KERNEL_ADDRPERM(so));
2855			wakeup((caddr_t)&so->so_cfil);
2856		}
2857	}
2858
2859	CFIL_INFO_VERIFY(so->so_cfil);
2860
2861	return (error);
2862}
2863
2864static int
2865cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
2866{
2867	uint64_t passlen, curlen;
2868	mbuf_t data;
2869	unsigned int datalen;
2870	errno_t error = 0;
2871	struct cfil_entry *entry;
2872	struct cfe_buf *entrybuf;
2873	struct cfil_queue *pending_q;
2874
2875	CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
2876		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
2877
2878	socket_lock_assert_owned(so);
2879
2880	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2881	if (outgoing)
2882		entrybuf = &entry->cfe_snd;
2883	else
2884		entrybuf = &entry->cfe_rcv;
2885
2886	pending_q = &entrybuf->cfe_pending_q;
2887
2888	passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
2889
2890	/*
2891	 * Locate the chunks of data that we can pass to the next filter
2892	 * A data chunk must be on mbuf boundaries
2893	 */
2894	curlen = 0;
2895	while ((data = cfil_queue_first(pending_q)) != NULL) {
2896		datalen = cfil_data_length(data, NULL);
2897
2898		CFIL_LOG(LOG_INFO,
2899			"data %llx datalen %u passlen %llu curlen %llu",
2900			(uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
2901			passlen, curlen);
2902
2903		if (curlen + datalen > passlen)
2904			break;
2905
2906		cfil_queue_remove(pending_q, data, datalen);
2907
2908		curlen += datalen;
2909
2910		for (kcunit += 1;
2911			kcunit <= MAX_CONTENT_FILTER;
2912			kcunit++) {
2913			error = cfil_data_filter(so, kcunit, outgoing,
2914				data, datalen);
2915			/* 0 means passed so we can continue */
2916			if (error != 0)
2917				break;
2918		}
2919		/* When data has passed all filters, re-inject */
2920		if (error == 0) {
2921			if (outgoing) {
2922				cfil_queue_enqueue(
2923					&so->so_cfil->cfi_snd.cfi_inject_q,
2924					data, datalen);
2925				OSAddAtomic64(datalen,
2926					&cfil_stats.cfs_inject_q_out_enqueued);
2927			} else {
2928				cfil_queue_enqueue(
2929					&so->so_cfil->cfi_rcv.cfi_inject_q,
2930					data, datalen);
2931				OSAddAtomic64(datalen,
2932					&cfil_stats.cfs_inject_q_in_enqueued);
2933			}
2934		}
2935	}
2936
2937	CFIL_INFO_VERIFY(so->so_cfil);
2938
2939	return (error);
2940}
2941
2942int
2943cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
2944	uint64_t pass_offset, uint64_t peek_offset)
2945{
2946	errno_t error = 0;
2947	struct cfil_entry *entry;
2948	struct cfe_buf *entrybuf;
2949	int updated = 0;
2950
2951	CFIL_LOG(LOG_INFO, "pass %llu peek %llu", pass_offset, peek_offset);
2952
2953	socket_lock_assert_owned(so);
2954
2955	if (so->so_cfil == NULL) {
2956		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
2957			(uint64_t)VM_KERNEL_ADDRPERM(so));
2958		error = 0;
2959		goto done;
2960	} else if (so->so_cfil->cfi_flags & CFIF_DROP) {
2961		CFIL_LOG(LOG_ERR, "so %llx drop set",
2962			(uint64_t)VM_KERNEL_ADDRPERM(so));
2963		error = EPIPE;
2964		goto done;
2965	}
2966
2967	entry = &so->so_cfil->cfi_entries[kcunit - 1];
2968	if (outgoing)
2969		entrybuf = &entry->cfe_snd;
2970	else
2971		entrybuf = &entry->cfe_rcv;
2972
2973	/* Record updated offsets for this content filter */
2974	if (pass_offset > entrybuf->cfe_pass_offset) {
2975		entrybuf->cfe_pass_offset = pass_offset;
2976
2977		if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
2978			entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
2979		updated = 1;
2980	} else {
2981		CFIL_LOG(LOG_INFO, "pass_offset %llu <= cfe_pass_offset %llu",
2982			pass_offset, entrybuf->cfe_pass_offset);
2983	}
2984	/* Filter does not want or need to see data that's allowed to pass */
2985	if (peek_offset > entrybuf->cfe_pass_offset &&
2986		peek_offset > entrybuf->cfe_peek_offset) {
2987		entrybuf->cfe_peek_offset = peek_offset;
2988		updated = 1;
2989	}
2990	/* Nothing to do */
2991	if (updated == 0)
2992		goto done;
2993
2994	/* Move data held in control queue to pending queue if needed */
2995	error = cfil_data_service_ctl_q(so, kcunit, outgoing);
2996	if (error != 0) {
2997		CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
2998			error);
2999		goto done;
3000	}
3001	error = EJUSTRETURN;
3002
3003done:
3004	/*
3005	 * The filter is effectively detached when pass all from both sides
3006	 * or when the socket is closed and no more data is waiting
3007	 * to be delivered to the filter
3008	 */
3009	if (so->so_cfil != NULL &&
3010	    ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
3011	    entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
3012	    ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3013	    cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
3014	    cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
3015		entry->cfe_flags |= CFEF_CFIL_DETACHED;
3016		CFIL_LOG(LOG_INFO, "so %llx detached %u",
3017			(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3018		if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
3019		    cfil_filters_attached(so) == 0) {
3020			CFIL_LOG(LOG_INFO, "so %llx waking",
3021				(uint64_t)VM_KERNEL_ADDRPERM(so));
3022			wakeup((caddr_t)&so->so_cfil);
3023		}
3024	}
3025	CFIL_INFO_VERIFY(so->so_cfil);
3026	CFIL_LOG(LOG_INFO, "return %d", error);
3027	return (error);
3028}
3029
3030/*
3031 * Update pass offset for socket when no data is pending
3032 */
3033static int
3034cfil_set_socket_pass_offset(struct socket *so, int outgoing)
3035{
3036	struct cfi_buf *cfi_buf;
3037	struct cfil_entry *entry;
3038	struct cfe_buf *entrybuf;
3039	uint32_t kcunit;
3040	uint64_t pass_offset = 0;
3041
3042	if (so->so_cfil == NULL)
3043		return (0);
3044
3045	CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
3046		(uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3047
3048	socket_lock_assert_owned(so);
3049
3050	if (outgoing)
3051		cfi_buf = &so->so_cfil->cfi_snd;
3052	else
3053		cfi_buf = &so->so_cfil->cfi_rcv;
3054
3055	if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
3056		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3057			entry = &so->so_cfil->cfi_entries[kcunit - 1];
3058
3059			/* Are we attached to a filter? */
3060			if (entry->cfe_filter == NULL)
3061				continue;
3062
3063			if (outgoing)
3064				entrybuf = &entry->cfe_snd;
3065			else
3066				entrybuf = &entry->cfe_rcv;
3067
3068			if (pass_offset == 0 ||
3069			    entrybuf->cfe_pass_offset < pass_offset)
3070				pass_offset = entrybuf->cfe_pass_offset;
3071		}
3072		cfi_buf->cfi_pass_offset = pass_offset;
3073	}
3074
3075	return (0);
3076}
3077
3078int
3079cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
3080	uint64_t pass_offset, uint64_t peek_offset)
3081{
3082	errno_t error = 0;
3083
3084	CFIL_LOG(LOG_INFO, "");
3085
3086	socket_lock_assert_owned(so);
3087
3088	error = cfil_acquire_sockbuf(so, outgoing);
3089	if (error != 0) {
3090		CFIL_LOG(LOG_INFO, "so %llx %s dropped",
3091			(uint64_t)VM_KERNEL_ADDRPERM(so),
3092			outgoing ? "out" : "in");
3093		goto release;
3094	}
3095
3096	error = cfil_update_data_offsets(so, kcunit, outgoing,
3097		pass_offset, peek_offset);
3098
3099	cfil_service_inject_queue(so, outgoing);
3100
3101	cfil_set_socket_pass_offset(so, outgoing);
3102release:
3103	CFIL_INFO_VERIFY(so->so_cfil);
3104	cfil_release_sockbuf(so, outgoing);
3105
3106	return (error);
3107}
3108
3109
3110static void
3111cfil_flush_queues(struct socket *so)
3112{
3113	struct cfil_entry *entry;
3114	int kcunit;
3115	uint64_t drained;
3116
3117	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3118		goto done;
3119
3120	socket_lock_assert_owned(so);
3121
3122	/*
3123	 * Flush the output queues and ignore errors as long as
3124	 * we are attached
3125	 */
3126	(void) cfil_acquire_sockbuf(so, 1);
3127	if (so->so_cfil != NULL) {
3128		drained = 0;
3129		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3130			entry = &so->so_cfil->cfi_entries[kcunit - 1];
3131
3132			drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
3133			drained += cfil_queue_drain(
3134			    &entry->cfe_snd.cfe_pending_q);
3135		}
3136		drained += cfil_queue_drain(&so->so_cfil->cfi_snd.cfi_inject_q);
3137		if (drained) {
3138			if (so->so_cfil->cfi_flags & CFIF_DROP)
3139				OSIncrementAtomic(
3140					&cfil_stats.cfs_flush_out_drop);
3141			else
3142				OSIncrementAtomic(
3143					&cfil_stats.cfs_flush_out_close);
3144		}
3145	}
3146	cfil_release_sockbuf(so, 1);
3147
3148	/*
3149	 * Flush the input queues
3150	 */
3151	(void) cfil_acquire_sockbuf(so, 0);
3152	if (so->so_cfil != NULL) {
3153		drained = 0;
3154		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3155			entry = &so->so_cfil->cfi_entries[kcunit - 1];
3156
3157				drained += cfil_queue_drain(
3158					&entry->cfe_rcv.cfe_ctl_q);
3159				drained += cfil_queue_drain(
3160					&entry->cfe_rcv.cfe_pending_q);
3161		}
3162		drained += cfil_queue_drain(&so->so_cfil->cfi_rcv.cfi_inject_q);
3163		if (drained) {
3164			if (so->so_cfil->cfi_flags & CFIF_DROP)
3165				OSIncrementAtomic(
3166					&cfil_stats.cfs_flush_in_drop);
3167			else
3168				OSIncrementAtomic(
3169					&cfil_stats.cfs_flush_in_close);
3170		}
3171	}
3172	cfil_release_sockbuf(so, 0);
3173done:
3174	CFIL_INFO_VERIFY(so->so_cfil);
3175}
3176
3177int
3178cfil_action_drop(struct socket *so, uint32_t kcunit)
3179{
3180	errno_t error = 0;
3181	struct cfil_entry *entry;
3182	struct proc *p;
3183
3184	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3185		goto done;
3186
3187	socket_lock_assert_owned(so);
3188
3189	entry = &so->so_cfil->cfi_entries[kcunit - 1];
3190
3191	/* Are we attached to the filter? */
3192	if (entry->cfe_filter == NULL)
3193		goto done;
3194
3195	so->so_cfil->cfi_flags |= CFIF_DROP;
3196
3197	p = current_proc();
3198
3199	/* Force the socket to be marked defunct */
3200	error = sosetdefunct(p, so,
3201		SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, 1);
3202
3203	/* Flush the socket buffer and disconnect */
3204	if (error == 0)
3205		error = sodefunct(p, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
3206
3207	/* The filter is done, mark as detached */
3208	entry->cfe_flags |= CFEF_CFIL_DETACHED;
3209	CFIL_LOG(LOG_INFO, "so %llx detached %u",
3210		(uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
3211
3212	/* Pending data needs to go */
3213	cfil_flush_queues(so);
3214
3215	if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
3216		if (cfil_filters_attached(so) == 0) {
3217			CFIL_LOG(LOG_INFO, "so %llx waking",
3218				(uint64_t)VM_KERNEL_ADDRPERM(so));
3219			wakeup((caddr_t)&so->so_cfil);
3220		}
3221	}
3222done:
3223	return (error);
3224}
3225
3226static int
3227cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
3228{
3229	struct cfil_entry *entry;
3230	struct cfe_buf *entrybuf;
3231	uint32_t kcunit;
3232
3233	CFIL_LOG(LOG_INFO, "so %llx outgoing %d datalen %u",
3234		(uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
3235
3236	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3237		entry = &so->so_cfil->cfi_entries[kcunit - 1];
3238
3239		/* Are we attached to the filter? */
3240		if (entry->cfe_filter == NULL)
3241			continue;
3242
3243		if (outgoing)
3244			entrybuf = &entry->cfe_snd;
3245		else
3246			entrybuf = &entry->cfe_rcv;
3247
3248		entrybuf->cfe_ctl_q.q_start += datalen;
3249		entrybuf->cfe_pass_offset = entrybuf->cfe_ctl_q.q_start;
3250		entrybuf->cfe_peeked = entrybuf->cfe_ctl_q.q_start;
3251		if (entrybuf->cfe_peek_offset < entrybuf->cfe_pass_offset)
3252			entrybuf->cfe_peek_offset = entrybuf->cfe_pass_offset;
3253
3254		entrybuf->cfe_ctl_q.q_end += datalen;
3255
3256		entrybuf->cfe_pending_q.q_start += datalen;
3257		entrybuf->cfe_pending_q.q_end += datalen;
3258	}
3259	CFIL_INFO_VERIFY(so->so_cfil);
3260	return (0);
3261}
3262
3263int
3264cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to,
3265		struct mbuf *data, struct mbuf *control, uint32_t flags)
3266{
3267#pragma unused(to, control, flags)
3268	errno_t error = 0;
3269	unsigned int datalen;
3270	int mbcnt;
3271	int kcunit;
3272	struct cfi_buf *cfi_buf;
3273
3274	if (so->so_cfil == NULL) {
3275		CFIL_LOG(LOG_ERR, "so %llx cfil detached",
3276			(uint64_t)VM_KERNEL_ADDRPERM(so));
3277		error = 0;
3278		goto done;
3279	} else if (so->so_cfil->cfi_flags & CFIF_DROP) {
3280		CFIL_LOG(LOG_ERR, "so %llx drop set",
3281			(uint64_t)VM_KERNEL_ADDRPERM(so));
3282		error = EPIPE;
3283		goto done;
3284	}
3285
3286	datalen = cfil_data_length(data, &mbcnt);
3287
3288	CFIL_LOG(LOG_INFO, "so %llx %s m %llx len %u flags 0x%x nextpkt %llx",
3289		(uint64_t)VM_KERNEL_ADDRPERM(so),
3290		outgoing ? "out" : "in",
3291		(uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
3292		(uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt));
3293
3294	if (outgoing)
3295		cfi_buf = &so->so_cfil->cfi_snd;
3296	else
3297		cfi_buf = &so->so_cfil->cfi_rcv;
3298
3299	cfi_buf->cfi_pending_last += datalen;
3300	cfi_buf->cfi_pending_mbcnt += mbcnt;
3301	cfil_info_buf_verify(cfi_buf);
3302
3303	CFIL_LOG(LOG_INFO, "so %llx cfi_pending_last %llu cfi_pass_offset %llu",
3304		(uint64_t)VM_KERNEL_ADDRPERM(so),
3305		cfi_buf->cfi_pending_last,
3306		cfi_buf->cfi_pass_offset);
3307
3308	/* Fast path when below pass offset */
3309	if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
3310		cfil_update_entry_offsets(so, outgoing, datalen);
3311	} else {
3312		for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3313			error = cfil_data_filter(so, kcunit, outgoing, data,
3314				datalen);
3315			/* 0 means passed so continue with next filter */
3316			if (error != 0)
3317				break;
3318		}
3319	}
3320
3321	/* Move cursor if no filter claimed the data */
3322	if (error == 0) {
3323		cfi_buf->cfi_pending_first += datalen;
3324		cfi_buf->cfi_pending_mbcnt -= mbcnt;
3325		cfil_info_buf_verify(cfi_buf);
3326	}
3327done:
3328	CFIL_INFO_VERIFY(so->so_cfil);
3329
3330	return (error);
3331}
3332
3333/*
3334 * Callback from socket layer sosendxxx()
3335 */
3336int
3337cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
3338		struct mbuf *data, struct mbuf *control, uint32_t flags)
3339{
3340	int error = 0;
3341
3342	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3343		return (0);
3344
3345	socket_lock_assert_owned(so);
3346
3347	if (so->so_cfil->cfi_flags & CFIF_DROP) {
3348		CFIL_LOG(LOG_ERR, "so %llx drop set",
3349			(uint64_t)VM_KERNEL_ADDRPERM(so));
3350		return (EPIPE);
3351	}
3352	if (control != NULL) {
3353		CFIL_LOG(LOG_ERR, "so %llx control",
3354			(uint64_t)VM_KERNEL_ADDRPERM(so));
3355		OSIncrementAtomic(&cfil_stats.cfs_data_out_control);
3356	}
3357	if ((flags & MSG_OOB)) {
3358		CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3359			(uint64_t)VM_KERNEL_ADDRPERM(so));
3360		OSIncrementAtomic(&cfil_stats.cfs_data_out_oob);
3361	}
3362	if ((so->so_snd.sb_flags & SB_LOCK) == 0)
3363		panic("so %p SB_LOCK not set", so);
3364
3365	if (so->so_snd.sb_cfil_thread != NULL)
3366		panic("%s sb_cfil_thread %p not NULL", __func__,
3367			so->so_snd.sb_cfil_thread);
3368
3369	error = cfil_data_common(so, 1, to, data, control, flags);
3370
3371	return (error);
3372}
3373
3374/*
3375 * Callback from socket layer sbappendxxx()
3376 */
3377int
3378cfil_sock_data_in(struct socket *so, struct sockaddr *from,
3379	struct mbuf *data, struct mbuf *control, uint32_t flags)
3380{
3381	int error = 0;
3382
3383	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3384		return (0);
3385
3386	socket_lock_assert_owned(so);
3387
3388	if (so->so_cfil->cfi_flags & CFIF_DROP) {
3389		CFIL_LOG(LOG_ERR, "so %llx drop set",
3390			(uint64_t)VM_KERNEL_ADDRPERM(so));
3391		return (EPIPE);
3392	}
3393	if (control != NULL) {
3394		CFIL_LOG(LOG_ERR, "so %llx control",
3395			(uint64_t)VM_KERNEL_ADDRPERM(so));
3396		OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
3397	}
3398	if (data->m_type == MT_OOBDATA) {
3399		CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
3400			(uint64_t)VM_KERNEL_ADDRPERM(so));
3401		OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
3402	}
3403	error = cfil_data_common(so, 0, from, data, control, flags);
3404
3405	return (error);
3406}
3407
3408/*
3409 * Callback from socket layer soshutdownxxx()
3410 *
3411 * We may delay the shutdown write if there's outgoing data in process.
3412 *
3413 * There is no point in delaying the shutdown read because the process
3414 * indicated that it does not want to read anymore data.
3415 */
3416int
3417cfil_sock_shutdown(struct socket *so, int *how)
3418{
3419	int error = 0;
3420
3421	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3422		goto done;
3423
3424	socket_lock_assert_owned(so);
3425
3426	CFIL_LOG(LOG_INFO, "so %llx how %d",
3427		(uint64_t)VM_KERNEL_ADDRPERM(so), *how);
3428
3429	/*
3430	 * Check the state of the socket before the content filter
3431	 */
3432	if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
3433		/* read already shut down */
3434		error = ENOTCONN;
3435		goto done;
3436	}
3437	if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
3438		/* write already shut down */
3439		error = ENOTCONN;
3440		goto done;
3441	}
3442
3443	if ((so->so_cfil->cfi_flags & CFIF_DROP) != 0) {
3444		CFIL_LOG(LOG_ERR, "so %llx drop set",
3445			(uint64_t)VM_KERNEL_ADDRPERM(so));
3446		goto done;
3447	}
3448
3449	/*
3450	 * shutdown read: SHUT_RD or SHUT_RDWR
3451	 */
3452	if (*how != SHUT_WR) {
3453		if (so->so_cfil->cfi_flags & CFIF_SHUT_RD) {
3454			error = ENOTCONN;
3455			goto done;
3456		}
3457		so->so_cfil->cfi_flags |= CFIF_SHUT_RD;
3458		cfil_sock_notify_shutdown(so, SHUT_RD);
3459	}
3460	/*
3461	 * shutdown write: SHUT_WR or SHUT_RDWR
3462	 */
3463	if (*how != SHUT_RD) {
3464		if (so->so_cfil->cfi_flags & CFIF_SHUT_WR) {
3465			error = ENOTCONN;
3466			goto done;
3467		}
3468		so->so_cfil->cfi_flags |= CFIF_SHUT_WR;
3469		cfil_sock_notify_shutdown(so, SHUT_WR);
3470		/*
3471		 * When outgoing data is pending, we delay the shutdown at the
3472		 * protocol level until the content filters give the final
3473		 * verdict on the pending data.
3474		 */
3475		if (cfil_sock_data_pending(&so->so_snd) != 0) {
3476			/*
3477			 * When shutting down the read and write sides at once
3478			 * we can proceed to the final shutdown of the read
3479			 * side. Otherwise, we just return.
3480			 */
3481			if (*how == SHUT_WR) {
3482				error = EJUSTRETURN;
3483			} else if (*how == SHUT_RDWR) {
3484				*how = SHUT_RD;
3485			}
3486		}
3487	}
3488done:
3489	return (error);
3490}
3491
3492/*
3493 * This is called when the socket is closed and there is no more
3494 * opportunity for filtering
3495 */
3496void
3497cfil_sock_is_closed(struct socket *so)
3498{
3499	errno_t error = 0;
3500	int kcunit;
3501
3502	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3503		return;
3504
3505	CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3506
3507	socket_lock_assert_owned(so);
3508
3509	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3510		/* Let the filters know of the closing */
3511		error = cfil_dispatch_closed_event(so, kcunit);
3512	}
3513
3514	/* Last chance to push passed data out */
3515	error = cfil_acquire_sockbuf(so, 1);
3516	if (error == 0)
3517		cfil_service_inject_queue(so, 1);
3518	cfil_release_sockbuf(so, 1);
3519
3520	so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
3521
3522	/* Pending data needs to go */
3523	cfil_flush_queues(so);
3524
3525	CFIL_INFO_VERIFY(so->so_cfil);
3526}
3527
3528/*
3529 * This is called when the socket is disconnected so let the filters
3530 * know about the disconnection and that no more data will come
3531 *
3532 * The how parameter has the same values as soshutown()
3533 */
3534void
3535cfil_sock_notify_shutdown(struct socket *so, int how)
3536{
3537	errno_t error = 0;
3538	int kcunit;
3539
3540	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3541		return;
3542
3543	CFIL_LOG(LOG_INFO, "so %llx how %d",
3544		(uint64_t)VM_KERNEL_ADDRPERM(so), how);
3545
3546	socket_lock_assert_owned(so);
3547
3548	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3549		/* Disconnect incoming side */
3550		if (how != SHUT_WR)
3551			error = cfil_dispatch_disconnect_event(so, kcunit, 0);
3552		/* Disconnect outgoing side */
3553		if (how != SHUT_RD)
3554			error = cfil_dispatch_disconnect_event(so, kcunit, 1);
3555	}
3556}
3557
3558static int
3559cfil_filters_attached(struct socket *so)
3560{
3561	struct cfil_entry *entry;
3562	uint32_t kcunit;
3563	int attached = 0;
3564
3565	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3566		return (0);
3567
3568	socket_lock_assert_owned(so);
3569
3570	for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
3571		entry = &so->so_cfil->cfi_entries[kcunit - 1];
3572
3573		/* Are we attached to the filter? */
3574		if (entry->cfe_filter == NULL)
3575			continue;
3576		if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
3577			continue;
3578		if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0)
3579			continue;
3580		attached = 1;
3581		break;
3582	}
3583
3584	return (attached);
3585}
3586
3587/*
3588 * This is called when the socket is closed and we are waiting for
3589 * the filters to gives the final pass or drop
3590 */
3591void
3592cfil_sock_close_wait(struct socket *so)
3593{
3594	lck_mtx_t *mutex_held;
3595	struct timespec ts;
3596	int error;
3597
3598	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3599		return;
3600
3601	CFIL_LOG(LOG_INFO, "so %llx", (uint64_t)VM_KERNEL_ADDRPERM(so));
3602
3603	if (so->so_proto->pr_getlock != NULL)
3604		mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3605	else
3606		mutex_held = so->so_proto->pr_domain->dom_mtx;
3607	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3608
3609	while (cfil_filters_attached(so)) {
3610		/*
3611		 * Notify the filters we are going away so they can detach
3612		 */
3613		cfil_sock_notify_shutdown(so, SHUT_RDWR);
3614
3615		/*
3616		 * Make sure we need to wait after the filter are notified
3617		 * of the disconnection
3618		 */
3619		if (cfil_filters_attached(so) == 0)
3620			break;
3621
3622		CFIL_LOG(LOG_INFO, "so %llx waiting",
3623			(uint64_t)VM_KERNEL_ADDRPERM(so));
3624
3625		ts.tv_sec = cfil_close_wait_timeout / 1000;
3626		ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
3627			NSEC_PER_USEC * 1000;
3628
3629		OSIncrementAtomic(&cfil_stats.cfs_close_wait);
3630		so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
3631		error = msleep((caddr_t)&so->so_cfil, mutex_held,
3632			PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
3633		so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
3634
3635		CFIL_LOG(LOG_NOTICE, "so %llx timed out %d",
3636			(uint64_t)VM_KERNEL_ADDRPERM(so), (error != 0));
3637
3638		/*
3639		 * Force close in case of timeout
3640		 */
3641		if (error != 0) {
3642			OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
3643			break;
3644		}
3645	}
3646
3647}
3648
3649/*
3650 * Returns the size of the data held by the content filter by using
3651 */
3652int32_t
3653cfil_sock_data_pending(struct sockbuf *sb)
3654{
3655	struct socket *so = sb->sb_so;
3656	uint64_t pending = 0;
3657
3658	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
3659		struct cfi_buf *cfi_buf;
3660
3661		socket_lock_assert_owned(so);
3662
3663		if ((sb->sb_flags & SB_RECV) == 0)
3664			cfi_buf = &so->so_cfil->cfi_snd;
3665		else
3666			cfi_buf = &so->so_cfil->cfi_rcv;
3667
3668		pending = cfi_buf->cfi_pending_last -
3669			cfi_buf->cfi_pending_first;
3670
3671		/*
3672		 * If we are limited by the "chars of mbufs used" roughly
3673		 * adjust so we won't overcommit
3674		 */
3675		if (pending > (uint64_t)cfi_buf->cfi_pending_mbcnt)
3676			pending = cfi_buf->cfi_pending_mbcnt;
3677	}
3678
3679	VERIFY(pending < INT32_MAX);
3680
3681	return (int32_t)(pending);
3682}
3683
3684/*
3685 * Return the socket buffer space used by data being held by content filters
3686 * so processes won't clog the socket buffer
3687 */
3688int32_t
3689cfil_sock_data_space(struct sockbuf *sb)
3690{
3691	struct socket *so = sb->sb_so;
3692	uint64_t pending = 0;
3693
3694	if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
3695		so->so_snd.sb_cfil_thread != current_thread()) {
3696		struct cfi_buf *cfi_buf;
3697
3698		socket_lock_assert_owned(so);
3699
3700		if ((sb->sb_flags & SB_RECV) == 0)
3701			cfi_buf = &so->so_cfil->cfi_snd;
3702		else
3703			cfi_buf = &so->so_cfil->cfi_rcv;
3704
3705		pending = cfi_buf->cfi_pending_last -
3706			cfi_buf->cfi_pending_first;
3707
3708		/*
3709		 * If we are limited by the "chars of mbufs used" roughly
3710		 * adjust so we won't overcommit
3711		 */
3712		if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending)
3713			pending = cfi_buf->cfi_pending_mbcnt;
3714	}
3715
3716	VERIFY(pending < INT32_MAX);
3717
3718	return (int32_t)(pending);
3719}
3720
3721/*
3722 * A callback from the socket and protocol layer when data becomes
3723 * available in the socket buffer to give a chance for the content filter
3724 * to re-inject data that was held back
3725 */
3726void
3727cfil_sock_buf_update(struct sockbuf *sb)
3728{
3729	int outgoing;
3730	int error;
3731	struct socket *so = sb->sb_so;
3732
3733	if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
3734		return;
3735
3736	if (!cfil_sbtrim)
3737		return;
3738
3739	socket_lock_assert_owned(so);
3740
3741	if ((sb->sb_flags & SB_RECV) == 0) {
3742		if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0)
3743			return;
3744		outgoing = 1;
3745		OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
3746	} else {
3747		if ((so->so_cfil->cfi_flags & CFIF_RETRY_INJECT_IN) == 0)
3748			return;
3749		outgoing = 0;
3750		OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
3751	}
3752
3753	CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
3754		(uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
3755
3756	error = cfil_acquire_sockbuf(so, outgoing);
3757	if (error == 0)
3758		cfil_service_inject_queue(so, outgoing);
3759	cfil_release_sockbuf(so, outgoing);
3760}
3761
3762int
3763sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
3764	struct sysctl_req *req)
3765{
3766#pragma unused(oidp, arg1, arg2)
3767	int error = 0;
3768	size_t len = 0;
3769	u_int32_t i;
3770
3771	/* Read only  */
3772	if (req->newptr != USER_ADDR_NULL)
3773		return (EPERM);
3774
3775	cfil_rw_lock_shared(&cfil_lck_rw);
3776
3777	for (i = 0; content_filters != NULL && i < MAX_CONTENT_FILTER; i++) {
3778		struct cfil_filter_stat filter_stat;
3779		struct content_filter *cfc = content_filters[i];
3780
3781		if (cfc == NULL)
3782			continue;
3783
3784		/* If just asking for the size */
3785		if (req->oldptr == USER_ADDR_NULL) {
3786			len += sizeof(struct cfil_filter_stat);
3787			continue;
3788		}
3789
3790		bzero(&filter_stat, sizeof(struct cfil_filter_stat));
3791		filter_stat.cfs_len = sizeof(struct cfil_filter_stat);
3792		filter_stat.cfs_filter_id = cfc->cf_kcunit;
3793		filter_stat.cfs_flags = cfc->cf_flags;
3794		filter_stat.cfs_sock_count = cfc->cf_sock_count;
3795		filter_stat.cfs_necp_control_unit = cfc->cf_necp_control_unit;
3796
3797		error = SYSCTL_OUT(req, &filter_stat,
3798			sizeof (struct cfil_filter_stat));
3799		if (error != 0)
3800			break;
3801	}
3802	/* If just asking for the size */
3803	if (req->oldptr == USER_ADDR_NULL)
3804		req->oldidx = len;
3805
3806	cfil_rw_unlock_shared(&cfil_lck_rw);
3807
3808	return (error);
3809}
3810
3811static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
3812	struct sysctl_req *req)
3813{
3814#pragma unused(oidp, arg1, arg2)
3815	int error = 0;
3816	u_int32_t i;
3817	struct cfil_info *cfi;
3818
3819	/* Read only  */
3820	if (req->newptr != USER_ADDR_NULL)
3821		return (EPERM);
3822
3823	cfil_rw_lock_shared(&cfil_lck_rw);
3824
3825	/*
3826	 * If just asking for the size,
3827	 */
3828	if (req->oldptr == USER_ADDR_NULL) {
3829		req->oldidx = cfil_sock_attached_count *
3830			sizeof(struct cfil_sock_stat);
3831		/* Bump the length in case new sockets gets attached */
3832		req->oldidx += req->oldidx >> 3;
3833		goto done;
3834	}
3835
3836	TAILQ_FOREACH(cfi, &cfil_sock_head, cfi_link) {
3837		struct cfil_entry *entry;
3838		struct cfil_sock_stat stat;
3839		struct socket *so = cfi->cfi_so;
3840
3841		bzero(&stat, sizeof(struct cfil_sock_stat));
3842		stat.cfs_len = sizeof(struct cfil_sock_stat);
3843		stat.cfs_sock_id = cfi->cfi_sock_id;
3844		stat.cfs_flags = cfi->cfi_flags;
3845
3846		if (so != NULL) {
3847			stat.cfs_pid = so->last_pid;
3848			memcpy(stat.cfs_uuid, so->last_uuid,
3849				sizeof(uuid_t));
3850			if (so->so_flags & SOF_DELEGATED) {
3851				stat.cfs_e_pid = so->e_pid;
3852				memcpy(stat.cfs_e_uuid, so->e_uuid,
3853					sizeof(uuid_t));
3854			} else {
3855				stat.cfs_e_pid = so->last_pid;
3856				memcpy(stat.cfs_e_uuid, so->last_uuid,
3857					sizeof(uuid_t));
3858			}
3859		}
3860
3861		stat.cfs_snd.cbs_pending_first =
3862			cfi->cfi_snd.cfi_pending_first;
3863		stat.cfs_snd.cbs_pending_last =
3864			cfi->cfi_snd.cfi_pending_last;
3865		stat.cfs_snd.cbs_inject_q_len =
3866			cfil_queue_len(&cfi->cfi_snd.cfi_inject_q);
3867		stat.cfs_snd.cbs_pass_offset =
3868			cfi->cfi_snd.cfi_pass_offset;
3869
3870		stat.cfs_rcv.cbs_pending_first =
3871			cfi->cfi_rcv.cfi_pending_first;
3872		stat.cfs_rcv.cbs_pending_last =
3873			cfi->cfi_rcv.cfi_pending_last;
3874		stat.cfs_rcv.cbs_inject_q_len =
3875			cfil_queue_len(&cfi->cfi_rcv.cfi_inject_q);
3876		stat.cfs_rcv.cbs_pass_offset =
3877			cfi->cfi_rcv.cfi_pass_offset;
3878
3879		for (i = 0; i < MAX_CONTENT_FILTER; i++) {
3880			struct cfil_entry_stat *estat;
3881			struct cfe_buf *ebuf;
3882			struct cfe_buf_stat *sbuf;
3883
3884			entry = &cfi->cfi_entries[i];
3885
3886			estat = &stat.ces_entries[i];
3887
3888			estat->ces_len = sizeof(struct cfil_entry_stat);
3889			estat->ces_filter_id = entry->cfe_filter ?
3890				entry->cfe_filter->cf_kcunit : 0;
3891			estat->ces_flags = entry->cfe_flags;
3892			estat->ces_necp_control_unit =
3893				entry->cfe_necp_control_unit;
3894
3895			estat->ces_last_event.tv_sec =
3896				(int64_t)entry->cfe_last_event.tv_sec;
3897			estat->ces_last_event.tv_usec =
3898				(int64_t)entry->cfe_last_event.tv_usec;
3899
3900			estat->ces_last_action.tv_sec =
3901				(int64_t)entry->cfe_last_action.tv_sec;
3902			estat->ces_last_action.tv_usec =
3903				(int64_t)entry->cfe_last_action.tv_usec;
3904
3905			ebuf = &entry->cfe_snd;
3906			sbuf = &estat->ces_snd;
3907			sbuf->cbs_pending_first =
3908				cfil_queue_offset_first(&ebuf->cfe_pending_q);
3909			sbuf->cbs_pending_last =
3910				cfil_queue_offset_last(&ebuf->cfe_pending_q);
3911			sbuf->cbs_ctl_first =
3912				cfil_queue_offset_first(&ebuf->cfe_ctl_q);
3913			sbuf->cbs_ctl_last =
3914				cfil_queue_offset_last(&ebuf->cfe_ctl_q);
3915			sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
3916			sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
3917			sbuf->cbs_peeked =  ebuf->cfe_peeked;
3918
3919			ebuf = &entry->cfe_rcv;
3920			sbuf = &estat->ces_rcv;
3921			sbuf->cbs_pending_first =
3922				cfil_queue_offset_first(&ebuf->cfe_pending_q);
3923			sbuf->cbs_pending_last =
3924				cfil_queue_offset_last(&ebuf->cfe_pending_q);
3925			sbuf->cbs_ctl_first =
3926				cfil_queue_offset_first(&ebuf->cfe_ctl_q);
3927			sbuf->cbs_ctl_last =
3928				cfil_queue_offset_last(&ebuf->cfe_ctl_q);
3929			sbuf->cbs_pass_offset =  ebuf->cfe_pass_offset;
3930			sbuf->cbs_peek_offset =  ebuf->cfe_peek_offset;
3931			sbuf->cbs_peeked =  ebuf->cfe_peeked;
3932		}
3933		error = SYSCTL_OUT(req, &stat,
3934			sizeof (struct cfil_sock_stat));
3935		if (error != 0)
3936			break;
3937	}
3938done:
3939	cfil_rw_unlock_shared(&cfil_lck_rw);
3940
3941	return (error);
3942}
3943