1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19/*
20 * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
21 *
22 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
23 *
24 * This file contains Original Code and/or Modifications of Original Code
25 * as defined in and that are subject to the Apple Public Source License
26 * Version 2.0 (the 'License'). You may not use this file except in
27 * compliance with the License. The rights granted to you under the License
28 * may not be used to create, or enable the creation or redistribution of,
29 * unlawful or unlicensed copies of an Apple operating system, or to
30 * circumvent, violate, or enable the circumvention or violation of, any
31 * terms of an Apple operating system software license agreement.
32 *
33 * Please obtain a copy of the License at
34 * http://www.opensource.apple.com/apsl/ and read it before using this file.
35 *
36 * The Original Code and all software distributed under the License are
37 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
38 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
39 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
40 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
41 * Please see the License for the specific language governing rights and
42 * limitations under the License.
43 *
44 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
45 */
46/*
47 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
48 * support for mandatory and extensible security protections.  This notice
49 * is included in support of clause 2.2 (b) of the Apple Public License,
50 * Version 2.0.
51 */
52
53/*
54 * This file contains a high-performance replacement for the socket-based
55 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
56 * all features of sockets, but does do everything that pipes normally
57 * do.
58 *
59 * Pipes are implemented as circular buffers. Following are the valid states in pipes operations
60 *
61 *      _________________________________
62 * 1.  |_________________________________| r=w, c=0
63 *
64 *      _________________________________
65 * 2.  |__r:::::wc_______________________| r <= w , c > 0
66 *
67 *      _________________________________
68 * 3.  |::::wc_____r:::::::::::::::::::::| r>w , c > 0
69 *
70 *      _________________________________
71 * 4.  |:::::::wrc:::::::::::::::::::::::| w=r, c = Max size
72 *
73 *
74 *  Nomenclature:-
75 *  a-z define the steps in a program flow
76 *  1-4 are the states as defined aboe
77 *  Action: is what file operation is done on the pipe
78 *
79 *  Current:None  Action: initialize with size M=200
80 *  a. State 1 ( r=0, w=0, c=0)
81 *
82 *  Current: a    Action: write(100) (w < M)
83 *  b. State 2 (r=0, w=100, c=100)
84 *
85 *  Current: b    Action: write(100) (w = M-w)
86 *  c. State 4 (r=0,w=0,c=200)
87 *
88 *  Current: b    Action: read(70)  ( r < c )
89 *  d. State 2(r=70,w=100,c=30)
90 *
91 *  Current: d	  Action: write(75) ( w < (m-w))
92 *  e. State 2 (r=70,w=175,c=105)
93 *
94 *  Current: d    Action: write(110) ( w > (m-w))
95 *  f. State 3 (r=70,w=10,c=140)
96 *
97 *  Current: d	  Action: read(30) (r >= c )
98 *  g. State 1 (r=100,w=100,c=0)
99 *
100 */
101
102/*
103 * This code create half duplex pipe buffers for facilitating file like
104 * operations on pipes. The initial buffer is very small, but this can
105 * dynamically change to larger sizes based on usage. The buffer size is never
106 * reduced. The total amount of kernel memory used is governed by maxpipekva.
107 * In case of dynamic expansion limit is reached, the output thread is blocked
108 * until the pipe buffer empties enough to continue.
109 *
110 * In order to limit the resource use of pipes, two sysctls exist:
111 *
112 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
113 * address space available to us in pipe_map.
114 *
115 * Memory usage may be monitored through the sysctls
116 * kern.ipc.pipes, kern.ipc.pipekva.
117 *
118 */
119
120#include <sys/param.h>
121#include <sys/systm.h>
122#include <sys/filedesc.h>
123#include <sys/kernel.h>
124#include <sys/vnode.h>
125#include <sys/proc_internal.h>
126#include <sys/kauth.h>
127#include <sys/file_internal.h>
128#include <sys/stat.h>
129#include <sys/ioctl.h>
130#include <sys/fcntl.h>
131#include <sys/malloc.h>
132#include <sys/syslog.h>
133#include <sys/unistd.h>
134#include <sys/resourcevar.h>
135#include <sys/aio_kern.h>
136#include <sys/signalvar.h>
137#include <sys/pipe.h>
138#include <sys/sysproto.h>
139#include <sys/proc_info.h>
140
141#include <security/audit/audit.h>
142
143#include <sys/kdebug.h>
144
145#include <kern/zalloc.h>
146#include <kern/kalloc.h>
147#include <vm/vm_kern.h>
148#include <libkern/OSAtomic.h>
149
150#define f_flag f_fglob->fg_flag
151#define f_msgcount f_fglob->fg_msgcount
152#define f_cred f_fglob->fg_cred
153#define f_ops f_fglob->fg_ops
154#define f_offset f_fglob->fg_offset
155#define f_data f_fglob->fg_data
156
157/*
158 * interfaces to the outside world exported through file operations
159 */
160static int pipe_read(struct fileproc *fp, struct uio *uio,
161                int flags, vfs_context_t ctx);
162static int pipe_write(struct fileproc *fp, struct uio *uio,
163                int flags, vfs_context_t ctx);
164static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
165static int pipe_select(struct fileproc *fp, int which, void * wql,
166		vfs_context_t ctx);
167static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
168		vfs_context_t ctx);
169static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
170		vfs_context_t ctx);
171static int pipe_drain(struct fileproc *fp,vfs_context_t ctx);
172
173static const struct fileops pipeops = {
174	DTYPE_PIPE,
175	pipe_read,
176	pipe_write,
177	pipe_ioctl,
178	pipe_select,
179	pipe_close,
180	pipe_kqfilter,
181	pipe_drain
182};
183
184static void	filt_pipedetach(struct knote *kn);
185static int	filt_piperead(struct knote *kn, long hint);
186static int	filt_pipewrite(struct knote *kn, long hint);
187
188static struct filterops pipe_rfiltops = {
189        .f_isfd = 1,
190        .f_detach = filt_pipedetach,
191        .f_event = filt_piperead,
192};
193
194static struct filterops pipe_wfiltops = {
195        .f_isfd = 1,
196        .f_detach = filt_pipedetach,
197        .f_event = filt_pipewrite,
198};
199
200static int nbigpipe;      /* for compatibility sake. no longer used */
201static int amountpipes;   /* total number of pipes in system */
202static int amountpipekva; /* total memory used by pipes */
203
204int maxpipekva __attribute__((used)) = PIPE_KVAMAX;  /* allowing 16MB max. */
205
206#if PIPE_SYSCTLS
207SYSCTL_DECL(_kern_ipc);
208
209SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
210	   &maxpipekva, 0, "Pipe KVA limit");
211SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED,
212	   &maxpipekvawired, 0, "Pipe KVA wired limit");
213SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED,
214	   &amountpipes, 0, "Current # of pipes");
215SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED,
216	   &nbigpipe, 0, "Current # of big pipes");
217SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
218	   &amountpipekva, 0, "Pipe KVA usage");
219SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED,
220	   &amountpipekvawired, 0, "Pipe wired KVA usage");
221#endif
222
223static void pipeclose(struct pipe *cpipe);
224static void pipe_free_kmem(struct pipe *cpipe);
225static int pipe_create(struct pipe **cpipep);
226static int pipespace(struct pipe *cpipe, int size);
227static int choose_pipespace(unsigned long current, unsigned long expected);
228static int expand_pipespace(struct pipe *p, int target_size);
229static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe);
230static __inline int pipeio_lock(struct pipe *cpipe, int catch);
231static __inline void pipeio_unlock(struct pipe *cpipe);
232
233extern int postpipeevent(struct pipe *, int);
234extern void evpipefree(struct pipe *cpipe);
235
236static lck_grp_t	*pipe_mtx_grp;
237static lck_attr_t	*pipe_mtx_attr;
238static lck_grp_attr_t	*pipe_mtx_grp_attr;
239
240static zone_t pipe_zone;
241
242#define MAX_PIPESIZE(pipe)  		( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) )
243
244#define	PIPE_GARBAGE_AGE_LIMIT		5000	/* In milliseconds */
245#define PIPE_GARBAGE_QUEUE_LIMIT	32000
246
247struct pipe_garbage {
248	struct pipe		*pg_pipe;
249	struct pipe_garbage	*pg_next;
250	uint64_t		pg_timestamp;
251};
252
253static zone_t pipe_garbage_zone;
254static struct pipe_garbage *pipe_garbage_head = NULL;
255static struct pipe_garbage *pipe_garbage_tail = NULL;
256static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
257static int pipe_garbage_count = 0;
258static lck_mtx_t *pipe_garbage_lock;
259static void pipe_garbage_collect(struct pipe *cpipe);
260
261SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
262
263/* initial setup done at time of sysinit */
264void
265pipeinit(void)
266{
267	nbigpipe=0;
268	vm_size_t zone_size;
269
270	zone_size = 8192 * sizeof(struct pipe);
271        pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
272
273
274	/* allocate lock group attribute and group for pipe mutexes */
275	pipe_mtx_grp_attr = lck_grp_attr_alloc_init();
276	pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr);
277
278	/* allocate the lock attribute for pipe mutexes */
279	pipe_mtx_attr = lck_attr_alloc_init();
280
281	/*
282	 * Set up garbage collection for dead pipes
283	 */
284	zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
285	    sizeof(struct pipe_garbage);
286        pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
287	    zone_size, 4096, "pipe garbage zone");
288	pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
289
290}
291
292/* Bitmap for things to touch in pipe_touch() */
293#define	PIPE_ATIME	0x00000001	/* time of last access */
294#define	PIPE_MTIME	0x00000002	/* time of last modification */
295#define	PIPE_CTIME	0x00000004	/* time of last status change */
296
297static void
298pipe_touch(struct pipe *tpipe, int touch)
299{
300	struct timeval now;
301
302	microtime(&now);
303
304	if (touch & PIPE_ATIME) {
305		tpipe->st_atimespec.tv_sec  = now.tv_sec;
306		tpipe->st_atimespec.tv_nsec = now.tv_usec * 1000;
307	}
308
309	if (touch & PIPE_MTIME) {
310		tpipe->st_mtimespec.tv_sec  = now.tv_sec;
311		tpipe->st_mtimespec.tv_nsec = now.tv_usec * 1000;
312	}
313
314	if (touch & PIPE_CTIME) {
315		tpipe->st_ctimespec.tv_sec  = now.tv_sec;
316		tpipe->st_ctimespec.tv_nsec = now.tv_usec * 1000;
317	}
318}
319
320static const unsigned int pipesize_blocks[] = {128,256,1024,2048,4096, 4096 * 2, PIPE_SIZE , PIPE_SIZE * 4 };
321
322/*
323 * finds the right size from possible sizes in pipesize_blocks
324 * returns the size which matches max(current,expected)
325 */
326static int
327choose_pipespace(unsigned long current, unsigned long expected)
328{
329	int i = sizeof(pipesize_blocks)/sizeof(unsigned int) -1;
330	unsigned long target;
331
332	if (expected > current)
333		target = expected;
334	else
335		target = current;
336
337	while ( i >0 && pipesize_blocks[i-1] > target) {
338		i=i-1;
339
340	}
341
342	return pipesize_blocks[i];
343}
344
345
346/*
347 * expand the size of pipe while there is data to be read,
348 * and then free the old buffer once the current buffered
349 * data has been transferred to new storage.
350 * Required: PIPE_LOCK and io lock to be held by caller.
351 * returns 0 on success or no expansion possible
352 */
353static int
354expand_pipespace(struct pipe *p, int target_size)
355{
356	struct pipe tmp, oldpipe;
357	int error;
358	tmp.pipe_buffer.buffer = 0;
359
360	if (p->pipe_buffer.size >= (unsigned) target_size) {
361		return 0; /* the existing buffer is max size possible */
362	}
363
364	/* create enough space in the target */
365	error = pipespace(&tmp, target_size);
366	if (error != 0)
367		return (error);
368
369	oldpipe.pipe_buffer.buffer = p->pipe_buffer.buffer;
370	oldpipe.pipe_buffer.size = p->pipe_buffer.size;
371
372	memcpy(tmp.pipe_buffer.buffer, p->pipe_buffer.buffer, p->pipe_buffer.size);
373	if (p->pipe_buffer.cnt > 0 && p->pipe_buffer.in <= p->pipe_buffer.out ){
374		/* we are in State 3 and need extra copying for read to be consistent */
375		memcpy(&tmp.pipe_buffer.buffer[p->pipe_buffer.size], p->pipe_buffer.buffer, p->pipe_buffer.size);
376		p->pipe_buffer.in += p->pipe_buffer.size;
377	}
378
379	p->pipe_buffer.buffer = tmp.pipe_buffer.buffer;
380	p->pipe_buffer.size = tmp.pipe_buffer.size;
381
382
383	pipe_free_kmem(&oldpipe);
384	return 0;
385}
386
387/*
388 * The pipe system call for the DTYPE_PIPE type of pipes
389 *
390 * returns:
391 *  FREAD  | fd0 | -->[struct rpipe] --> |~~buffer~~| \
392 *                                                    (pipe_mutex)
393 *  FWRITE | fd1 | -->[struct wpipe] --X              /
394 */
395
396/* ARGSUSED */
397int
398pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
399{
400	struct fileproc *rf, *wf;
401	struct pipe *rpipe, *wpipe;
402	lck_mtx_t   *pmtx;
403	int fd, error;
404
405	if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL)
406	        return (ENOMEM);
407
408	rpipe = wpipe = NULL;
409	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
410	        error = ENFILE;
411		goto freepipes;
412	}
413        /*
414	 * allocate the space for the normal I/O direction up
415	 * front... we'll delay the allocation for the other
416	 * direction until a write actually occurs (most likely it won't)...
417         */
418	error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0));
419        if (error)
420	        goto freepipes;
421
422	TAILQ_INIT(&rpipe->pipe_evlist);
423	TAILQ_INIT(&wpipe->pipe_evlist);
424
425	error = falloc(p, &rf, &fd, vfs_context_current());
426	if (error) {
427	        goto freepipes;
428	}
429	retval[0] = fd;
430
431	/*
432	 * for now we'll create half-duplex pipes(refer returns section above).
433	 * this is what we've always supported..
434	 */
435	rf->f_flag = FREAD;
436	rf->f_data = (caddr_t)rpipe;
437	rf->f_ops = &pipeops;
438
439	error = falloc(p, &wf, &fd, vfs_context_current());
440	if (error) {
441		fp_free(p, retval[0], rf);
442	        goto freepipes;
443	}
444	wf->f_flag = FWRITE;
445	wf->f_data = (caddr_t)wpipe;
446	wf->f_ops = &pipeops;
447
448	rpipe->pipe_peer = wpipe;
449	wpipe->pipe_peer = rpipe;
450	/* both structures share the same mutex */
451	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
452
453	retval[1] = fd;
454#if CONFIG_MACF
455	/*
456	 * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
457	 *
458	 * struct pipe represents a pipe endpoint.  The MAC label is shared
459	 * between the connected endpoints.  As a result mac_pipe_label_init() and
460	 * mac_pipe_label_associate() should only be called on one of the endpoints
461	 * after they have been connected.
462	 */
463	mac_pipe_label_init(rpipe);
464	mac_pipe_label_associate(kauth_cred_get(), rpipe);
465	wpipe->pipe_label = rpipe->pipe_label;
466#endif
467	proc_fdlock_spin(p);
468	procfdtbl_releasefd(p, retval[0], NULL);
469	procfdtbl_releasefd(p, retval[1], NULL);
470	fp_drop(p, retval[0], rf, 1);
471	fp_drop(p, retval[1], wf, 1);
472	proc_fdunlock(p);
473
474
475	return (0);
476
477freepipes:
478	pipeclose(rpipe);
479	pipeclose(wpipe);
480	lck_mtx_free(pmtx, pipe_mtx_grp);
481
482	return (error);
483}
484
485int
486pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
487{
488#if CONFIG_MACF
489        int error;
490#endif
491	int	pipe_size = 0;
492	int	pipe_count;
493	struct stat *sb = (struct stat *)0;	/* warning avoidance ; protected by isstat64 */
494	struct stat64 * sb64 = (struct stat64 *)0;  /* warning avoidance ; protected by isstat64 */
495
496	if (cpipe == NULL)
497	        return (EBADF);
498	PIPE_LOCK(cpipe);
499
500#if CONFIG_MACF
501	error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
502	if (error) {
503		PIPE_UNLOCK(cpipe);
504	        return (error);
505	}
506#endif
507	if (cpipe->pipe_buffer.buffer == 0) {
508	        /* must be stat'ing the write fd */
509	        if (cpipe->pipe_peer) {
510		        /* the peer still exists, use it's info */
511		        pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
512			pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
513		} else {
514			pipe_count = 0;
515		}
516	} else {
517	        pipe_size  = MAX_PIPESIZE(cpipe);
518		pipe_count = cpipe->pipe_buffer.cnt;
519	}
520	/*
521	 * since peer's buffer is setup ouside of lock
522	 * we might catch it in transient state
523	 */
524	if (pipe_size == 0)
525		pipe_size  = MAX(PIPE_SIZE, pipesize_blocks[0]);
526
527	if (isstat64 != 0) {
528		sb64 = (struct stat64 *)ub;
529
530		bzero(sb64, sizeof(*sb64));
531		sb64->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
532		sb64->st_blksize = pipe_size;
533		sb64->st_size = pipe_count;
534		sb64->st_blocks = (sb64->st_size + sb64->st_blksize - 1) / sb64->st_blksize;
535
536		sb64->st_uid = kauth_getuid();
537		sb64->st_gid = kauth_getgid();
538
539		sb64->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
540		sb64->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
541
542		sb64->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
543		sb64->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
544
545		sb64->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
546		sb64->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
547
548		/*
549	 	* Return a relatively unique inode number based on the current
550	 	* address of this pipe's struct pipe.  This number may be recycled
551	 	* relatively quickly.
552	 	*/
553		sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
554	} else {
555		sb = (struct stat *)ub;
556
557		bzero(sb, sizeof(*sb));
558		sb->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
559		sb->st_blksize = pipe_size;
560		sb->st_size = pipe_count;
561		sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
562
563		sb->st_uid = kauth_getuid();
564		sb->st_gid = kauth_getgid();
565
566		sb->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
567		sb->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
568
569		sb->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
570		sb->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
571
572		sb->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
573		sb->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
574
575		/*
576	 	* Return a relatively unique inode number based on the current
577	 	* address of this pipe's struct pipe.  This number may be recycled
578	 	* relatively quickly.
579	 	*/
580		sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
581	}
582	PIPE_UNLOCK(cpipe);
583
584	/*
585	 * POSIX: Left as 0: st_dev, st_nlink, st_rdev, st_flags, st_gen,
586	 * st_uid, st_gid.
587	 *
588	 * XXX (st_dev) should be unique, but there is no device driver that
589	 * XXX is associated with pipes, since they are implemented via a
590	 * XXX struct fileops indirection rather than as FS objects.
591	 */
592	return (0);
593}
594
595
596/*
597 * Allocate kva for pipe circular buffer, the space is pageable
598 * This routine will 'realloc' the size of a pipe safely, if it fails
599 * it will retain the old buffer.
600 * If it fails it will return ENOMEM.
601 */
602static int
603pipespace(struct pipe *cpipe, int size)
604{
605	vm_offset_t buffer;
606
607	if (size <= 0)
608		return(EINVAL);
609
610	if ((buffer = (vm_offset_t)kalloc(size)) == 0 )
611		return(ENOMEM);
612
613	/* free old resources if we're resizing */
614	pipe_free_kmem(cpipe);
615	cpipe->pipe_buffer.buffer = (caddr_t)buffer;
616	cpipe->pipe_buffer.size = size;
617	cpipe->pipe_buffer.in = 0;
618	cpipe->pipe_buffer.out = 0;
619	cpipe->pipe_buffer.cnt = 0;
620
621	OSAddAtomic(1, &amountpipes);
622	OSAddAtomic(cpipe->pipe_buffer.size, &amountpipekva);
623
624	return (0);
625}
626
627/*
628 * initialize and allocate VM and memory for pipe
629 */
630static int
631pipe_create(struct pipe **cpipep)
632{
633	struct pipe *cpipe;
634	cpipe = (struct pipe *)zalloc(pipe_zone);
635
636	if ((*cpipep = cpipe) == NULL)
637		return (ENOMEM);
638
639	/*
640	 * protect so pipespace or pipeclose don't follow a junk pointer
641	 * if pipespace() fails.
642	 */
643	bzero(cpipe, sizeof *cpipe);
644
645	/* Initial times are all the time of creation of the pipe */
646	pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
647	return (0);
648}
649
650
651/*
652 * lock a pipe for I/O, blocking other access
653 */
654static inline int
655pipeio_lock(struct pipe *cpipe, int catch)
656{
657	int error;
658	while (cpipe->pipe_state & PIPE_LOCKFL) {
659		cpipe->pipe_state |= PIPE_LWANT;
660		error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO,
661			       "pipelk", 0);
662		if (error != 0)
663			return (error);
664	}
665	cpipe->pipe_state |= PIPE_LOCKFL;
666	return (0);
667}
668
669/*
670 * unlock a pipe I/O lock
671 */
672static inline void
673pipeio_unlock(struct pipe *cpipe)
674{
675	cpipe->pipe_state &= ~PIPE_LOCKFL;
676	if (cpipe->pipe_state & PIPE_LWANT) {
677		cpipe->pipe_state &= ~PIPE_LWANT;
678		wakeup(cpipe);
679	}
680}
681
682/*
683 * wakeup anyone whos blocked in select
684 */
685static void
686pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
687{
688	if (cpipe->pipe_state & PIPE_SEL) {
689		cpipe->pipe_state &= ~PIPE_SEL;
690		selwakeup(&cpipe->pipe_sel);
691	}
692        if (cpipe->pipe_state & PIPE_KNOTE)
693	       KNOTE(&cpipe->pipe_sel.si_note, 1);
694
695	postpipeevent(cpipe, EV_RWBYTES);
696
697	if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) {
698	        if (spipe->pipe_pgid < 0)
699		        gsignal(-spipe->pipe_pgid, SIGIO);
700		else
701		        proc_signal(spipe->pipe_pgid, SIGIO);
702        }
703}
704
705/*
706 * Read n bytes from the buffer. Semantics are similar to file read.
707 * returns: number of bytes read from the buffer
708 */
709/* ARGSUSED */
710static int
711pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
712	__unused vfs_context_t ctx)
713{
714	struct pipe *rpipe = (struct pipe *)fp->f_data;
715	int error;
716	int nread = 0;
717	u_int size;
718
719	PIPE_LOCK(rpipe);
720	++rpipe->pipe_busy;
721
722	error = pipeio_lock(rpipe, 1);
723	if (error)
724		goto unlocked_error;
725
726#if CONFIG_MACF
727	error = mac_pipe_check_read(kauth_cred_get(), rpipe);
728	if (error)
729		goto locked_error;
730#endif
731
732
733	while (uio_resid(uio)) {
734		/*
735		 * normal pipe buffer receive
736		 */
737		if (rpipe->pipe_buffer.cnt > 0) {
738			/*
739			 * # bytes to read is min( bytes from read pointer until end of buffer,
740			 *                         total unread bytes,
741			 *                         user requested byte count)
742			 */
743			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
744			if (size > rpipe->pipe_buffer.cnt)
745				size = rpipe->pipe_buffer.cnt;
746			// LP64todo - fix this!
747			if (size > (u_int) uio_resid(uio))
748				size = (u_int) uio_resid(uio);
749
750			PIPE_UNLOCK(rpipe); /* we still hold io lock.*/
751			error = uiomove(
752			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
753			    size, uio);
754			PIPE_LOCK(rpipe);
755			if (error)
756				break;
757
758			rpipe->pipe_buffer.out += size;
759			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
760				rpipe->pipe_buffer.out = 0;
761
762			rpipe->pipe_buffer.cnt -= size;
763
764			/*
765			 * If there is no more to read in the pipe, reset
766			 * its pointers to the beginning.  This improves
767			 * cache hit stats.
768			 */
769			if (rpipe->pipe_buffer.cnt == 0) {
770				rpipe->pipe_buffer.in = 0;
771				rpipe->pipe_buffer.out = 0;
772			}
773			nread += size;
774		} else {
775			/*
776			 * detect EOF condition
777			 * read returns 0 on EOF, no need to set error
778			 */
779			if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
780				break;
781			}
782
783			/*
784			 * If the "write-side" has been blocked, wake it up now.
785			 */
786			if (rpipe->pipe_state & PIPE_WANTW) {
787				rpipe->pipe_state &= ~PIPE_WANTW;
788				wakeup(rpipe);
789			}
790
791			/*
792			 * Break if some data was read in previous iteration.
793			 */
794			if (nread > 0)
795				break;
796
797			/*
798			 * Unlock the pipe buffer for our remaining processing.
799			 * We will either break out with an error or we will
800			 * sleep and relock to loop.
801			 */
802			pipeio_unlock(rpipe);
803
804			/*
805			 * Handle non-blocking mode operation or
806			 * wait for more data.
807			 */
808			if (fp->f_flag & FNONBLOCK) {
809				error = EAGAIN;
810			} else {
811				rpipe->pipe_state |= PIPE_WANTR;
812				error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0);
813				if (error == 0)
814				        error = pipeio_lock(rpipe, 1);
815			}
816			if (error)
817				goto unlocked_error;
818		}
819	}
820#if CONFIG_MACF
821locked_error:
822#endif
823	pipeio_unlock(rpipe);
824
825unlocked_error:
826	--rpipe->pipe_busy;
827
828	/*
829	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
830	 */
831	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
832		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
833		wakeup(rpipe);
834	} else if (rpipe->pipe_buffer.cnt < rpipe->pipe_buffer.size) {
835		/*
836		 * Handle write blocking hysteresis.
837		 */
838		if (rpipe->pipe_state & PIPE_WANTW) {
839			rpipe->pipe_state &= ~PIPE_WANTW;
840			wakeup(rpipe);
841		}
842	}
843
844	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) > 0)
845		pipeselwakeup(rpipe, rpipe->pipe_peer);
846
847	/* update last read time */
848	pipe_touch(rpipe, PIPE_ATIME);
849
850	PIPE_UNLOCK(rpipe);
851
852	return (error);
853}
854
855/*
856 * perform a write of n bytes into the read side of buffer. Since
857 * pipes are unidirectional a write is meant to be read by the otherside only.
858 */
859static int
860pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
861	__unused vfs_context_t ctx)
862{
863	int error = 0;
864	int orig_resid;
865	int pipe_size;
866	struct pipe *wpipe, *rpipe;
867	// LP64todo - fix this!
868	orig_resid = uio_resid(uio);
869	int space;
870
871	rpipe = (struct pipe *)fp->f_data;
872
873	PIPE_LOCK(rpipe);
874	wpipe = rpipe->pipe_peer;
875
876	/*
877	 * detect loss of pipe read side, issue SIGPIPE if lost.
878	 */
879	if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
880		PIPE_UNLOCK(rpipe);
881		return (EPIPE);
882	}
883#if CONFIG_MACF
884	error = mac_pipe_check_write(kauth_cred_get(), wpipe);
885	if (error) {
886		PIPE_UNLOCK(rpipe);
887		return (error);
888	}
889#endif
890	++wpipe->pipe_busy;
891
892	pipe_size = 0;
893
894	/*
895	 * need to allocate some storage... we delay the allocation
896	 * until the first write on fd[0] to avoid allocating storage for both
897	 * 'pipe ends'... most pipes are half-duplex with the writes targeting
898	 * fd[1], so allocating space for both ends is a waste...
899	 */
900
901	if ( wpipe->pipe_buffer.buffer == 0 || (
902		(unsigned)orig_resid > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt &&
903		amountpipekva < maxpipekva ) ) {
904
905	        pipe_size = choose_pipespace(wpipe->pipe_buffer.size, wpipe->pipe_buffer.cnt + orig_resid);
906	}
907	if (pipe_size) {
908	        /*
909		 * need to do initial allocation or resizing of pipe
910		 * holding both structure and io locks.
911		 */
912		if ((error = pipeio_lock(wpipe, 1)) == 0) {
913			if (wpipe->pipe_buffer.cnt == 0)
914				error = pipespace(wpipe, pipe_size);
915			else
916				error = expand_pipespace(wpipe, pipe_size);
917
918			pipeio_unlock(wpipe);
919
920			/* allocation failed */
921			if (wpipe->pipe_buffer.buffer == 0)
922			        error = ENOMEM;
923		}
924		if (error) {
925		        /*
926			 * If an error occurred unbusy and return, waking up any pending
927			 * readers.
928			 */
929		        --wpipe->pipe_busy;
930			if ((wpipe->pipe_busy == 0) &&
931			    (wpipe->pipe_state & PIPE_WANT)) {
932			        wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
933				wakeup(wpipe);
934			}
935			PIPE_UNLOCK(rpipe);
936			return(error);
937		}
938	}
939
940	while (uio_resid(uio)) {
941
942	retrywrite:
943		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
944
945		/* Writes of size <= PIPE_BUF must be atomic. */
946		if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF))
947			space = 0;
948
949		if (space > 0) {
950
951			if ((error = pipeio_lock(wpipe,1)) == 0) {
952				int size;	/* Transfer size */
953				int segsize;	/* first segment to transfer */
954
955				if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
956					pipeio_unlock(wpipe);
957				        error = EPIPE;
958					break;
959				}
960				/*
961				 * If a process blocked in pipeio_lock, our
962				 * value for space might be bad... the mutex
963				 * is dropped while we're blocked
964				 */
965				if (space > (int)(wpipe->pipe_buffer.size -
966				    wpipe->pipe_buffer.cnt)) {
967					pipeio_unlock(wpipe);
968					goto retrywrite;
969				}
970
971				/*
972				 * Transfer size is minimum of uio transfer
973				 * and free space in pipe buffer.
974				 */
975				// LP64todo - fix this!
976				if (space > uio_resid(uio))
977					size = uio_resid(uio);
978				else
979					size = space;
980				/*
981				 * First segment to transfer is minimum of
982				 * transfer size and contiguous space in
983				 * pipe buffer.  If first segment to transfer
984				 * is less than the transfer size, we've got
985				 * a wraparound in the buffer.
986				 */
987				segsize = wpipe->pipe_buffer.size -
988					wpipe->pipe_buffer.in;
989				if (segsize > size)
990					segsize = size;
991
992				/* Transfer first segment */
993
994				PIPE_UNLOCK(rpipe);
995				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
996						segsize, uio);
997				PIPE_LOCK(rpipe);
998
999				if (error == 0 && segsize < size) {
1000					/*
1001					 * Transfer remaining part now, to
1002					 * support atomic writes.  Wraparound
1003					 * happened. (State 3)
1004					 */
1005					if (wpipe->pipe_buffer.in + segsize !=
1006					    wpipe->pipe_buffer.size)
1007						panic("Expected pipe buffer "
1008						    "wraparound disappeared");
1009
1010					PIPE_UNLOCK(rpipe);
1011					error = uiomove(
1012					    &wpipe->pipe_buffer.buffer[0],
1013				    	    size - segsize, uio);
1014					PIPE_LOCK(rpipe);
1015				}
1016				/*
1017				 * readers never know to read until count is updated.
1018				 */
1019				if (error == 0) {
1020					wpipe->pipe_buffer.in += size;
1021					if (wpipe->pipe_buffer.in >
1022					    wpipe->pipe_buffer.size) {
1023						if (wpipe->pipe_buffer.in !=
1024						    size - segsize +
1025						    wpipe->pipe_buffer.size)
1026							panic("Expected "
1027							    "wraparound bad");
1028						wpipe->pipe_buffer.in = size -
1029						    segsize;
1030					}
1031
1032					wpipe->pipe_buffer.cnt += size;
1033					if (wpipe->pipe_buffer.cnt >
1034					    wpipe->pipe_buffer.size)
1035						panic("Pipe buffer overflow");
1036
1037				}
1038				pipeio_unlock(wpipe);
1039			}
1040			if (error)
1041				break;
1042
1043		} else {
1044			/*
1045			 * If the "read-side" has been blocked, wake it up now.
1046			 */
1047			if (wpipe->pipe_state & PIPE_WANTR) {
1048				wpipe->pipe_state &= ~PIPE_WANTR;
1049				wakeup(wpipe);
1050			}
1051			/*
1052			 * don't block on non-blocking I/O
1053			 * we'll do the pipeselwakeup on the way out
1054			 */
1055			if (fp->f_flag & FNONBLOCK) {
1056				error = EAGAIN;
1057				break;
1058			}
1059
1060			/*
1061			 * If read side wants to go away, we just issue a signal
1062			 * to ourselves.
1063			 */
1064			if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
1065				error = EPIPE;
1066				break;
1067			}
1068
1069			/*
1070			 * We have no more space and have something to offer,
1071			 * wake up select/poll.
1072			 */
1073			pipeselwakeup(wpipe, wpipe);
1074
1075			wpipe->pipe_state |= PIPE_WANTW;
1076
1077			error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0);
1078
1079			if (error != 0)
1080				break;
1081		}
1082	}
1083	--wpipe->pipe_busy;
1084
1085	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1086		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1087		wakeup(wpipe);
1088	}
1089	if (wpipe->pipe_buffer.cnt > 0) {
1090		/*
1091		 * If there are any characters in the buffer, we wake up
1092		 * the reader if it was blocked waiting for data.
1093		 */
1094		if (wpipe->pipe_state & PIPE_WANTR) {
1095			wpipe->pipe_state &= ~PIPE_WANTR;
1096			wakeup(wpipe);
1097		}
1098		/*
1099		 * wake up thread blocked in select/poll or post the notification
1100		 */
1101		pipeselwakeup(wpipe, wpipe);
1102	}
1103
1104	/* Update modification, status change (# of bytes in pipe) times */
1105	pipe_touch(rpipe, PIPE_MTIME | PIPE_CTIME);
1106	pipe_touch(wpipe, PIPE_MTIME | PIPE_CTIME);
1107	PIPE_UNLOCK(rpipe);
1108
1109	return (error);
1110}
1111
1112/*
1113 * we implement a very minimal set of ioctls for compatibility with sockets.
1114 */
1115/* ARGSUSED 3 */
1116static int
1117pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
1118	__unused vfs_context_t ctx)
1119{
1120	struct pipe *mpipe = (struct pipe *)fp->f_data;
1121#if CONFIG_MACF
1122	int error;
1123#endif
1124
1125	PIPE_LOCK(mpipe);
1126
1127#if CONFIG_MACF
1128	error = mac_pipe_check_ioctl(kauth_cred_get(), mpipe, cmd);
1129	if (error) {
1130		PIPE_UNLOCK(mpipe);
1131
1132		return (error);
1133	}
1134#endif
1135
1136	switch (cmd) {
1137
1138	case FIONBIO:
1139		PIPE_UNLOCK(mpipe);
1140		return (0);
1141
1142	case FIOASYNC:
1143		if (*(int *)data) {
1144			mpipe->pipe_state |= PIPE_ASYNC;
1145		} else {
1146			mpipe->pipe_state &= ~PIPE_ASYNC;
1147		}
1148		PIPE_UNLOCK(mpipe);
1149		return (0);
1150
1151	case FIONREAD:
1152		*(int *)data = mpipe->pipe_buffer.cnt;
1153		PIPE_UNLOCK(mpipe);
1154		return (0);
1155
1156	case TIOCSPGRP:
1157		mpipe->pipe_pgid = *(int *)data;
1158
1159		PIPE_UNLOCK(mpipe);
1160		return (0);
1161
1162	case TIOCGPGRP:
1163		*(int *)data = mpipe->pipe_pgid;
1164
1165		PIPE_UNLOCK(mpipe);
1166		return (0);
1167
1168	}
1169	PIPE_UNLOCK(mpipe);
1170	return (ENOTTY);
1171}
1172
1173
1174static int
1175pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
1176{
1177	struct pipe *rpipe = (struct pipe *)fp->f_data;
1178	struct pipe *wpipe;
1179	int    retnum = 0;
1180
1181	if (rpipe == NULL || rpipe == (struct pipe *)-1)
1182	        return (retnum);
1183
1184	PIPE_LOCK(rpipe);
1185
1186	wpipe = rpipe->pipe_peer;
1187
1188
1189#if CONFIG_MACF
1190	/*
1191	 * XXX We should use a per thread credential here; minimally, the
1192	 * XXX process credential should have a persistent reference on it
1193	 * XXX before being passed in here.
1194	 */
1195	if (mac_pipe_check_select(vfs_context_ucred(ctx), rpipe, which)) {
1196		PIPE_UNLOCK(rpipe);
1197		return (0);
1198	}
1199#endif
1200        switch (which) {
1201
1202        case FREAD:
1203		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1204		    (rpipe->pipe_buffer.cnt > 0) ||
1205		    (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1206
1207		        retnum = 1;
1208		} else {
1209		        rpipe->pipe_state |= PIPE_SEL;
1210		        selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1211		}
1212		break;
1213
1214        case FWRITE:
1215		if (wpipe)
1216			wpipe->pipe_state |= PIPE_WSELECT;
1217		if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1218		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1219		     (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
1220
1221		        retnum = 1;
1222		} else {
1223		        wpipe->pipe_state |= PIPE_SEL;
1224			selrecord(vfs_context_proc(ctx), &wpipe->pipe_sel, wql);
1225		}
1226		break;
1227        case 0:
1228	        rpipe->pipe_state |= PIPE_SEL;
1229		selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1230		break;
1231        }
1232	PIPE_UNLOCK(rpipe);
1233
1234        return (retnum);
1235}
1236
1237
1238/* ARGSUSED 1 */
1239static int
1240pipe_close(struct fileglob *fg, __unused vfs_context_t ctx)
1241{
1242        struct pipe *cpipe;
1243
1244	proc_fdlock_spin(vfs_context_proc(ctx));
1245	cpipe = (struct pipe *)fg->fg_data;
1246	fg->fg_data = NULL;
1247	proc_fdunlock(vfs_context_proc(ctx));
1248	if (cpipe)
1249	        pipeclose(cpipe);
1250
1251	return (0);
1252}
1253
1254static void
1255pipe_free_kmem(struct pipe *cpipe)
1256{
1257	if (cpipe->pipe_buffer.buffer != NULL) {
1258		OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva);
1259		OSAddAtomic(-1, &amountpipes);
1260		kfree((void *)cpipe->pipe_buffer.buffer,
1261			  cpipe->pipe_buffer.size);
1262		cpipe->pipe_buffer.buffer = NULL;
1263		cpipe->pipe_buffer.size = 0;
1264	}
1265}
1266
1267/*
1268 * shutdown the pipe
1269 */
1270static void
1271pipeclose(struct pipe *cpipe)
1272{
1273	struct pipe *ppipe;
1274
1275	if (cpipe == NULL)
1276		return;
1277	/* partially created pipes won't have a valid mutex. */
1278	if (PIPE_MTX(cpipe) != NULL)
1279		PIPE_LOCK(cpipe);
1280
1281
1282	/*
1283	 * If the other side is blocked, wake it up saying that
1284	 * we want to close it down.
1285	 */
1286	cpipe->pipe_state &= ~PIPE_DRAIN;
1287	cpipe->pipe_state |= PIPE_EOF;
1288	pipeselwakeup(cpipe, cpipe);
1289
1290	while (cpipe->pipe_busy) {
1291		cpipe->pipe_state |= PIPE_WANT;
1292
1293		wakeup(cpipe);
1294 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1295	}
1296
1297#if CONFIG_MACF
1298	/*
1299	 * Free the shared pipe label only after the two ends are disconnected.
1300	 */
1301	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1302		mac_pipe_label_destroy(cpipe);
1303#endif
1304
1305	/*
1306	 * Disconnect from peer
1307	 */
1308	if ((ppipe = cpipe->pipe_peer) != NULL) {
1309
1310		ppipe->pipe_state &= ~(PIPE_DRAIN);
1311		ppipe->pipe_state |= PIPE_EOF;
1312
1313		pipeselwakeup(ppipe, ppipe);
1314		wakeup(ppipe);
1315
1316		if (cpipe->pipe_state & PIPE_KNOTE)
1317		        KNOTE(&ppipe->pipe_sel.si_note, 1);
1318
1319		postpipeevent(ppipe, EV_RCLOSED);
1320
1321		ppipe->pipe_peer = NULL;
1322	}
1323	evpipefree(cpipe);
1324
1325	/*
1326	 * free resources
1327	 */
1328	if (PIPE_MTX(cpipe) != NULL) {
1329		if (ppipe != NULL) {
1330			/*
1331			 * since the mutex is shared and the peer is still
1332			 * alive, we need to release the mutex, not free it
1333			 */
1334			PIPE_UNLOCK(cpipe);
1335		} else {
1336			/*
1337			 * peer is gone, so we're the sole party left with
1338			 * interest in this mutex... unlock and free it
1339			 */
1340			PIPE_UNLOCK(cpipe);
1341			lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
1342		}
1343	}
1344	pipe_free_kmem(cpipe);
1345	if (cpipe->pipe_state & PIPE_WSELECT) {
1346		pipe_garbage_collect(cpipe);
1347	} else {
1348		zfree(pipe_zone, cpipe);
1349		pipe_garbage_collect(NULL);
1350	}
1351
1352}
1353
1354/*ARGSUSED*/
1355static int
1356pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
1357{
1358	struct pipe *cpipe;
1359
1360	cpipe = (struct pipe *)kn->kn_fp->f_data;
1361
1362	PIPE_LOCK(cpipe);
1363#if CONFIG_MACF
1364	/*
1365	 * XXX We should use a per thread credential here; minimally, the
1366	 * XXX process credential should have a persistent reference on it
1367	 * XXX before being passed in here.
1368	 */
1369	if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
1370		PIPE_UNLOCK(cpipe);
1371		return (1);
1372	}
1373#endif
1374
1375	switch (kn->kn_filter) {
1376	case EVFILT_READ:
1377		kn->kn_fop = &pipe_rfiltops;
1378
1379		break;
1380	case EVFILT_WRITE:
1381		kn->kn_fop = &pipe_wfiltops;
1382
1383		if (cpipe->pipe_peer == NULL) {
1384			/*
1385			 * other end of pipe has been closed
1386			 */
1387		        PIPE_UNLOCK(cpipe);
1388			return (EPIPE);
1389		}
1390		if (cpipe->pipe_peer)
1391		cpipe = cpipe->pipe_peer;
1392		break;
1393	default:
1394	        PIPE_UNLOCK(cpipe);
1395		return (1);
1396	}
1397
1398	if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn))
1399	        cpipe->pipe_state |= PIPE_KNOTE;
1400
1401	PIPE_UNLOCK(cpipe);
1402	return (0);
1403}
1404
1405static void
1406filt_pipedetach(struct knote *kn)
1407{
1408	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1409
1410	PIPE_LOCK(cpipe);
1411
1412	if (kn->kn_filter == EVFILT_WRITE) {
1413	        if (cpipe->pipe_peer == NULL) {
1414		        PIPE_UNLOCK(cpipe);
1415			return;
1416		}
1417		cpipe = cpipe->pipe_peer;
1418	}
1419	if (cpipe->pipe_state & PIPE_KNOTE) {
1420	        if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn))
1421		        cpipe->pipe_state &= ~PIPE_KNOTE;
1422	}
1423	PIPE_UNLOCK(cpipe);
1424}
1425
1426/*ARGSUSED*/
1427static int
1428filt_piperead(struct knote *kn, long hint)
1429{
1430	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1431	struct pipe *wpipe;
1432	int    retval;
1433
1434	/*
1435	 * if hint == 0, then we've been called from the kevent
1436	 * world directly and do not currently hold the pipe mutex...
1437	 * if hint == 1, we're being called back via the KNOTE post
1438	 * we made in pipeselwakeup, and we already hold the mutex...
1439	 */
1440	if (hint == 0)
1441	        PIPE_LOCK(rpipe);
1442
1443	wpipe = rpipe->pipe_peer;
1444	kn->kn_data = rpipe->pipe_buffer.cnt;
1445	if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1446	    (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1447		kn->kn_flags |= EV_EOF;
1448		retval = 1;
1449	} else {
1450		int64_t lowwat = 1;
1451		if (kn->kn_sfflags & NOTE_LOWAT) {
1452			if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe))
1453				lowwat = MAX_PIPESIZE(rpipe);
1454			else if (kn->kn_sdata > lowwat)
1455				lowwat = kn->kn_sdata;
1456		}
1457		retval = kn->kn_data >= lowwat;
1458	}
1459
1460	if (hint == 0)
1461	        PIPE_UNLOCK(rpipe);
1462
1463	return (retval);
1464}
1465
1466/*ARGSUSED*/
1467static int
1468filt_pipewrite(struct knote *kn, long hint)
1469{
1470	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1471	struct pipe *wpipe;
1472
1473	/*
1474	 * if hint == 0, then we've been called from the kevent
1475	 * world directly and do not currently hold the pipe mutex...
1476	 * if hint == 1, we're being called back via the KNOTE post
1477	 * we made in pipeselwakeup, and we already hold the mutex...
1478	 */
1479	if (hint == 0)
1480	        PIPE_LOCK(rpipe);
1481
1482	wpipe = rpipe->pipe_peer;
1483
1484	if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1485		kn->kn_data = 0;
1486		kn->kn_flags |= EV_EOF;
1487
1488		if (hint == 0)
1489		        PIPE_UNLOCK(rpipe);
1490		return (1);
1491	}
1492	kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt;
1493
1494	int64_t lowwat = PIPE_BUF;
1495	if (kn->kn_sfflags & NOTE_LOWAT) {
1496		if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe))
1497			lowwat = MAX_PIPESIZE(wpipe);
1498		else if (kn->kn_sdata > lowwat)
1499			lowwat = kn->kn_sdata;
1500	}
1501
1502	if (hint == 0)
1503	        PIPE_UNLOCK(rpipe);
1504
1505	return (kn->kn_data >= lowwat);
1506}
1507
1508int
1509fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
1510{
1511#if CONFIG_MACF
1512        int error;
1513#endif
1514	struct timeval now;
1515	struct vinfo_stat * ub;
1516	int pipe_size = 0;
1517	int pipe_count;
1518
1519	if (cpipe == NULL)
1520	        return (EBADF);
1521	PIPE_LOCK(cpipe);
1522
1523#if CONFIG_MACF
1524	error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
1525	if (error) {
1526		PIPE_UNLOCK(cpipe);
1527	        return (error);
1528	}
1529#endif
1530	if (cpipe->pipe_buffer.buffer == 0) {
1531	        /*
1532		 * must be stat'ing the write fd
1533		 */
1534	        if (cpipe->pipe_peer) {
1535		        /*
1536			 * the peer still exists, use it's info
1537			 */
1538		        pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
1539			pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
1540		} else {
1541			pipe_count = 0;
1542		}
1543	} else {
1544	        pipe_size  = MAX_PIPESIZE(cpipe);
1545		pipe_count = cpipe->pipe_buffer.cnt;
1546	}
1547	/*
1548	 * since peer's buffer is setup ouside of lock
1549	 * we might catch it in transient state
1550	 */
1551	if (pipe_size == 0)
1552		pipe_size  = PIPE_SIZE;
1553
1554	ub = &pinfo->pipe_stat;
1555
1556	bzero(ub, sizeof(*ub));
1557	ub->vst_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
1558	ub->vst_blksize = pipe_size;
1559	ub->vst_size = pipe_count;
1560	if (ub->vst_blksize != 0)
1561		ub->vst_blocks = (ub->vst_size + ub->vst_blksize - 1) / ub->vst_blksize;
1562	ub->vst_nlink = 1;
1563
1564	ub->vst_uid = kauth_getuid();
1565	ub->vst_gid = kauth_getgid();
1566
1567	microtime(&now);
1568	ub->vst_atime  = now.tv_sec;
1569	ub->vst_atimensec = now.tv_usec * 1000;
1570
1571	ub->vst_mtime  = now.tv_sec;
1572	ub->vst_mtimensec = now.tv_usec * 1000;
1573
1574	ub->vst_ctime  = now.tv_sec;
1575	ub->vst_ctimensec = now.tv_usec * 1000;
1576
1577	/*
1578	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid.
1579	 * XXX (st_dev, st_ino) should be unique.
1580	 */
1581
1582	pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
1583	pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(cpipe->pipe_peer));
1584	pinfo->pipe_status = cpipe->pipe_state;
1585
1586	PIPE_UNLOCK(cpipe);
1587
1588	return (0);
1589}
1590
1591
1592static int
1593pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
1594{
1595
1596	/* Note: fdlock already held */
1597	struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
1598
1599	if (cpipe) {
1600		PIPE_LOCK(cpipe);
1601		cpipe->pipe_state |= PIPE_DRAIN;
1602		cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1603		wakeup(cpipe);
1604
1605		/* Must wake up peer: a writer sleeps on the read side */
1606		if ((ppipe = cpipe->pipe_peer)) {
1607			ppipe->pipe_state |= PIPE_DRAIN;
1608			ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1609			wakeup(ppipe);
1610		}
1611
1612		PIPE_UNLOCK(cpipe);
1613		return 0;
1614	}
1615
1616	return 1;
1617}
1618
1619
1620 /*
1621 * When a thread sets a write-select on a pipe, it creates an implicit,
1622 * untracked dependency between that thread and the peer of the pipe
1623 * on which the select is set.  If the peer pipe is closed and freed
1624 * before the select()ing thread wakes up, the system will panic as
1625 * it attempts to unwind the dangling select().  To avoid that panic,
1626 * we notice whenever a dangerous select() is set on a pipe, and
1627 * defer the final deletion of the pipe until that select()s are all
1628 * resolved.  Since we can't currently detect exactly when that
1629 * resolution happens, we use a simple garbage collection queue to
1630 * reap the at-risk pipes 'later'.
1631 */
1632static void
1633pipe_garbage_collect(struct pipe *cpipe)
1634{
1635	uint64_t old, now;
1636	struct pipe_garbage *pgp;
1637
1638	/* Convert msecs to nsecs and then to abstime */
1639	old = pipe_garbage_age_limit * 1000000;
1640	nanoseconds_to_absolutetime(old, &old);
1641
1642	lck_mtx_lock(pipe_garbage_lock);
1643
1644	/* Free anything that's been on the queue for <mumble> seconds */
1645	now = mach_absolute_time();
1646	old = now - old;
1647	while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
1648		pipe_garbage_head = pgp->pg_next;
1649		if (pipe_garbage_head == NULL)
1650			pipe_garbage_tail = NULL;
1651		pipe_garbage_count--;
1652		zfree(pipe_zone, pgp->pg_pipe);
1653		zfree(pipe_garbage_zone, pgp);
1654	}
1655
1656	/* Add the new pipe (if any) to the tail of the garbage queue */
1657	if (cpipe) {
1658		cpipe->pipe_state = PIPE_DEAD;
1659		pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
1660		if (pgp == NULL) {
1661			/*
1662			 * We're too low on memory to garbage collect the
1663			 * pipe.  Freeing it runs the risk of panicing the
1664			 * system.  All we can do is leak it and leave
1665			 * a breadcrumb behind.  The good news, such as it
1666			 * is, is that this will probably never happen.
1667			 * We will probably hit the panic below first.
1668			 */
1669			printf("Leaking pipe %p - no room left in the queue",
1670			    cpipe);
1671			lck_mtx_unlock(pipe_garbage_lock);
1672			return;
1673		}
1674
1675		pgp->pg_pipe = cpipe;
1676		pgp->pg_timestamp = now;
1677		pgp->pg_next = NULL;
1678
1679		if (pipe_garbage_tail)
1680			pipe_garbage_tail->pg_next = pgp;
1681		pipe_garbage_tail = pgp;
1682		if (pipe_garbage_head == NULL)
1683			pipe_garbage_head = pipe_garbage_tail;
1684
1685		if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT)
1686			panic("Length of pipe garbage queue exceeded %d",
1687			    PIPE_GARBAGE_QUEUE_LIMIT);
1688	}
1689	lck_mtx_unlock(pipe_garbage_lock);
1690}
1691
1692