1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19/*
20 * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
21 *
22 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
23 *
24 * This file contains Original Code and/or Modifications of Original Code
25 * as defined in and that are subject to the Apple Public Source License
26 * Version 2.0 (the 'License'). You may not use this file except in
27 * compliance with the License. The rights granted to you under the License
28 * may not be used to create, or enable the creation or redistribution of,
29 * unlawful or unlicensed copies of an Apple operating system, or to
30 * circumvent, violate, or enable the circumvention or violation of, any
31 * terms of an Apple operating system software license agreement.
32 *
33 * Please obtain a copy of the License at
34 * http://www.opensource.apple.com/apsl/ and read it before using this file.
35 *
36 * The Original Code and all software distributed under the License are
37 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
38 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
39 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
40 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
41 * Please see the License for the specific language governing rights and
42 * limitations under the License.
43 *
44 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
45 */
46/*
47 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
48 * support for mandatory and extensible security protections.  This notice
49 * is included in support of clause 2.2 (b) of the Apple Public License,
50 * Version 2.0.
51 */
52
53/*
54 * This file contains a high-performance replacement for the socket-based
55 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
56 * all features of sockets, but does do everything that pipes normally
57 * do.
58 *
59 * Pipes are implemented as circular buffers. Following are the valid states in pipes operations
60 *
61 *      _________________________________
62 * 1.  |_________________________________| r=w, c=0
63 *
64 *      _________________________________
65 * 2.  |__r:::::wc_______________________| r <= w , c > 0
66 *
67 *      _________________________________
68 * 3.  |::::wc_____r:::::::::::::::::::::| r>w , c > 0
69 *
70 *      _________________________________
71 * 4.  |:::::::wrc:::::::::::::::::::::::| w=r, c = Max size
72 *
73 *
74 *  Nomenclature:-
75 *  a-z define the steps in a program flow
76 *  1-4 are the states as defined aboe
77 *  Action: is what file operation is done on the pipe
78 *
79 *  Current:None  Action: initialize with size M=200
80 *  a. State 1 ( r=0, w=0, c=0)
81 *
82 *  Current: a    Action: write(100) (w < M)
83 *  b. State 2 (r=0, w=100, c=100)
84 *
85 *  Current: b    Action: write(100) (w = M-w)
86 *  c. State 4 (r=0,w=0,c=200)
87 *
88 *  Current: b    Action: read(70)  ( r < c )
89 *  d. State 2(r=70,w=100,c=30)
90 *
91 *  Current: d	  Action: write(75) ( w < (m-w))
92 *  e. State 2 (r=70,w=175,c=105)
93 *
94 *  Current: d    Action: write(110) ( w > (m-w))
95 *  f. State 3 (r=70,w=10,c=140)
96 *
97 *  Current: d	  Action: read(30) (r >= c )
98 *  g. State 1 (r=100,w=100,c=0)
99 *
100 */
101
102/*
103 * This code create half duplex pipe buffers for facilitating file like
104 * operations on pipes. The initial buffer is very small, but this can
105 * dynamically change to larger sizes based on usage. The buffer size is never
106 * reduced. The total amount of kernel memory used is governed by maxpipekva.
107 * In case of dynamic expansion limit is reached, the output thread is blocked
108 * until the pipe buffer empties enough to continue.
109 *
110 * In order to limit the resource use of pipes, two sysctls exist:
111 *
112 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
113 * address space available to us in pipe_map.
114 *
115 * Memory usage may be monitored through the sysctls
116 * kern.ipc.pipes, kern.ipc.pipekva.
117 *
118 */
119
120#include <sys/param.h>
121#include <sys/systm.h>
122#include <sys/filedesc.h>
123#include <sys/kernel.h>
124#include <sys/vnode.h>
125#include <sys/proc_internal.h>
126#include <sys/kauth.h>
127#include <sys/file_internal.h>
128#include <sys/stat.h>
129#include <sys/ioctl.h>
130#include <sys/fcntl.h>
131#include <sys/malloc.h>
132#include <sys/syslog.h>
133#include <sys/unistd.h>
134#include <sys/resourcevar.h>
135#include <sys/aio_kern.h>
136#include <sys/signalvar.h>
137#include <sys/pipe.h>
138#include <sys/sysproto.h>
139#include <sys/proc_info.h>
140
141#include <security/audit/audit.h>
142
143#include <sys/kdebug.h>
144
145#include <kern/zalloc.h>
146#include <kern/kalloc.h>
147#include <vm/vm_kern.h>
148#include <libkern/OSAtomic.h>
149
150#define f_flag f_fglob->fg_flag
151#define f_type f_fglob->fg_type
152#define f_msgcount f_fglob->fg_msgcount
153#define f_cred f_fglob->fg_cred
154#define f_ops f_fglob->fg_ops
155#define f_offset f_fglob->fg_offset
156#define f_data f_fglob->fg_data
157
158/*
159 * interfaces to the outside world exported through file operations
160 */
161static int pipe_read(struct fileproc *fp, struct uio *uio,
162                int flags, vfs_context_t ctx);
163static int pipe_write(struct fileproc *fp, struct uio *uio,
164                int flags, vfs_context_t ctx);
165static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
166static int pipe_select(struct fileproc *fp, int which, void * wql,
167		vfs_context_t ctx);
168static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
169		vfs_context_t ctx);
170static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
171		vfs_context_t ctx);
172static int pipe_drain(struct fileproc *fp,vfs_context_t ctx);
173
174struct  fileops pipeops =
175  { pipe_read,
176    pipe_write,
177    pipe_ioctl,
178    pipe_select,
179    pipe_close,
180    pipe_kqfilter,
181    pipe_drain };
182
183static void	filt_pipedetach(struct knote *kn);
184static int	filt_piperead(struct knote *kn, long hint);
185static int	filt_pipewrite(struct knote *kn, long hint);
186
187static struct filterops pipe_rfiltops = {
188        .f_isfd = 1,
189        .f_detach = filt_pipedetach,
190        .f_event = filt_piperead,
191};
192
193static struct filterops pipe_wfiltops = {
194        .f_isfd = 1,
195        .f_detach = filt_pipedetach,
196        .f_event = filt_pipewrite,
197};
198
199static int nbigpipe;      /* for compatibility sake. no longer used */
200static int amountpipes;   /* total number of pipes in system */
201static int amountpipekva; /* total memory used by pipes */
202
203int maxpipekva = PIPE_KVAMAX;  /* allowing 16MB max. */
204
205#if PIPE_SYSCTLS
206SYSCTL_DECL(_kern_ipc);
207
208SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
209	   &maxpipekva, 0, "Pipe KVA limit");
210SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED,
211	   &maxpipekvawired, 0, "Pipe KVA wired limit");
212SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED,
213	   &amountpipes, 0, "Current # of pipes");
214SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED,
215	   &nbigpipe, 0, "Current # of big pipes");
216SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
217	   &amountpipekva, 0, "Pipe KVA usage");
218SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED,
219	   &amountpipekvawired, 0, "Pipe wired KVA usage");
220#endif
221
222static void pipeclose(struct pipe *cpipe);
223static void pipe_free_kmem(struct pipe *cpipe);
224static int pipe_create(struct pipe **cpipep);
225static int pipespace(struct pipe *cpipe, int size);
226static int choose_pipespace(unsigned long current, unsigned long expected);
227static int expand_pipespace(struct pipe *p, int target_size);
228static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe);
229static __inline int pipeio_lock(struct pipe *cpipe, int catch);
230static __inline void pipeio_unlock(struct pipe *cpipe);
231
232extern int postpipeevent(struct pipe *, int);
233extern void evpipefree(struct pipe *cpipe);
234
235static lck_grp_t	*pipe_mtx_grp;
236static lck_attr_t	*pipe_mtx_attr;
237static lck_grp_attr_t	*pipe_mtx_grp_attr;
238
239static zone_t pipe_zone;
240
241#define MAX_PIPESIZE(pipe)  		( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) )
242
243#define	PIPE_GARBAGE_AGE_LIMIT		5000	/* In milliseconds */
244#define PIPE_GARBAGE_QUEUE_LIMIT	32000
245
246struct pipe_garbage {
247	struct pipe		*pg_pipe;
248	struct pipe_garbage	*pg_next;
249	uint64_t		pg_timestamp;
250};
251
252static zone_t pipe_garbage_zone;
253static struct pipe_garbage *pipe_garbage_head = NULL;
254static struct pipe_garbage *pipe_garbage_tail = NULL;
255static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
256static int pipe_garbage_count = 0;
257static lck_mtx_t *pipe_garbage_lock;
258static void pipe_garbage_collect(struct pipe *cpipe);
259
260SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
261
262/* initial setup done at time of sysinit */
263void
264pipeinit(void)
265{
266	nbigpipe=0;
267	vm_size_t zone_size;
268
269	zone_size = 8192 * sizeof(struct pipe);
270        pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
271
272
273	/* allocate lock group attribute and group for pipe mutexes */
274	pipe_mtx_grp_attr = lck_grp_attr_alloc_init();
275	pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr);
276
277	/* allocate the lock attribute for pipe mutexes */
278	pipe_mtx_attr = lck_attr_alloc_init();
279
280	/*
281	 * Set up garbage collection for dead pipes
282	 */
283	zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
284	    sizeof(struct pipe_garbage);
285        pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
286	    zone_size, 4096, "pipe garbage zone");
287	pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
288
289}
290
291/* Bitmap for things to touch in pipe_touch() */
292#define	PIPE_ATIME	0x00000001	/* time of last access */
293#define	PIPE_MTIME	0x00000002	/* time of last modification */
294#define	PIPE_CTIME	0x00000004	/* time of last status change */
295
296static void
297pipe_touch(struct pipe *tpipe, int touch)
298{
299	struct timeval now;
300
301	microtime(&now);
302
303	if (touch & PIPE_ATIME) {
304		tpipe->st_atimespec.tv_sec  = now.tv_sec;
305		tpipe->st_atimespec.tv_nsec = now.tv_usec * 1000;
306	}
307
308	if (touch & PIPE_MTIME) {
309		tpipe->st_mtimespec.tv_sec  = now.tv_sec;
310		tpipe->st_mtimespec.tv_nsec = now.tv_usec * 1000;
311	}
312
313	if (touch & PIPE_CTIME) {
314		tpipe->st_ctimespec.tv_sec  = now.tv_sec;
315		tpipe->st_ctimespec.tv_nsec = now.tv_usec * 1000;
316	}
317}
318
319static const unsigned int pipesize_blocks[] = {128,256,1024,2048,PAGE_SIZE, PAGE_SIZE * 2, PIPE_SIZE , PIPE_SIZE * 4 };
320
321/*
322 * finds the right size from possible sizes in pipesize_blocks
323 * returns the size which matches max(current,expected)
324 */
325static int
326choose_pipespace(unsigned long current, unsigned long expected)
327{
328	int i = sizeof(pipesize_blocks)/sizeof(unsigned int) -1;
329	unsigned long target;
330
331	if (expected > current)
332		target = expected;
333	else
334		target = current;
335
336	while ( i >0 && pipesize_blocks[i-1] > target) {
337		i=i-1;
338
339	}
340
341	return pipesize_blocks[i];
342}
343
344
345/*
346 * expand the size of pipe while there is data to be read,
347 * and then free the old buffer once the current buffered
348 * data has been transferred to new storage.
349 * Required: PIPE_LOCK and io lock to be held by caller.
350 * returns 0 on success or no expansion possible
351 */
352static int
353expand_pipespace(struct pipe *p, int target_size)
354{
355	struct pipe tmp, oldpipe;
356	int error;
357	tmp.pipe_buffer.buffer = 0;
358
359	if (p->pipe_buffer.size >= (unsigned) target_size) {
360		return 0; /* the existing buffer is max size possible */
361	}
362
363	/* create enough space in the target */
364	error = pipespace(&tmp, target_size);
365	if (error != 0)
366		return (error);
367
368	oldpipe.pipe_buffer.buffer = p->pipe_buffer.buffer;
369	oldpipe.pipe_buffer.size = p->pipe_buffer.size;
370
371	memcpy(tmp.pipe_buffer.buffer, p->pipe_buffer.buffer, p->pipe_buffer.size);
372	if (p->pipe_buffer.cnt > 0 && p->pipe_buffer.in <= p->pipe_buffer.out ){
373		/* we are in State 3 and need extra copying for read to be consistent */
374		memcpy(&tmp.pipe_buffer.buffer[p->pipe_buffer.size], p->pipe_buffer.buffer, p->pipe_buffer.size);
375		p->pipe_buffer.in += p->pipe_buffer.size;
376	}
377
378	p->pipe_buffer.buffer = tmp.pipe_buffer.buffer;
379	p->pipe_buffer.size = tmp.pipe_buffer.size;
380
381
382	pipe_free_kmem(&oldpipe);
383	return 0;
384}
385
386/*
387 * The pipe system call for the DTYPE_PIPE type of pipes
388 *
389 * returns:
390 *  FREAD  | fd0 | -->[struct rpipe] --> |~~buffer~~| \
391 *                                                    (pipe_mutex)
392 *  FWRITE | fd1 | -->[struct wpipe] --X              /
393 */
394
395/* ARGSUSED */
396int
397pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
398{
399	struct fileproc *rf, *wf;
400	struct pipe *rpipe, *wpipe;
401	lck_mtx_t   *pmtx;
402	int fd, error;
403
404	if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL)
405	        return (ENOMEM);
406
407	rpipe = wpipe = NULL;
408	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
409	        error = ENFILE;
410		goto freepipes;
411	}
412        /*
413	 * allocate the space for the normal I/O direction up
414	 * front... we'll delay the allocation for the other
415	 * direction until a write actually occurs (most likely it won't)...
416         */
417	error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0));
418        if (error)
419	        goto freepipes;
420
421	TAILQ_INIT(&rpipe->pipe_evlist);
422	TAILQ_INIT(&wpipe->pipe_evlist);
423
424	error = falloc(p, &rf, &fd, vfs_context_current());
425	if (error) {
426	        goto freepipes;
427	}
428	retval[0] = fd;
429
430	/*
431	 * for now we'll create half-duplex pipes(refer returns section above).
432	 * this is what we've always supported..
433	 */
434	rf->f_flag = FREAD;
435	rf->f_type = DTYPE_PIPE;
436	rf->f_data = (caddr_t)rpipe;
437	rf->f_ops = &pipeops;
438
439	error = falloc(p, &wf, &fd, vfs_context_current());
440	if (error) {
441		fp_free(p, retval[0], rf);
442	        goto freepipes;
443	}
444	wf->f_flag = FWRITE;
445	wf->f_type = DTYPE_PIPE;
446	wf->f_data = (caddr_t)wpipe;
447	wf->f_ops = &pipeops;
448
449	rpipe->pipe_peer = wpipe;
450	wpipe->pipe_peer = rpipe;
451	/* both structures share the same mutex */
452	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
453
454	retval[1] = fd;
455#if CONFIG_MACF
456	/*
457	 * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
458	 *
459	 * struct pipe represents a pipe endpoint.  The MAC label is shared
460	 * between the connected endpoints.  As a result mac_pipe_label_init() and
461	 * mac_pipe_label_associate() should only be called on one of the endpoints
462	 * after they have been connected.
463	 */
464	mac_pipe_label_init(rpipe);
465	mac_pipe_label_associate(kauth_cred_get(), rpipe);
466	wpipe->pipe_label = rpipe->pipe_label;
467#endif
468	proc_fdlock_spin(p);
469	procfdtbl_releasefd(p, retval[0], NULL);
470	procfdtbl_releasefd(p, retval[1], NULL);
471	fp_drop(p, retval[0], rf, 1);
472	fp_drop(p, retval[1], wf, 1);
473	proc_fdunlock(p);
474
475
476	return (0);
477
478freepipes:
479	pipeclose(rpipe);
480	pipeclose(wpipe);
481	lck_mtx_free(pmtx, pipe_mtx_grp);
482
483	return (error);
484}
485
486int
487pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
488{
489#if CONFIG_MACF
490        int error;
491#endif
492	int	pipe_size = 0;
493	int	pipe_count;
494	struct stat *sb = (struct stat *)0;	/* warning avoidance ; protected by isstat64 */
495	struct stat64 * sb64 = (struct stat64 *)0;  /* warning avoidance ; protected by isstat64 */
496
497	if (cpipe == NULL)
498	        return (EBADF);
499	PIPE_LOCK(cpipe);
500
501#if CONFIG_MACF
502	error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
503	if (error) {
504		PIPE_UNLOCK(cpipe);
505	        return (error);
506	}
507#endif
508	if (cpipe->pipe_buffer.buffer == 0) {
509	        /* must be stat'ing the write fd */
510	        if (cpipe->pipe_peer) {
511		        /* the peer still exists, use it's info */
512		        pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
513			pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
514		} else {
515			pipe_count = 0;
516		}
517	} else {
518	        pipe_size  = MAX_PIPESIZE(cpipe);
519		pipe_count = cpipe->pipe_buffer.cnt;
520	}
521	/*
522	 * since peer's buffer is setup ouside of lock
523	 * we might catch it in transient state
524	 */
525	if (pipe_size == 0)
526		pipe_size  = MAX(PIPE_SIZE, pipesize_blocks[0]);
527
528	if (isstat64 != 0) {
529		sb64 = (struct stat64 *)ub;
530
531		bzero(sb64, sizeof(*sb64));
532		sb64->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
533		sb64->st_blksize = pipe_size;
534		sb64->st_size = pipe_count;
535		sb64->st_blocks = (sb64->st_size + sb64->st_blksize - 1) / sb64->st_blksize;
536
537		sb64->st_uid = kauth_getuid();
538		sb64->st_gid = kauth_getgid();
539
540		sb64->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
541		sb64->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
542
543		sb64->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
544		sb64->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
545
546		sb64->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
547		sb64->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
548
549		/*
550	 	* Return a relatively unique inode number based on the current
551	 	* address of this pipe's struct pipe.  This number may be recycled
552	 	* relatively quickly.
553	 	*/
554		sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
555	} else {
556		sb = (struct stat *)ub;
557
558		bzero(sb, sizeof(*sb));
559		sb->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
560		sb->st_blksize = pipe_size;
561		sb->st_size = pipe_count;
562		sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
563
564		sb->st_uid = kauth_getuid();
565		sb->st_gid = kauth_getgid();
566
567		sb->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
568		sb->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
569
570		sb->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
571		sb->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
572
573		sb->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
574		sb->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
575
576		/*
577	 	* Return a relatively unique inode number based on the current
578	 	* address of this pipe's struct pipe.  This number may be recycled
579	 	* relatively quickly.
580	 	*/
581		sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
582	}
583	PIPE_UNLOCK(cpipe);
584
585	/*
586	 * POSIX: Left as 0: st_dev, st_nlink, st_rdev, st_flags, st_gen,
587	 * st_uid, st_gid.
588	 *
589	 * XXX (st_dev) should be unique, but there is no device driver that
590	 * XXX is associated with pipes, since they are implemented via a
591	 * XXX struct fileops indirection rather than as FS objects.
592	 */
593	return (0);
594}
595
596
597/*
598 * Allocate kva for pipe circular buffer, the space is pageable
599 * This routine will 'realloc' the size of a pipe safely, if it fails
600 * it will retain the old buffer.
601 * If it fails it will return ENOMEM.
602 */
603static int
604pipespace(struct pipe *cpipe, int size)
605{
606	vm_offset_t buffer;
607
608	if (size <= 0)
609		return(EINVAL);
610
611	if ((buffer = (vm_offset_t)kalloc(size)) == 0 )
612		return(ENOMEM);
613
614	/* free old resources if we're resizing */
615	pipe_free_kmem(cpipe);
616	cpipe->pipe_buffer.buffer = (caddr_t)buffer;
617	cpipe->pipe_buffer.size = size;
618	cpipe->pipe_buffer.in = 0;
619	cpipe->pipe_buffer.out = 0;
620	cpipe->pipe_buffer.cnt = 0;
621
622	OSAddAtomic(1, &amountpipes);
623	OSAddAtomic(cpipe->pipe_buffer.size, &amountpipekva);
624
625	return (0);
626}
627
628/*
629 * initialize and allocate VM and memory for pipe
630 */
631static int
632pipe_create(struct pipe **cpipep)
633{
634	struct pipe *cpipe;
635	cpipe = (struct pipe *)zalloc(pipe_zone);
636
637	if ((*cpipep = cpipe) == NULL)
638		return (ENOMEM);
639
640	/*
641	 * protect so pipespace or pipeclose don't follow a junk pointer
642	 * if pipespace() fails.
643	 */
644	bzero(cpipe, sizeof *cpipe);
645
646	/* Initial times are all the time of creation of the pipe */
647	pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
648	return (0);
649}
650
651
652/*
653 * lock a pipe for I/O, blocking other access
654 */
655static inline int
656pipeio_lock(struct pipe *cpipe, int catch)
657{
658	int error;
659	while (cpipe->pipe_state & PIPE_LOCKFL) {
660		cpipe->pipe_state |= PIPE_LWANT;
661		error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO,
662			       "pipelk", 0);
663		if (error != 0)
664			return (error);
665	}
666	cpipe->pipe_state |= PIPE_LOCKFL;
667	return (0);
668}
669
670/*
671 * unlock a pipe I/O lock
672 */
673static inline void
674pipeio_unlock(struct pipe *cpipe)
675{
676	cpipe->pipe_state &= ~PIPE_LOCKFL;
677	if (cpipe->pipe_state & PIPE_LWANT) {
678		cpipe->pipe_state &= ~PIPE_LWANT;
679		wakeup(cpipe);
680	}
681}
682
683/*
684 * wakeup anyone whos blocked in select
685 */
686static void
687pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
688{
689	if (cpipe->pipe_state & PIPE_SEL) {
690		cpipe->pipe_state &= ~PIPE_SEL;
691		selwakeup(&cpipe->pipe_sel);
692	}
693        if (cpipe->pipe_state & PIPE_KNOTE)
694	       KNOTE(&cpipe->pipe_sel.si_note, 1);
695
696	postpipeevent(cpipe, EV_RWBYTES);
697
698	if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) {
699	        if (spipe->pipe_pgid < 0)
700		        gsignal(-spipe->pipe_pgid, SIGIO);
701		else
702		        proc_signal(spipe->pipe_pgid, SIGIO);
703        }
704}
705
706/*
707 * Read n bytes from the buffer. Semantics are similar to file read.
708 * returns: number of bytes read from the buffer
709 */
710/* ARGSUSED */
711static int
712pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
713	__unused vfs_context_t ctx)
714{
715	struct pipe *rpipe = (struct pipe *)fp->f_data;
716	int error;
717	int nread = 0;
718	u_int size;
719
720	PIPE_LOCK(rpipe);
721	++rpipe->pipe_busy;
722
723	error = pipeio_lock(rpipe, 1);
724	if (error)
725		goto unlocked_error;
726
727#if CONFIG_MACF
728	error = mac_pipe_check_read(kauth_cred_get(), rpipe);
729	if (error)
730		goto locked_error;
731#endif
732
733
734	while (uio_resid(uio)) {
735		/*
736		 * normal pipe buffer receive
737		 */
738		if (rpipe->pipe_buffer.cnt > 0) {
739			/*
740			 * # bytes to read is min( bytes from read pointer until end of buffer,
741			 *                         total unread bytes,
742			 *                         user requested byte count)
743			 */
744			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
745			if (size > rpipe->pipe_buffer.cnt)
746				size = rpipe->pipe_buffer.cnt;
747			// LP64todo - fix this!
748			if (size > (u_int) uio_resid(uio))
749				size = (u_int) uio_resid(uio);
750
751			PIPE_UNLOCK(rpipe); /* we still hold io lock.*/
752			error = uiomove(
753			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
754			    size, uio);
755			PIPE_LOCK(rpipe);
756			if (error)
757				break;
758
759			rpipe->pipe_buffer.out += size;
760			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
761				rpipe->pipe_buffer.out = 0;
762
763			rpipe->pipe_buffer.cnt -= size;
764
765			/*
766			 * If there is no more to read in the pipe, reset
767			 * its pointers to the beginning.  This improves
768			 * cache hit stats.
769			 */
770			if (rpipe->pipe_buffer.cnt == 0) {
771				rpipe->pipe_buffer.in = 0;
772				rpipe->pipe_buffer.out = 0;
773			}
774			nread += size;
775		} else {
776			/*
777			 * detect EOF condition
778			 * read returns 0 on EOF, no need to set error
779			 */
780			if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
781				break;
782			}
783
784			/*
785			 * If the "write-side" has been blocked, wake it up now.
786			 */
787			if (rpipe->pipe_state & PIPE_WANTW) {
788				rpipe->pipe_state &= ~PIPE_WANTW;
789				wakeup(rpipe);
790			}
791
792			/*
793			 * Break if some data was read in previous iteration.
794			 */
795			if (nread > 0)
796				break;
797
798			/*
799			 * Unlock the pipe buffer for our remaining processing.
800			 * We will either break out with an error or we will
801			 * sleep and relock to loop.
802			 */
803			pipeio_unlock(rpipe);
804
805			/*
806			 * Handle non-blocking mode operation or
807			 * wait for more data.
808			 */
809			if (fp->f_flag & FNONBLOCK) {
810				error = EAGAIN;
811			} else {
812				rpipe->pipe_state |= PIPE_WANTR;
813				error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0);
814				if (error == 0)
815				        error = pipeio_lock(rpipe, 1);
816			}
817			if (error)
818				goto unlocked_error;
819		}
820	}
821#if CONFIG_MACF
822locked_error:
823#endif
824	pipeio_unlock(rpipe);
825
826unlocked_error:
827	--rpipe->pipe_busy;
828
829	/*
830	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
831	 */
832	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
833		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
834		wakeup(rpipe);
835	} else if (rpipe->pipe_buffer.cnt < rpipe->pipe_buffer.size) {
836		/*
837		 * Handle write blocking hysteresis.
838		 */
839		if (rpipe->pipe_state & PIPE_WANTW) {
840			rpipe->pipe_state &= ~PIPE_WANTW;
841			wakeup(rpipe);
842		}
843	}
844
845	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) > 0)
846		pipeselwakeup(rpipe, rpipe->pipe_peer);
847
848	/* update last read time */
849	pipe_touch(rpipe, PIPE_ATIME);
850
851	PIPE_UNLOCK(rpipe);
852
853	return (error);
854}
855
856/*
857 * perform a write of n bytes into the read side of buffer. Since
858 * pipes are unidirectional a write is meant to be read by the otherside only.
859 */
860static int
861pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
862	__unused vfs_context_t ctx)
863{
864	int error = 0;
865	int orig_resid;
866	int pipe_size;
867	struct pipe *wpipe, *rpipe;
868	// LP64todo - fix this!
869	orig_resid = uio_resid(uio);
870	int space;
871
872	rpipe = (struct pipe *)fp->f_data;
873
874	PIPE_LOCK(rpipe);
875	wpipe = rpipe->pipe_peer;
876
877	/*
878	 * detect loss of pipe read side, issue SIGPIPE if lost.
879	 */
880	if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
881		PIPE_UNLOCK(rpipe);
882		return (EPIPE);
883	}
884#if CONFIG_MACF
885	error = mac_pipe_check_write(kauth_cred_get(), wpipe);
886	if (error) {
887		PIPE_UNLOCK(rpipe);
888		return (error);
889	}
890#endif
891	++wpipe->pipe_busy;
892
893	pipe_size = 0;
894
895	/*
896	 * need to allocate some storage... we delay the allocation
897	 * until the first write on fd[0] to avoid allocating storage for both
898	 * 'pipe ends'... most pipes are half-duplex with the writes targeting
899	 * fd[1], so allocating space for both ends is a waste...
900	 */
901
902	if ( wpipe->pipe_buffer.buffer == 0 || (
903		(unsigned)orig_resid > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt &&
904		amountpipekva < maxpipekva ) ) {
905
906	        pipe_size = choose_pipespace(wpipe->pipe_buffer.size, wpipe->pipe_buffer.cnt + orig_resid);
907	}
908	if (pipe_size) {
909	        /*
910		 * need to do initial allocation or resizing of pipe
911		 * holding both structure and io locks.
912		 */
913		if ((error = pipeio_lock(wpipe, 1)) == 0) {
914			if (wpipe->pipe_buffer.cnt == 0)
915				error = pipespace(wpipe, pipe_size);
916			else
917				error = expand_pipespace(wpipe, pipe_size);
918
919			pipeio_unlock(wpipe);
920
921			/* allocation failed */
922			if (wpipe->pipe_buffer.buffer == 0)
923			        error = ENOMEM;
924		}
925		if (error) {
926		        /*
927			 * If an error occurred unbusy and return, waking up any pending
928			 * readers.
929			 */
930		        --wpipe->pipe_busy;
931			if ((wpipe->pipe_busy == 0) &&
932			    (wpipe->pipe_state & PIPE_WANT)) {
933			        wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
934				wakeup(wpipe);
935			}
936			PIPE_UNLOCK(rpipe);
937			return(error);
938		}
939	}
940
941	while (uio_resid(uio)) {
942
943	retrywrite:
944		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
945
946		/* Writes of size <= PIPE_BUF must be atomic. */
947		if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF))
948			space = 0;
949
950		if (space > 0) {
951
952			if ((error = pipeio_lock(wpipe,1)) == 0) {
953				int size;	/* Transfer size */
954				int segsize;	/* first segment to transfer */
955
956				if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
957					pipeio_unlock(wpipe);
958				        error = EPIPE;
959					break;
960				}
961				/*
962				 * If a process blocked in pipeio_lock, our
963				 * value for space might be bad... the mutex
964				 * is dropped while we're blocked
965				 */
966				if (space > (int)(wpipe->pipe_buffer.size -
967				    wpipe->pipe_buffer.cnt)) {
968					pipeio_unlock(wpipe);
969					goto retrywrite;
970				}
971
972				/*
973				 * Transfer size is minimum of uio transfer
974				 * and free space in pipe buffer.
975				 */
976				// LP64todo - fix this!
977				if (space > uio_resid(uio))
978					size = uio_resid(uio);
979				else
980					size = space;
981				/*
982				 * First segment to transfer is minimum of
983				 * transfer size and contiguous space in
984				 * pipe buffer.  If first segment to transfer
985				 * is less than the transfer size, we've got
986				 * a wraparound in the buffer.
987				 */
988				segsize = wpipe->pipe_buffer.size -
989					wpipe->pipe_buffer.in;
990				if (segsize > size)
991					segsize = size;
992
993				/* Transfer first segment */
994
995				PIPE_UNLOCK(rpipe);
996				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
997						segsize, uio);
998				PIPE_LOCK(rpipe);
999
1000				if (error == 0 && segsize < size) {
1001					/*
1002					 * Transfer remaining part now, to
1003					 * support atomic writes.  Wraparound
1004					 * happened. (State 3)
1005					 */
1006					if (wpipe->pipe_buffer.in + segsize !=
1007					    wpipe->pipe_buffer.size)
1008						panic("Expected pipe buffer "
1009						    "wraparound disappeared");
1010
1011					PIPE_UNLOCK(rpipe);
1012					error = uiomove(
1013					    &wpipe->pipe_buffer.buffer[0],
1014				    	    size - segsize, uio);
1015					PIPE_LOCK(rpipe);
1016				}
1017				/*
1018				 * readers never know to read until count is updated.
1019				 */
1020				if (error == 0) {
1021					wpipe->pipe_buffer.in += size;
1022					if (wpipe->pipe_buffer.in >
1023					    wpipe->pipe_buffer.size) {
1024						if (wpipe->pipe_buffer.in !=
1025						    size - segsize +
1026						    wpipe->pipe_buffer.size)
1027							panic("Expected "
1028							    "wraparound bad");
1029						wpipe->pipe_buffer.in = size -
1030						    segsize;
1031					}
1032
1033					wpipe->pipe_buffer.cnt += size;
1034					if (wpipe->pipe_buffer.cnt >
1035					    wpipe->pipe_buffer.size)
1036						panic("Pipe buffer overflow");
1037
1038				}
1039				pipeio_unlock(wpipe);
1040			}
1041			if (error)
1042				break;
1043
1044		} else {
1045			/*
1046			 * If the "read-side" has been blocked, wake it up now.
1047			 */
1048			if (wpipe->pipe_state & PIPE_WANTR) {
1049				wpipe->pipe_state &= ~PIPE_WANTR;
1050				wakeup(wpipe);
1051			}
1052			/*
1053			 * don't block on non-blocking I/O
1054			 * we'll do the pipeselwakeup on the way out
1055			 */
1056			if (fp->f_flag & FNONBLOCK) {
1057				error = EAGAIN;
1058				break;
1059			}
1060
1061			/*
1062			 * If read side wants to go away, we just issue a signal
1063			 * to ourselves.
1064			 */
1065			if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
1066				error = EPIPE;
1067				break;
1068			}
1069
1070			/*
1071			 * We have no more space and have something to offer,
1072			 * wake up select/poll.
1073			 */
1074			pipeselwakeup(wpipe, wpipe);
1075
1076			wpipe->pipe_state |= PIPE_WANTW;
1077
1078			error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0);
1079
1080			if (error != 0)
1081				break;
1082		}
1083	}
1084	--wpipe->pipe_busy;
1085
1086	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1087		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1088		wakeup(wpipe);
1089	}
1090	if (wpipe->pipe_buffer.cnt > 0) {
1091		/*
1092		 * If there are any characters in the buffer, we wake up
1093		 * the reader if it was blocked waiting for data.
1094		 */
1095		if (wpipe->pipe_state & PIPE_WANTR) {
1096			wpipe->pipe_state &= ~PIPE_WANTR;
1097			wakeup(wpipe);
1098		}
1099		/*
1100		 * wake up thread blocked in select/poll or post the notification
1101		 */
1102		pipeselwakeup(wpipe, wpipe);
1103	}
1104
1105	/* Update modification, status change (# of bytes in pipe) times */
1106	pipe_touch(rpipe, PIPE_MTIME | PIPE_CTIME);
1107	pipe_touch(wpipe, PIPE_MTIME | PIPE_CTIME);
1108	PIPE_UNLOCK(rpipe);
1109
1110	return (error);
1111}
1112
1113/*
1114 * we implement a very minimal set of ioctls for compatibility with sockets.
1115 */
1116/* ARGSUSED 3 */
1117static int
1118pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
1119	__unused vfs_context_t ctx)
1120{
1121	struct pipe *mpipe = (struct pipe *)fp->f_data;
1122#if CONFIG_MACF
1123	int error;
1124#endif
1125
1126	PIPE_LOCK(mpipe);
1127
1128#if CONFIG_MACF
1129	error = mac_pipe_check_ioctl(kauth_cred_get(), mpipe, cmd);
1130	if (error) {
1131		PIPE_UNLOCK(mpipe);
1132
1133		return (error);
1134	}
1135#endif
1136
1137	switch (cmd) {
1138
1139	case FIONBIO:
1140		PIPE_UNLOCK(mpipe);
1141		return (0);
1142
1143	case FIOASYNC:
1144		if (*(int *)data) {
1145			mpipe->pipe_state |= PIPE_ASYNC;
1146		} else {
1147			mpipe->pipe_state &= ~PIPE_ASYNC;
1148		}
1149		PIPE_UNLOCK(mpipe);
1150		return (0);
1151
1152	case FIONREAD:
1153		*(int *)data = mpipe->pipe_buffer.cnt;
1154		PIPE_UNLOCK(mpipe);
1155		return (0);
1156
1157	case TIOCSPGRP:
1158		mpipe->pipe_pgid = *(int *)data;
1159
1160		PIPE_UNLOCK(mpipe);
1161		return (0);
1162
1163	case TIOCGPGRP:
1164		*(int *)data = mpipe->pipe_pgid;
1165
1166		PIPE_UNLOCK(mpipe);
1167		return (0);
1168
1169	}
1170	PIPE_UNLOCK(mpipe);
1171	return (ENOTTY);
1172}
1173
1174
1175static int
1176pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
1177{
1178	struct pipe *rpipe = (struct pipe *)fp->f_data;
1179	struct pipe *wpipe;
1180	int    retnum = 0;
1181
1182	if (rpipe == NULL || rpipe == (struct pipe *)-1)
1183	        return (retnum);
1184
1185	PIPE_LOCK(rpipe);
1186
1187	wpipe = rpipe->pipe_peer;
1188
1189
1190#if CONFIG_MACF
1191	/*
1192	 * XXX We should use a per thread credential here; minimally, the
1193	 * XXX process credential should have a persistent reference on it
1194	 * XXX before being passed in here.
1195	 */
1196	if (mac_pipe_check_select(vfs_context_ucred(ctx), rpipe, which)) {
1197		PIPE_UNLOCK(rpipe);
1198		return (0);
1199	}
1200#endif
1201        switch (which) {
1202
1203        case FREAD:
1204		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1205		    (rpipe->pipe_buffer.cnt > 0) ||
1206		    (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1207
1208		        retnum = 1;
1209		} else {
1210		        rpipe->pipe_state |= PIPE_SEL;
1211		        selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1212		}
1213		break;
1214
1215        case FWRITE:
1216		if (wpipe)
1217			wpipe->pipe_state |= PIPE_WSELECT;
1218		if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1219		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1220		     (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) > 0)) {
1221
1222		        retnum = 1;
1223		} else {
1224		        wpipe->pipe_state |= PIPE_SEL;
1225			selrecord(vfs_context_proc(ctx), &wpipe->pipe_sel, wql);
1226		}
1227		break;
1228        case 0:
1229	        rpipe->pipe_state |= PIPE_SEL;
1230		selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1231		break;
1232        }
1233	PIPE_UNLOCK(rpipe);
1234
1235        return (retnum);
1236}
1237
1238
1239/* ARGSUSED 1 */
1240static int
1241pipe_close(struct fileglob *fg, __unused vfs_context_t ctx)
1242{
1243        struct pipe *cpipe;
1244
1245	proc_fdlock_spin(vfs_context_proc(ctx));
1246	cpipe = (struct pipe *)fg->fg_data;
1247	fg->fg_data = NULL;
1248	proc_fdunlock(vfs_context_proc(ctx));
1249	if (cpipe)
1250	        pipeclose(cpipe);
1251
1252	return (0);
1253}
1254
1255static void
1256pipe_free_kmem(struct pipe *cpipe)
1257{
1258	if (cpipe->pipe_buffer.buffer != NULL) {
1259		OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva);
1260		OSAddAtomic(-1, &amountpipes);
1261		kfree((void *)cpipe->pipe_buffer.buffer,
1262			  cpipe->pipe_buffer.size);
1263		cpipe->pipe_buffer.buffer = NULL;
1264		cpipe->pipe_buffer.size = 0;
1265	}
1266}
1267
1268/*
1269 * shutdown the pipe
1270 */
1271static void
1272pipeclose(struct pipe *cpipe)
1273{
1274	struct pipe *ppipe;
1275
1276	if (cpipe == NULL)
1277		return;
1278	/* partially created pipes won't have a valid mutex. */
1279	if (PIPE_MTX(cpipe) != NULL)
1280		PIPE_LOCK(cpipe);
1281
1282
1283	/*
1284	 * If the other side is blocked, wake it up saying that
1285	 * we want to close it down.
1286	 */
1287	cpipe->pipe_state &= ~PIPE_DRAIN;
1288	cpipe->pipe_state |= PIPE_EOF;
1289	pipeselwakeup(cpipe, cpipe);
1290
1291	while (cpipe->pipe_busy) {
1292		cpipe->pipe_state |= PIPE_WANT;
1293
1294		wakeup(cpipe);
1295 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1296	}
1297
1298#if CONFIG_MACF
1299	/*
1300	 * Free the shared pipe label only after the two ends are disconnected.
1301	 */
1302	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1303		mac_pipe_label_destroy(cpipe);
1304#endif
1305
1306	/*
1307	 * Disconnect from peer
1308	 */
1309	if ((ppipe = cpipe->pipe_peer) != NULL) {
1310
1311		ppipe->pipe_state &= ~(PIPE_DRAIN);
1312		ppipe->pipe_state |= PIPE_EOF;
1313
1314		pipeselwakeup(ppipe, ppipe);
1315		wakeup(ppipe);
1316
1317		if (cpipe->pipe_state & PIPE_KNOTE)
1318		        KNOTE(&ppipe->pipe_sel.si_note, 1);
1319
1320		postpipeevent(ppipe, EV_RCLOSED);
1321
1322		ppipe->pipe_peer = NULL;
1323	}
1324	evpipefree(cpipe);
1325
1326	/*
1327	 * free resources
1328	 */
1329	if (PIPE_MTX(cpipe) != NULL) {
1330	        if (ppipe != NULL) {
1331		        /*
1332			 * since the mutex is shared and the peer is still
1333			 * alive, we need to release the mutex, not free it
1334			 */
1335		        PIPE_UNLOCK(cpipe);
1336		} else {
1337		        /*
1338			 * peer is gone, so we're the sole party left with
1339			 * interest in this mutex... we can just free it
1340			 */
1341			lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
1342		}
1343	}
1344	pipe_free_kmem(cpipe);
1345	if (cpipe->pipe_state & PIPE_WSELECT) {
1346		pipe_garbage_collect(cpipe);
1347	} else {
1348		zfree(pipe_zone, cpipe);
1349		pipe_garbage_collect(NULL);
1350	}
1351
1352}
1353
1354/*ARGSUSED*/
1355static int
1356pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
1357{
1358	struct pipe *cpipe;
1359
1360	cpipe = (struct pipe *)kn->kn_fp->f_data;
1361
1362	PIPE_LOCK(cpipe);
1363#if CONFIG_MACF
1364	/*
1365	 * XXX We should use a per thread credential here; minimally, the
1366	 * XXX process credential should have a persistent reference on it
1367	 * XXX before being passed in here.
1368	 */
1369	if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
1370		PIPE_UNLOCK(cpipe);
1371		return (1);
1372	}
1373#endif
1374
1375	switch (kn->kn_filter) {
1376	case EVFILT_READ:
1377		kn->kn_fop = &pipe_rfiltops;
1378
1379		break;
1380	case EVFILT_WRITE:
1381		kn->kn_fop = &pipe_wfiltops;
1382
1383		if (cpipe->pipe_peer == NULL) {
1384			/*
1385			 * other end of pipe has been closed
1386			 */
1387		        PIPE_UNLOCK(cpipe);
1388			return (EPIPE);
1389		}
1390		if (cpipe->pipe_peer)
1391		cpipe = cpipe->pipe_peer;
1392		break;
1393	default:
1394	        PIPE_UNLOCK(cpipe);
1395		return (1);
1396	}
1397
1398	if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn))
1399	        cpipe->pipe_state |= PIPE_KNOTE;
1400
1401	PIPE_UNLOCK(cpipe);
1402	return (0);
1403}
1404
1405static void
1406filt_pipedetach(struct knote *kn)
1407{
1408	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1409
1410	PIPE_LOCK(cpipe);
1411
1412	if (kn->kn_filter == EVFILT_WRITE) {
1413	        if (cpipe->pipe_peer == NULL) {
1414		        PIPE_UNLOCK(cpipe);
1415			return;
1416		}
1417		cpipe = cpipe->pipe_peer;
1418	}
1419	if (cpipe->pipe_state & PIPE_KNOTE) {
1420	        if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn))
1421		        cpipe->pipe_state &= ~PIPE_KNOTE;
1422	}
1423	PIPE_UNLOCK(cpipe);
1424}
1425
1426/*ARGSUSED*/
1427static int
1428filt_piperead(struct knote *kn, long hint)
1429{
1430	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1431	struct pipe *wpipe;
1432	int    retval;
1433
1434	/*
1435	 * if hint == 0, then we've been called from the kevent
1436	 * world directly and do not currently hold the pipe mutex...
1437	 * if hint == 1, we're being called back via the KNOTE post
1438	 * we made in pipeselwakeup, and we already hold the mutex...
1439	 */
1440	if (hint == 0)
1441	        PIPE_LOCK(rpipe);
1442
1443	wpipe = rpipe->pipe_peer;
1444	kn->kn_data = rpipe->pipe_buffer.cnt;
1445	if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1446	    (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1447		kn->kn_flags |= EV_EOF;
1448		retval = 1;
1449	} else {
1450		int64_t lowwat = 1;
1451		if (kn->kn_sfflags & NOTE_LOWAT) {
1452			if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe))
1453				lowwat = MAX_PIPESIZE(rpipe);
1454			else if (kn->kn_sdata > lowwat)
1455				lowwat = kn->kn_sdata;
1456		}
1457		retval = kn->kn_data >= lowwat;
1458	}
1459
1460	if (hint == 0)
1461	        PIPE_UNLOCK(rpipe);
1462
1463	return (retval);
1464}
1465
1466/*ARGSUSED*/
1467static int
1468filt_pipewrite(struct knote *kn, long hint)
1469{
1470	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1471	struct pipe *wpipe;
1472
1473	/*
1474	 * if hint == 0, then we've been called from the kevent
1475	 * world directly and do not currently hold the pipe mutex...
1476	 * if hint == 1, we're being called back via the KNOTE post
1477	 * we made in pipeselwakeup, and we already hold the mutex...
1478	 */
1479	if (hint == 0)
1480	        PIPE_LOCK(rpipe);
1481
1482	wpipe = rpipe->pipe_peer;
1483
1484	if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1485		kn->kn_data = 0;
1486		kn->kn_flags |= EV_EOF;
1487
1488		if (hint == 0)
1489		        PIPE_UNLOCK(rpipe);
1490		return (1);
1491	}
1492	kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt;
1493
1494	int64_t lowwat = PIPE_BUF;
1495	if (kn->kn_sfflags & NOTE_LOWAT) {
1496		if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe))
1497			lowwat = MAX_PIPESIZE(wpipe);
1498		else if (kn->kn_sdata > lowwat)
1499			lowwat = kn->kn_sdata;
1500	}
1501
1502	if (hint == 0)
1503	        PIPE_UNLOCK(rpipe);
1504
1505	return (kn->kn_data >= lowwat);
1506}
1507
1508int
1509fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
1510{
1511#if CONFIG_MACF
1512        int error;
1513#endif
1514	struct timeval now;
1515	struct vinfo_stat * ub;
1516	int pipe_size = 0;
1517	int pipe_count;
1518
1519	if (cpipe == NULL)
1520	        return (EBADF);
1521	PIPE_LOCK(cpipe);
1522
1523#if CONFIG_MACF
1524	error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
1525	if (error) {
1526		PIPE_UNLOCK(cpipe);
1527	        return (error);
1528	}
1529#endif
1530	if (cpipe->pipe_buffer.buffer == 0) {
1531	        /*
1532		 * must be stat'ing the write fd
1533		 */
1534	        if (cpipe->pipe_peer) {
1535		        /*
1536			 * the peer still exists, use it's info
1537			 */
1538		        pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
1539			pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
1540		} else {
1541			pipe_count = 0;
1542		}
1543	} else {
1544	        pipe_size  = MAX_PIPESIZE(cpipe);
1545		pipe_count = cpipe->pipe_buffer.cnt;
1546	}
1547	/*
1548	 * since peer's buffer is setup ouside of lock
1549	 * we might catch it in transient state
1550	 */
1551	if (pipe_size == 0)
1552		pipe_size  = PIPE_SIZE;
1553
1554	ub = &pinfo->pipe_stat;
1555
1556	bzero(ub, sizeof(*ub));
1557	ub->vst_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
1558	ub->vst_blksize = pipe_size;
1559	ub->vst_size = pipe_count;
1560	if (ub->vst_blksize != 0)
1561		ub->vst_blocks = (ub->vst_size + ub->vst_blksize - 1) / ub->vst_blksize;
1562	ub->vst_nlink = 1;
1563
1564	ub->vst_uid = kauth_getuid();
1565	ub->vst_gid = kauth_getgid();
1566
1567	microtime(&now);
1568	ub->vst_atime  = now.tv_sec;
1569	ub->vst_atimensec = now.tv_usec * 1000;
1570
1571	ub->vst_mtime  = now.tv_sec;
1572	ub->vst_mtimensec = now.tv_usec * 1000;
1573
1574	ub->vst_ctime  = now.tv_sec;
1575	ub->vst_ctimensec = now.tv_usec * 1000;
1576
1577	/*
1578	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid.
1579	 * XXX (st_dev, st_ino) should be unique.
1580	 */
1581
1582	pinfo->pipe_handle = (uint64_t)((uintptr_t)cpipe);
1583	pinfo->pipe_peerhandle = (uint64_t)((uintptr_t)(cpipe->pipe_peer));
1584	pinfo->pipe_status = cpipe->pipe_state;
1585
1586	PIPE_UNLOCK(cpipe);
1587
1588	return (0);
1589}
1590
1591
1592static int
1593pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
1594{
1595
1596	/* Note: fdlock already held */
1597	struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
1598
1599	if (cpipe) {
1600		PIPE_LOCK(cpipe);
1601		cpipe->pipe_state |= PIPE_DRAIN;
1602		cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1603		wakeup(cpipe);
1604
1605		/* Must wake up peer: a writer sleeps on the read side */
1606		if ((ppipe = cpipe->pipe_peer)) {
1607			ppipe->pipe_state |= PIPE_DRAIN;
1608			ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1609			wakeup(ppipe);
1610		}
1611
1612		PIPE_UNLOCK(cpipe);
1613		return 0;
1614	}
1615
1616	return 1;
1617}
1618
1619
1620 /*
1621 * When a thread sets a write-select on a pipe, it creates an implicit,
1622 * untracked dependency between that thread and the peer of the pipe
1623 * on which the select is set.  If the peer pipe is closed and freed
1624 * before the select()ing thread wakes up, the system will panic as
1625 * it attempts to unwind the dangling select().  To avoid that panic,
1626 * we notice whenever a dangerous select() is set on a pipe, and
1627 * defer the final deletion of the pipe until that select()s are all
1628 * resolved.  Since we can't currently detect exactly when that
1629 * resolution happens, we use a simple garbage collection queue to
1630 * reap the at-risk pipes 'later'.
1631 */
1632static void
1633pipe_garbage_collect(struct pipe *cpipe)
1634{
1635	uint64_t old, now;
1636	struct pipe_garbage *pgp;
1637
1638	/* Convert msecs to nsecs and then to abstime */
1639	old = pipe_garbage_age_limit * 1000000;
1640	nanoseconds_to_absolutetime(old, &old);
1641
1642	lck_mtx_lock(pipe_garbage_lock);
1643
1644	/* Free anything that's been on the queue for <mumble> seconds */
1645	now = mach_absolute_time();
1646	old = now - old;
1647	while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
1648		pipe_garbage_head = pgp->pg_next;
1649		if (pipe_garbage_head == NULL)
1650			pipe_garbage_tail = NULL;
1651		pipe_garbage_count--;
1652		zfree(pipe_zone, pgp->pg_pipe);
1653		zfree(pipe_garbage_zone, pgp);
1654	}
1655
1656	/* Add the new pipe (if any) to the tail of the garbage queue */
1657	if (cpipe) {
1658		cpipe->pipe_state = PIPE_DEAD;
1659		pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
1660		if (pgp == NULL) {
1661			/*
1662			 * We're too low on memory to garbage collect the
1663			 * pipe.  Freeing it runs the risk of panicing the
1664			 * system.  All we can do is leak it and leave
1665			 * a breadcrumb behind.  The good news, such as it
1666			 * is, is that this will probably never happen.
1667			 * We will probably hit the panic below first.
1668			 */
1669			printf("Leaking pipe %p - no room left in the queue",
1670			    cpipe);
1671			lck_mtx_unlock(pipe_garbage_lock);
1672			return;
1673		}
1674
1675		pgp->pg_pipe = cpipe;
1676		pgp->pg_timestamp = now;
1677		pgp->pg_next = NULL;
1678
1679		if (pipe_garbage_tail)
1680			pipe_garbage_tail->pg_next = pgp;
1681		pipe_garbage_tail = pgp;
1682		if (pipe_garbage_head == NULL)
1683			pipe_garbage_head = pipe_garbage_tail;
1684
1685		if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT)
1686			panic("Length of pipe garbage queue exceeded %d",
1687			    PIPE_GARBAGE_QUEUE_LIMIT);
1688	}
1689	lck_mtx_unlock(pipe_garbage_lock);
1690}
1691
1692