sys_pipe.c revision 184849
1139804Simp/*-
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson */
1913675Sdyson
2013675Sdyson/*
2113675Sdyson * This file contains a high-performance replacement for the socket-based
2213675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2313675Sdyson * all features of sockets, but does do everything that pipes normally
2413675Sdyson * do.
2513675Sdyson */
2613675Sdyson
2713907Sdyson/*
2813907Sdyson * This code has two modes of operation, a small write mode and a large
2913907Sdyson * write mode.  The small write mode acts like conventional pipes with
3013907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3113907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3213907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3313907Sdyson * the receiving process can copy it directly from the pages in the sending
3413907Sdyson * process.
3513907Sdyson *
3613907Sdyson * If the sending process receives a signal, it is possible that it will
3713913Sdyson * go away, and certainly its address space can change, because control
3813907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
3913907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4013907Sdyson * kernel buffer, and the receiving process will grab the data from the
4113907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4213907Sdyson * the copy operation is normally eliminated.
4313907Sdyson *
4413907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4513907Sdyson * happen for small transfers so that the system will not spend all of
46118764Ssilby * its time context switching.
47117325Ssilby *
48118764Ssilby * In order to limit the resource use of pipes, two sysctls exist:
49117325Ssilby *
50118764Ssilby * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51133790Ssilby * address space available to us in pipe_map. This value is normally
52133790Ssilby * autotuned, but may also be loader tuned.
53117325Ssilby *
54133790Ssilby * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
55133790Ssilby * memory in use by pipes.
56117325Ssilby *
57133790Ssilby * Based on how large pipekva is relative to maxpipekva, the following
58133790Ssilby * will happen:
59117325Ssilby *
60133790Ssilby * 0% - 50%:
61133790Ssilby *     New pipes are given 16K of memory backing, pipes may dynamically
62133790Ssilby *     grow to as large as 64K where needed.
63133790Ssilby * 50% - 75%:
64133790Ssilby *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
65133790Ssilby *     existing pipes may NOT grow.
66133790Ssilby * 75% - 100%:
67133790Ssilby *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
68133790Ssilby *     existing pipes will be shrunk down to 4K whenever possible.
69133049Ssilby *
70133790Ssilby * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
71133790Ssilby * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72133790Ssilby * resize which MUST occur for reverse-direction pipes when they are
73133790Ssilby * first used.
74133790Ssilby *
75133790Ssilby * Additional information about the current state of pipes may be obtained
76133790Ssilby * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77133790Ssilby * and kern.ipc.piperesizefail.
78133790Ssilby *
79133049Ssilby * Locking rules:  There are two locks present here:  A mutex, used via
80133049Ssilby * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
81133049Ssilby * the flag, as mutexes can not persist over uiomove.  The mutex
82133049Ssilby * exists only to guard access to the flag, and is not in itself a
83133790Ssilby * locking mechanism.  Also note that there is only a single mutex for
84133790Ssilby * both directions of a pipe.
85133049Ssilby *
86133049Ssilby * As pipelock() may have to sleep before it can acquire the flag, it
87133049Ssilby * is important to reread all data after a call to pipelock(); everything
88133049Ssilby * in the structure may have changed.
8913907Sdyson */
9013907Sdyson
91116182Sobrien#include <sys/cdefs.h>
92116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 184849 2008-11-11 14:55:59Z ed $");
93116182Sobrien
94101768Srwatson#include "opt_mac.h"
95101768Srwatson
9613675Sdyson#include <sys/param.h>
9713675Sdyson#include <sys/systm.h>
9824131Sbde#include <sys/fcntl.h>
9913675Sdyson#include <sys/file.h>
10013675Sdyson#include <sys/filedesc.h>
10124206Sbde#include <sys/filio.h>
10291372Salfred#include <sys/kernel.h>
10376166Smarkm#include <sys/lock.h>
10476827Salfred#include <sys/mutex.h>
10524206Sbde#include <sys/ttycom.h>
10613675Sdyson#include <sys/stat.h>
10791968Salfred#include <sys/malloc.h>
10829356Speter#include <sys/poll.h>
10970834Swollman#include <sys/selinfo.h>
11013675Sdyson#include <sys/signalvar.h>
111184849Sed#include <sys/syscallsubr.h>
112117325Ssilby#include <sys/sysctl.h>
11313675Sdyson#include <sys/sysproto.h>
11413675Sdyson#include <sys/pipe.h>
11576166Smarkm#include <sys/proc.h>
11655112Sbde#include <sys/vnode.h>
11734924Sbde#include <sys/uio.h>
11859288Sjlemon#include <sys/event.h>
11913675Sdyson
120163606Srwatson#include <security/mac/mac_framework.h>
121163606Srwatson
12213675Sdyson#include <vm/vm.h>
12313675Sdyson#include <vm/vm_param.h>
12413675Sdyson#include <vm/vm_object.h>
12513675Sdyson#include <vm/vm_kern.h>
12613675Sdyson#include <vm/vm_extern.h>
12713675Sdyson#include <vm/pmap.h>
12813675Sdyson#include <vm/vm_map.h>
12913907Sdyson#include <vm/vm_page.h>
13092751Sjeff#include <vm/uma.h>
13113675Sdyson
13214037Sdyson/*
13314037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
13414037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
13514037Sdyson * NetBSD or OpenBSD.
13614037Sdyson */
13714037Sdyson/* #define PIPE_NODIRECT */
13814037Sdyson
13914037Sdyson/*
14014037Sdyson * interfaces to the outside world
14114037Sdyson */
142108255Sphkstatic fo_rdwr_t	pipe_read;
143108255Sphkstatic fo_rdwr_t	pipe_write;
144175140Sjhbstatic fo_truncate_t	pipe_truncate;
145108255Sphkstatic fo_ioctl_t	pipe_ioctl;
146108255Sphkstatic fo_poll_t	pipe_poll;
147108255Sphkstatic fo_kqfilter_t	pipe_kqfilter;
148108255Sphkstatic fo_stat_t	pipe_stat;
149108255Sphkstatic fo_close_t	pipe_close;
15013675Sdyson
15172521Sjlemonstatic struct fileops pipeops = {
152116546Sphk	.fo_read = pipe_read,
153116546Sphk	.fo_write = pipe_write,
154175140Sjhb	.fo_truncate = pipe_truncate,
155116546Sphk	.fo_ioctl = pipe_ioctl,
156116546Sphk	.fo_poll = pipe_poll,
157116546Sphk	.fo_kqfilter = pipe_kqfilter,
158116546Sphk	.fo_stat = pipe_stat,
159116546Sphk	.fo_close = pipe_close,
160116546Sphk	.fo_flags = DFLAG_PASSABLE
16172521Sjlemon};
16213675Sdyson
16359288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
16459288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
16559288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
16659288Sjlemon
16772521Sjlemonstatic struct filterops pipe_rfiltops =
16872521Sjlemon	{ 1, NULL, filt_pipedetach, filt_piperead };
16972521Sjlemonstatic struct filterops pipe_wfiltops =
17072521Sjlemon	{ 1, NULL, filt_pipedetach, filt_pipewrite };
17159288Sjlemon
17213675Sdyson/*
17313675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
17413675Sdyson * space is pageable.  The pipe code will try to maintain locality of
17513675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
17613675Sdyson * will not wipe the cache.
17713675Sdyson */
17813907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
17913907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
18013675Sdyson
18117124Sbdestatic int amountpipekva;
182133790Ssilbystatic int pipefragretry;
183133790Ssilbystatic int pipeallocfail;
184133790Ssilbystatic int piperesizefail;
185133790Ssilbystatic int piperesizeallowed = 1;
18613907Sdyson
187121307SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
188117325Ssilby	   &maxpipekva, 0, "Pipe KVA limit");
189117325SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
190117325Ssilby	   &amountpipekva, 0, "Pipe KVA usage");
191133790SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
192133790Ssilby	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
193133790SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
194133790Ssilby	  &pipeallocfail, 0, "Pipe allocation failures");
195133790SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
196133790Ssilby	  &piperesizefail, 0, "Pipe resize failures");
197133790SsilbySYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
198133790Ssilby	  &piperesizeallowed, 0, "Pipe resizing allowed");
199117325Ssilby
20091413Salfredstatic void pipeinit(void *dummy __unused);
20191413Salfredstatic void pipeclose(struct pipe *cpipe);
20291413Salfredstatic void pipe_free_kmem(struct pipe *cpipe);
203133790Ssilbystatic int pipe_create(struct pipe *pipe, int backing);
20491413Salfredstatic __inline int pipelock(struct pipe *cpipe, int catch);
20591413Salfredstatic __inline void pipeunlock(struct pipe *cpipe);
20691413Salfredstatic __inline void pipeselwakeup(struct pipe *cpipe);
20714037Sdyson#ifndef PIPE_NODIRECT
20891413Salfredstatic int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
20991413Salfredstatic void pipe_destroy_write_buffer(struct pipe *wpipe);
21091413Salfredstatic int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
21191413Salfredstatic void pipe_clone_write_buffer(struct pipe *wpipe);
21214037Sdyson#endif
21391413Salfredstatic int pipespace(struct pipe *cpipe, int size);
214132579Srwatsonstatic int pipespace_new(struct pipe *cpipe, int size);
21513675Sdyson
216132987Sgreenstatic int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
217132987Sgreenstatic int	pipe_zone_init(void *mem, int size, int flags);
218125293Srwatsonstatic void	pipe_zone_fini(void *mem, int size);
219125293Srwatson
22092751Sjeffstatic uma_zone_t pipe_zone;
22127899Sdyson
22291372SalfredSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
22391372Salfred
22491372Salfredstatic void
22591372Salfredpipeinit(void *dummy __unused)
22691372Salfred{
227118880Salc
228170022Srwatson	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
229170022Srwatson	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
230125293Srwatson	    UMA_ALIGN_PTR, 0);
231118880Salc	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
23291372Salfred}
23391372Salfred
234132987Sgreenstatic int
235132987Sgreenpipe_zone_ctor(void *mem, int size, void *arg, int flags)
236125293Srwatson{
237125293Srwatson	struct pipepair *pp;
238125293Srwatson	struct pipe *rpipe, *wpipe;
239125293Srwatson
240125293Srwatson	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
241125293Srwatson
242125293Srwatson	pp = (struct pipepair *)mem;
243125293Srwatson
244125293Srwatson	/*
245125293Srwatson	 * We zero both pipe endpoints to make sure all the kmem pointers
246125293Srwatson	 * are NULL, flag fields are zero'd, etc.  We timestamp both
247125293Srwatson	 * endpoints with the same time.
248125293Srwatson	 */
249125293Srwatson	rpipe = &pp->pp_rpipe;
250125293Srwatson	bzero(rpipe, sizeof(*rpipe));
251125293Srwatson	vfs_timestamp(&rpipe->pipe_ctime);
252125293Srwatson	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
253125293Srwatson
254125293Srwatson	wpipe = &pp->pp_wpipe;
255125293Srwatson	bzero(wpipe, sizeof(*wpipe));
256125293Srwatson	wpipe->pipe_ctime = rpipe->pipe_ctime;
257125293Srwatson	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
258125293Srwatson
259125293Srwatson	rpipe->pipe_peer = wpipe;
260125293Srwatson	rpipe->pipe_pair = pp;
261125293Srwatson	wpipe->pipe_peer = rpipe;
262125293Srwatson	wpipe->pipe_pair = pp;
263125293Srwatson
264125293Srwatson	/*
265125293Srwatson	 * Mark both endpoints as present; they will later get free'd
266125293Srwatson	 * one at a time.  When both are free'd, then the whole pair
267125293Srwatson	 * is released.
268125293Srwatson	 */
269179243Skib	rpipe->pipe_present = PIPE_ACTIVE;
270179243Skib	wpipe->pipe_present = PIPE_ACTIVE;
271125293Srwatson
272125293Srwatson	/*
273125293Srwatson	 * Eventually, the MAC Framework may initialize the label
274125293Srwatson	 * in ctor or init, but for now we do it elswhere to avoid
275125293Srwatson	 * blocking in ctor or init.
276125293Srwatson	 */
277125293Srwatson	pp->pp_label = NULL;
278125293Srwatson
279132987Sgreen	return (0);
280125293Srwatson}
281125293Srwatson
282132987Sgreenstatic int
283132987Sgreenpipe_zone_init(void *mem, int size, int flags)
284125293Srwatson{
285125293Srwatson	struct pipepair *pp;
286125293Srwatson
287125293Srwatson	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
288125293Srwatson
289125293Srwatson	pp = (struct pipepair *)mem;
290125293Srwatson
291125293Srwatson	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
292132987Sgreen	return (0);
293125293Srwatson}
294125293Srwatson
295125293Srwatsonstatic void
296125293Srwatsonpipe_zone_fini(void *mem, int size)
297125293Srwatson{
298125293Srwatson	struct pipepair *pp;
299125293Srwatson
300125293Srwatson	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
301125293Srwatson
302125293Srwatson	pp = (struct pipepair *)mem;
303125293Srwatson
304125293Srwatson	mtx_destroy(&pp->pp_mtx);
305125293Srwatson}
306125293Srwatson
30713675Sdyson/*
308167232Srwatson * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
309167232Srwatson * the zone pick up the pieces via pipeclose().
31013675Sdyson */
31113675Sdysonint
312184849Sedkern_pipe(struct thread *td, int fildes[2])
31313675Sdyson{
31483366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
31513675Sdyson	struct file *rf, *wf;
316125293Srwatson	struct pipepair *pp;
31713675Sdyson	struct pipe *rpipe, *wpipe;
31813675Sdyson	int fd, error;
31927899Sdyson
320125293Srwatson	pp = uma_zalloc(pipe_zone, M_WAITOK);
321125293Srwatson#ifdef MAC
322125293Srwatson	/*
323126249Srwatson	 * The MAC label is shared between the connected endpoints.  As a
324172930Srwatson	 * result mac_pipe_init() and mac_pipe_create() are called once
325126249Srwatson	 * for the pair, and not on the endpoints.
326125293Srwatson	 */
327172930Srwatson	mac_pipe_init(pp);
328172930Srwatson	mac_pipe_create(td->td_ucred, pp);
329125293Srwatson#endif
330125293Srwatson	rpipe = &pp->pp_rpipe;
331125293Srwatson	wpipe = &pp->pp_wpipe;
332125293Srwatson
333147730Sssouhlal	knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL,
334147730Sssouhlal	    NULL);
335147730Sssouhlal	knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL,
336147730Sssouhlal	    NULL);
337140369Ssilby
338133790Ssilby	/* Only the forward direction pipe is backed by default */
339155035Sglebius	if ((error = pipe_create(rpipe, 1)) != 0 ||
340155035Sglebius	    (error = pipe_create(wpipe, 0)) != 0) {
341124394Sdes		pipeclose(rpipe);
342124394Sdes		pipeclose(wpipe);
343155035Sglebius		return (error);
34476364Salfred	}
345124394Sdes
34613907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
34713907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
34813675Sdyson
34983366Sjulian	error = falloc(td, &rf, &fd);
35070915Sdwmalone	if (error) {
35170915Sdwmalone		pipeclose(rpipe);
35270915Sdwmalone		pipeclose(wpipe);
35370915Sdwmalone		return (error);
35470915Sdwmalone	}
355121256Sdwmalone	/* An extra reference on `rf' has been held for us by falloc(). */
356184849Sed	fildes[0] = fd;
35770915Sdwmalone
35870803Sdwmalone	/*
35970803Sdwmalone	 * Warning: once we've gotten past allocation of the fd for the
36070803Sdwmalone	 * read-side, we can only drop the read side via fdrop() in order
36170803Sdwmalone	 * to avoid races against processes which manage to dup() the read
36270803Sdwmalone	 * side while we are blocked trying to allocate the write side.
36370803Sdwmalone	 */
364174988Sjeff	finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);
36583366Sjulian	error = falloc(td, &wf, &fd);
36670915Sdwmalone	if (error) {
367184849Sed		fdclose(fdp, rf, fildes[0], td);
36883366Sjulian		fdrop(rf, td);
36970915Sdwmalone		/* rpipe has been closed by fdrop(). */
37070915Sdwmalone		pipeclose(wpipe);
37170915Sdwmalone		return (error);
37270915Sdwmalone	}
373121256Sdwmalone	/* An extra reference on `wf' has been held for us by falloc(). */
374174988Sjeff	finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);
375121256Sdwmalone	fdrop(wf, td);
376184849Sed	fildes[1] = fd;
37783366Sjulian	fdrop(rf, td);
37813675Sdyson
37913675Sdyson	return (0);
38013675Sdyson}
38113675Sdyson
382184849Sed/* ARGSUSED */
383184849Sedint
384184849Sedpipe(struct thread *td, struct pipe_args *uap)
385184849Sed{
386184849Sed	int error;
387184849Sed	int fildes[2];
388184849Sed
389184849Sed	error = kern_pipe(td, fildes);
390184849Sed	if (error)
391184849Sed		return (error);
392184849Sed
393184849Sed	td->td_retval[0] = fildes[0];
394184849Sed	td->td_retval[1] = fildes[1];
395184849Sed
396184849Sed	return (0);
397184849Sed}
398184849Sed
39913909Sdyson/*
40013909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
40176364Salfred * This routine will 'realloc' the size of a pipe safely, if it fails
40276364Salfred * it will retain the old buffer.
40376364Salfred * If it fails it will return ENOMEM.
40413909Sdyson */
40576364Salfredstatic int
406132579Srwatsonpipespace_new(cpipe, size)
40713675Sdyson	struct pipe *cpipe;
40876364Salfred	int size;
40913675Sdyson{
41076364Salfred	caddr_t buffer;
411133790Ssilby	int error, cnt, firstseg;
412117325Ssilby	static int curfail = 0;
413117325Ssilby	static struct timeval lastfail;
41413675Sdyson
415125293Srwatson	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
416133790Ssilby	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
417133790Ssilby		("pipespace: resize of direct writes not allowed"));
418133790Ssilbyretry:
419133790Ssilby	cnt = cpipe->pipe_buffer.cnt;
420133790Ssilby	if (cnt > size)
421133790Ssilby		size = cnt;
42279224Sdillon
423118764Ssilby	size = round_page(size);
424118764Ssilby	buffer = (caddr_t) vm_map_min(pipe_map);
42513675Sdyson
426122163Salc	error = vm_map_find(pipe_map, NULL, 0,
42776364Salfred		(vm_offset_t *) &buffer, size, 1,
42813688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
42976364Salfred	if (error != KERN_SUCCESS) {
430133790Ssilby		if ((cpipe->pipe_buffer.buffer == NULL) &&
431133790Ssilby			(size > SMALL_PIPE_SIZE)) {
432133790Ssilby			size = SMALL_PIPE_SIZE;
433133790Ssilby			pipefragretry++;
434133790Ssilby			goto retry;
435133790Ssilby		}
436133790Ssilby		if (cpipe->pipe_buffer.buffer == NULL) {
437133790Ssilby			pipeallocfail++;
438133790Ssilby			if (ppsratecheck(&lastfail, &curfail, 1))
439133790Ssilby				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
440133790Ssilby		} else {
441133790Ssilby			piperesizefail++;
442133790Ssilby		}
44376364Salfred		return (ENOMEM);
44476364Salfred	}
44576364Salfred
446133790Ssilby	/* copy data, then free old resources if we're resizing */
447133790Ssilby	if (cnt > 0) {
448133790Ssilby		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
449133790Ssilby			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
450133790Ssilby			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
451133790Ssilby				buffer, firstseg);
452133790Ssilby			if ((cnt - firstseg) > 0)
453133790Ssilby				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
454133790Ssilby					cpipe->pipe_buffer.in);
455133790Ssilby		} else {
456133790Ssilby			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
457133790Ssilby				buffer, cnt);
458133790Ssilby		}
459133790Ssilby	}
46076364Salfred	pipe_free_kmem(cpipe);
46176364Salfred	cpipe->pipe_buffer.buffer = buffer;
46276364Salfred	cpipe->pipe_buffer.size = size;
463133790Ssilby	cpipe->pipe_buffer.in = cnt;
46476364Salfred	cpipe->pipe_buffer.out = 0;
465133790Ssilby	cpipe->pipe_buffer.cnt = cnt;
466110816Salc	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
46776364Salfred	return (0);
46813907Sdyson}
46913688Sdyson
47013907Sdyson/*
471132579Srwatson * Wrapper for pipespace_new() that performs locking assertions.
472132579Srwatson */
473132579Srwatsonstatic int
474132579Srwatsonpipespace(cpipe, size)
475132579Srwatson	struct pipe *cpipe;
476132579Srwatson	int size;
477132579Srwatson{
478132579Srwatson
479133049Ssilby	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
480133049Ssilby		("Unlocked pipe passed to pipespace"));
481132579Srwatson	return (pipespace_new(cpipe, size));
482132579Srwatson}
483132579Srwatson
484132579Srwatson/*
48513675Sdyson * lock a pipe for I/O, blocking other access
48613675Sdyson */
48713675Sdysonstatic __inline int
48813907Sdysonpipelock(cpipe, catch)
48913675Sdyson	struct pipe *cpipe;
49013907Sdyson	int catch;
49113675Sdyson{
49213776Sdyson	int error;
49376364Salfred
49491362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
49591362Salfred	while (cpipe->pipe_state & PIPE_LOCKFL) {
49613675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
49791362Salfred		error = msleep(cpipe, PIPE_MTX(cpipe),
49891362Salfred		    catch ? (PRIBIO | PCATCH) : PRIBIO,
49976760Salfred		    "pipelk", 0);
500124394Sdes		if (error != 0)
50176760Salfred			return (error);
50213675Sdyson	}
50391362Salfred	cpipe->pipe_state |= PIPE_LOCKFL;
50476760Salfred	return (0);
50513675Sdyson}
50613675Sdyson
50713675Sdyson/*
50813675Sdyson * unlock a pipe I/O lock
50913675Sdyson */
51013675Sdysonstatic __inline void
51113675Sdysonpipeunlock(cpipe)
51213675Sdyson	struct pipe *cpipe;
51313675Sdyson{
51476364Salfred
51591362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
516133049Ssilby	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
517133049Ssilby		("Unlocked pipe passed to pipeunlock"));
51891362Salfred	cpipe->pipe_state &= ~PIPE_LOCKFL;
51913675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
52013675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
52114177Sdyson		wakeup(cpipe);
52213675Sdyson	}
52313675Sdyson}
52413675Sdyson
52514037Sdysonstatic __inline void
52614037Sdysonpipeselwakeup(cpipe)
52714037Sdyson	struct pipe *cpipe;
52814037Sdyson{
52976364Salfred
530126252Srwatson	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
53114037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
532122352Stanimura		selwakeuppri(&cpipe->pipe_sel, PSOCK);
533174647Sjeff		if (!SEL_WAITING(&cpipe->pipe_sel))
534174647Sjeff			cpipe->pipe_state &= ~PIPE_SEL;
53514037Sdyson	}
53641086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
53795883Salfred		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
538133741Sjmg	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
53914037Sdyson}
54014037Sdyson
541126131Sgreen/*
542126131Sgreen * Initialize and allocate VM and memory for pipe.  The structure
543126131Sgreen * will start out zero'd from the ctor, so we just manage the kmem.
544126131Sgreen */
545126131Sgreenstatic int
546133790Ssilbypipe_create(pipe, backing)
547126131Sgreen	struct pipe *pipe;
548133790Ssilby	int backing;
549126131Sgreen{
550126131Sgreen	int error;
551126131Sgreen
552133790Ssilby	if (backing) {
553133790Ssilby		if (amountpipekva > maxpipekva / 2)
554133790Ssilby			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
555133790Ssilby		else
556133790Ssilby			error = pipespace_new(pipe, PIPE_SIZE);
557133790Ssilby	} else {
558133790Ssilby		/* If we're not backing this pipe, no need to do anything. */
559133790Ssilby		error = 0;
560133790Ssilby	}
561132579Srwatson	return (error);
562126131Sgreen}
563126131Sgreen
56413675Sdyson/* ARGSUSED */
56513675Sdysonstatic int
566101941Srwatsonpipe_read(fp, uio, active_cred, flags, td)
56713675Sdyson	struct file *fp;
56813675Sdyson	struct uio *uio;
569101941Srwatson	struct ucred *active_cred;
57083366Sjulian	struct thread *td;
57145311Sdt	int flags;
57213675Sdyson{
573109153Sdillon	struct pipe *rpipe = fp->f_data;
57447748Salc	int error;
57513675Sdyson	int nread = 0;
57618863Sdyson	u_int size;
57713675Sdyson
57891362Salfred	PIPE_LOCK(rpipe);
57913675Sdyson	++rpipe->pipe_busy;
58047748Salc	error = pipelock(rpipe, 1);
58147748Salc	if (error)
58247748Salc		goto unlocked_error;
58347748Salc
584101768Srwatson#ifdef MAC
585172930Srwatson	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
586101768Srwatson	if (error)
587101768Srwatson		goto locked_error;
588101768Srwatson#endif
589133790Ssilby	if (amountpipekva > (3 * maxpipekva) / 4) {
590133790Ssilby		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
591133790Ssilby			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
592133790Ssilby			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
593133790Ssilby			(piperesizeallowed == 1)) {
594133790Ssilby			PIPE_UNLOCK(rpipe);
595133790Ssilby			pipespace(rpipe, SMALL_PIPE_SIZE);
596133790Ssilby			PIPE_LOCK(rpipe);
597133790Ssilby		}
598133790Ssilby	}
599101768Srwatson
60013675Sdyson	while (uio->uio_resid) {
60113907Sdyson		/*
60213907Sdyson		 * normal pipe buffer receive
60313907Sdyson		 */
60413675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
60518863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
60613675Sdyson			if (size > rpipe->pipe_buffer.cnt)
60713675Sdyson				size = rpipe->pipe_buffer.cnt;
60818863Sdyson			if (size > (u_int) uio->uio_resid)
60918863Sdyson				size = (u_int) uio->uio_resid;
61047748Salc
61191362Salfred			PIPE_UNLOCK(rpipe);
612116127Smux			error = uiomove(
613116127Smux			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
614116127Smux			    size, uio);
61591362Salfred			PIPE_LOCK(rpipe);
61676760Salfred			if (error)
61713675Sdyson				break;
61876760Salfred
61913675Sdyson			rpipe->pipe_buffer.out += size;
62013675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
62113675Sdyson				rpipe->pipe_buffer.out = 0;
62213675Sdyson
62313675Sdyson			rpipe->pipe_buffer.cnt -= size;
62447748Salc
62547748Salc			/*
62647748Salc			 * If there is no more to read in the pipe, reset
62747748Salc			 * its pointers to the beginning.  This improves
62847748Salc			 * cache hit stats.
62947748Salc			 */
63047748Salc			if (rpipe->pipe_buffer.cnt == 0) {
63147748Salc				rpipe->pipe_buffer.in = 0;
63247748Salc				rpipe->pipe_buffer.out = 0;
63347748Salc			}
63413675Sdyson			nread += size;
63514037Sdyson#ifndef PIPE_NODIRECT
63613907Sdyson		/*
63713907Sdyson		 * Direct copy, bypassing a kernel buffer.
63813907Sdyson		 */
63913907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
64047748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
64118863Sdyson			if (size > (u_int) uio->uio_resid)
64218863Sdyson				size = (u_int) uio->uio_resid;
64347748Salc
64491362Salfred			PIPE_UNLOCK(rpipe);
645127501Salc			error = uiomove_fromphys(rpipe->pipe_map.ms,
646127501Salc			    rpipe->pipe_map.pos, size, uio);
64791362Salfred			PIPE_LOCK(rpipe);
64813907Sdyson			if (error)
64913907Sdyson				break;
65013907Sdyson			nread += size;
65113907Sdyson			rpipe->pipe_map.pos += size;
65213907Sdyson			rpipe->pipe_map.cnt -= size;
65313907Sdyson			if (rpipe->pipe_map.cnt == 0) {
65413907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
65513907Sdyson				wakeup(rpipe);
65613907Sdyson			}
65714037Sdyson#endif
65813675Sdyson		} else {
65913675Sdyson			/*
66013675Sdyson			 * detect EOF condition
66176760Salfred			 * read returns 0 on EOF, no need to set error
66213675Sdyson			 */
66376760Salfred			if (rpipe->pipe_state & PIPE_EOF)
66413675Sdyson				break;
66543623Sdillon
66613675Sdyson			/*
66713675Sdyson			 * If the "write-side" has been blocked, wake it up now.
66813675Sdyson			 */
66913675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
67013675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
67113675Sdyson				wakeup(rpipe);
67213675Sdyson			}
67343623Sdillon
67443623Sdillon			/*
67547748Salc			 * Break if some data was read.
67643623Sdillon			 */
67747748Salc			if (nread > 0)
67813675Sdyson				break;
67916960Sdyson
68043623Sdillon			/*
681124394Sdes			 * Unlock the pipe buffer for our remaining processing.
682116127Smux			 * We will either break out with an error or we will
683116127Smux			 * sleep and relock to loop.
68443623Sdillon			 */
68547748Salc			pipeunlock(rpipe);
68643623Sdillon
68713675Sdyson			/*
68847748Salc			 * Handle non-blocking mode operation or
68947748Salc			 * wait for more data.
69013675Sdyson			 */
69176760Salfred			if (fp->f_flag & FNONBLOCK) {
69247748Salc				error = EAGAIN;
69376760Salfred			} else {
69447748Salc				rpipe->pipe_state |= PIPE_WANTR;
69591362Salfred				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
69691362Salfred				    PRIBIO | PCATCH,
69777140Salfred				    "piperd", 0)) == 0)
69847748Salc					error = pipelock(rpipe, 1);
69913675Sdyson			}
70047748Salc			if (error)
70147748Salc				goto unlocked_error;
70213675Sdyson		}
70313675Sdyson	}
704101768Srwatson#ifdef MAC
705101768Srwatsonlocked_error:
706101768Srwatson#endif
70747748Salc	pipeunlock(rpipe);
70813675Sdyson
70991362Salfred	/* XXX: should probably do this before getting any locks. */
71024101Sbde	if (error == 0)
71155112Sbde		vfs_timestamp(&rpipe->pipe_atime);
71247748Salcunlocked_error:
71347748Salc	--rpipe->pipe_busy;
71413913Sdyson
71547748Salc	/*
71647748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
71747748Salc	 */
71813675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
71913675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
72013675Sdyson		wakeup(rpipe);
72113675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
72213675Sdyson		/*
72347748Salc		 * Handle write blocking hysteresis.
72413675Sdyson		 */
72513675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
72613675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
72713675Sdyson			wakeup(rpipe);
72813675Sdyson		}
72913675Sdyson	}
73014037Sdyson
73114802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
73214037Sdyson		pipeselwakeup(rpipe);
73314037Sdyson
73491362Salfred	PIPE_UNLOCK(rpipe);
73576760Salfred	return (error);
73613675Sdyson}
73713675Sdyson
73814037Sdyson#ifndef PIPE_NODIRECT
73913907Sdyson/*
74013907Sdyson * Map the sending processes' buffer into kernel space and wire it.
74113907Sdyson * This is similar to a physical write operation.
74213907Sdyson */
74313675Sdysonstatic int
74413907Sdysonpipe_build_write_buffer(wpipe, uio)
74513907Sdyson	struct pipe *wpipe;
74613675Sdyson	struct uio *uio;
74713675Sdyson{
748119872Salc	pmap_t pmap;
74918863Sdyson	u_int size;
750119872Salc	int i, j;
751112569Sjake	vm_offset_t addr, endaddr;
75213907Sdyson
75391412Salfred	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
754133790Ssilby	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
755133790Ssilby		("Clone attempt on non-direct write pipe!"));
75679224Sdillon
75718863Sdyson	size = (u_int) uio->uio_iov->iov_len;
75813907Sdyson	if (size > wpipe->pipe_buffer.size)
75913907Sdyson		size = wpipe->pipe_buffer.size;
76013907Sdyson
761119872Salc	pmap = vmspace_pmap(curproc->p_vmspace);
76240286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
76376760Salfred	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
76476760Salfred	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
76599899Salc		/*
76699899Salc		 * vm_fault_quick() can sleep.  Consequently,
76799899Salc		 * vm_page_lock_queue() and vm_page_unlock_queue()
76899899Salc		 * should not be performed outside of this loop.
76999899Salc		 */
770119872Salc	race:
771119872Salc		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
77299899Salc			vm_page_lock_queues();
773119872Salc			for (j = 0; j < i; j++)
774118757Salc				vm_page_unhold(wpipe->pipe_map.ms[j]);
77599899Salc			vm_page_unlock_queues();
77676760Salfred			return (EFAULT);
77713907Sdyson		}
778120000Salc		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
779120000Salc		    VM_PROT_READ);
780119872Salc		if (wpipe->pipe_map.ms[i] == NULL)
781119872Salc			goto race;
78213907Sdyson	}
78313907Sdyson
78413907Sdyson/*
78513907Sdyson * set up the control block
78613907Sdyson */
78713907Sdyson	wpipe->pipe_map.npages = i;
78876760Salfred	wpipe->pipe_map.pos =
78976760Salfred	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
79013907Sdyson	wpipe->pipe_map.cnt = size;
79113907Sdyson
79213907Sdyson/*
79313907Sdyson * and update the uio data
79413907Sdyson */
79513907Sdyson
79613907Sdyson	uio->uio_iov->iov_len -= size;
797104908Smike	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
79813907Sdyson	if (uio->uio_iov->iov_len == 0)
79913907Sdyson		uio->uio_iov++;
80013907Sdyson	uio->uio_resid -= size;
80113907Sdyson	uio->uio_offset += size;
80276760Salfred	return (0);
80313907Sdyson}
80413907Sdyson
80513907Sdyson/*
80613907Sdyson * unmap and unwire the process buffer
80713907Sdyson */
80813907Sdysonstatic void
80913907Sdysonpipe_destroy_write_buffer(wpipe)
81076760Salfred	struct pipe *wpipe;
81113907Sdyson{
81213907Sdyson	int i;
81376364Salfred
814127501Salc	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
81599899Salc	vm_page_lock_queues();
816117325Ssilby	for (i = 0; i < wpipe->pipe_map.npages; i++) {
817118757Salc		vm_page_unhold(wpipe->pipe_map.ms[i]);
818117325Ssilby	}
81999899Salc	vm_page_unlock_queues();
82091653Stanimura	wpipe->pipe_map.npages = 0;
82113907Sdyson}
82213907Sdyson
82313907Sdyson/*
82413907Sdyson * In the case of a signal, the writing process might go away.  This
82513907Sdyson * code copies the data into the circular buffer so that the source
82613907Sdyson * pages can be freed without loss of data.
82713907Sdyson */
82813907Sdysonstatic void
82913907Sdysonpipe_clone_write_buffer(wpipe)
83076364Salfred	struct pipe *wpipe;
83113907Sdyson{
832127501Salc	struct uio uio;
833127501Salc	struct iovec iov;
83413907Sdyson	int size;
83513907Sdyson	int pos;
83613907Sdyson
83791362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
83813907Sdyson	size = wpipe->pipe_map.cnt;
83913907Sdyson	pos = wpipe->pipe_map.pos;
84013907Sdyson
84113907Sdyson	wpipe->pipe_buffer.in = size;
84213907Sdyson	wpipe->pipe_buffer.out = 0;
84313907Sdyson	wpipe->pipe_buffer.cnt = size;
84413907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
84513907Sdyson
846119811Salc	PIPE_UNLOCK(wpipe);
847127501Salc	iov.iov_base = wpipe->pipe_buffer.buffer;
848127501Salc	iov.iov_len = size;
849127501Salc	uio.uio_iov = &iov;
850127501Salc	uio.uio_iovcnt = 1;
851127501Salc	uio.uio_offset = 0;
852127501Salc	uio.uio_resid = size;
853127501Salc	uio.uio_segflg = UIO_SYSSPACE;
854127501Salc	uio.uio_rw = UIO_READ;
855127501Salc	uio.uio_td = curthread;
856127501Salc	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
857127501Salc	PIPE_LOCK(wpipe);
85813907Sdyson	pipe_destroy_write_buffer(wpipe);
85913907Sdyson}
86013907Sdyson
86113907Sdyson/*
86213907Sdyson * This implements the pipe buffer write mechanism.  Note that only
86313907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
86413907Sdyson * If there are any characters in the pipe buffer, the direct write will
86513907Sdyson * be deferred until the receiving process grabs all of the bytes from
86613907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
86713907Sdyson */
86813907Sdysonstatic int
86913907Sdysonpipe_direct_write(wpipe, uio)
87013907Sdyson	struct pipe *wpipe;
87113907Sdyson	struct uio *uio;
87213907Sdyson{
87313907Sdyson	int error;
87476364Salfred
87513951Sdysonretry:
87691362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
877133049Ssilby	error = pipelock(wpipe, 1);
878133049Ssilby	if (wpipe->pipe_state & PIPE_EOF)
879133049Ssilby		error = EPIPE;
880133049Ssilby	if (error) {
881133049Ssilby		pipeunlock(wpipe);
882133049Ssilby		goto error1;
883133049Ssilby	}
88413907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
88576760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
88613951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
88713951Sdyson			wakeup(wpipe);
88813951Sdyson		}
889173750Sdumbbell		pipeselwakeup(wpipe);
89013992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
891133049Ssilby		pipeunlock(wpipe);
89291362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
89391362Salfred		    PRIBIO | PCATCH, "pipdww", 0);
89414802Sdyson		if (error)
89513907Sdyson			goto error1;
896133049Ssilby		else
897133049Ssilby			goto retry;
89813907Sdyson	}
89913907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
90013951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
90176760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
90213951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
90313951Sdyson			wakeup(wpipe);
90413951Sdyson		}
905173750Sdumbbell		pipeselwakeup(wpipe);
90613992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
907133049Ssilby		pipeunlock(wpipe);
90891362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
90991362Salfred		    PRIBIO | PCATCH, "pipdwc", 0);
91014802Sdyson		if (error)
91113907Sdyson			goto error1;
912133049Ssilby		else
913133049Ssilby			goto retry;
91413907Sdyson	}
91513907Sdyson
91613951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
91713951Sdyson
918119872Salc	PIPE_UNLOCK(wpipe);
91913907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
920119872Salc	PIPE_LOCK(wpipe);
92113907Sdyson	if (error) {
92213907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
923133049Ssilby		pipeunlock(wpipe);
92413907Sdyson		goto error1;
92513907Sdyson	}
92613907Sdyson
92713907Sdyson	error = 0;
92813907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
92913907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
93013907Sdyson			pipe_destroy_write_buffer(wpipe);
931112981Shsu			pipeselwakeup(wpipe);
93213907Sdyson			pipeunlock(wpipe);
93314802Sdyson			error = EPIPE;
93414802Sdyson			goto error1;
93513907Sdyson		}
93613992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
93713992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
93813992Sdyson			wakeup(wpipe);
93913992Sdyson		}
94014037Sdyson		pipeselwakeup(wpipe);
941133049Ssilby		pipeunlock(wpipe);
94291362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
94391362Salfred		    "pipdwt", 0);
944133049Ssilby		pipelock(wpipe, 0);
94513907Sdyson	}
94613907Sdyson
947126131Sgreen	if (wpipe->pipe_state & PIPE_EOF)
948126131Sgreen		error = EPIPE;
94913907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
95013907Sdyson		/*
95113907Sdyson		 * this bit of trickery substitutes a kernel buffer for
95213907Sdyson		 * the process that might be going away.
95313907Sdyson		 */
95413907Sdyson		pipe_clone_write_buffer(wpipe);
95513907Sdyson	} else {
95613907Sdyson		pipe_destroy_write_buffer(wpipe);
95713907Sdyson	}
95813907Sdyson	pipeunlock(wpipe);
95976760Salfred	return (error);
96013907Sdyson
96113907Sdysonerror1:
96213907Sdyson	wakeup(wpipe);
96376760Salfred	return (error);
96413907Sdyson}
96514037Sdyson#endif
966124394Sdes
96716960Sdysonstatic int
968101941Srwatsonpipe_write(fp, uio, active_cred, flags, td)
96916960Sdyson	struct file *fp;
97013907Sdyson	struct uio *uio;
971101941Srwatson	struct ucred *active_cred;
97283366Sjulian	struct thread *td;
97345311Sdt	int flags;
97413907Sdyson{
97513675Sdyson	int error = 0;
976133790Ssilby	int desiredsize, orig_resid;
97716960Sdyson	struct pipe *wpipe, *rpipe;
97816960Sdyson
979109153Sdillon	rpipe = fp->f_data;
98016960Sdyson	wpipe = rpipe->pipe_peer;
98116960Sdyson
98291395Salfred	PIPE_LOCK(rpipe);
983133049Ssilby	error = pipelock(wpipe, 1);
984133049Ssilby	if (error) {
985133049Ssilby		PIPE_UNLOCK(rpipe);
986133049Ssilby		return (error);
987133049Ssilby	}
98813675Sdyson	/*
98913675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
99013675Sdyson	 */
991179243Skib	if (wpipe->pipe_present != PIPE_ACTIVE ||
992179243Skib	    (wpipe->pipe_state & PIPE_EOF)) {
993133049Ssilby		pipeunlock(wpipe);
99491395Salfred		PIPE_UNLOCK(rpipe);
99576760Salfred		return (EPIPE);
99613675Sdyson	}
997101768Srwatson#ifdef MAC
998172930Srwatson	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
999101768Srwatson	if (error) {
1000133049Ssilby		pipeunlock(wpipe);
1001101768Srwatson		PIPE_UNLOCK(rpipe);
1002101768Srwatson		return (error);
1003101768Srwatson	}
1004101768Srwatson#endif
100577676Sdillon	++wpipe->pipe_busy;
100613675Sdyson
1007133790Ssilby	/* Choose a larger size if it's advantageous */
1008133790Ssilby	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1009133790Ssilby	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1010133790Ssilby		if (piperesizeallowed != 1)
1011133790Ssilby			break;
1012133790Ssilby		if (amountpipekva > maxpipekva / 2)
1013133790Ssilby			break;
1014133790Ssilby		if (desiredsize == BIG_PIPE_SIZE)
1015133790Ssilby			break;
1016133790Ssilby		desiredsize = desiredsize * 2;
1017133790Ssilby	}
101817163Sdyson
1019133790Ssilby	/* Choose a smaller size if we're in a OOM situation */
1020133790Ssilby	if ((amountpipekva > (3 * maxpipekva) / 4) &&
1021133790Ssilby		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1022133790Ssilby		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1023133790Ssilby		(piperesizeallowed == 1))
1024133790Ssilby		desiredsize = SMALL_PIPE_SIZE;
1025133790Ssilby
1026133790Ssilby	/* Resize if the above determined that a new size was necessary */
1027133790Ssilby	if ((desiredsize != wpipe->pipe_buffer.size) &&
1028133790Ssilby		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1029133049Ssilby		PIPE_UNLOCK(wpipe);
1030133790Ssilby		pipespace(wpipe, desiredsize);
1031133049Ssilby		PIPE_LOCK(wpipe);
103213907Sdyson	}
1033133790Ssilby	if (wpipe->pipe_buffer.size == 0) {
1034133790Ssilby		/*
1035133790Ssilby		 * This can only happen for reverse direction use of pipes
1036133790Ssilby		 * in a complete OOM situation.
1037133790Ssilby		 */
1038133790Ssilby		error = ENOMEM;
1039133790Ssilby		--wpipe->pipe_busy;
1040133790Ssilby		pipeunlock(wpipe);
1041133790Ssilby		PIPE_UNLOCK(wpipe);
1042133790Ssilby		return (error);
1043133790Ssilby	}
104477676Sdillon
1045133049Ssilby	pipeunlock(wpipe);
1046124394Sdes
104713913Sdyson	orig_resid = uio->uio_resid;
104877676Sdillon
104913675Sdyson	while (uio->uio_resid) {
105013907Sdyson		int space;
105176760Salfred
1052133049Ssilby		pipelock(wpipe, 0);
1053133049Ssilby		if (wpipe->pipe_state & PIPE_EOF) {
1054133049Ssilby			pipeunlock(wpipe);
1055133049Ssilby			error = EPIPE;
1056133049Ssilby			break;
1057133049Ssilby		}
105814037Sdyson#ifndef PIPE_NODIRECT
105913907Sdyson		/*
106013907Sdyson		 * If the transfer is large, we can gain performance if
106113907Sdyson		 * we do process-to-process copies directly.
106216416Sdyson		 * If the write is non-blocking, we don't use the
106316416Sdyson		 * direct write mechanism.
106458505Sdillon		 *
106558505Sdillon		 * The direct write mechanism will detect the reader going
106658505Sdillon		 * away on us.
106713907Sdyson		 */
1068165347Spjd		if (uio->uio_segflg == UIO_USERSPACE &&
1069165347Spjd		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
1070165347Spjd		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
1071127501Salc		    (fp->f_flag & FNONBLOCK) == 0) {
1072133049Ssilby			pipeunlock(wpipe);
1073105009Salfred			error = pipe_direct_write(wpipe, uio);
107476760Salfred			if (error)
107513907Sdyson				break;
107613907Sdyson			continue;
107791362Salfred		}
107814037Sdyson#endif
107913907Sdyson
108013907Sdyson		/*
108113907Sdyson		 * Pipe buffered writes cannot be coincidental with
108213907Sdyson		 * direct writes.  We wait until the currently executing
108313907Sdyson		 * direct write is completed before we start filling the
108458505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
108558505Sdillon		 * reader goes away.
108613907Sdyson		 */
1087133049Ssilby		if (wpipe->pipe_state & PIPE_DIRECTW) {
108813992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
108913992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
109013992Sdyson				wakeup(wpipe);
109113992Sdyson			}
1092173750Sdumbbell			pipeselwakeup(wpipe);
1093173750Sdumbbell			wpipe->pipe_state |= PIPE_WANTW;
1094133049Ssilby			pipeunlock(wpipe);
109591395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
109691362Salfred			    "pipbww", 0);
109713907Sdyson			if (error)
109813907Sdyson				break;
1099133049Ssilby			else
1100133049Ssilby				continue;
110113907Sdyson		}
110213907Sdyson
110313907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
110414644Sdyson
110514644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
110613913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
110713913Sdyson			space = 0;
110813907Sdyson
1109118230Spb		if (space > 0) {
1110133049Ssilby			int size;	/* Transfer size */
1111133049Ssilby			int segsize;	/* first segment to transfer */
111276760Salfred
1113133049Ssilby			/*
1114133049Ssilby			 * Transfer size is minimum of uio transfer
1115133049Ssilby			 * and free space in pipe buffer.
1116133049Ssilby			 */
1117133049Ssilby			if (space > uio->uio_resid)
1118133049Ssilby				size = uio->uio_resid;
1119133049Ssilby			else
1120133049Ssilby				size = space;
1121133049Ssilby			/*
1122133049Ssilby			 * First segment to transfer is minimum of
1123133049Ssilby			 * transfer size and contiguous space in
1124133049Ssilby			 * pipe buffer.  If first segment to transfer
1125133049Ssilby			 * is less than the transfer size, we've got
1126133049Ssilby			 * a wraparound in the buffer.
1127133049Ssilby			 */
1128133049Ssilby			segsize = wpipe->pipe_buffer.size -
1129133049Ssilby				wpipe->pipe_buffer.in;
1130133049Ssilby			if (segsize > size)
1131133049Ssilby				segsize = size;
113254534Stegge
1133133049Ssilby			/* Transfer first segment */
1134133049Ssilby
1135133049Ssilby			PIPE_UNLOCK(rpipe);
1136133049Ssilby			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1137133049Ssilby					segsize, uio);
1138133049Ssilby			PIPE_LOCK(rpipe);
1139133049Ssilby
1140133049Ssilby			if (error == 0 && segsize < size) {
1141133049Ssilby				KASSERT(wpipe->pipe_buffer.in + segsize ==
1142133049Ssilby					wpipe->pipe_buffer.size,
1143133049Ssilby					("Pipe buffer wraparound disappeared"));
114454534Stegge				/*
1145133049Ssilby				 * Transfer remaining part now, to
1146133049Ssilby				 * support atomic writes.  Wraparound
1147133049Ssilby				 * happened.
114854534Stegge				 */
1149124394Sdes
115091395Salfred				PIPE_UNLOCK(rpipe);
1151133049Ssilby				error = uiomove(
1152133049Ssilby				    &wpipe->pipe_buffer.buffer[0],
1153133049Ssilby				    size - segsize, uio);
115491395Salfred				PIPE_LOCK(rpipe);
1155133049Ssilby			}
1156133049Ssilby			if (error == 0) {
1157133049Ssilby				wpipe->pipe_buffer.in += size;
1158133049Ssilby				if (wpipe->pipe_buffer.in >=
1159133049Ssilby				    wpipe->pipe_buffer.size) {
1160133049Ssilby					KASSERT(wpipe->pipe_buffer.in ==
1161133049Ssilby						size - segsize +
1162133049Ssilby						wpipe->pipe_buffer.size,
1163133049Ssilby						("Expected wraparound bad"));
1164133049Ssilby					wpipe->pipe_buffer.in = size - segsize;
116554534Stegge				}
1166124394Sdes
1167133049Ssilby				wpipe->pipe_buffer.cnt += size;
1168133049Ssilby				KASSERT(wpipe->pipe_buffer.cnt <=
1169133049Ssilby					wpipe->pipe_buffer.size,
1170133049Ssilby					("Pipe buffer overflow"));
117113675Sdyson			}
1172133049Ssilby			pipeunlock(wpipe);
1173153484Sdelphij			if (error != 0)
1174153484Sdelphij				break;
117513675Sdyson		} else {
117613675Sdyson			/*
117713675Sdyson			 * If the "read-side" has been blocked, wake it up now.
117813675Sdyson			 */
117913675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
118013675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
118113675Sdyson				wakeup(wpipe);
118213675Sdyson			}
118314037Sdyson
118413675Sdyson			/*
118513675Sdyson			 * don't block on non-blocking I/O
118613675Sdyson			 */
118716960Sdyson			if (fp->f_flag & FNONBLOCK) {
118813907Sdyson				error = EAGAIN;
1189133049Ssilby				pipeunlock(wpipe);
119013675Sdyson				break;
119113675Sdyson			}
119213907Sdyson
119314037Sdyson			/*
119414037Sdyson			 * We have no more space and have something to offer,
119529356Speter			 * wake up select/poll.
119614037Sdyson			 */
119714037Sdyson			pipeselwakeup(wpipe);
119814037Sdyson
119913675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
1200133049Ssilby			pipeunlock(wpipe);
120191395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe),
120291362Salfred			    PRIBIO | PCATCH, "pipewr", 0);
120376760Salfred			if (error != 0)
120413675Sdyson				break;
120513675Sdyson		}
120613675Sdyson	}
120713675Sdyson
1208133049Ssilby	pipelock(wpipe, 0);
120914644Sdyson	--wpipe->pipe_busy;
121077676Sdillon
121176760Salfred	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
121276760Salfred		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
121313675Sdyson		wakeup(wpipe);
121413675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
121513675Sdyson		/*
121613675Sdyson		 * If we have put any characters in the buffer, we wake up
121713675Sdyson		 * the reader.
121813675Sdyson		 */
121913675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
122013675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
122113675Sdyson			wakeup(wpipe);
122213675Sdyson		}
122313675Sdyson	}
122413909Sdyson
122513909Sdyson	/*
122613909Sdyson	 * Don't return EPIPE if I/O was successful
122713909Sdyson	 */
122813907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
122977676Sdillon	    (uio->uio_resid == 0) &&
123077676Sdillon	    (error == EPIPE)) {
123113907Sdyson		error = 0;
123277676Sdillon	}
123313913Sdyson
123424101Sbde	if (error == 0)
123555112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
123624101Sbde
123714037Sdyson	/*
123814037Sdyson	 * We have something to offer,
123929356Speter	 * wake up select/poll.
124014037Sdyson	 */
124114177Sdyson	if (wpipe->pipe_buffer.cnt)
124214037Sdyson		pipeselwakeup(wpipe);
124313907Sdyson
1244133049Ssilby	pipeunlock(wpipe);
124591395Salfred	PIPE_UNLOCK(rpipe);
124676760Salfred	return (error);
124713675Sdyson}
124813675Sdyson
1249175140Sjhb/* ARGSUSED */
1250175140Sjhbstatic int
1251175140Sjhbpipe_truncate(fp, length, active_cred, td)
1252175140Sjhb	struct file *fp;
1253175140Sjhb	off_t length;
1254175140Sjhb	struct ucred *active_cred;
1255175140Sjhb	struct thread *td;
1256175140Sjhb{
1257175140Sjhb
1258175140Sjhb	return (EINVAL);
1259175140Sjhb}
1260175140Sjhb
126113675Sdyson/*
126213675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
126313675Sdyson */
1264104094Sphkstatic int
1265102003Srwatsonpipe_ioctl(fp, cmd, data, active_cred, td)
126613675Sdyson	struct file *fp;
126736735Sdfr	u_long cmd;
126899009Salfred	void *data;
1269102003Srwatson	struct ucred *active_cred;
127083366Sjulian	struct thread *td;
127113675Sdyson{
1272109153Sdillon	struct pipe *mpipe = fp->f_data;
1273101768Srwatson	int error;
127413675Sdyson
1275104269Srwatson	PIPE_LOCK(mpipe);
1276104269Srwatson
1277104269Srwatson#ifdef MAC
1278172930Srwatson	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1279121970Srwatson	if (error) {
1280121970Srwatson		PIPE_UNLOCK(mpipe);
1281101768Srwatson		return (error);
1282121970Srwatson	}
1283101768Srwatson#endif
1284101768Srwatson
1285137752Sphk	error = 0;
128613675Sdyson	switch (cmd) {
128713675Sdyson
128813675Sdyson	case FIONBIO:
1289137752Sphk		break;
129013675Sdyson
129113675Sdyson	case FIOASYNC:
129213675Sdyson		if (*(int *)data) {
129313675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
129413675Sdyson		} else {
129513675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
129613675Sdyson		}
1297137752Sphk		break;
129813675Sdyson
129913675Sdyson	case FIONREAD:
130014037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
130114037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
130214037Sdyson		else
130314037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
1304137752Sphk		break;
130513675Sdyson
130641086Struckman	case FIOSETOWN:
1307138032Srwatson		PIPE_UNLOCK(mpipe);
1308137752Sphk		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1309138032Srwatson		goto out_unlocked;
131041086Struckman
131141086Struckman	case FIOGETOWN:
1312104393Struckman		*(int *)data = fgetown(&mpipe->pipe_sigio);
1313137752Sphk		break;
131413675Sdyson
131541086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
131641086Struckman	case TIOCSPGRP:
1317138032Srwatson		PIPE_UNLOCK(mpipe);
1318137752Sphk		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1319138032Srwatson		goto out_unlocked;
132041086Struckman
132141086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
132218863Sdyson	case TIOCGPGRP:
1323104393Struckman		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1324137752Sphk		break;
132513675Sdyson
1326137752Sphk	default:
1327137752Sphk		error = ENOTTY;
1328137764Sphk		break;
132913675Sdyson	}
1330104269Srwatson	PIPE_UNLOCK(mpipe);
1331138032Srwatsonout_unlocked:
1332137752Sphk	return (error);
133313675Sdyson}
133413675Sdyson
1335104094Sphkstatic int
1336101983Srwatsonpipe_poll(fp, events, active_cred, td)
133713675Sdyson	struct file *fp;
133829356Speter	int events;
1339101983Srwatson	struct ucred *active_cred;
134083366Sjulian	struct thread *td;
134113675Sdyson{
1342109153Sdillon	struct pipe *rpipe = fp->f_data;
134313675Sdyson	struct pipe *wpipe;
134429356Speter	int revents = 0;
1345101768Srwatson#ifdef MAC
1346101768Srwatson	int error;
1347101768Srwatson#endif
134813675Sdyson
134913675Sdyson	wpipe = rpipe->pipe_peer;
135091362Salfred	PIPE_LOCK(rpipe);
1351101768Srwatson#ifdef MAC
1352172930Srwatson	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1353101768Srwatson	if (error)
1354101768Srwatson		goto locked_error;
1355101768Srwatson#endif
135629356Speter	if (events & (POLLIN | POLLRDNORM))
135729356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
135829356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
135929356Speter		    (rpipe->pipe_state & PIPE_EOF))
136029356Speter			revents |= events & (POLLIN | POLLRDNORM);
136113675Sdyson
136229356Speter	if (events & (POLLOUT | POLLWRNORM))
1363179243Skib		if (wpipe->pipe_present != PIPE_ACTIVE ||
1364179243Skib		    (wpipe->pipe_state & PIPE_EOF) ||
136543311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
136643311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
136729356Speter			revents |= events & (POLLOUT | POLLWRNORM);
136813675Sdyson
136929356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
1370179243Skib	    wpipe->pipe_present != PIPE_ACTIVE ||
137129356Speter	    (wpipe->pipe_state & PIPE_EOF))
137229356Speter		revents |= POLLHUP;
137329356Speter
137429356Speter	if (revents == 0) {
137529356Speter		if (events & (POLLIN | POLLRDNORM)) {
137683805Sjhb			selrecord(td, &rpipe->pipe_sel);
1377174647Sjeff			if (SEL_WAITING(&rpipe->pipe_sel))
1378174647Sjeff				rpipe->pipe_state |= PIPE_SEL;
137913675Sdyson		}
138013675Sdyson
138129356Speter		if (events & (POLLOUT | POLLWRNORM)) {
138283805Sjhb			selrecord(td, &wpipe->pipe_sel);
1383174647Sjeff			if (SEL_WAITING(&wpipe->pipe_sel))
1384174647Sjeff				wpipe->pipe_state |= PIPE_SEL;
138513907Sdyson		}
138613675Sdyson	}
1387101768Srwatson#ifdef MAC
1388101768Srwatsonlocked_error:
1389101768Srwatson#endif
139091362Salfred	PIPE_UNLOCK(rpipe);
139129356Speter
139229356Speter	return (revents);
139313675Sdyson}
139413675Sdyson
139598989Salfred/*
139698989Salfred * We shouldn't need locks here as we're doing a read and this should
139798989Salfred * be a natural race.
139898989Salfred */
139952983Speterstatic int
1400101983Srwatsonpipe_stat(fp, ub, active_cred, td)
140152983Speter	struct file *fp;
140252983Speter	struct stat *ub;
1403101983Srwatson	struct ucred *active_cred;
140483366Sjulian	struct thread *td;
140513675Sdyson{
1406109153Sdillon	struct pipe *pipe = fp->f_data;
1407101768Srwatson#ifdef MAC
1408101768Srwatson	int error;
140952983Speter
1410104269Srwatson	PIPE_LOCK(pipe);
1411172930Srwatson	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1412104269Srwatson	PIPE_UNLOCK(pipe);
1413101768Srwatson	if (error)
1414101768Srwatson		return (error);
1415101768Srwatson#endif
1416100527Salfred	bzero(ub, sizeof(*ub));
141717124Sbde	ub->st_mode = S_IFIFO;
1418133790Ssilby	ub->st_blksize = PAGE_SIZE;
1419132436Ssilby	if (pipe->pipe_state & PIPE_DIRECTW)
1420132436Ssilby		ub->st_size = pipe->pipe_map.cnt;
1421132436Ssilby	else
1422132436Ssilby		ub->st_size = pipe->pipe_buffer.cnt;
142313675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
142434901Sphk	ub->st_atimespec = pipe->pipe_atime;
142534901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
142634901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
142760404Schris	ub->st_uid = fp->f_cred->cr_uid;
142860404Schris	ub->st_gid = fp->f_cred->cr_gid;
142917124Sbde	/*
143060404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
143117124Sbde	 * XXX (st_dev, st_ino) should be unique.
143217124Sbde	 */
143376760Salfred	return (0);
143413675Sdyson}
143513675Sdyson
143613675Sdyson/* ARGSUSED */
143713675Sdysonstatic int
143883366Sjulianpipe_close(fp, td)
143913675Sdyson	struct file *fp;
144083366Sjulian	struct thread *td;
144113675Sdyson{
1442109153Sdillon	struct pipe *cpipe = fp->f_data;
144316322Sgpalmer
144449413Sgreen	fp->f_ops = &badfileops;
1445109153Sdillon	fp->f_data = NULL;
144696122Salfred	funsetown(&cpipe->pipe_sigio);
144713675Sdyson	pipeclose(cpipe);
144876760Salfred	return (0);
144913675Sdyson}
145013675Sdyson
145176364Salfredstatic void
145276364Salfredpipe_free_kmem(cpipe)
145376364Salfred	struct pipe *cpipe;
145476364Salfred{
145591412Salfred
1456125293Srwatson	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1457125293Srwatson	    ("pipe_free_kmem: pipe mutex locked"));
145876364Salfred
145976364Salfred	if (cpipe->pipe_buffer.buffer != NULL) {
1460110816Salc		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1461118764Ssilby		vm_map_remove(pipe_map,
1462118764Ssilby		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1463118764Ssilby		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
146476364Salfred		cpipe->pipe_buffer.buffer = NULL;
146576364Salfred	}
146676364Salfred#ifndef PIPE_NODIRECT
1467127501Salc	{
146876364Salfred		cpipe->pipe_map.cnt = 0;
146976364Salfred		cpipe->pipe_map.pos = 0;
147076364Salfred		cpipe->pipe_map.npages = 0;
147176364Salfred	}
147276364Salfred#endif
147376364Salfred}
147476364Salfred
147513675Sdyson/*
147613675Sdyson * shutdown the pipe
147713675Sdyson */
147813675Sdysonstatic void
147913675Sdysonpipeclose(cpipe)
148013675Sdyson	struct pipe *cpipe;
148113675Sdyson{
1482125293Srwatson	struct pipepair *pp;
148313907Sdyson	struct pipe *ppipe;
148476364Salfred
1485125293Srwatson	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
148691968Salfred
1487125293Srwatson	PIPE_LOCK(cpipe);
1488133049Ssilby	pipelock(cpipe, 0);
1489125293Srwatson	pp = cpipe->pipe_pair;
149091968Salfred
149191968Salfred	pipeselwakeup(cpipe);
149213907Sdyson
149391968Salfred	/*
149491968Salfred	 * If the other side is blocked, wake it up saying that
149591968Salfred	 * we want to close it down.
149691968Salfred	 */
1497126131Sgreen	cpipe->pipe_state |= PIPE_EOF;
149891968Salfred	while (cpipe->pipe_busy) {
149991968Salfred		wakeup(cpipe);
1500126131Sgreen		cpipe->pipe_state |= PIPE_WANT;
1501133049Ssilby		pipeunlock(cpipe);
150291968Salfred		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1503133049Ssilby		pipelock(cpipe, 0);
150491968Salfred	}
150513675Sdyson
1506101768Srwatson
150791968Salfred	/*
1508125293Srwatson	 * Disconnect from peer, if any.
150991968Salfred	 */
1510125293Srwatson	ppipe = cpipe->pipe_peer;
1511179243Skib	if (ppipe->pipe_present == PIPE_ACTIVE) {
151291968Salfred		pipeselwakeup(ppipe);
151313907Sdyson
151491968Salfred		ppipe->pipe_state |= PIPE_EOF;
151591968Salfred		wakeup(ppipe);
1516133741Sjmg		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
151791968Salfred	}
1518125293Srwatson
151991968Salfred	/*
1520125293Srwatson	 * Mark this endpoint as free.  Release kmem resources.  We
1521125293Srwatson	 * don't mark this endpoint as unused until we've finished
1522125293Srwatson	 * doing that, or the pipe might disappear out from under
1523125293Srwatson	 * us.
152491968Salfred	 */
1525125293Srwatson	PIPE_UNLOCK(cpipe);
1526125293Srwatson	pipe_free_kmem(cpipe);
1527125293Srwatson	PIPE_LOCK(cpipe);
1528179243Skib	cpipe->pipe_present = PIPE_CLOSING;
1529126131Sgreen	pipeunlock(cpipe);
1530179243Skib
1531179243Skib	/*
1532179243Skib	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1533179243Skib	 * PIPE_FINALIZED, that allows other end to free the
1534179243Skib	 * pipe_pair, only after the knotes are completely dismantled.
1535179243Skib	 */
1536133741Sjmg	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1537179243Skib	cpipe->pipe_present = PIPE_FINALIZED;
1538133741Sjmg	knlist_destroy(&cpipe->pipe_sel.si_note);
1539125293Srwatson
1540125293Srwatson	/*
1541125293Srwatson	 * If both endpoints are now closed, release the memory for the
1542125293Srwatson	 * pipe pair.  If not, unlock.
1543125293Srwatson	 */
1544179243Skib	if (ppipe->pipe_present == PIPE_FINALIZED) {
154591968Salfred		PIPE_UNLOCK(cpipe);
1546125293Srwatson#ifdef MAC
1547172930Srwatson		mac_pipe_destroy(pp);
1548125293Srwatson#endif
1549125293Srwatson		uma_zfree(pipe_zone, cpipe->pipe_pair);
1550125293Srwatson	} else
1551125293Srwatson		PIPE_UNLOCK(cpipe);
155213675Sdyson}
155359288Sjlemon
155472521Sjlemon/*ARGSUSED*/
155559288Sjlemonstatic int
155672521Sjlemonpipe_kqfilter(struct file *fp, struct knote *kn)
155759288Sjlemon{
155889306Salfred	struct pipe *cpipe;
155959288Sjlemon
1560109153Sdillon	cpipe = kn->kn_fp->f_data;
1561126131Sgreen	PIPE_LOCK(cpipe);
156272521Sjlemon	switch (kn->kn_filter) {
156372521Sjlemon	case EVFILT_READ:
156472521Sjlemon		kn->kn_fop = &pipe_rfiltops;
156572521Sjlemon		break;
156672521Sjlemon	case EVFILT_WRITE:
156772521Sjlemon		kn->kn_fop = &pipe_wfiltops;
1568179243Skib		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1569101382Sdes			/* other end of pipe has been closed */
1570126131Sgreen			PIPE_UNLOCK(cpipe);
1571118929Sjmg			return (EPIPE);
1572126131Sgreen		}
1573126131Sgreen		cpipe = cpipe->pipe_peer;
157472521Sjlemon		break;
157572521Sjlemon	default:
1576126131Sgreen		PIPE_UNLOCK(cpipe);
1577133741Sjmg		return (EINVAL);
157872521Sjlemon	}
157978292Sjlemon
1580133741Sjmg	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
158191372Salfred	PIPE_UNLOCK(cpipe);
158259288Sjlemon	return (0);
158359288Sjlemon}
158459288Sjlemon
158559288Sjlemonstatic void
158659288Sjlemonfilt_pipedetach(struct knote *kn)
158759288Sjlemon{
1588121018Sjmg	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
158959288Sjlemon
1590126131Sgreen	PIPE_LOCK(cpipe);
1591179242Skib	if (kn->kn_filter == EVFILT_WRITE)
1592121018Sjmg		cpipe = cpipe->pipe_peer;
1593133741Sjmg	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
159491372Salfred	PIPE_UNLOCK(cpipe);
159559288Sjlemon}
159659288Sjlemon
159759288Sjlemon/*ARGSUSED*/
159859288Sjlemonstatic int
159959288Sjlemonfilt_piperead(struct knote *kn, long hint)
160059288Sjlemon{
1601109153Sdillon	struct pipe *rpipe = kn->kn_fp->f_data;
160259288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
1603133741Sjmg	int ret;
160459288Sjlemon
160591372Salfred	PIPE_LOCK(rpipe);
160659288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
160759288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
160859288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
160959288Sjlemon
161059288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
1611179243Skib	    wpipe->pipe_present != PIPE_ACTIVE ||
1612179243Skib	    (wpipe->pipe_state & PIPE_EOF)) {
161391372Salfred		kn->kn_flags |= EV_EOF;
161491372Salfred		PIPE_UNLOCK(rpipe);
161559288Sjlemon		return (1);
161659288Sjlemon	}
1617133741Sjmg	ret = kn->kn_data > 0;
161891372Salfred	PIPE_UNLOCK(rpipe);
1619133741Sjmg	return ret;
162059288Sjlemon}
162159288Sjlemon
162259288Sjlemon/*ARGSUSED*/
162359288Sjlemonstatic int
162459288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
162559288Sjlemon{
1626109153Sdillon	struct pipe *rpipe = kn->kn_fp->f_data;
162759288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
162859288Sjlemon
162991372Salfred	PIPE_LOCK(rpipe);
1630179243Skib	if (wpipe->pipe_present != PIPE_ACTIVE ||
1631179243Skib	    (wpipe->pipe_state & PIPE_EOF)) {
163259288Sjlemon		kn->kn_data = 0;
1633124394Sdes		kn->kn_flags |= EV_EOF;
163491372Salfred		PIPE_UNLOCK(rpipe);
163559288Sjlemon		return (1);
163659288Sjlemon	}
163759288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
163865855Sjlemon	if (wpipe->pipe_state & PIPE_DIRECTW)
163959288Sjlemon		kn->kn_data = 0;
164059288Sjlemon
164191372Salfred	PIPE_UNLOCK(rpipe);
164259288Sjlemon	return (kn->kn_data >= PIPE_BUF);
164359288Sjlemon}
1644