sys_pipe.c revision 224914
199026Sjulian/*-
299026Sjulian * Copyright (c) 1996 John S. Dyson
399026Sjulian * All rights reserved.
499026Sjulian *
599026Sjulian * Redistribution and use in source and binary forms, with or without
699026Sjulian * modification, are permitted provided that the following conditions
799026Sjulian * are met:
899026Sjulian * 1. Redistributions of source code must retain the above copyright
999026Sjulian *    notice immediately at the beginning of the file, without modification,
1099026Sjulian *    this list of conditions, and the following disclaimer.
1199026Sjulian * 2. Redistributions in binary form must reproduce the above copyright
1299026Sjulian *    notice, this list of conditions and the following disclaimer in the
1399026Sjulian *    documentation and/or other materials provided with the distribution.
1499026Sjulian * 3. Absolutely no warranty of function or purpose is made by the author
1599026Sjulian *    John S. Dyson.
1699026Sjulian * 4. Modifications may be freely made to this file if the above conditions
1799026Sjulian *    are met.
1899026Sjulian */
1999026Sjulian
2099026Sjulian/*
2199026Sjulian * This file contains a high-performance replacement for the socket-based
2299026Sjulian * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2399026Sjulian * all features of sockets, but does do everything that pipes normally
2499026Sjulian * do.
2599026Sjulian */
2699026Sjulian
2799026Sjulian/*
2899026Sjulian * This code has two modes of operation, a small write mode and a large
2999026Sjulian * write mode.  The small write mode acts like conventional pipes with
3099026Sjulian * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3199026Sjulian * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3299026Sjulian * and PIPE_SIZE in size, the sending process pins the underlying pages in
3399026Sjulian * memory, and the receiving process copies directly from these pinned pages
3499026Sjulian * in the sending process.
3599026Sjulian *
3699026Sjulian * If the sending process receives a signal, it is possible that it will
3799026Sjulian * go away, and certainly its address space can change, because control
38107029Sjulian * is returned back to the user-mode side.  In that case, the pipe code
3999026Sjulian * arranges to copy the buffer supplied by the user process, to a pageable
40105854Sjulian * kernel buffer, and the receiving process will grab the data from the
4199026Sjulian * pageable kernel buffer.  Since signals don't happen all that often,
42107126Sjeff * the copy operation is normally eliminated.
4399026Sjulian *
4499026Sjulian * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45107126Sjeff * happen for small transfers so that the system will not spend all of
4699026Sjulian * its time context switching.
4799026Sjulian *
4899026Sjulian * In order to limit the resource use of pipes, two sysctls exist:
4999026Sjulian *
50103410Smini * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
5199026Sjulian * address space available to us in pipe_map. This value is normally
5299026Sjulian * autotuned, but may also be loader tuned.
5399026Sjulian *
5499026Sjulian * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
5599026Sjulian * memory in use by pipes.
5699026Sjulian *
5799026Sjulian * Based on how large pipekva is relative to maxpipekva, the following
58100273Speter * will happen:
59100273Speter *
6099026Sjulian * 0% - 50%:
61103367Sjulian *     New pipes are given 16K of memory backing, pipes may dynamically
6299026Sjulian *     grow to as large as 64K where needed.
63103367Sjulian * 50% - 75%:
64103367Sjulian *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
6599026Sjulian *     existing pipes may NOT grow.
66111028Sjeff * 75% - 100%:
6799026Sjulian *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
68103367Sjulian *     existing pipes will be shrunk down to 4K whenever possible.
6999026Sjulian *
70107719Sjulian * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
71107719Sjulian * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72107719Sjulian * resize which MUST occur for reverse-direction pipes when they are
7399026Sjulian * first used.
74107006Sdavidxu *
75107006Sdavidxu * Additional information about the current state of pipes may be obtained
76103367Sjulian * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77103367Sjulian * and kern.ipc.piperesizefail.
78107006Sdavidxu *
79107006Sdavidxu * Locking rules:  There are two locks present here:  A mutex, used via
80107006Sdavidxu * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
81107006Sdavidxu * the flag, as mutexes can not persist over uiomove.  The mutex
82111115Sdavidxu * exists only to guard access to the flag, and is not in itself a
83111115Sdavidxu * locking mechanism.  Also note that there is only a single mutex for
84111115Sdavidxu * both directions of a pipe.
85111115Sdavidxu *
86111028Sjeff * As pipelock() may have to sleep before it can acquire the flag, it
87111028Sjeff * is important to reread all data after a call to pipelock(); everything
8899026Sjulian * in the structure may have changed.
8999026Sjulian */
90111028Sjeff
91105854Sjulian#include <sys/cdefs.h>
92105854Sjulian__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 224914 2011-08-16 20:07:47Z kib $");
93111028Sjeff
94111028Sjeff#include <sys/param.h>
95111028Sjeff#include <sys/systm.h>
96111028Sjeff#include <sys/fcntl.h>
9799026Sjulian#include <sys/file.h>
98107719Sjulian#include <sys/filedesc.h>
99111028Sjeff#include <sys/filio.h>
100111515Sdavidxu#include <sys/kernel.h>
101111028Sjeff#include <sys/lock.h>
102105854Sjulian#include <sys/mutex.h>
103111028Sjeff#include <sys/ttycom.h>
104111028Sjeff#include <sys/stat.h>
105111028Sjeff#include <sys/malloc.h>
106111028Sjeff#include <sys/poll.h>
107111028Sjeff#include <sys/selinfo.h>
108111028Sjeff#include <sys/signalvar.h>
109111028Sjeff#include <sys/syscallsubr.h>
110111028Sjeff#include <sys/sysctl.h>
111111028Sjeff#include <sys/sysproto.h>
112111028Sjeff#include <sys/pipe.h>
113111028Sjeff#include <sys/proc.h>
114111028Sjeff#include <sys/vnode.h>
115111028Sjeff#include <sys/uio.h>
116111028Sjeff#include <sys/event.h>
117111028Sjeff
118111028Sjeff#include <security/mac/mac_framework.h>
119111028Sjeff
120111028Sjeff#include <vm/vm.h>
121111028Sjeff#include <vm/vm_param.h>
122111028Sjeff#include <vm/vm_object.h>
123111028Sjeff#include <vm/vm_kern.h>
124111028Sjeff#include <vm/vm_extern.h>
125111028Sjeff#include <vm/pmap.h>
126111028Sjeff#include <vm/vm_map.h>
127111028Sjeff#include <vm/vm_page.h>
128111028Sjeff#include <vm/uma.h>
129111028Sjeff
130111028Sjeff/*
131111028Sjeff * Use this define if you want to disable *fancy* VM things.  Expect an
13299026Sjulian * approx 30% decrease in transfer rate.  This could be useful for
133107719Sjulian * NetBSD or OpenBSD.
13499026Sjulian */
13599026Sjulian/* #define PIPE_NODIRECT */
13699026Sjulian
13799026Sjulian/*
13899026Sjulian * interfaces to the outside world
13999026Sjulian */
14099026Sjulianstatic fo_rdwr_t	pipe_read;
141103216Sjulianstatic fo_rdwr_t	pipe_write;
142113339Sjulianstatic fo_truncate_t	pipe_truncate;
14399026Sjulianstatic fo_ioctl_t	pipe_ioctl;
14499026Sjulianstatic fo_poll_t	pipe_poll;
14599026Sjulianstatic fo_kqfilter_t	pipe_kqfilter;
14699026Sjulianstatic fo_stat_t	pipe_stat;
14799026Sjulianstatic fo_close_t	pipe_close;
14899026Sjulian
14999026Sjulianstatic struct fileops pipeops = {
15099026Sjulian	.fo_read = pipe_read,
15199026Sjulian	.fo_write = pipe_write,
15299026Sjulian	.fo_truncate = pipe_truncate,
15399026Sjulian	.fo_ioctl = pipe_ioctl,
15499026Sjulian	.fo_poll = pipe_poll,
15599026Sjulian	.fo_kqfilter = pipe_kqfilter,
15699026Sjulian	.fo_stat = pipe_stat,
15799026Sjulian	.fo_close = pipe_close,
158103216Sjulian	.fo_chmod = invfo_chmod,
159103216Sjulian	.fo_chown = invfo_chown,
160103216Sjulian	.fo_flags = DFLAG_PASSABLE
16199026Sjulian};
16299026Sjulian
16399026Sjulianstatic void	filt_pipedetach(struct knote *kn);
16499026Sjulianstatic int	filt_piperead(struct knote *kn, long hint);
16599026Sjulianstatic int	filt_pipewrite(struct knote *kn, long hint);
16699026Sjulian
16799026Sjulianstatic struct filterops pipe_rfiltops = {
168103216Sjulian	.f_isfd = 1,
16999026Sjulian	.f_detach = filt_pipedetach,
17099026Sjulian	.f_event = filt_piperead
17199026Sjulian};
17299026Sjulianstatic struct filterops pipe_wfiltops = {
17399026Sjulian	.f_isfd = 1,
17499026Sjulian	.f_detach = filt_pipedetach,
17599026Sjulian	.f_event = filt_pipewrite
17699026Sjulian};
17799026Sjulian
17899026Sjulian/*
17999026Sjulian * Default pipe buffer size(s), this can be kind-of large now because pipe
18099026Sjulian * space is pageable.  The pipe code will try to maintain locality of
18199026Sjulian * reference for performance reasons, so small amounts of outstanding I/O
18299026Sjulian * will not wipe the cache.
18399026Sjulian */
18499026Sjulian#define MINPIPESIZE (PIPE_SIZE/3)
18599026Sjulian#define MAXPIPESIZE (2*PIPE_SIZE/3)
186103312Sjulian
187104354Sscottlstatic long amountpipekva;
188103312Sjulianstatic int pipefragretry;
18999026Sjulianstatic int pipeallocfail;
190107126Sjeffstatic int piperesizefail;
19199026Sjulianstatic int piperesizeallowed = 1;
19299026Sjulian
19399026SjulianSYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
19499026Sjulian	   &maxpipekva, 0, "Pipe KVA limit");
19599026SjulianSYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
19699026Sjulian	   &amountpipekva, 0, "Pipe KVA usage");
19799026SjulianSYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
19899026Sjulian	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
19999026SjulianSYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
20099026Sjulian	  &pipeallocfail, 0, "Pipe allocation failures");
20199026SjulianSYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
20299026Sjulian	  &piperesizefail, 0, "Pipe resize failures");
20399026SjulianSYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
204111028Sjeff	  &piperesizeallowed, 0, "Pipe resizing allowed");
205107126Sjeff
206107126Sjeffstatic void pipeinit(void *dummy __unused);
207107126Sjeffstatic void pipeclose(struct pipe *cpipe);
208107126Sjeffstatic void pipe_free_kmem(struct pipe *cpipe);
209107126Sjeffstatic int pipe_create(struct pipe *pipe, int backing);
210107126Sjeffstatic __inline int pipelock(struct pipe *cpipe, int catch);
211107126Sjeffstatic __inline void pipeunlock(struct pipe *cpipe);
21299026Sjulianstatic __inline void pipeselwakeup(struct pipe *cpipe);
213107126Sjeff#ifndef PIPE_NODIRECT
214107126Sjeffstatic int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
215107126Sjeffstatic void pipe_destroy_write_buffer(struct pipe *wpipe);
216111028Sjeffstatic int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
217107126Sjeffstatic void pipe_clone_write_buffer(struct pipe *wpipe);
218107126Sjeff#endif
219107126Sjeffstatic int pipespace(struct pipe *cpipe, int size);
220107126Sjeffstatic int pipespace_new(struct pipe *cpipe, int size);
221107126Sjeff
222107126Sjeffstatic int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
223107126Sjeffstatic int	pipe_zone_init(void *mem, int size, int flags);
224107126Sjeffstatic void	pipe_zone_fini(void *mem, int size);
225107126Sjeff
226107126Sjeffstatic uma_zone_t pipe_zone;
227107126Sjeff
228107126SjeffSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
229105854Sjulian
230111028Sjeffstatic void
231105854Sjulianpipeinit(void *dummy __unused)
232105854Sjulian{
233105854Sjulian
234105854Sjulian	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
235105854Sjulian	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
236105854Sjulian	    UMA_ALIGN_PTR, 0);
237105854Sjulian	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
238105854Sjulian}
239111028Sjeff
240105854Sjulianstatic int
241105854Sjulianpipe_zone_ctor(void *mem, int size, void *arg, int flags)
242105854Sjulian{
243111028Sjeff	struct pipepair *pp;
244111028Sjeff	struct pipe *rpipe, *wpipe;
245105854Sjulian
246105854Sjulian	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
247105854Sjulian
248105854Sjulian	pp = (struct pipepair *)mem;
249105854Sjulian
250105854Sjulian	/*
251105854Sjulian	 * We zero both pipe endpoints to make sure all the kmem pointers
252105854Sjulian	 * are NULL, flag fields are zero'd, etc.  We timestamp both
253105854Sjulian	 * endpoints with the same time.
254105854Sjulian	 */
255111028Sjeff	rpipe = &pp->pp_rpipe;
256111028Sjeff	bzero(rpipe, sizeof(*rpipe));
257111028Sjeff	vfs_timestamp(&rpipe->pipe_ctime);
258105854Sjulian	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
259111028Sjeff
260111028Sjeff	wpipe = &pp->pp_wpipe;
261105854Sjulian	bzero(wpipe, sizeof(*wpipe));
262105854Sjulian	wpipe->pipe_ctime = rpipe->pipe_ctime;
263105854Sjulian	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
264105854Sjulian
265105854Sjulian	rpipe->pipe_peer = wpipe;
266105854Sjulian	rpipe->pipe_pair = pp;
267105854Sjulian	wpipe->pipe_peer = rpipe;
268105854Sjulian	wpipe->pipe_pair = pp;
269105854Sjulian
270105854Sjulian	/*
271105854Sjulian	 * Mark both endpoints as present; they will later get free'd
272105854Sjulian	 * one at a time.  When both are free'd, then the whole pair
273105854Sjulian	 * is released.
274105854Sjulian	 */
275111028Sjeff	rpipe->pipe_present = PIPE_ACTIVE;
276111028Sjeff	wpipe->pipe_present = PIPE_ACTIVE;
277111028Sjeff
278111028Sjeff	/*
279111028Sjeff	 * Eventually, the MAC Framework may initialize the label
280111028Sjeff	 * in ctor or init, but for now we do it elswhere to avoid
281111028Sjeff	 * blocking in ctor or init.
282105854Sjulian	 */
283111028Sjeff	pp->pp_label = NULL;
284111028Sjeff
285111028Sjeff	return (0);
286111028Sjeff}
287111028Sjeff
288111028Sjeffstatic int
289105854Sjulianpipe_zone_init(void *mem, int size, int flags)
290105854Sjulian{
291105854Sjulian	struct pipepair *pp;
292105854Sjulian
293105854Sjulian	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
294105854Sjulian
295105854Sjulian	pp = (struct pipepair *)mem;
296105854Sjulian
297105854Sjulian	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
298105854Sjulian	return (0);
299111028Sjeff}
300111028Sjeff
301111028Sjeffstatic void
302111028Sjeffpipe_zone_fini(void *mem, int size)
303105854Sjulian{
304105854Sjulian	struct pipepair *pp;
305105854Sjulian
306105854Sjulian	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
307105854Sjulian
308105854Sjulian	pp = (struct pipepair *)mem;
309105854Sjulian
310105854Sjulian	mtx_destroy(&pp->pp_mtx);
311105854Sjulian}
312111028Sjeff
313111028Sjeff/*
314111028Sjeff * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
315111028Sjeff * the zone pick up the pieces via pipeclose().
316111028Sjeff */
317111125Sdavidxuint
318111028Sjeffkern_pipe(struct thread *td, int fildes[2])
319111028Sjeff{
320111028Sjeff	struct filedesc *fdp = td->td_proc->p_fd;
321111028Sjeff	struct file *rf, *wf;
322111028Sjeff	struct pipepair *pp;
323111028Sjeff	struct pipe *rpipe, *wpipe;
324111028Sjeff	int fd, error;
325111028Sjeff
326111028Sjeff	pp = uma_zalloc(pipe_zone, M_WAITOK);
327111028Sjeff#ifdef MAC
328111028Sjeff	/*
329111028Sjeff	 * The MAC label is shared between the connected endpoints.  As a
330111028Sjeff	 * result mac_pipe_init() and mac_pipe_create() are called once
331111028Sjeff	 * for the pair, and not on the endpoints.
332111028Sjeff	 */
333111028Sjeff	mac_pipe_init(pp);
334111028Sjeff	mac_pipe_create(td->td_ucred, pp);
335111028Sjeff#endif
336111028Sjeff	rpipe = &pp->pp_rpipe;
337111028Sjeff	wpipe = &pp->pp_wpipe;
338111028Sjeff
339111028Sjeff	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
340111028Sjeff	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
341111028Sjeff
342111028Sjeff	/* Only the forward direction pipe is backed by default */
343111028Sjeff	if ((error = pipe_create(rpipe, 1)) != 0 ||
344111028Sjeff	    (error = pipe_create(wpipe, 0)) != 0) {
345111028Sjeff		pipeclose(rpipe);
346111028Sjeff		pipeclose(wpipe);
347111028Sjeff		return (error);
348111028Sjeff	}
349111028Sjeff
350111028Sjeff	rpipe->pipe_state |= PIPE_DIRECTOK;
351111028Sjeff	wpipe->pipe_state |= PIPE_DIRECTOK;
352111028Sjeff
353111028Sjeff	error = falloc(td, &rf, &fd, 0);
354111028Sjeff	if (error) {
355111028Sjeff		pipeclose(rpipe);
356111028Sjeff		pipeclose(wpipe);
357111028Sjeff		return (error);
358111028Sjeff	}
359111028Sjeff	/* An extra reference on `rf' has been held for us by falloc(). */
360111028Sjeff	fildes[0] = fd;
361111028Sjeff
36299026Sjulian	/*
363111028Sjeff	 * Warning: once we've gotten past allocation of the fd for the
364111028Sjeff	 * read-side, we can only drop the read side via fdrop() in order
365105854Sjulian	 * to avoid races against processes which manage to dup() the read
366105854Sjulian	 * side while we are blocked trying to allocate the write side.
367105854Sjulian	 */
368111028Sjeff	finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);
369105854Sjulian	error = falloc(td, &wf, &fd, 0);
370105854Sjulian	if (error) {
371105854Sjulian		fdclose(fdp, rf, fildes[0], td);
372105854Sjulian		fdrop(rf, td);
373105854Sjulian		/* rpipe has been closed by fdrop(). */
374105854Sjulian		pipeclose(wpipe);
375105854Sjulian		return (error);
376105854Sjulian	}
377105854Sjulian	/* An extra reference on `wf' has been held for us by falloc(). */
378105854Sjulian	finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);
379105854Sjulian	fdrop(wf, td);
380105854Sjulian	fildes[1] = fd;
381105854Sjulian	fdrop(rf, td);
382111028Sjeff
383111028Sjeff	return (0);
384111028Sjeff}
385111028Sjeff
386111028Sjeff/* ARGSUSED */
387105854Sjulianint
388105854Sjulianpipe(struct thread *td, struct pipe_args *uap)
389105854Sjulian{
390106180Sdavidxu	int error;
391106180Sdavidxu	int fildes[2];
392105854Sjulian
393106242Sdavidxu	error = kern_pipe(td, fildes);
394111585Sjulian	if (error)
395106242Sdavidxu		return (error);
396106180Sdavidxu
397106180Sdavidxu	td->td_retval[0] = fildes[0];
398106180Sdavidxu	td->td_retval[1] = fildes[1];
399106180Sdavidxu
400106180Sdavidxu	return (0);
401106180Sdavidxu}
402106180Sdavidxu
403106180Sdavidxu/*
404106180Sdavidxu * Allocate kva for pipe circular buffer, the space is pageable
405111028Sjeff * This routine will 'realloc' the size of a pipe safely, if it fails
406106180Sdavidxu * it will retain the old buffer.
407106182Sdavidxu * If it fails it will return ENOMEM.
408106180Sdavidxu */
409106180Sdavidxustatic int
410106180Sdavidxupipespace_new(cpipe, size)
411106182Sdavidxu	struct pipe *cpipe;
412105854Sjulian	int size;
413105854Sjulian{
414111028Sjeff	caddr_t buffer;
415111028Sjeff	int error, cnt, firstseg;
416111028Sjeff	static int curfail = 0;
417111028Sjeff	static struct timeval lastfail;
418111028Sjeff
419105854Sjulian	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
420105854Sjulian	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
421105854Sjulian		("pipespace: resize of direct writes not allowed"));
422105854Sjulianretry:
423105854Sjulian	cnt = cpipe->pipe_buffer.cnt;
424108640Sdavidxu	if (cnt > size)
425105854Sjulian		size = cnt;
426105854Sjulian
427113793Sdavidxu	size = round_page(size);
428106182Sdavidxu	buffer = (caddr_t) vm_map_min(pipe_map);
429105854Sjulian
430111028Sjeff	error = vm_map_find(pipe_map, NULL, 0,
431105854Sjulian		(vm_offset_t *) &buffer, size, 1,
432105854Sjulian		VM_PROT_ALL, VM_PROT_ALL, 0);
433111028Sjeff	if (error != KERN_SUCCESS) {
434105854Sjulian		if ((cpipe->pipe_buffer.buffer == NULL) &&
435105854Sjulian			(size > SMALL_PIPE_SIZE)) {
436105854Sjulian			size = SMALL_PIPE_SIZE;
437105854Sjulian			pipefragretry++;
438108640Sdavidxu			goto retry;
439111028Sjeff		}
440108640Sdavidxu		if (cpipe->pipe_buffer.buffer == NULL) {
441111028Sjeff			pipeallocfail++;
442111585Sjulian			if (ppsratecheck(&lastfail, &curfail, 1))
443105854Sjulian				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
444105854Sjulian		} else {
445105854Sjulian			piperesizefail++;
446111028Sjeff		}
447111028Sjeff		return (ENOMEM);
448111028Sjeff	}
449111028Sjeff
450112071Sdavidxu	/* copy data, then free old resources if we're resizing */
451105854Sjulian	if (cnt > 0) {
452105854Sjulian		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
453105854Sjulian			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
454106182Sdavidxu			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
455105854Sjulian				buffer, firstseg);
456105854Sjulian			if ((cnt - firstseg) > 0)
457107719Sjulian				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
458108338Sjulian					cpipe->pipe_buffer.in);
459111028Sjeff		} else {
460107719Sjulian			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
461111028Sjeff				buffer, cnt);
462111028Sjeff		}
463111169Sdavidxu	}
464111028Sjeff	pipe_free_kmem(cpipe);
465111028Sjeff	cpipe->pipe_buffer.buffer = buffer;
466105854Sjulian	cpipe->pipe_buffer.size = size;
467111028Sjeff	cpipe->pipe_buffer.in = cnt;
468105854Sjulian	cpipe->pipe_buffer.out = 0;
469105854Sjulian	cpipe->pipe_buffer.cnt = cnt;
470107719Sjulian	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
471111169Sdavidxu	return (0);
472111169Sdavidxu}
473111169Sdavidxu
474105854Sjulian/*
475105854Sjulian * Wrapper for pipespace_new() that performs locking assertions.
476107719Sjulian */
477113793Sdavidxustatic int
478107719Sjulianpipespace(cpipe, size)
479111169Sdavidxu	struct pipe *cpipe;
480111169Sdavidxu	int size;
481111169Sdavidxu{
482111169Sdavidxu
483111169Sdavidxu	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
484111169Sdavidxu		("Unlocked pipe passed to pipespace"));
485111169Sdavidxu	return (pipespace_new(cpipe, size));
486108613Sjulian}
487108338Sjulian
488111028Sjeff/*
489112888Sjeff * lock a pipe for I/O, blocking other access
490111042Sdavidxu */
491111042Sdavidxustatic __inline int
492112888Sjeffpipelock(cpipe, catch)
493111169Sdavidxu	struct pipe *cpipe;
494111169Sdavidxu	int catch;
495111169Sdavidxu{
496111169Sdavidxu	int error;
497111028Sjeff
498111169Sdavidxu	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
499111169Sdavidxu	while (cpipe->pipe_state & PIPE_LOCKFL) {
500111028Sjeff		cpipe->pipe_state |= PIPE_LWANT;
501110190Sjulian		error = msleep(cpipe, PIPE_MTX(cpipe),
502111169Sdavidxu		    catch ? (PRIBIO | PCATCH) : PRIBIO,
503111169Sdavidxu		    "pipelk", 0);
504111169Sdavidxu		if (error != 0)
505111169Sdavidxu			return (error);
506111169Sdavidxu	}
507111169Sdavidxu	cpipe->pipe_state |= PIPE_LOCKFL;
508111169Sdavidxu	return (0);
509111169Sdavidxu}
510111169Sdavidxu
511105854Sjulian/*
512111169Sdavidxu * unlock a pipe I/O lock
513107719Sjulian */
514105854Sjulianstatic __inline void
515105854Sjulianpipeunlock(cpipe)
516105854Sjulian	struct pipe *cpipe;
517105854Sjulian{
518105854Sjulian
519105854Sjulian	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
520105854Sjulian	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
521105854Sjulian		("Unlocked pipe passed to pipeunlock"));
522105854Sjulian	cpipe->pipe_state &= ~PIPE_LOCKFL;
523105854Sjulian	if (cpipe->pipe_state & PIPE_LWANT) {
524111028Sjeff		cpipe->pipe_state &= ~PIPE_LWANT;
525108338Sjulian		wakeup(cpipe);
526105854Sjulian	}
527105854Sjulian}
528108338Sjulian
529111028Sjeffstatic __inline void
530105854Sjulianpipeselwakeup(cpipe)
531111585Sjulian	struct pipe *cpipe;
532111028Sjeff{
533111028Sjeff
534108613Sjulian	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
535105854Sjulian	if (cpipe->pipe_state & PIPE_SEL) {
536105854Sjulian		selwakeuppri(&cpipe->pipe_sel, PSOCK);
537111028Sjeff		if (!SEL_WAITING(&cpipe->pipe_sel))
538111207Sdavidxu			cpipe->pipe_state &= ~PIPE_SEL;
539111028Sjeff	}
540108613Sjulian	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
541111028Sjeff		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
542108338Sjulian	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
543105854Sjulian}
544105854Sjulian
545105854Sjulian/*
546111028Sjeff * Initialize and allocate VM and memory for pipe.  The structure
547111028Sjeff * will start out zero'd from the ctor, so we just manage the kmem.
548111028Sjeff */
549111028Sjeffstatic int
550111028Sjeffpipe_create(pipe, backing)
551108338Sjulian	struct pipe *pipe;
552111028Sjeff	int backing;
553105854Sjulian{
554111028Sjeff	int error;
555111028Sjeff
556111028Sjeff	if (backing) {
557111028Sjeff		if (amountpipekva > maxpipekva / 2)
558111028Sjeff			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
559111028Sjeff		else
560111028Sjeff			error = pipespace_new(pipe, PIPE_SIZE);
561111028Sjeff	} else {
562108613Sjulian		/* If we're not backing this pipe, no need to do anything. */
563105854Sjulian		error = 0;
564111028Sjeff	}
565108338Sjulian	return (error);
566108613Sjulian}
567105854Sjulian
568111028Sjeff/* ARGSUSED */
569108338Sjulianstatic int
570105854Sjulianpipe_read(fp, uio, active_cred, flags, td)
571105854Sjulian	struct file *fp;
572105854Sjulian	struct uio *uio;
573105854Sjulian	struct ucred *active_cred;
574111028Sjeff	struct thread *td;
575105854Sjulian	int flags;
576105854Sjulian{
577105854Sjulian	struct pipe *rpipe = fp->f_data;
578105854Sjulian	int error;
579105854Sjulian	int nread = 0;
580105854Sjulian	u_int size;
581105854Sjulian
582105854Sjulian	PIPE_LOCK(rpipe);
583105854Sjulian	++rpipe->pipe_busy;
584105854Sjulian	error = pipelock(rpipe, 1);
585105854Sjulian	if (error)
586105854Sjulian		goto unlocked_error;
587105854Sjulian
588111028Sjeff#ifdef MAC
589111028Sjeff	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
590105854Sjulian	if (error)
591105854Sjulian		goto locked_error;
592105854Sjulian#endif
593105854Sjulian	if (amountpipekva > (3 * maxpipekva) / 4) {
594105854Sjulian		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
595111028Sjeff			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
596111028Sjeff			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
597111028Sjeff			(piperesizeallowed == 1)) {
598111028Sjeff			PIPE_UNLOCK(rpipe);
599111028Sjeff			pipespace(rpipe, SMALL_PIPE_SIZE);
600111028Sjeff			PIPE_LOCK(rpipe);
601111028Sjeff		}
602111028Sjeff	}
603111028Sjeff
604111028Sjeff	while (uio->uio_resid) {
605112078Sdavidxu		/*
606111585Sjulian		 * normal pipe buffer receive
607112078Sdavidxu		 */
608105854Sjulian		if (rpipe->pipe_buffer.cnt > 0) {
609105854Sjulian			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
610111028Sjeff			if (size > rpipe->pipe_buffer.cnt)
611107006Sdavidxu				size = rpipe->pipe_buffer.cnt;
612107006Sdavidxu			if (size > (u_int) uio->uio_resid)
613105854Sjulian				size = (u_int) uio->uio_resid;
614105854Sjulian
615105854Sjulian			PIPE_UNLOCK(rpipe);
616111028Sjeff			error = uiomove(
617105854Sjulian			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
618105854Sjulian			    size, uio);
619105854Sjulian			PIPE_LOCK(rpipe);
620111028Sjeff			if (error)
621105854Sjulian				break;
622105854Sjulian
623111028Sjeff			rpipe->pipe_buffer.out += size;
624111028Sjeff			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
625111028Sjeff				rpipe->pipe_buffer.out = 0;
626111677Sdavidxu
627111028Sjeff			rpipe->pipe_buffer.cnt -= size;
628111028Sjeff
629111677Sdavidxu			/*
630111028Sjeff			 * If there is no more to read in the pipe, reset
631105854Sjulian			 * its pointers to the beginning.  This improves
632111028Sjeff			 * cache hit stats.
633111028Sjeff			 */
634111028Sjeff			if (rpipe->pipe_buffer.cnt == 0) {
635111028Sjeff				rpipe->pipe_buffer.in = 0;
636111028Sjeff				rpipe->pipe_buffer.out = 0;
637111028Sjeff			}
638111028Sjeff			nread += size;
639111028Sjeff#ifndef PIPE_NODIRECT
640111028Sjeff		/*
641111028Sjeff		 * Direct copy, bypassing a kernel buffer.
642111028Sjeff		 */
643111028Sjeff		} else if ((size = rpipe->pipe_map.cnt) &&
644111028Sjeff			   (rpipe->pipe_state & PIPE_DIRECTW)) {
645111028Sjeff			if (size > (u_int) uio->uio_resid)
646111028Sjeff				size = (u_int) uio->uio_resid;
647111028Sjeff
648111028Sjeff			PIPE_UNLOCK(rpipe);
649111028Sjeff			error = uiomove_fromphys(rpipe->pipe_map.ms,
650111028Sjeff			    rpipe->pipe_map.pos, size, uio);
651111028Sjeff			PIPE_LOCK(rpipe);
652111028Sjeff			if (error)
653111028Sjeff				break;
654111028Sjeff			nread += size;
655111028Sjeff			rpipe->pipe_map.pos += size;
656111028Sjeff			rpipe->pipe_map.cnt -= size;
657105854Sjulian			if (rpipe->pipe_map.cnt == 0) {
658111028Sjeff				rpipe->pipe_state &= ~PIPE_DIRECTW;
659105854Sjulian				wakeup(rpipe);
660111028Sjeff			}
661111028Sjeff#endif
662105854Sjulian		} else {
663111028Sjeff			/*
664111028Sjeff			 * detect EOF condition
665111028Sjeff			 * read returns 0 on EOF, no need to set error
666111028Sjeff			 */
667105854Sjulian			if (rpipe->pipe_state & PIPE_EOF)
668111028Sjeff				break;
669111028Sjeff
670111028Sjeff			/*
671111028Sjeff			 * If the "write-side" has been blocked, wake it up now.
672111028Sjeff			 */
673105854Sjulian			if (rpipe->pipe_state & PIPE_WANTW) {
674111028Sjeff				rpipe->pipe_state &= ~PIPE_WANTW;
675111028Sjeff				wakeup(rpipe);
676111028Sjeff			}
677111028Sjeff
678111028Sjeff			/*
679111028Sjeff			 * Break if some data was read.
680111028Sjeff			 */
681111028Sjeff			if (nread > 0)
682111028Sjeff				break;
683111028Sjeff
684111028Sjeff			/*
685111028Sjeff			 * Unlock the pipe buffer for our remaining processing.
686111595Sdavidxu			 * We will either break out with an error or we will
687111028Sjeff			 * sleep and relock to loop.
688111028Sjeff			 */
689111028Sjeff			pipeunlock(rpipe);
690111028Sjeff
691112397Sdavidxu			/*
692112397Sdavidxu			 * Handle non-blocking mode operation or
693111028Sjeff			 * wait for more data.
694111028Sjeff			 */
695111028Sjeff			if (fp->f_flag & FNONBLOCK) {
696111028Sjeff				error = EAGAIN;
697111028Sjeff			} else {
698111028Sjeff				rpipe->pipe_state |= PIPE_WANTR;
699111028Sjeff				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
700111028Sjeff				    PRIBIO | PCATCH,
701111028Sjeff				    "piperd", 0)) == 0)
702111028Sjeff					error = pipelock(rpipe, 1);
703111028Sjeff			}
704105854Sjulian			if (error)
705105854Sjulian				goto unlocked_error;
706111028Sjeff		}
707111028Sjeff	}
708105854Sjulian#ifdef MAC
709111028Sjefflocked_error:
710111028Sjeff#endif
711111028Sjeff	pipeunlock(rpipe);
712111028Sjeff
713111028Sjeff	/* XXX: should probably do this before getting any locks. */
714111028Sjeff	if (error == 0)
715111028Sjeff		vfs_timestamp(&rpipe->pipe_atime);
716111028Sjeffunlocked_error:
717111028Sjeff	--rpipe->pipe_busy;
718105854Sjulian
719111028Sjeff	/*
720105854Sjulian	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
721105854Sjulian	 */
722105854Sjulian	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
723105854Sjulian		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
724103410Smini		wakeup(rpipe);
725103410Smini	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
726103410Smini		/*
727103410Smini		 * Handle write blocking hysteresis.
728103410Smini		 */
729103410Smini		if (rpipe->pipe_state & PIPE_WANTW) {
730103410Smini			rpipe->pipe_state &= ~PIPE_WANTW;
731103410Smini			wakeup(rpipe);
732103464Speter		}
733103464Speter	}
734103464Speter
735103464Speter	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
736103463Speter		pipeselwakeup(rpipe);
737103410Smini
738103463Speter	PIPE_UNLOCK(rpipe);
739113626Sjhb	return (error);
740112888Sjeff}
741113626Sjhb
742103410Smini#ifndef PIPE_NODIRECT
743103410Smini/*
744103410Smini * Map the sending processes' buffer into kernel space and wire it.
745103410Smini * This is similar to a physical write operation.
746103410Smini */
747103410Sministatic int
748103410Sminipipe_build_write_buffer(wpipe, uio)
749103410Smini	struct pipe *wpipe;
750103410Smini	struct uio *uio;
751103410Smini{
752103410Smini	u_int size;
753103410Smini	int i;
754103464Speter
755103464Speter	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
756103464Speter	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
757103464Speter		("Clone attempt on non-direct write pipe!"));
758103463Speter
759103410Smini	size = (u_int) uio->uio_iov->iov_len;
760103463Speter	if (size > wpipe->pipe_buffer.size)
761103463Speter		size = wpipe->pipe_buffer.size;
762103463Speter
763103410Smini	if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
764103410Smini	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
765103410Smini	    wpipe->pipe_map.ms, PIPENPAGES)) < 0)
766112888Sjeff		return (EFAULT);
767103410Smini
768103410Smini/*
769103410Smini * set up the control block
770103410Smini */
771103410Smini	wpipe->pipe_map.npages = i;
772103410Smini	wpipe->pipe_map.pos =
77399026Sjulian	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
77499026Sjulian	wpipe->pipe_map.cnt = size;
77599026Sjulian
77699026Sjulian/*
77799026Sjulian * and update the uio data
77899026Sjulian */
779104437Speter
780107126Sjeff	uio->uio_iov->iov_len -= size;
78199026Sjulian	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
78299026Sjulian	if (uio->uio_iov->iov_len == 0)
783104437Speter		uio->uio_iov++;
784104437Speter	uio->uio_resid -= size;
785104437Speter	uio->uio_offset += size;
786104437Speter	return (0);
787104437Speter}
788104437Speter
789104437Speter/*
790104437Speter * unmap and unwire the process buffer
791107126Sjeff */
792104437Speterstatic void
793104437Speterpipe_destroy_write_buffer(wpipe)
794104437Speter	struct pipe *wpipe;
795104437Speter{
796107126Sjeff
797107126Sjeff	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
798103367Sjulian	vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
799107126Sjeff	wpipe->pipe_map.npages = 0;
800107126Sjeff}
801103367Sjulian
802111028Sjeff/*
803111028Sjeff * In the case of a signal, the writing process might go away.  This
80499026Sjulian * code copies the data into the circular buffer so that the source
80599026Sjulian * pages can be freed without loss of data.
80699026Sjulian */
807103002Sjulianstatic void
80899026Sjulianpipe_clone_write_buffer(wpipe)
80999026Sjulian	struct pipe *wpipe;
81099026Sjulian{
81199026Sjulian	struct uio uio;
812111028Sjeff	struct iovec iov;
81399026Sjulian	int size;
814111028Sjeff	int pos;
81599026Sjulian
81699026Sjulian	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
817103410Smini	size = wpipe->pipe_map.cnt;
818105854Sjulian	pos = wpipe->pipe_map.pos;
819105854Sjulian
820105854Sjulian	wpipe->pipe_buffer.in = size;
821105854Sjulian	wpipe->pipe_buffer.out = 0;
822105854Sjulian	wpipe->pipe_buffer.cnt = size;
823111028Sjeff	wpipe->pipe_state &= ~PIPE_DIRECTW;
824105854Sjulian
825111028Sjeff	PIPE_UNLOCK(wpipe);
826105854Sjulian	iov.iov_base = wpipe->pipe_buffer.buffer;
827105854Sjulian	iov.iov_len = size;
828105854Sjulian	uio.uio_iov = &iov;
829111028Sjeff	uio.uio_iovcnt = 1;
830111028Sjeff	uio.uio_offset = 0;
831111028Sjeff	uio.uio_resid = size;
832111028Sjeff	uio.uio_segflg = UIO_SYSSPACE;
833111028Sjeff	uio.uio_rw = UIO_READ;
834111028Sjeff	uio.uio_td = curthread;
835111028Sjeff	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
836111028Sjeff	PIPE_LOCK(wpipe);
837111028Sjeff	pipe_destroy_write_buffer(wpipe);
838111028Sjeff}
839111028Sjeff
840111028Sjeff/*
841105854Sjulian * This implements the pipe buffer write mechanism.  Note that only
842105854Sjulian * a direct write OR a normal pipe write can be pending at any given time.
843105854Sjulian * If there are any characters in the pipe buffer, the direct write will
844105854Sjulian * be deferred until the receiving process grabs all of the bytes from
845105854Sjulian * the pipe buffer.  Then the direct mapping write is set-up.
846111028Sjeff */
847105854Sjulianstatic int
848111028Sjeffpipe_direct_write(wpipe, uio)
849105854Sjulian	struct pipe *wpipe;
850105854Sjulian	struct uio *uio;
851105854Sjulian{
852111028Sjeff	int error;
85399026Sjulian
85499026Sjulianretry:
85599026Sjulian	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
85699026Sjulian	error = pipelock(wpipe, 1);
857105854Sjulian	if (wpipe->pipe_state & PIPE_EOF)
858105854Sjulian		error = EPIPE;
859105854Sjulian	if (error) {
860111028Sjeff		pipeunlock(wpipe);
86199026Sjulian		goto error1;
86299026Sjulian	}
863111028Sjeff	while (wpipe->pipe_state & PIPE_DIRECTW) {
864111028Sjeff		if (wpipe->pipe_state & PIPE_WANTR) {
86599026Sjulian			wpipe->pipe_state &= ~PIPE_WANTR;
866105854Sjulian			wakeup(wpipe);
867105854Sjulian		}
868111028Sjeff		pipeselwakeup(wpipe);
869111028Sjeff		wpipe->pipe_state |= PIPE_WANTW;
870111028Sjeff		pipeunlock(wpipe);
871105854Sjulian		error = msleep(wpipe, PIPE_MTX(wpipe),
872105854Sjulian		    PRIBIO | PCATCH, "pipdww", 0);
873105854Sjulian		if (error)
874111028Sjeff			goto error1;
875105854Sjulian		else
876105854Sjulian			goto retry;
877105854Sjulian	}
878105854Sjulian	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
879105854Sjulian	if (wpipe->pipe_buffer.cnt > 0) {
880105854Sjulian		if (wpipe->pipe_state & PIPE_WANTR) {
881111028Sjeff			wpipe->pipe_state &= ~PIPE_WANTR;
882111028Sjeff			wakeup(wpipe);
883111028Sjeff		}
884105854Sjulian		pipeselwakeup(wpipe);
885105854Sjulian		wpipe->pipe_state |= PIPE_WANTW;
886111028Sjeff		pipeunlock(wpipe);
887111028Sjeff		error = msleep(wpipe, PIPE_MTX(wpipe),
888105854Sjulian		    PRIBIO | PCATCH, "pipdwc", 0);
889105854Sjulian		if (error)
89099026Sjulian			goto error1;
891105854Sjulian		else
892105854Sjulian			goto retry;
893105854Sjulian	}
894105854Sjulian
895105854Sjulian	wpipe->pipe_state |= PIPE_DIRECTW;
896105854Sjulian
897105854Sjulian	PIPE_UNLOCK(wpipe);
898105854Sjulian	error = pipe_build_write_buffer(wpipe, uio);
899105854Sjulian	PIPE_LOCK(wpipe);
900105854Sjulian	if (error) {
901111028Sjeff		wpipe->pipe_state &= ~PIPE_DIRECTW;
902111028Sjeff		pipeunlock(wpipe);
903111028Sjeff		goto error1;
904111028Sjeff	}
905111028Sjeff
90699026Sjulian	error = 0;
90799026Sjulian	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
90899026Sjulian		if (wpipe->pipe_state & PIPE_EOF) {
90999026Sjulian			pipe_destroy_write_buffer(wpipe);
910103367Sjulian			pipeselwakeup(wpipe);
911103367Sjulian			pipeunlock(wpipe);
912103367Sjulian			error = EPIPE;
913103367Sjulian			goto error1;
914103367Sjulian		}
915111119Simp		if (wpipe->pipe_state & PIPE_WANTR) {
916103367Sjulian			wpipe->pipe_state &= ~PIPE_WANTR;
917103367Sjulian			wakeup(wpipe);
918103367Sjulian		}
919103367Sjulian		pipeselwakeup(wpipe);
920103367Sjulian		pipeunlock(wpipe);
921103367Sjulian		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
922103367Sjulian		    "pipdwt", 0);
923103367Sjulian		pipelock(wpipe, 0);
924111119Simp	}
925103367Sjulian
926103367Sjulian	if (wpipe->pipe_state & PIPE_EOF)
927103367Sjulian		error = EPIPE;
92899026Sjulian	if (wpipe->pipe_state & PIPE_DIRECTW) {
92999026Sjulian		/*
93099026Sjulian		 * this bit of trickery substitutes a kernel buffer for
93199026Sjulian		 * the process that might be going away.
93299026Sjulian		 */
93399026Sjulian		pipe_clone_write_buffer(wpipe);
934111119Simp	} else {
93599026Sjulian		pipe_destroy_write_buffer(wpipe);
93699026Sjulian	}
93799026Sjulian	pipeunlock(wpipe);
938103367Sjulian	return (error);
939103367Sjulian
940103367Sjulianerror1:
941103367Sjulian	wakeup(wpipe);
942103367Sjulian	return (error);
943103367Sjulian}
944103367Sjulian#endif
945103367Sjulian
946103367Sjulianstatic int
947103367Sjulianpipe_write(fp, uio, active_cred, flags, td)
948103367Sjulian	struct file *fp;
949103367Sjulian	struct uio *uio;
950103367Sjulian	struct ucred *active_cred;
951103367Sjulian	struct thread *td;
952103367Sjulian	int flags;
953103367Sjulian{
954103367Sjulian	int error = 0;
955103367Sjulian	int desiredsize, orig_resid;
95699026Sjulian	struct pipe *wpipe, *rpipe;
95799026Sjulian
95899026Sjulian	rpipe = fp->f_data;
95999026Sjulian	wpipe = rpipe->pipe_peer;
96099026Sjulian
961107719Sjulian	PIPE_LOCK(rpipe);
962107719Sjulian	error = pipelock(wpipe, 1);
96399026Sjulian	if (error) {
96499026Sjulian		PIPE_UNLOCK(rpipe);
96599026Sjulian		return (error);
96699026Sjulian	}
96799026Sjulian	/*
968104031Sjulian	 * detect loss of pipe read side, issue SIGPIPE if lost.
969104031Sjulian	 */
97099026Sjulian	if (wpipe->pipe_present != PIPE_ACTIVE ||
97199026Sjulian	    (wpipe->pipe_state & PIPE_EOF)) {
97299026Sjulian		pipeunlock(wpipe);
97399026Sjulian		PIPE_UNLOCK(rpipe);
974104503Sjmallett		return (EPIPE);
975104031Sjulian	}
976104031Sjulian#ifdef MAC
977104031Sjulian	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
978111028Sjeff	if (error) {
979103410Smini		pipeunlock(wpipe);
98099026Sjulian		PIPE_UNLOCK(rpipe);
981104503Sjmallett		return (error);
982104503Sjmallett	}
983104503Sjmallett#endif
984104031Sjulian	++wpipe->pipe_busy;
985111028Sjeff
986104031Sjulian	/* Choose a larger size if it's advantageous */
987108338Sjulian	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
988108338Sjulian	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
989104031Sjulian		if (piperesizeallowed != 1)
990108338Sjulian			break;
991108338Sjulian		if (amountpipekva > maxpipekva / 2)
992108338Sjulian			break;
993108338Sjulian		if (desiredsize == BIG_PIPE_SIZE)
994108338Sjulian			break;
995111028Sjeff		desiredsize = desiredsize * 2;
996111028Sjeff	}
997111028Sjeff
998111028Sjeff	/* Choose a smaller size if we're in a OOM situation */
999111028Sjeff	if ((amountpipekva > (3 * maxpipekva) / 4) &&
1000111028Sjeff		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1001111028Sjeff		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1002104031Sjulian		(piperesizeallowed == 1))
1003104031Sjulian		desiredsize = SMALL_PIPE_SIZE;
1004104031Sjulian
1005104031Sjulian	/* Resize if the above determined that a new size was necessary */
1006104031Sjulian	if ((desiredsize != wpipe->pipe_buffer.size) &&
1007104031Sjulian		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1008104031Sjulian		PIPE_UNLOCK(wpipe);
1009104031Sjulian		pipespace(wpipe, desiredsize);
1010108338Sjulian		PIPE_LOCK(wpipe);
1011107034Sdavidxu	}
1012104031Sjulian	if (wpipe->pipe_buffer.size == 0) {
1013104126Sjulian		/*
1014104031Sjulian		 * This can only happen for reverse direction use of pipes
1015104031Sjulian		 * in a complete OOM situation.
1016111028Sjeff		 */
1017111028Sjeff		error = ENOMEM;
1018111028Sjeff		--wpipe->pipe_busy;
1019111028Sjeff		pipeunlock(wpipe);
1020111028Sjeff		PIPE_UNLOCK(wpipe);
1021111028Sjeff		return (error);
1022111028Sjeff	}
1023104126Sjulian
1024104031Sjulian	pipeunlock(wpipe);
1025104031Sjulian
1026104126Sjulian	orig_resid = uio->uio_resid;
1027104031Sjulian
1028111028Sjeff	while (uio->uio_resid) {
1029104031Sjulian		int space;
1030107034Sdavidxu
1031107034Sdavidxu		pipelock(wpipe, 0);
1032107034Sdavidxu		if (wpipe->pipe_state & PIPE_EOF) {
1033107034Sdavidxu			pipeunlock(wpipe);
1034107034Sdavidxu			error = EPIPE;
1035111028Sjeff			break;
1036111028Sjeff		}
1037111028Sjeff#ifndef PIPE_NODIRECT
1038108338Sjulian		/*
1039104031Sjulian		 * If the transfer is large, we can gain performance if
104099026Sjulian		 * we do process-to-process copies directly.
1041104031Sjulian		 * If the write is non-blocking, we don't use the
1042104031Sjulian		 * direct write mechanism.
1043111028Sjeff		 *
1044104031Sjulian		 * The direct write mechanism will detect the reader going
1045104031Sjulian		 * away on us.
1046111028Sjeff		 */
1047104031Sjulian		if (uio->uio_segflg == UIO_USERSPACE &&
1048104126Sjulian		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
1049104031Sjulian		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
1050104031Sjulian		    (fp->f_flag & FNONBLOCK) == 0) {
1051104031Sjulian			pipeunlock(wpipe);
1052111028Sjeff			error = pipe_direct_write(wpipe, uio);
1053104031Sjulian			if (error)
1054104031Sjulian				break;
1055104031Sjulian			continue;
1056104126Sjulian		}
1057104126Sjulian#endif
1058104126Sjulian
1059104031Sjulian		/*
1060104031Sjulian		 * Pipe buffered writes cannot be coincidental with
1061104126Sjulian		 * direct writes.  We wait until the currently executing
1062104031Sjulian		 * direct write is completed before we start filling the
1063104031Sjulian		 * pipe buffer.  We break out if a signal occurs or the
1064104126Sjulian		 * reader goes away.
1065104031Sjulian		 */
1066104031Sjulian		if (wpipe->pipe_state & PIPE_DIRECTW) {
1067104126Sjulian			if (wpipe->pipe_state & PIPE_WANTR) {
106899026Sjulian				wpipe->pipe_state &= ~PIPE_WANTR;
1069104031Sjulian				wakeup(wpipe);
107099026Sjulian			}
107199026Sjulian			pipeselwakeup(wpipe);
107299026Sjulian			wpipe->pipe_state |= PIPE_WANTW;
1073107034Sdavidxu			pipeunlock(wpipe);
1074107034Sdavidxu			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1075107034Sdavidxu			    "pipbww", 0);
1076111028Sjeff			if (error)
1077107034Sdavidxu				break;
1078107034Sdavidxu			else
1079107034Sdavidxu				continue;
1080111028Sjeff		}
1081111028Sjeff
1082107034Sdavidxu		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1083107034Sdavidxu
1084111976Sdavidxu		/* Writes of size <= PIPE_BUF must be atomic. */
1085111032Sjulian		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1086111976Sdavidxu			space = 0;
1087111028Sjeff
1088107034Sdavidxu		if (space > 0) {
1089107034Sdavidxu			int size;	/* Transfer size */
1090111028Sjeff			int segsize;	/* first segment to transfer */
1091111028Sjeff
1092111028Sjeff			/*
1093111028Sjeff			 * Transfer size is minimum of uio transfer
1094111028Sjeff			 * and free space in pipe buffer.
1095111028Sjeff			 */
1096111028Sjeff			if (space > uio->uio_resid)
1097111028Sjeff				size = uio->uio_resid;
1098111028Sjeff			else
1099107034Sdavidxu				size = space;
1100111028Sjeff			/*
1101107034Sdavidxu			 * First segment to transfer is minimum of
1102107034Sdavidxu			 * transfer size and contiguous space in
1103111028Sjeff			 * pipe buffer.  If first segment to transfer
1104111515Sdavidxu			 * is less than the transfer size, we've got
1105111028Sjeff			 * a wraparound in the buffer.
1106107034Sdavidxu			 */
1107111515Sdavidxu			segsize = wpipe->pipe_buffer.size -
1108107034Sdavidxu				wpipe->pipe_buffer.in;
1109107034Sdavidxu			if (segsize > size)
1110107034Sdavidxu				segsize = size;
1111111028Sjeff
1112112397Sdavidxu			/* Transfer first segment */
1113107034Sdavidxu
1114111028Sjeff			PIPE_UNLOCK(rpipe);
1115107034Sdavidxu			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1116111028Sjeff					segsize, uio);
1117111028Sjeff			PIPE_LOCK(rpipe);
1118111028Sjeff
1119111028Sjeff			if (error == 0 && segsize < size) {
1120107034Sdavidxu				KASSERT(wpipe->pipe_buffer.in + segsize ==
1121111028Sjeff					wpipe->pipe_buffer.size,
1122111515Sdavidxu					("Pipe buffer wraparound disappeared"));
1123111515Sdavidxu				/*
1124111515Sdavidxu				 * Transfer remaining part now, to
1125111515Sdavidxu				 * support atomic writes.  Wraparound
1126111515Sdavidxu				 * happened.
1127111515Sdavidxu				 */
1128111515Sdavidxu
1129111515Sdavidxu				PIPE_UNLOCK(rpipe);
1130111515Sdavidxu				error = uiomove(
1131107034Sdavidxu				    &wpipe->pipe_buffer.buffer[0],
1132111515Sdavidxu				    size - segsize, uio);
1133111028Sjeff				PIPE_LOCK(rpipe);
1134111028Sjeff			}
1135111028Sjeff			if (error == 0) {
1136111028Sjeff				wpipe->pipe_buffer.in += size;
1137111028Sjeff				if (wpipe->pipe_buffer.in >=
1138107034Sdavidxu				    wpipe->pipe_buffer.size) {
1139112397Sdavidxu					KASSERT(wpipe->pipe_buffer.in ==
1140112397Sdavidxu						size - segsize +
1141112397Sdavidxu						wpipe->pipe_buffer.size,
1142112397Sdavidxu						("Expected wraparound bad"));
1143112397Sdavidxu					wpipe->pipe_buffer.in = size - segsize;
1144112397Sdavidxu				}
1145111028Sjeff
1146111028Sjeff				wpipe->pipe_buffer.cnt += size;
1147111028Sjeff				KASSERT(wpipe->pipe_buffer.cnt <=
1148111028Sjeff					wpipe->pipe_buffer.size,
114999026Sjulian					("Pipe buffer overflow"));
115099026Sjulian			}
115199026Sjulian			pipeunlock(wpipe);
1152107719Sjulian			if (error != 0)
1153107719Sjulian				break;
1154107719Sjulian		} else {
115599026Sjulian			/*
115699026Sjulian			 * If the "read-side" has been blocked, wake it up now.
115799026Sjulian			 */
115899026Sjulian			if (wpipe->pipe_state & PIPE_WANTR) {
115999026Sjulian				wpipe->pipe_state &= ~PIPE_WANTR;
116099026Sjulian				wakeup(wpipe);
116199026Sjulian			}
116299026Sjulian
116399026Sjulian			/*
116499026Sjulian			 * don't block on non-blocking I/O
116599026Sjulian			 */
116699026Sjulian			if (fp->f_flag & FNONBLOCK) {
116799026Sjulian				error = EAGAIN;
116899026Sjulian				pipeunlock(wpipe);
116999026Sjulian				break;
1170102581Sjulian			}
1171102581Sjulian
1172102581Sjulian			/*
117399026Sjulian			 * We have no more space and have something to offer,
117499026Sjulian			 * wake up select/poll.
117599026Sjulian			 */
117699026Sjulian			pipeselwakeup(wpipe);
1177104695Sjulian
1178104695Sjulian			wpipe->pipe_state |= PIPE_WANTW;
1179104695Sjulian			pipeunlock(wpipe);
1180104695Sjulian			error = msleep(wpipe, PIPE_MTX(rpipe),
1181104695Sjulian			    PRIBIO | PCATCH, "pipewr", 0);
118299026Sjulian			if (error != 0)
118399026Sjulian				break;
1184102581Sjulian		}
1185103002Sjulian	}
1186103002Sjulian
1187103002Sjulian	pipelock(wpipe, 0);
1188102581Sjulian	--wpipe->pipe_busy;
1189103002Sjulian
1190113641Sjulian	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1191111115Sdavidxu		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1192111115Sdavidxu		wakeup(wpipe);
1193103002Sjulian	} else if (wpipe->pipe_buffer.cnt > 0) {
1194103002Sjulian		/*
1195103002Sjulian		 * If we have put any characters in the buffer, we wake up
1196103002Sjulian		 * the reader.
1197103002Sjulian		 */
1198103002Sjulian		if (wpipe->pipe_state & PIPE_WANTR) {
1199103002Sjulian			wpipe->pipe_state &= ~PIPE_WANTR;
1200103216Sjulian			wakeup(wpipe);
1201103002Sjulian		}
120299026Sjulian	}
1203104695Sjulian
1204111028Sjeff	/*
1205111028Sjeff	 * Don't return EPIPE if I/O was successful
1206111028Sjeff	 */
1207111028Sjeff	if ((wpipe->pipe_buffer.cnt == 0) &&
1208111028Sjeff	    (uio->uio_resid == 0) &&
1209111028Sjeff	    (error == EPIPE)) {
1210111028Sjeff		error = 0;
1211111028Sjeff	}
1212111028Sjeff
1213111028Sjeff	if (error == 0)
1214111028Sjeff		vfs_timestamp(&wpipe->pipe_mtime);
1215104695Sjulian
1216111028Sjeff	/*
1217104695Sjulian	 * We have something to offer,
1218108338Sjulian	 * wake up select/poll.
1219104695Sjulian	 */
1220111028Sjeff	if (wpipe->pipe_buffer.cnt)
1221105854Sjulian		pipeselwakeup(wpipe);
1222111028Sjeff
1223105854Sjulian	pipeunlock(wpipe);
1224105854Sjulian	PIPE_UNLOCK(rpipe);
1225111028Sjeff	return (error);
1226105854Sjulian}
1227113244Sdavidxu
1228105854Sjulian/* ARGSUSED */
1229113244Sdavidxustatic int
1230105854Sjulianpipe_truncate(fp, length, active_cred, td)
1231105854Sjulian	struct file *fp;
1232107719Sjulian	off_t length;
1233103002Sjulian	struct ucred *active_cred;
1234103002Sjulian	struct thread *td;
123599026Sjulian{
1236112888Sjeff
1237112993Speter	return (EINVAL);
1238112993Speter}
1239112993Speter
1240112993Speter/*
124199026Sjulian * we implement a very minimal set of ioctls for compatibility with sockets.
1242112993Speter */
1243112993Speterstatic int
124499026Sjulianpipe_ioctl(fp, cmd, data, active_cred, td)
124599026Sjulian	struct file *fp;
124699026Sjulian	u_long cmd;
1247107719Sjulian	void *data;
1248107719Sjulian	struct ucred *active_cred;
1249107719Sjulian	struct thread *td;
1250107719Sjulian{
1251107719Sjulian	struct pipe *mpipe = fp->f_data;
1252107719Sjulian	int error;
1253107719Sjulian
1254107719Sjulian	PIPE_LOCK(mpipe);
1255107719Sjulian
1256107719Sjulian#ifdef MAC
1257107719Sjulian	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1258107719Sjulian	if (error) {
1259107719Sjulian		PIPE_UNLOCK(mpipe);
1260107719Sjulian		return (error);
1261107719Sjulian	}
1262107719Sjulian#endif
1263107719Sjulian
1264107719Sjulian	error = 0;
1265107719Sjulian	switch (cmd) {
1266107719Sjulian
1267107719Sjulian	case FIONBIO:
126899026Sjulian		break;
126999026Sjulian
1270103002Sjulian	case FIOASYNC:
1271103002Sjulian		if (*(int *)data) {
127299026Sjulian			mpipe->pipe_state |= PIPE_ASYNC;
127399026Sjulian		} else {
127499026Sjulian			mpipe->pipe_state &= ~PIPE_ASYNC;
127599026Sjulian		}
127699026Sjulian		break;
127799026Sjulian
127899026Sjulian	case FIONREAD:
127999026Sjulian		if (mpipe->pipe_state & PIPE_DIRECTW)
128099026Sjulian			*(int *)data = mpipe->pipe_map.cnt;
128199026Sjulian		else
1282111028Sjeff			*(int *)data = mpipe->pipe_buffer.cnt;
1283111028Sjeff		break;
1284111028Sjeff
1285111028Sjeff	case FIOSETOWN:
1286111028Sjeff		PIPE_UNLOCK(mpipe);
1287111028Sjeff		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
128899026Sjulian		goto out_unlocked;
1289103002Sjulian
1290103002Sjulian	case FIOGETOWN:
129199026Sjulian		*(int *)data = fgetown(&mpipe->pipe_sigio);
129299026Sjulian		break;
129399026Sjulian
129499026Sjulian	/* This is deprecated, FIOSETOWN should be used instead. */
129599026Sjulian	case TIOCSPGRP:
129699026Sjulian		PIPE_UNLOCK(mpipe);
1297113641Sjulian		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1298113641Sjulian		goto out_unlocked;
1299113641Sjulian
1300113641Sjulian	/* This is deprecated, FIOGETOWN should be used instead. */
1301113641Sjulian	case TIOCGPGRP:
1302113641Sjulian		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1303113641Sjulian		break;
1304113641Sjulian
1305113641Sjulian	default:
1306113641Sjulian		error = ENOTTY;
1307113641Sjulian		break;
1308113641Sjulian	}
1309113641Sjulian	PIPE_UNLOCK(mpipe);
1310111028Sjeffout_unlocked:
1311111028Sjeff	return (error);
1312111028Sjeff}
1313111028Sjeff
1314105854Sjulianstatic int
1315111028Sjeffpipe_poll(fp, events, active_cred, td)
1316111028Sjeff	struct file *fp;
1317111028Sjeff	int events;
1318111028Sjeff	struct ucred *active_cred;
1319111028Sjeff	struct thread *td;
1320111028Sjeff{
1321111028Sjeff	struct pipe *rpipe = fp->f_data;
1322111028Sjeff	struct pipe *wpipe;
1323111028Sjeff	int revents = 0;
1324111028Sjeff#ifdef MAC
1325111028Sjeff	int error;
1326111028Sjeff#endif
1327111028Sjeff
1328111028Sjeff	wpipe = rpipe->pipe_peer;
1329111028Sjeff	PIPE_LOCK(rpipe);
1330111028Sjeff#ifdef MAC
1331111028Sjeff	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1332111028Sjeff	if (error)
1333111028Sjeff		goto locked_error;
1334111028Sjeff#endif
1335111028Sjeff	if (events & (POLLIN | POLLRDNORM))
1336111028Sjeff		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1337111028Sjeff		    (rpipe->pipe_buffer.cnt > 0))
1338111028Sjeff			revents |= events & (POLLIN | POLLRDNORM);
1339111028Sjeff
1340105854Sjulian	if (events & (POLLOUT | POLLWRNORM))
1341105854Sjulian		if (wpipe->pipe_present != PIPE_ACTIVE ||
1342105854Sjulian		    (wpipe->pipe_state & PIPE_EOF) ||
1343111028Sjeff		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1344105854Sjulian		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1345105854Sjulian			revents |= events & (POLLOUT | POLLWRNORM);
1346105854Sjulian
1347105854Sjulian	if ((events & POLLINIGNEOF) == 0) {
1348105854Sjulian		if (rpipe->pipe_state & PIPE_EOF) {
1349105854Sjulian			revents |= (events & (POLLIN | POLLRDNORM));
1350111028Sjeff			if (wpipe->pipe_present != PIPE_ACTIVE ||
1351111028Sjeff			    (wpipe->pipe_state & PIPE_EOF))
1352111028Sjeff				revents |= POLLHUP;
1353111028Sjeff		}
1354111028Sjeff	}
1355111028Sjeff
1356111028Sjeff	if (revents == 0) {
1357111028Sjeff		if (events & (POLLIN | POLLRDNORM)) {
1358111028Sjeff			selrecord(td, &rpipe->pipe_sel);
1359111028Sjeff			if (SEL_WAITING(&rpipe->pipe_sel))
1360111028Sjeff				rpipe->pipe_state |= PIPE_SEL;
1361111028Sjeff		}
1362111028Sjeff
1363111028Sjeff		if (events & (POLLOUT | POLLWRNORM)) {
1364105854Sjulian			selrecord(td, &wpipe->pipe_sel);
1365111028Sjeff			if (SEL_WAITING(&wpipe->pipe_sel))
1366111028Sjeff				wpipe->pipe_state |= PIPE_SEL;
1367111028Sjeff		}
1368111028Sjeff	}
1369111028Sjeff#ifdef MAC
1370111028Sjefflocked_error:
1371111028Sjeff#endif
1372105854Sjulian	PIPE_UNLOCK(rpipe);
1373105854Sjulian
1374105854Sjulian	return (revents);
1375105854Sjulian}
1376105854Sjulian
1377105854Sjulian/*
1378105854Sjulian * We shouldn't need locks here as we're doing a read and this should
1379111028Sjeff * be a natural race.
1380111028Sjeff */
1381111028Sjeffstatic int
1382111028Sjeffpipe_stat(fp, ub, active_cred, td)
1383111028Sjeff	struct file *fp;
1384111028Sjeff	struct stat *ub;
1385111028Sjeff	struct ucred *active_cred;
1386111028Sjeff	struct thread *td;
1387111028Sjeff{
1388111028Sjeff	struct pipe *pipe = fp->f_data;
1389111028Sjeff#ifdef MAC
1390111028Sjeff	int error;
1391111028Sjeff
1392111028Sjeff	PIPE_LOCK(pipe);
1393111028Sjeff	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1394111028Sjeff	PIPE_UNLOCK(pipe);
1395111028Sjeff	if (error)
1396111028Sjeff		return (error);
1397105854Sjulian#endif
139899026Sjulian	bzero(ub, sizeof(*ub));
1399103410Smini	ub->st_mode = S_IFIFO;
1400108338Sjulian	ub->st_blksize = PAGE_SIZE;
140199026Sjulian	if (pipe->pipe_state & PIPE_DIRECTW)
140299026Sjulian		ub->st_size = pipe->pipe_map.cnt;
1403111028Sjeff	else
140499026Sjulian		ub->st_size = pipe->pipe_buffer.cnt;
140599026Sjulian	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
140699026Sjulian	ub->st_atim = pipe->pipe_atime;
140799026Sjulian	ub->st_mtim = pipe->pipe_mtime;
1408104695Sjulian	ub->st_ctim = pipe->pipe_ctime;
1409104695Sjulian	ub->st_uid = fp->f_cred->cr_uid;
1410111028Sjeff	ub->st_gid = fp->f_cred->cr_gid;
1411111028Sjeff	/*
1412111028Sjeff	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1413104695Sjulian	 * XXX (st_dev, st_ino) should be unique.
1414111028Sjeff	 */
1415104695Sjulian	return (0);
1416104695Sjulian}
141799026Sjulian
1418111028Sjeff/* ARGSUSED */
1419106182Sdavidxustatic int
142099026Sjulianpipe_close(fp, td)
142199026Sjulian	struct file *fp;
1422104695Sjulian	struct thread *td;
1423103002Sjulian{
1424103002Sjulian	struct pipe *cpipe = fp->f_data;
1425111028Sjeff
1426113244Sdavidxu	fp->f_ops = &badfileops;
1427113244Sdavidxu	fp->f_data = NULL;
1428113244Sdavidxu	funsetown(&cpipe->pipe_sigio);
1429111028Sjeff	pipeclose(cpipe);
1430111028Sjeff	return (0);
1431111028Sjeff}
1432111028Sjeff
1433112888Sjeffstatic void
1434111041Sdavidxupipe_free_kmem(cpipe)
1435111041Sdavidxu	struct pipe *cpipe;
1436112888Sjeff{
1437111028Sjeff
1438111028Sjeff	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1439104695Sjulian	    ("pipe_free_kmem: pipe mutex locked"));
1440111028Sjeff
1441104695Sjulian	if (cpipe->pipe_buffer.buffer != NULL) {
144299026Sjulian		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
144399026Sjulian		vm_map_remove(pipe_map,
1444111033Sjeff		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1445111033Sjeff		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1446103410Smini		cpipe->pipe_buffer.buffer = NULL;
1447111033Sjeff	}
1448111033Sjeff#ifndef PIPE_NODIRECT
1449103410Smini	{
1450103410Smini		cpipe->pipe_map.cnt = 0;
1451103410Smini		cpipe->pipe_map.pos = 0;
1452111033Sjeff		cpipe->pipe_map.npages = 0;
1453111033Sjeff	}
1454111033Sjeff#endif
1455111033Sjeff}
1456111033Sjeff
1457103410Smini/*
1458111033Sjeff * shutdown the pipe
1459103410Smini */
1460111033Sjeffstatic void
1461111033Sjeffpipeclose(cpipe)
1462103410Smini	struct pipe *cpipe;
1463111033Sjeff{
1464111033Sjeff	struct pipepair *pp;
1465111033Sjeff	struct pipe *ppipe;
1466111033Sjeff
1467111033Sjeff	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1468103410Smini
1469111033Sjeff	PIPE_LOCK(cpipe);
1470111033Sjeff	pipelock(cpipe, 0);
1471111033Sjeff	pp = cpipe->pipe_pair;
1472111033Sjeff
1473111033Sjeff	pipeselwakeup(cpipe);
1474111033Sjeff
1475111033Sjeff	/*
1476111033Sjeff	 * If the other side is blocked, wake it up saying that
1477111033Sjeff	 * we want to close it down.
1478111033Sjeff	 */
1479111033Sjeff	cpipe->pipe_state |= PIPE_EOF;
1480111033Sjeff	while (cpipe->pipe_busy) {
1481111033Sjeff		wakeup(cpipe);
1482111033Sjeff		cpipe->pipe_state |= PIPE_WANT;
1483103410Smini		pipeunlock(cpipe);
1484111033Sjeff		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1485103410Smini		pipelock(cpipe, 0);
1486111033Sjeff	}
1487111033Sjeff
1488103410Smini
1489103410Smini	/*
1490112397Sdavidxu	 * Disconnect from peer, if any.
1491112397Sdavidxu	 */
1492112397Sdavidxu	ppipe = cpipe->pipe_peer;
1493112397Sdavidxu	if (ppipe->pipe_present == PIPE_ACTIVE) {
1494112397Sdavidxu		pipeselwakeup(ppipe);
1495112397Sdavidxu
1496112397Sdavidxu		ppipe->pipe_state |= PIPE_EOF;
1497112397Sdavidxu		wakeup(ppipe);
1498112397Sdavidxu		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1499112397Sdavidxu	}
1500112397Sdavidxu
1501112397Sdavidxu	/*
1502112397Sdavidxu	 * Mark this endpoint as free.  Release kmem resources.  We
1503112397Sdavidxu	 * don't mark this endpoint as unused until we've finished
1504112397Sdavidxu	 * doing that, or the pipe might disappear out from under
1505112397Sdavidxu	 * us.
1506112397Sdavidxu	 */
1507112397Sdavidxu	PIPE_UNLOCK(cpipe);
1508112397Sdavidxu	pipe_free_kmem(cpipe);
1509112397Sdavidxu	PIPE_LOCK(cpipe);
1510112397Sdavidxu	cpipe->pipe_present = PIPE_CLOSING;
1511112397Sdavidxu	pipeunlock(cpipe);
1512112397Sdavidxu
1513112397Sdavidxu	/*
1514112397Sdavidxu	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1515112397Sdavidxu	 * PIPE_FINALIZED, that allows other end to free the
1516112397Sdavidxu	 * pipe_pair, only after the knotes are completely dismantled.
1517112397Sdavidxu	 */
1518112397Sdavidxu	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1519112397Sdavidxu	cpipe->pipe_present = PIPE_FINALIZED;
1520112397Sdavidxu	knlist_destroy(&cpipe->pipe_sel.si_note);
1521112397Sdavidxu
1522103410Smini	/*
1523111028Sjeff	 * If both endpoints are now closed, release the memory for the
1524105900Sjulian	 * pipe pair.  If not, unlock.
1525105900Sjulian	 */
1526105900Sjulian	if (ppipe->pipe_present == PIPE_FINALIZED) {
1527105900Sjulian		PIPE_UNLOCK(cpipe);
1528105900Sjulian#ifdef MAC
1529111028Sjeff		mac_pipe_destroy(pp);
1530111028Sjeff#endif
1531113793Sdavidxu		uma_zfree(pipe_zone, cpipe->pipe_pair);
1532105900Sjulian	} else
1533111028Sjeff		PIPE_UNLOCK(cpipe);
1534113793Sdavidxu}
1535105900Sjulian
1536105900Sjulian/*ARGSUSED*/
1537105900Sjulianstatic int
1538105900Sjulianpipe_kqfilter(struct file *fp, struct knote *kn)
1539113686Sjhb{
1540111028Sjeff	struct pipe *cpipe;
1541105900Sjulian
1542112071Sdavidxu	cpipe = kn->kn_fp->f_data;
1543105900Sjulian	PIPE_LOCK(cpipe);
1544105900Sjulian	switch (kn->kn_filter) {
1545105900Sjulian	case EVFILT_READ:
1546113686Sjhb		kn->kn_fop = &pipe_rfiltops;
1547105900Sjulian		break;
1548105900Sjulian	case EVFILT_WRITE:
1549105900Sjulian		kn->kn_fop = &pipe_wfiltops;
1550105900Sjulian		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1551108338Sjulian			/* other end of pipe has been closed */
1552105900Sjulian			PIPE_UNLOCK(cpipe);
1553105900Sjulian			return (EPIPE);
1554111028Sjeff		}
1555111028Sjeff		cpipe = cpipe->pipe_peer;
1556111028Sjeff		break;
1557111028Sjeff	default:
1558111028Sjeff		PIPE_UNLOCK(cpipe);
1559113793Sdavidxu		return (EINVAL);
1560113793Sdavidxu	}
1561113793Sdavidxu
1562113793Sdavidxu	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1563111028Sjeff	PIPE_UNLOCK(cpipe);
1564105900Sjulian	return (0);
1565113793Sdavidxu}
1566111115Sdavidxu
1567111115Sdavidxustatic void
1568111115Sdavidxufilt_pipedetach(struct knote *kn)
1569113793Sdavidxu{
1570113793Sdavidxu	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1571113793Sdavidxu
1572113793Sdavidxu	PIPE_LOCK(cpipe);
1573111115Sdavidxu	if (kn->kn_filter == EVFILT_WRITE)
1574105900Sjulian		cpipe = cpipe->pipe_peer;
1575105900Sjulian	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1576105900Sjulian	PIPE_UNLOCK(cpipe);
1577105900Sjulian}
1578105900Sjulian
1579103410Smini/*ARGSUSED*/
1580103410Sministatic int
1581103410Sminifilt_piperead(struct knote *kn, long hint)
158299026Sjulian{
158399026Sjulian	struct pipe *rpipe = kn->kn_fp->f_data;
158499026Sjulian	struct pipe *wpipe = rpipe->pipe_peer;
158599026Sjulian	int ret;
1586103410Smini
1587103410Smini	PIPE_LOCK(rpipe);
1588103410Smini	kn->kn_data = rpipe->pipe_buffer.cnt;
158999026Sjulian	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
159099026Sjulian		kn->kn_data = rpipe->pipe_map.cnt;
1591103838Sjulian
159299026Sjulian	if ((rpipe->pipe_state & PIPE_EOF) ||
1593113793Sdavidxu	    wpipe->pipe_present != PIPE_ACTIVE ||
1594111028Sjeff	    (wpipe->pipe_state & PIPE_EOF)) {
1595111115Sdavidxu		kn->kn_flags |= EV_EOF;
1596104695Sjulian		PIPE_UNLOCK(rpipe);
1597107060Sdavidxu		return (1);
159899026Sjulian	}
1599111028Sjeff	ret = kn->kn_data > 0;
1600110190Sjulian	PIPE_UNLOCK(rpipe);
1601104695Sjulian	return ret;
1602111028Sjeff}
1603111028Sjeff
1604111028Sjeff/*ARGSUSED*/
1605108338Sjulianstatic int
1606103410Sminifilt_pipewrite(struct knote *kn, long hint)
1607111028Sjeff{
1608111028Sjeff	struct pipe *rpipe = kn->kn_fp->f_data;
1609111028Sjeff	struct pipe *wpipe = rpipe->pipe_peer;
1610103410Smini
1611111028Sjeff	PIPE_LOCK(rpipe);
1612111515Sdavidxu	if (wpipe->pipe_present != PIPE_ACTIVE ||
1613111028Sjeff	    (wpipe->pipe_state & PIPE_EOF)) {
1614111028Sjeff		kn->kn_data = 0;
1615111028Sjeff		kn->kn_flags |= EV_EOF;
1616111515Sdavidxu		PIPE_UNLOCK(rpipe);
1617111515Sdavidxu		return (1);
1618111515Sdavidxu	}
1619111028Sjeff	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1620108338Sjulian	if (wpipe->pipe_state & PIPE_DIRECTW)
1621113793Sdavidxu		kn->kn_data = 0;
1622113793Sdavidxu
1623111028Sjeff	PIPE_UNLOCK(rpipe);
1624111028Sjeff	return (kn->kn_data >= PIPE_BUF);
1625111028Sjeff}
1626111028Sjeff