sys_pipe.c revision 76166
113675Sdyson/*
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson *
1950477Speter * $FreeBSD: head/sys/kern/sys_pipe.c 76166 2001-05-01 08:13:21Z markm $
2013675Sdyson */
2113675Sdyson
2213675Sdyson/*
2313675Sdyson * This file contains a high-performance replacement for the socket-based
2413675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2513675Sdyson * all features of sockets, but does do everything that pipes normally
2613675Sdyson * do.
2713675Sdyson */
2813675Sdyson
2913907Sdyson/*
3013907Sdyson * This code has two modes of operation, a small write mode and a large
3113907Sdyson * write mode.  The small write mode acts like conventional pipes with
3213907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3313907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3413907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3513907Sdyson * the receiving process can copy it directly from the pages in the sending
3613907Sdyson * process.
3713907Sdyson *
3813907Sdyson * If the sending process receives a signal, it is possible that it will
3913913Sdyson * go away, and certainly its address space can change, because control
4013907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
4113907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4213907Sdyson * kernel buffer, and the receiving process will grab the data from the
4313907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4413907Sdyson * the copy operation is normally eliminated.
4513907Sdyson *
4613907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4713907Sdyson * happen for small transfers so that the system will not spend all of
4813913Sdyson * its time context switching.  PIPE_SIZE is constrained by the
4913907Sdyson * amount of kernel virtual memory.
5013907Sdyson */
5113907Sdyson
5213675Sdyson#include <sys/param.h>
5313675Sdyson#include <sys/systm.h>
5424131Sbde#include <sys/fcntl.h>
5513675Sdyson#include <sys/file.h>
5613675Sdyson#include <sys/filedesc.h>
5724206Sbde#include <sys/filio.h>
5876166Smarkm#include <sys/lock.h>
5924206Sbde#include <sys/ttycom.h>
6013675Sdyson#include <sys/stat.h>
6129356Speter#include <sys/poll.h>
6270834Swollman#include <sys/selinfo.h>
6313675Sdyson#include <sys/signalvar.h>
6413675Sdyson#include <sys/sysproto.h>
6513675Sdyson#include <sys/pipe.h>
6676166Smarkm#include <sys/proc.h>
6755112Sbde#include <sys/vnode.h>
6834924Sbde#include <sys/uio.h>
6959288Sjlemon#include <sys/event.h>
7013675Sdyson
7113675Sdyson#include <vm/vm.h>
7213675Sdyson#include <vm/vm_param.h>
7313675Sdyson#include <vm/vm_object.h>
7413675Sdyson#include <vm/vm_kern.h>
7513675Sdyson#include <vm/vm_extern.h>
7613675Sdyson#include <vm/pmap.h>
7713675Sdyson#include <vm/vm_map.h>
7813907Sdyson#include <vm/vm_page.h>
7927899Sdyson#include <vm/vm_zone.h>
8013675Sdyson
8114037Sdyson/*
8214037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
8314037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
8414037Sdyson * NetBSD or OpenBSD.
8514037Sdyson */
8614037Sdyson/* #define PIPE_NODIRECT */
8714037Sdyson
8814037Sdyson/*
8914037Sdyson * interfaces to the outside world
9014037Sdyson */
9113675Sdysonstatic int pipe_read __P((struct file *fp, struct uio *uio,
9251418Sgreen		struct ucred *cred, int flags, struct proc *p));
9313675Sdysonstatic int pipe_write __P((struct file *fp, struct uio *uio,
9451418Sgreen		struct ucred *cred, int flags, struct proc *p));
9513675Sdysonstatic int pipe_close __P((struct file *fp, struct proc *p));
9629356Speterstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
9729356Speter		struct proc *p));
9872521Sjlemonstatic int pipe_kqfilter __P((struct file *fp, struct knote *kn));
9952983Speterstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
10036735Sdfrstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
10113675Sdyson
10272521Sjlemonstatic struct fileops pipeops = {
10372521Sjlemon	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10472521Sjlemon	pipe_stat, pipe_close
10572521Sjlemon};
10613675Sdyson
10759288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
10859288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
10959288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
11059288Sjlemon
11172521Sjlemonstatic struct filterops pipe_rfiltops =
11272521Sjlemon	{ 1, NULL, filt_pipedetach, filt_piperead };
11372521Sjlemonstatic struct filterops pipe_wfiltops =
11472521Sjlemon	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11559288Sjlemon
11672521Sjlemon
11713675Sdyson/*
11813675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
11913675Sdyson * space is pageable.  The pipe code will try to maintain locality of
12013675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
12113675Sdyson * will not wipe the cache.
12213675Sdyson */
12313907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
12413907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
12513675Sdyson
12613907Sdyson/*
12713907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
12813907Sdyson * is there so that on large systems, we don't exhaust it.
12913907Sdyson */
13013907Sdyson#define MAXPIPEKVA (8*1024*1024)
13113907Sdyson
13213907Sdyson/*
13313907Sdyson * Limit for direct transfers, we cannot, of course limit
13413907Sdyson * the amount of kva for pipes in general though.
13513907Sdyson */
13613907Sdyson#define LIMITPIPEKVA (16*1024*1024)
13717163Sdyson
13817163Sdyson/*
13917163Sdyson * Limit the number of "big" pipes
14017163Sdyson */
14117163Sdyson#define LIMITBIGPIPES	32
14233181Seivindstatic int nbigpipe;
14317163Sdyson
14417124Sbdestatic int amountpipekva;
14513907Sdyson
14613675Sdysonstatic void pipeclose __P((struct pipe *cpipe));
14713675Sdysonstatic void pipeinit __P((struct pipe *cpipe));
14813907Sdysonstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
14913675Sdysonstatic __inline void pipeunlock __P((struct pipe *cpipe));
15014122Speterstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
15114037Sdyson#ifndef PIPE_NODIRECT
15213907Sdysonstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
15313907Sdysonstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
15413907Sdysonstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
15513907Sdysonstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
15614037Sdyson#endif
15713907Sdysonstatic void pipespace __P((struct pipe *cpipe));
15813675Sdyson
15933181Seivindstatic vm_zone_t pipe_zone;
16027899Sdyson
16113675Sdyson/*
16213675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes
16313675Sdyson */
16413675Sdyson
16513675Sdyson/* ARGSUSED */
16613675Sdysonint
16730994Sphkpipe(p, uap)
16813675Sdyson	struct proc *p;
16913675Sdyson	struct pipe_args /* {
17013675Sdyson		int	dummy;
17113675Sdyson	} */ *uap;
17213675Sdyson{
17313675Sdyson	register struct filedesc *fdp = p->p_fd;
17413675Sdyson	struct file *rf, *wf;
17513675Sdyson	struct pipe *rpipe, *wpipe;
17613675Sdyson	int fd, error;
17713675Sdyson
17827899Sdyson	if (pipe_zone == NULL)
17927923Sdyson		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
18027899Sdyson
18127899Sdyson	rpipe = zalloc( pipe_zone);
18213675Sdyson	pipeinit(rpipe);
18313907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
18427899Sdyson	wpipe = zalloc( pipe_zone);
18513675Sdyson	pipeinit(wpipe);
18613907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
18713675Sdyson
18870915Sdwmalone	error = falloc(p, &rf, &fd);
18970915Sdwmalone	if (error) {
19070915Sdwmalone		pipeclose(rpipe);
19170915Sdwmalone		pipeclose(wpipe);
19270915Sdwmalone		return (error);
19370915Sdwmalone	}
19470915Sdwmalone	fhold(rf);
19570915Sdwmalone	p->p_retval[0] = fd;
19670915Sdwmalone
19770803Sdwmalone	/*
19870803Sdwmalone	 * Warning: once we've gotten past allocation of the fd for the
19970803Sdwmalone	 * read-side, we can only drop the read side via fdrop() in order
20070803Sdwmalone	 * to avoid races against processes which manage to dup() the read
20170803Sdwmalone	 * side while we are blocked trying to allocate the write side.
20270803Sdwmalone	 */
20313675Sdyson	rf->f_flag = FREAD | FWRITE;
20413675Sdyson	rf->f_type = DTYPE_PIPE;
20549413Sgreen	rf->f_data = (caddr_t)rpipe;
20613675Sdyson	rf->f_ops = &pipeops;
20713675Sdyson	error = falloc(p, &wf, &fd);
20870915Sdwmalone	if (error) {
20970915Sdwmalone		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
21070915Sdwmalone			fdp->fd_ofiles[p->p_retval[0]] = NULL;
21170915Sdwmalone			fdrop(rf, p);
21270915Sdwmalone		}
21370915Sdwmalone		fdrop(rf, p);
21470915Sdwmalone		/* rpipe has been closed by fdrop(). */
21570915Sdwmalone		pipeclose(wpipe);
21670915Sdwmalone		return (error);
21770915Sdwmalone	}
21813675Sdyson	wf->f_flag = FREAD | FWRITE;
21913675Sdyson	wf->f_type = DTYPE_PIPE;
22049413Sgreen	wf->f_data = (caddr_t)wpipe;
22113675Sdyson	wf->f_ops = &pipeops;
22230994Sphk	p->p_retval[1] = fd;
22313675Sdyson
22413675Sdyson	rpipe->pipe_peer = wpipe;
22513675Sdyson	wpipe->pipe_peer = rpipe;
22668883Sdillon	fdrop(rf, p);
22713675Sdyson
22813675Sdyson	return (0);
22913675Sdyson}
23013675Sdyson
23113909Sdyson/*
23213909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
23313909Sdyson */
23413675Sdysonstatic void
23513907Sdysonpipespace(cpipe)
23613675Sdyson	struct pipe *cpipe;
23713675Sdyson{
23813688Sdyson	int npages, error;
23913675Sdyson
24013907Sdyson	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
24113675Sdyson	/*
24213675Sdyson	 * Create an object, I don't like the idea of paging to/from
24313675Sdyson	 * kernel_object.
24414037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
24513675Sdyson	 */
24613675Sdyson	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
24713688Sdyson	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
24813675Sdyson
24913675Sdyson	/*
25013675Sdyson	 * Insert the object into the kernel map, and allocate kva for it.
25113675Sdyson	 * The map entry is, by default, pageable.
25214037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
25313675Sdyson	 */
25413688Sdyson	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
25513907Sdyson		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
25613907Sdyson		cpipe->pipe_buffer.size, 1,
25713688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
25813675Sdyson
25913688Sdyson	if (error != KERN_SUCCESS)
26013688Sdyson		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
26113907Sdyson	amountpipekva += cpipe->pipe_buffer.size;
26213907Sdyson}
26313688Sdyson
26413907Sdyson/*
26513907Sdyson * initialize and allocate VM and memory for pipe
26613907Sdyson */
26713907Sdysonstatic void
26813907Sdysonpipeinit(cpipe)
26913907Sdyson	struct pipe *cpipe;
27013907Sdyson{
27113907Sdyson
27213675Sdyson	cpipe->pipe_buffer.in = 0;
27313675Sdyson	cpipe->pipe_buffer.out = 0;
27413675Sdyson	cpipe->pipe_buffer.cnt = 0;
27513907Sdyson	cpipe->pipe_buffer.size = PIPE_SIZE;
27617163Sdyson
27713907Sdyson	/* Buffer kva gets dynamically allocated */
27813907Sdyson	cpipe->pipe_buffer.buffer = NULL;
27917124Sbde	/* cpipe->pipe_buffer.object = invalid */
28013675Sdyson
28113675Sdyson	cpipe->pipe_state = 0;
28213675Sdyson	cpipe->pipe_peer = NULL;
28313675Sdyson	cpipe->pipe_busy = 0;
28455112Sbde	vfs_timestamp(&cpipe->pipe_ctime);
28524101Sbde	cpipe->pipe_atime = cpipe->pipe_ctime;
28624101Sbde	cpipe->pipe_mtime = cpipe->pipe_ctime;
28713675Sdyson	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
28813907Sdyson
28914037Sdyson#ifndef PIPE_NODIRECT
29013907Sdyson	/*
29113907Sdyson	 * pipe data structure initializations to support direct pipe I/O
29213907Sdyson	 */
29313907Sdyson	cpipe->pipe_map.cnt = 0;
29413907Sdyson	cpipe->pipe_map.kva = 0;
29513907Sdyson	cpipe->pipe_map.pos = 0;
29613907Sdyson	cpipe->pipe_map.npages = 0;
29717124Sbde	/* cpipe->pipe_map.ms[] = invalid */
29814037Sdyson#endif
29913675Sdyson}
30013675Sdyson
30113675Sdyson
30213675Sdyson/*
30313675Sdyson * lock a pipe for I/O, blocking other access
30413675Sdyson */
30513675Sdysonstatic __inline int
30613907Sdysonpipelock(cpipe, catch)
30713675Sdyson	struct pipe *cpipe;
30813907Sdyson	int catch;
30913675Sdyson{
31013776Sdyson	int error;
31113675Sdyson	while (cpipe->pipe_state & PIPE_LOCK) {
31213675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
31343301Sdillon		if ((error = tsleep( cpipe,
31443301Sdillon			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
31513776Sdyson			return error;
31613675Sdyson		}
31713675Sdyson	}
31813675Sdyson	cpipe->pipe_state |= PIPE_LOCK;
31913675Sdyson	return 0;
32013675Sdyson}
32113675Sdyson
32213675Sdyson/*
32313675Sdyson * unlock a pipe I/O lock
32413675Sdyson */
32513675Sdysonstatic __inline void
32613675Sdysonpipeunlock(cpipe)
32713675Sdyson	struct pipe *cpipe;
32813675Sdyson{
32913675Sdyson	cpipe->pipe_state &= ~PIPE_LOCK;
33013675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
33113675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
33214177Sdyson		wakeup(cpipe);
33313675Sdyson	}
33413675Sdyson}
33513675Sdyson
33614037Sdysonstatic __inline void
33714037Sdysonpipeselwakeup(cpipe)
33814037Sdyson	struct pipe *cpipe;
33914037Sdyson{
34014037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
34114037Sdyson		cpipe->pipe_state &= ~PIPE_SEL;
34214037Sdyson		selwakeup(&cpipe->pipe_sel);
34314037Sdyson	}
34441086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
34541086Struckman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
34659288Sjlemon	KNOTE(&cpipe->pipe_sel.si_note, 0);
34714037Sdyson}
34814037Sdyson
34913675Sdyson/* ARGSUSED */
35013675Sdysonstatic int
35151418Sgreenpipe_read(fp, uio, cred, flags, p)
35213675Sdyson	struct file *fp;
35313675Sdyson	struct uio *uio;
35413675Sdyson	struct ucred *cred;
35551418Sgreen	struct proc *p;
35645311Sdt	int flags;
35713675Sdyson{
35813675Sdyson
35913675Sdyson	struct pipe *rpipe = (struct pipe *) fp->f_data;
36047748Salc	int error;
36113675Sdyson	int nread = 0;
36218863Sdyson	u_int size;
36313675Sdyson
36413675Sdyson	++rpipe->pipe_busy;
36547748Salc	error = pipelock(rpipe, 1);
36647748Salc	if (error)
36747748Salc		goto unlocked_error;
36847748Salc
36913675Sdyson	while (uio->uio_resid) {
37013907Sdyson		/*
37113907Sdyson		 * normal pipe buffer receive
37213907Sdyson		 */
37313675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
37418863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
37513675Sdyson			if (size > rpipe->pipe_buffer.cnt)
37613675Sdyson				size = rpipe->pipe_buffer.cnt;
37718863Sdyson			if (size > (u_int) uio->uio_resid)
37818863Sdyson				size = (u_int) uio->uio_resid;
37947748Salc
38047748Salc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
38113675Sdyson					size, uio);
38213675Sdyson			if (error) {
38313675Sdyson				break;
38413675Sdyson			}
38513675Sdyson			rpipe->pipe_buffer.out += size;
38613675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
38713675Sdyson				rpipe->pipe_buffer.out = 0;
38813675Sdyson
38913675Sdyson			rpipe->pipe_buffer.cnt -= size;
39047748Salc
39147748Salc			/*
39247748Salc			 * If there is no more to read in the pipe, reset
39347748Salc			 * its pointers to the beginning.  This improves
39447748Salc			 * cache hit stats.
39547748Salc			 */
39647748Salc			if (rpipe->pipe_buffer.cnt == 0) {
39747748Salc				rpipe->pipe_buffer.in = 0;
39847748Salc				rpipe->pipe_buffer.out = 0;
39947748Salc			}
40013675Sdyson			nread += size;
40114037Sdyson#ifndef PIPE_NODIRECT
40213907Sdyson		/*
40313907Sdyson		 * Direct copy, bypassing a kernel buffer.
40413907Sdyson		 */
40513907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
40647748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
40747748Salc			caddr_t	va;
40818863Sdyson			if (size > (u_int) uio->uio_resid)
40918863Sdyson				size = (u_int) uio->uio_resid;
41047748Salc
41147748Salc			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
41247748Salc			error = uiomove(va, size, uio);
41313907Sdyson			if (error)
41413907Sdyson				break;
41513907Sdyson			nread += size;
41613907Sdyson			rpipe->pipe_map.pos += size;
41713907Sdyson			rpipe->pipe_map.cnt -= size;
41813907Sdyson			if (rpipe->pipe_map.cnt == 0) {
41913907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
42013907Sdyson				wakeup(rpipe);
42113907Sdyson			}
42214037Sdyson#endif
42313675Sdyson		} else {
42413675Sdyson			/*
42513675Sdyson			 * detect EOF condition
42613675Sdyson			 */
42713675Sdyson			if (rpipe->pipe_state & PIPE_EOF) {
42814802Sdyson				/* XXX error = ? */
42913675Sdyson				break;
43013675Sdyson			}
43143623Sdillon
43213675Sdyson			/*
43313675Sdyson			 * If the "write-side" has been blocked, wake it up now.
43413675Sdyson			 */
43513675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
43613675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
43713675Sdyson				wakeup(rpipe);
43813675Sdyson			}
43943623Sdillon
44043623Sdillon			/*
44147748Salc			 * Break if some data was read.
44243623Sdillon			 */
44347748Salc			if (nread > 0)
44413675Sdyson				break;
44516960Sdyson
44643623Sdillon			/*
44747748Salc			 * Unlock the pipe buffer for our remaining processing.  We
44847748Salc			 * will either break out with an error or we will sleep and
44947748Salc			 * relock to loop.
45043623Sdillon			 */
45147748Salc			pipeunlock(rpipe);
45243623Sdillon
45313675Sdyson			/*
45447748Salc			 * Handle non-blocking mode operation or
45547748Salc			 * wait for more data.
45613675Sdyson			 */
45747748Salc			if (fp->f_flag & FNONBLOCK)
45847748Salc				error = EAGAIN;
45947748Salc			else {
46047748Salc				rpipe->pipe_state |= PIPE_WANTR;
46147748Salc				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
46247748Salc					error = pipelock(rpipe, 1);
46313675Sdyson			}
46447748Salc			if (error)
46547748Salc				goto unlocked_error;
46613675Sdyson		}
46713675Sdyson	}
46847748Salc	pipeunlock(rpipe);
46913675Sdyson
47024101Sbde	if (error == 0)
47155112Sbde		vfs_timestamp(&rpipe->pipe_atime);
47247748Salcunlocked_error:
47347748Salc	--rpipe->pipe_busy;
47413913Sdyson
47547748Salc	/*
47647748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
47747748Salc	 */
47813675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
47913675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
48013675Sdyson		wakeup(rpipe);
48113675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
48213675Sdyson		/*
48347748Salc		 * Handle write blocking hysteresis.
48413675Sdyson		 */
48513675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
48613675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
48713675Sdyson			wakeup(rpipe);
48813675Sdyson		}
48913675Sdyson	}
49014037Sdyson
49114802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
49214037Sdyson		pipeselwakeup(rpipe);
49314037Sdyson
49413675Sdyson	return error;
49513675Sdyson}
49613675Sdyson
49714037Sdyson#ifndef PIPE_NODIRECT
49813907Sdyson/*
49913907Sdyson * Map the sending processes' buffer into kernel space and wire it.
50013907Sdyson * This is similar to a physical write operation.
50113907Sdyson */
50213675Sdysonstatic int
50313907Sdysonpipe_build_write_buffer(wpipe, uio)
50413907Sdyson	struct pipe *wpipe;
50513675Sdyson	struct uio *uio;
50613675Sdyson{
50718863Sdyson	u_int size;
50813907Sdyson	int i;
50913907Sdyson	vm_offset_t addr, endaddr, paddr;
51013907Sdyson
51118863Sdyson	size = (u_int) uio->uio_iov->iov_len;
51213907Sdyson	if (size > wpipe->pipe_buffer.size)
51313907Sdyson		size = wpipe->pipe_buffer.size;
51413907Sdyson
51540286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
51640286Sdg	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
51713907Sdyson		addr < endaddr;
51813907Sdyson		addr += PAGE_SIZE, i+=1) {
51913907Sdyson
52013907Sdyson		vm_page_t m;
52113907Sdyson
52251474Sdillon		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
52351474Sdillon		    (paddr = pmap_kextract(addr)) == 0) {
52413907Sdyson			int j;
52513907Sdyson			for(j=0;j<i;j++)
52640700Sdg				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
52713907Sdyson			return EFAULT;
52813907Sdyson		}
52913907Sdyson
53013907Sdyson		m = PHYS_TO_VM_PAGE(paddr);
53113907Sdyson		vm_page_wire(m);
53213907Sdyson		wpipe->pipe_map.ms[i] = m;
53313907Sdyson	}
53413907Sdyson
53513907Sdyson/*
53613907Sdyson * set up the control block
53713907Sdyson */
53813907Sdyson	wpipe->pipe_map.npages = i;
53913907Sdyson	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
54013907Sdyson	wpipe->pipe_map.cnt = size;
54113907Sdyson
54213907Sdyson/*
54313907Sdyson * and map the buffer
54413907Sdyson */
54513907Sdyson	if (wpipe->pipe_map.kva == 0) {
54613912Sdyson		/*
54713912Sdyson		 * We need to allocate space for an extra page because the
54813912Sdyson		 * address range might (will) span pages at times.
54913912Sdyson		 */
55013907Sdyson		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
55113912Sdyson			wpipe->pipe_buffer.size + PAGE_SIZE);
55213912Sdyson		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
55313907Sdyson	}
55413907Sdyson	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
55513907Sdyson		wpipe->pipe_map.npages);
55613907Sdyson
55713907Sdyson/*
55813907Sdyson * and update the uio data
55913907Sdyson */
56013907Sdyson
56113907Sdyson	uio->uio_iov->iov_len -= size;
56213907Sdyson	uio->uio_iov->iov_base += size;
56313907Sdyson	if (uio->uio_iov->iov_len == 0)
56413907Sdyson		uio->uio_iov++;
56513907Sdyson	uio->uio_resid -= size;
56613907Sdyson	uio->uio_offset += size;
56713907Sdyson	return 0;
56813907Sdyson}
56913907Sdyson
57013907Sdyson/*
57113907Sdyson * unmap and unwire the process buffer
57213907Sdyson */
57313907Sdysonstatic void
57413907Sdysonpipe_destroy_write_buffer(wpipe)
57513907Sdysonstruct pipe *wpipe;
57613907Sdyson{
57713907Sdyson	int i;
57817163Sdyson	if (wpipe->pipe_map.kva) {
57917163Sdyson		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
58013907Sdyson
58113907Sdyson		if (amountpipekva > MAXPIPEKVA) {
58213907Sdyson			vm_offset_t kva = wpipe->pipe_map.kva;
58313907Sdyson			wpipe->pipe_map.kva = 0;
58413907Sdyson			kmem_free(kernel_map, kva,
58513912Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
58613912Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
58713907Sdyson		}
58813907Sdyson	}
58913907Sdyson	for (i=0;i<wpipe->pipe_map.npages;i++)
59040700Sdg		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
59113907Sdyson}
59213907Sdyson
59313907Sdyson/*
59413907Sdyson * In the case of a signal, the writing process might go away.  This
59513907Sdyson * code copies the data into the circular buffer so that the source
59613907Sdyson * pages can be freed without loss of data.
59713907Sdyson */
59813907Sdysonstatic void
59913907Sdysonpipe_clone_write_buffer(wpipe)
60013907Sdysonstruct pipe *wpipe;
60113907Sdyson{
60213907Sdyson	int size;
60313907Sdyson	int pos;
60413907Sdyson
60513907Sdyson	size = wpipe->pipe_map.cnt;
60613907Sdyson	pos = wpipe->pipe_map.pos;
60713907Sdyson	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
60813907Sdyson			(caddr_t) wpipe->pipe_buffer.buffer,
60913907Sdyson			size);
61013907Sdyson
61113907Sdyson	wpipe->pipe_buffer.in = size;
61213907Sdyson	wpipe->pipe_buffer.out = 0;
61313907Sdyson	wpipe->pipe_buffer.cnt = size;
61413907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
61513907Sdyson
61613907Sdyson	pipe_destroy_write_buffer(wpipe);
61713907Sdyson}
61813907Sdyson
61913907Sdyson/*
62013907Sdyson * This implements the pipe buffer write mechanism.  Note that only
62113907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
62213907Sdyson * If there are any characters in the pipe buffer, the direct write will
62313907Sdyson * be deferred until the receiving process grabs all of the bytes from
62413907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
62513907Sdyson */
62613907Sdysonstatic int
62713907Sdysonpipe_direct_write(wpipe, uio)
62813907Sdyson	struct pipe *wpipe;
62913907Sdyson	struct uio *uio;
63013907Sdyson{
63113907Sdyson	int error;
63213951Sdysonretry:
63313907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
63413951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
63513951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
63613951Sdyson			wakeup(wpipe);
63713951Sdyson		}
63813992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
63913907Sdyson		error = tsleep(wpipe,
64013907Sdyson				PRIBIO|PCATCH, "pipdww", 0);
64114802Sdyson		if (error)
64213907Sdyson			goto error1;
64314802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
64414802Sdyson			error = EPIPE;
64514802Sdyson			goto error1;
64614802Sdyson		}
64713907Sdyson	}
64813907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
64913951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
65013951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
65113951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
65213951Sdyson			wakeup(wpipe);
65313951Sdyson		}
65413951Sdyson
65513992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
65613907Sdyson		error = tsleep(wpipe,
65713907Sdyson				PRIBIO|PCATCH, "pipdwc", 0);
65814802Sdyson		if (error)
65913907Sdyson			goto error1;
66014802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
66114802Sdyson			error = EPIPE;
66214802Sdyson			goto error1;
66313907Sdyson		}
66413951Sdyson		goto retry;
66513907Sdyson	}
66613907Sdyson
66713951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
66813951Sdyson
66913907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
67013907Sdyson	if (error) {
67113907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
67213907Sdyson		goto error1;
67313907Sdyson	}
67413907Sdyson
67513907Sdyson	error = 0;
67613907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
67713907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
67813907Sdyson			pipelock(wpipe, 0);
67913907Sdyson			pipe_destroy_write_buffer(wpipe);
68013907Sdyson			pipeunlock(wpipe);
68114037Sdyson			pipeselwakeup(wpipe);
68214802Sdyson			error = EPIPE;
68314802Sdyson			goto error1;
68413907Sdyson		}
68513992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
68613992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
68713992Sdyson			wakeup(wpipe);
68813992Sdyson		}
68914037Sdyson		pipeselwakeup(wpipe);
69013907Sdyson		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
69113907Sdyson	}
69213907Sdyson
69313907Sdyson	pipelock(wpipe,0);
69413907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
69513907Sdyson		/*
69613907Sdyson		 * this bit of trickery substitutes a kernel buffer for
69713907Sdyson		 * the process that might be going away.
69813907Sdyson		 */
69913907Sdyson		pipe_clone_write_buffer(wpipe);
70013907Sdyson	} else {
70113907Sdyson		pipe_destroy_write_buffer(wpipe);
70213907Sdyson	}
70313907Sdyson	pipeunlock(wpipe);
70413907Sdyson	return error;
70513907Sdyson
70613907Sdysonerror1:
70713907Sdyson	wakeup(wpipe);
70813907Sdyson	return error;
70913907Sdyson}
71014037Sdyson#endif
71113907Sdyson
71216960Sdysonstatic int
71351418Sgreenpipe_write(fp, uio, cred, flags, p)
71416960Sdyson	struct file *fp;
71513907Sdyson	struct uio *uio;
71616960Sdyson	struct ucred *cred;
71751418Sgreen	struct proc *p;
71845311Sdt	int flags;
71913907Sdyson{
72013675Sdyson	int error = 0;
72113913Sdyson	int orig_resid;
72213675Sdyson
72316960Sdyson	struct pipe *wpipe, *rpipe;
72416960Sdyson
72516960Sdyson	rpipe = (struct pipe *) fp->f_data;
72616960Sdyson	wpipe = rpipe->pipe_peer;
72716960Sdyson
72813675Sdyson	/*
72913675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
73013675Sdyson	 */
73116960Sdyson	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
73213774Sdyson		return EPIPE;
73313675Sdyson	}
73413675Sdyson
73517163Sdyson	/*
73617163Sdyson	 * If it is advantageous to resize the pipe buffer, do
73717163Sdyson	 * so.
73817163Sdyson	 */
73917163Sdyson	if ((uio->uio_resid > PIPE_SIZE) &&
74017163Sdyson		(nbigpipe < LIMITBIGPIPES) &&
74117163Sdyson		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
74217163Sdyson		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
74317163Sdyson		(wpipe->pipe_buffer.cnt == 0)) {
74417163Sdyson
74517163Sdyson		if (wpipe->pipe_buffer.buffer) {
74617163Sdyson			amountpipekva -= wpipe->pipe_buffer.size;
74717163Sdyson			kmem_free(kernel_map,
74817163Sdyson				(vm_offset_t)wpipe->pipe_buffer.buffer,
74917163Sdyson				wpipe->pipe_buffer.size);
75017163Sdyson		}
75117163Sdyson
75217163Sdyson#ifndef PIPE_NODIRECT
75317163Sdyson		if (wpipe->pipe_map.kva) {
75417163Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
75517163Sdyson			kmem_free(kernel_map,
75617163Sdyson				wpipe->pipe_map.kva,
75717163Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
75817163Sdyson		}
75917163Sdyson#endif
76017163Sdyson
76117163Sdyson		wpipe->pipe_buffer.in = 0;
76217163Sdyson		wpipe->pipe_buffer.out = 0;
76317163Sdyson		wpipe->pipe_buffer.cnt = 0;
76417163Sdyson		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
76517163Sdyson		wpipe->pipe_buffer.buffer = NULL;
76617163Sdyson		++nbigpipe;
76717163Sdyson
76817163Sdyson#ifndef PIPE_NODIRECT
76917163Sdyson		wpipe->pipe_map.cnt = 0;
77017163Sdyson		wpipe->pipe_map.kva = 0;
77117163Sdyson		wpipe->pipe_map.pos = 0;
77217163Sdyson		wpipe->pipe_map.npages = 0;
77317163Sdyson#endif
77417163Sdyson
77517163Sdyson	}
77617163Sdyson
77717163Sdyson
77813907Sdyson	if( wpipe->pipe_buffer.buffer == NULL) {
77913907Sdyson		if ((error = pipelock(wpipe,1)) == 0) {
78013907Sdyson			pipespace(wpipe);
78113907Sdyson			pipeunlock(wpipe);
78213907Sdyson		} else {
78313907Sdyson			return error;
78413907Sdyson		}
78513907Sdyson	}
78613907Sdyson
78713675Sdyson	++wpipe->pipe_busy;
78813913Sdyson	orig_resid = uio->uio_resid;
78913675Sdyson	while (uio->uio_resid) {
79013907Sdyson		int space;
79114037Sdyson#ifndef PIPE_NODIRECT
79213907Sdyson		/*
79313907Sdyson		 * If the transfer is large, we can gain performance if
79413907Sdyson		 * we do process-to-process copies directly.
79516416Sdyson		 * If the write is non-blocking, we don't use the
79616416Sdyson		 * direct write mechanism.
79758505Sdillon		 *
79858505Sdillon		 * The direct write mechanism will detect the reader going
79958505Sdillon		 * away on us.
80013907Sdyson		 */
80117163Sdyson		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
80217163Sdyson		    (fp->f_flag & FNONBLOCK) == 0 &&
80317163Sdyson			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
80413907Sdyson			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
80513907Sdyson			error = pipe_direct_write( wpipe, uio);
80613907Sdyson			if (error) {
80713907Sdyson				break;
80813907Sdyson			}
80913907Sdyson			continue;
81013907Sdyson		}
81114037Sdyson#endif
81213907Sdyson
81313907Sdyson		/*
81413907Sdyson		 * Pipe buffered writes cannot be coincidental with
81513907Sdyson		 * direct writes.  We wait until the currently executing
81613907Sdyson		 * direct write is completed before we start filling the
81758505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
81858505Sdillon		 * reader goes away.
81913907Sdyson		 */
82013907Sdyson	retrywrite:
82113907Sdyson		while (wpipe->pipe_state & PIPE_DIRECTW) {
82213992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
82313992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
82413992Sdyson				wakeup(wpipe);
82513992Sdyson			}
82658505Sdillon			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
82758505Sdillon			if (wpipe->pipe_state & PIPE_EOF)
82858505Sdillon				break;
82913907Sdyson			if (error)
83013907Sdyson				break;
83113907Sdyson		}
83258505Sdillon		if (wpipe->pipe_state & PIPE_EOF) {
83358505Sdillon			error = EPIPE;
83458505Sdillon			break;
83558505Sdillon		}
83613907Sdyson
83713907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
83814644Sdyson
83914644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
84013913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
84113913Sdyson			space = 0;
84213907Sdyson
84317163Sdyson		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
84413907Sdyson			if ((error = pipelock(wpipe,1)) == 0) {
84554534Stegge				int size;	/* Transfer size */
84654534Stegge				int segsize;	/* first segment to transfer */
84713907Sdyson				/*
84813907Sdyson				 * It is possible for a direct write to
84913907Sdyson				 * slip in on us... handle it here...
85013907Sdyson				 */
85113907Sdyson				if (wpipe->pipe_state & PIPE_DIRECTW) {
85213907Sdyson					pipeunlock(wpipe);
85313907Sdyson					goto retrywrite;
85413907Sdyson				}
85554534Stegge				/*
85654534Stegge				 * If a process blocked in uiomove, our
85754534Stegge				 * value for space might be bad.
85858505Sdillon				 *
85958505Sdillon				 * XXX will we be ok if the reader has gone
86058505Sdillon				 * away here?
86154534Stegge				 */
86254534Stegge				if (space > wpipe->pipe_buffer.size -
86354534Stegge				    wpipe->pipe_buffer.cnt) {
86454534Stegge					pipeunlock(wpipe);
86554534Stegge					goto retrywrite;
86654534Stegge				}
86754534Stegge
86854534Stegge				/*
86954534Stegge				 * Transfer size is minimum of uio transfer
87054534Stegge				 * and free space in pipe buffer.
87154534Stegge				 */
87254534Stegge				if (space > uio->uio_resid)
87354534Stegge					size = uio->uio_resid;
87454534Stegge				else
87554534Stegge					size = space;
87654534Stegge				/*
87754534Stegge				 * First segment to transfer is minimum of
87854534Stegge				 * transfer size and contiguous space in
87954534Stegge				 * pipe buffer.  If first segment to transfer
88054534Stegge				 * is less than the transfer size, we've got
88154534Stegge				 * a wraparound in the buffer.
88254534Stegge				 */
88354534Stegge				segsize = wpipe->pipe_buffer.size -
88454534Stegge					wpipe->pipe_buffer.in;
88554534Stegge				if (segsize > size)
88654534Stegge					segsize = size;
88754534Stegge
88854534Stegge				/* Transfer first segment */
88954534Stegge
89054534Stegge				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
89154534Stegge						segsize, uio);
89254534Stegge
89354534Stegge				if (error == 0 && segsize < size) {
89454534Stegge					/*
89554534Stegge					 * Transfer remaining part now, to
89654534Stegge					 * support atomic writes.  Wraparound
89754534Stegge					 * happened.
89854534Stegge					 */
89954534Stegge					if (wpipe->pipe_buffer.in + segsize !=
90054534Stegge					    wpipe->pipe_buffer.size)
90154534Stegge						panic("Expected pipe buffer wraparound disappeared");
90254534Stegge
90354534Stegge					error = uiomove(&wpipe->pipe_buffer.buffer[0],
90454534Stegge							size - segsize, uio);
90554534Stegge				}
90654534Stegge				if (error == 0) {
90754534Stegge					wpipe->pipe_buffer.in += size;
90854534Stegge					if (wpipe->pipe_buffer.in >=
90954534Stegge					    wpipe->pipe_buffer.size) {
91054534Stegge						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
91154534Stegge							panic("Expected wraparound bad");
91254534Stegge						wpipe->pipe_buffer.in = size - segsize;
91354534Stegge					}
91454534Stegge
91554534Stegge					wpipe->pipe_buffer.cnt += size;
91654534Stegge					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
91754534Stegge						panic("Pipe buffer overflow");
91854534Stegge
91954534Stegge				}
92013675Sdyson				pipeunlock(wpipe);
92113675Sdyson			}
92213675Sdyson			if (error)
92313675Sdyson				break;
92413675Sdyson
92513675Sdyson		} else {
92613675Sdyson			/*
92713675Sdyson			 * If the "read-side" has been blocked, wake it up now.
92813675Sdyson			 */
92913675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
93013675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
93113675Sdyson				wakeup(wpipe);
93213675Sdyson			}
93314037Sdyson
93413675Sdyson			/*
93513675Sdyson			 * don't block on non-blocking I/O
93613675Sdyson			 */
93716960Sdyson			if (fp->f_flag & FNONBLOCK) {
93813907Sdyson				error = EAGAIN;
93913675Sdyson				break;
94013675Sdyson			}
94113907Sdyson
94214037Sdyson			/*
94314037Sdyson			 * We have no more space and have something to offer,
94429356Speter			 * wake up select/poll.
94514037Sdyson			 */
94614037Sdyson			pipeselwakeup(wpipe);
94714037Sdyson
94813675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
94943301Sdillon			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
95013675Sdyson				break;
95113675Sdyson			}
95213675Sdyson			/*
95313675Sdyson			 * If read side wants to go away, we just issue a signal
95413675Sdyson			 * to ourselves.
95513675Sdyson			 */
95613675Sdyson			if (wpipe->pipe_state & PIPE_EOF) {
95713774Sdyson				error = EPIPE;
95813907Sdyson				break;
95913675Sdyson			}
96013675Sdyson		}
96113675Sdyson	}
96213675Sdyson
96314644Sdyson	--wpipe->pipe_busy;
96413675Sdyson	if ((wpipe->pipe_busy == 0) &&
96513675Sdyson		(wpipe->pipe_state & PIPE_WANT)) {
96613675Sdyson		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
96713675Sdyson		wakeup(wpipe);
96813675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
96913675Sdyson		/*
97013675Sdyson		 * If we have put any characters in the buffer, we wake up
97113675Sdyson		 * the reader.
97213675Sdyson		 */
97313675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
97413675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
97513675Sdyson			wakeup(wpipe);
97613675Sdyson		}
97713675Sdyson	}
97813909Sdyson
97913909Sdyson	/*
98013909Sdyson	 * Don't return EPIPE if I/O was successful
98113909Sdyson	 */
98213907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
98313907Sdyson		(uio->uio_resid == 0) &&
98413907Sdyson		(error == EPIPE))
98513907Sdyson		error = 0;
98613913Sdyson
98724101Sbde	if (error == 0)
98855112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
98924101Sbde
99014037Sdyson	/*
99114037Sdyson	 * We have something to offer,
99229356Speter	 * wake up select/poll.
99314037Sdyson	 */
99414177Sdyson	if (wpipe->pipe_buffer.cnt)
99514037Sdyson		pipeselwakeup(wpipe);
99613907Sdyson
99713675Sdyson	return error;
99813675Sdyson}
99913675Sdyson
100013675Sdyson/*
100113675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
100213675Sdyson */
100313675Sdysonint
100413675Sdysonpipe_ioctl(fp, cmd, data, p)
100513675Sdyson	struct file *fp;
100636735Sdfr	u_long cmd;
100713675Sdyson	register caddr_t data;
100813675Sdyson	struct proc *p;
100913675Sdyson{
101013675Sdyson	register struct pipe *mpipe = (struct pipe *)fp->f_data;
101113675Sdyson
101213675Sdyson	switch (cmd) {
101313675Sdyson
101413675Sdyson	case FIONBIO:
101513675Sdyson		return (0);
101613675Sdyson
101713675Sdyson	case FIOASYNC:
101813675Sdyson		if (*(int *)data) {
101913675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
102013675Sdyson		} else {
102113675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
102213675Sdyson		}
102313675Sdyson		return (0);
102413675Sdyson
102513675Sdyson	case FIONREAD:
102614037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
102714037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
102814037Sdyson		else
102914037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
103013675Sdyson		return (0);
103113675Sdyson
103241086Struckman	case FIOSETOWN:
103341086Struckman		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
103441086Struckman
103541086Struckman	case FIOGETOWN:
103641086Struckman		*(int *)data = fgetown(mpipe->pipe_sigio);
103713675Sdyson		return (0);
103813675Sdyson
103941086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
104041086Struckman	case TIOCSPGRP:
104141086Struckman		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
104241086Struckman
104341086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
104418863Sdyson	case TIOCGPGRP:
104541086Struckman		*(int *)data = -fgetown(mpipe->pipe_sigio);
104613675Sdyson		return (0);
104713675Sdyson
104813675Sdyson	}
104917124Sbde	return (ENOTTY);
105013675Sdyson}
105113675Sdyson
105213675Sdysonint
105329356Speterpipe_poll(fp, events, cred, p)
105413675Sdyson	struct file *fp;
105529356Speter	int events;
105629356Speter	struct ucred *cred;
105713675Sdyson	struct proc *p;
105813675Sdyson{
105913675Sdyson	register struct pipe *rpipe = (struct pipe *)fp->f_data;
106013675Sdyson	struct pipe *wpipe;
106129356Speter	int revents = 0;
106213675Sdyson
106313675Sdyson	wpipe = rpipe->pipe_peer;
106429356Speter	if (events & (POLLIN | POLLRDNORM))
106529356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
106629356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
106729356Speter		    (rpipe->pipe_state & PIPE_EOF))
106829356Speter			revents |= events & (POLLIN | POLLRDNORM);
106913675Sdyson
107029356Speter	if (events & (POLLOUT | POLLWRNORM))
107129356Speter		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
107243311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
107343311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
107429356Speter			revents |= events & (POLLOUT | POLLWRNORM);
107513675Sdyson
107629356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
107729356Speter	    (wpipe == NULL) ||
107829356Speter	    (wpipe->pipe_state & PIPE_EOF))
107929356Speter		revents |= POLLHUP;
108029356Speter
108129356Speter	if (revents == 0) {
108229356Speter		if (events & (POLLIN | POLLRDNORM)) {
108329356Speter			selrecord(p, &rpipe->pipe_sel);
108429356Speter			rpipe->pipe_state |= PIPE_SEL;
108513675Sdyson		}
108613675Sdyson
108729356Speter		if (events & (POLLOUT | POLLWRNORM)) {
108830164Speter			selrecord(p, &wpipe->pipe_sel);
108930164Speter			wpipe->pipe_state |= PIPE_SEL;
109013907Sdyson		}
109113675Sdyson	}
109229356Speter
109329356Speter	return (revents);
109413675Sdyson}
109513675Sdyson
109652983Speterstatic int
109752983Speterpipe_stat(fp, ub, p)
109852983Speter	struct file *fp;
109952983Speter	struct stat *ub;
110052983Speter	struct proc *p;
110113675Sdyson{
110252983Speter	struct pipe *pipe = (struct pipe *)fp->f_data;
110352983Speter
110413675Sdyson	bzero((caddr_t)ub, sizeof (*ub));
110517124Sbde	ub->st_mode = S_IFIFO;
110613907Sdyson	ub->st_blksize = pipe->pipe_buffer.size;
110713675Sdyson	ub->st_size = pipe->pipe_buffer.cnt;
110813675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
110934901Sphk	ub->st_atimespec = pipe->pipe_atime;
111034901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
111134901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
111260404Schris	ub->st_uid = fp->f_cred->cr_uid;
111360404Schris	ub->st_gid = fp->f_cred->cr_gid;
111417124Sbde	/*
111560404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
111617124Sbde	 * XXX (st_dev, st_ino) should be unique.
111717124Sbde	 */
111813675Sdyson	return 0;
111913675Sdyson}
112013675Sdyson
112113675Sdyson/* ARGSUSED */
112213675Sdysonstatic int
112313675Sdysonpipe_close(fp, p)
112413675Sdyson	struct file *fp;
112513675Sdyson	struct proc *p;
112613675Sdyson{
112713675Sdyson	struct pipe *cpipe = (struct pipe *)fp->f_data;
112816322Sgpalmer
112949413Sgreen	fp->f_ops = &badfileops;
113049413Sgreen	fp->f_data = NULL;
113141086Struckman	funsetown(cpipe->pipe_sigio);
113213675Sdyson	pipeclose(cpipe);
113313675Sdyson	return 0;
113413675Sdyson}
113513675Sdyson
113613675Sdyson/*
113713675Sdyson * shutdown the pipe
113813675Sdyson */
113913675Sdysonstatic void
114013675Sdysonpipeclose(cpipe)
114113675Sdyson	struct pipe *cpipe;
114213675Sdyson{
114313907Sdyson	struct pipe *ppipe;
114413675Sdyson	if (cpipe) {
114513907Sdyson
114614037Sdyson		pipeselwakeup(cpipe);
114713907Sdyson
114813675Sdyson		/*
114913675Sdyson		 * If the other side is blocked, wake it up saying that
115013675Sdyson		 * we want to close it down.
115113675Sdyson		 */
115213675Sdyson		while (cpipe->pipe_busy) {
115313675Sdyson			wakeup(cpipe);
115413675Sdyson			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
115513675Sdyson			tsleep(cpipe, PRIBIO, "pipecl", 0);
115613675Sdyson		}
115713675Sdyson
115813675Sdyson		/*
115913675Sdyson		 * Disconnect from peer
116013675Sdyson		 */
116143301Sdillon		if ((ppipe = cpipe->pipe_peer) != NULL) {
116214037Sdyson			pipeselwakeup(ppipe);
116313907Sdyson
116413907Sdyson			ppipe->pipe_state |= PIPE_EOF;
116513907Sdyson			wakeup(ppipe);
116613907Sdyson			ppipe->pipe_peer = NULL;
116713675Sdyson		}
116813675Sdyson
116913675Sdyson		/*
117013675Sdyson		 * free resources
117113675Sdyson		 */
117213907Sdyson		if (cpipe->pipe_buffer.buffer) {
117317163Sdyson			if (cpipe->pipe_buffer.size > PIPE_SIZE)
117417163Sdyson				--nbigpipe;
117513907Sdyson			amountpipekva -= cpipe->pipe_buffer.size;
117613907Sdyson			kmem_free(kernel_map,
117713907Sdyson				(vm_offset_t)cpipe->pipe_buffer.buffer,
117813907Sdyson				cpipe->pipe_buffer.size);
117913907Sdyson		}
118014037Sdyson#ifndef PIPE_NODIRECT
118113907Sdyson		if (cpipe->pipe_map.kva) {
118213912Sdyson			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
118313907Sdyson			kmem_free(kernel_map,
118413907Sdyson				cpipe->pipe_map.kva,
118513912Sdyson				cpipe->pipe_buffer.size + PAGE_SIZE);
118613907Sdyson		}
118714037Sdyson#endif
118827899Sdyson		zfree(pipe_zone, cpipe);
118913675Sdyson	}
119013675Sdyson}
119159288Sjlemon
119272521Sjlemon/*ARGSUSED*/
119359288Sjlemonstatic int
119472521Sjlemonpipe_kqfilter(struct file *fp, struct knote *kn)
119559288Sjlemon{
119659288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
119759288Sjlemon
119872521Sjlemon	switch (kn->kn_filter) {
119972521Sjlemon	case EVFILT_READ:
120072521Sjlemon		kn->kn_fop = &pipe_rfiltops;
120172521Sjlemon		break;
120272521Sjlemon	case EVFILT_WRITE:
120372521Sjlemon		kn->kn_fop = &pipe_wfiltops;
120472521Sjlemon		break;
120572521Sjlemon	default:
120672521Sjlemon		return (1);
120772521Sjlemon	}
120872521Sjlemon
120959288Sjlemon	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
121059288Sjlemon	return (0);
121159288Sjlemon}
121259288Sjlemon
121359288Sjlemonstatic void
121459288Sjlemonfilt_pipedetach(struct knote *kn)
121559288Sjlemon{
121659288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
121759288Sjlemon
121860938Sjake	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
121959288Sjlemon}
122059288Sjlemon
122159288Sjlemon/*ARGSUSED*/
122259288Sjlemonstatic int
122359288Sjlemonfilt_piperead(struct knote *kn, long hint)
122459288Sjlemon{
122559288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
122659288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
122759288Sjlemon
122859288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
122959288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
123059288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
123159288Sjlemon
123259288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
123359288Sjlemon	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
123459288Sjlemon		kn->kn_flags |= EV_EOF;
123559288Sjlemon		return (1);
123659288Sjlemon	}
123759288Sjlemon	return (kn->kn_data > 0);
123859288Sjlemon}
123959288Sjlemon
124059288Sjlemon/*ARGSUSED*/
124159288Sjlemonstatic int
124259288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
124359288Sjlemon{
124459288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
124559288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
124659288Sjlemon
124759288Sjlemon	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
124859288Sjlemon		kn->kn_data = 0;
124959288Sjlemon		kn->kn_flags |= EV_EOF;
125059288Sjlemon		return (1);
125159288Sjlemon	}
125259288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
125365855Sjlemon	if (wpipe->pipe_state & PIPE_DIRECTW)
125459288Sjlemon		kn->kn_data = 0;
125559288Sjlemon
125659288Sjlemon	return (kn->kn_data >= PIPE_BUF);
125759288Sjlemon}
1258