sys_pipe.c revision 83366
125605Skjc/*
225605Skjc * Copyright (c) 1996 John S. Dyson
325605Skjc * All rights reserved.
425605Skjc *
525605Skjc * Redistribution and use in source and binary forms, with or without
625605Skjc * modification, are permitted provided that the following conditions
725605Skjc * are met:
825605Skjc * 1. Redistributions of source code must retain the above copyright
925605Skjc *    notice immediately at the beginning of the file, without modification,
1025605Skjc *    this list of conditions, and the following disclaimer.
1125605Skjc * 2. Redistributions in binary form must reproduce the above copyright
1225605Skjc *    notice, this list of conditions and the following disclaimer in the
1325605Skjc *    documentation and/or other materials provided with the distribution.
1425605Skjc * 3. Absolutely no warranty of function or purpose is made by the author
1525605Skjc *    John S. Dyson.
1625605Skjc * 4. Modifications may be freely made to this file if the above conditions
1725605Skjc *    are met.
1825605Skjc *
1925605Skjc * $FreeBSD: head/sys/kern/sys_pipe.c 83366 2001-09-12 08:38:13Z julian $
2025605Skjc */
2125605Skjc
2225605Skjc/*
2325605Skjc * This file contains a high-performance replacement for the socket-based
2425605Skjc * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2525605Skjc * all features of sockets, but does do everything that pipes normally
2625605Skjc * do.
2725605Skjc */
2825605Skjc
2925605Skjc/*
3025605Skjc * This code has two modes of operation, a small write mode and a large
3125605Skjc * write mode.  The small write mode acts like conventional pipes with
3225605Skjc * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3325605Skjc * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3425605Skjc * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3525605Skjc * the receiving process can copy it directly from the pages in the sending
3625605Skjc * process.
3725605Skjc *
38116189Sobrien * If the sending process receives a signal, it is possible that it will
39116189Sobrien * go away, and certainly its address space can change, because control
40116189Sobrien * is returned back to the user-mode side.  In that case, the pipe code
4125605Skjc * arranges to copy the buffer supplied by the user process, to a pageable
4295759Stanimura * kernel buffer, and the receiving process will grab the data from the
4325605Skjc * pageable kernel buffer.  Since signals don't happen all that often,
4495759Stanimura * the copy operation is normally eliminated.
4529024Sbde *
4625605Skjc * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4795759Stanimura * happen for small transfers so that the system will not spend all of
4895759Stanimura * its time context switching.  PIPE_SIZE is constrained by the
4925605Skjc * amount of kernel virtual memory.
5025605Skjc */
5195759Stanimura
5295759Stanimura#include <sys/param.h>
5395759Stanimura#include <sys/systm.h>
54118541Sharti#include <sys/fcntl.h>
5525605Skjc#include <sys/file.h>
5625605Skjc#include <sys/filedesc.h>
5725605Skjc#include <sys/filio.h>
5825605Skjc#include <sys/lock.h>
5925605Skjc#include <sys/mutex.h>
6025605Skjc#include <sys/ttycom.h>
6125605Skjc#include <sys/stat.h>
6225605Skjc#include <sys/poll.h>
6325605Skjc#include <sys/selinfo.h>
6433181Seivind#include <sys/signalvar.h>
6533181Seivind#include <sys/sysproto.h>
6625605Skjc#include <sys/pipe.h>
6733181Seivind#include <sys/proc.h>
6833181Seivind#include <sys/vnode.h>
6925605Skjc#include <sys/uio.h>
7025605Skjc#include <sys/event.h>
7125605Skjc
7225605Skjc#include <vm/vm.h>
7325605Skjc#include <vm/vm_param.h>
7425605Skjc#include <vm/vm_object.h>
7525605Skjc#include <vm/vm_kern.h>
7625605Skjc#include <vm/vm_extern.h>
7792745Salfred#include <vm/pmap.h>
7892745Salfred#include <vm/vm_map.h>
7992745Salfred#include <vm/vm_page.h>
8092745Salfred#include <vm/vm_zone.h>
8192745Salfred
8292745Salfred/*
83118541Sharti * Use this define if you want to disable *fancy* VM things.  Expect an
8492745Salfred * approx 30% decrease in transfer rate.  This could be useful for
8592745Salfred * NetBSD or OpenBSD.
86118541Sharti */
8792745Salfred/* #define PIPE_NODIRECT */
8892745Salfred
8992745Salfred/*
9025605Skjc * interfaces to the outside world
9125605Skjc */
9291458Speterstatic int pipe_read __P((struct file *fp, struct uio *uio,
9325605Skjc		struct ucred *cred, int flags, struct thread *td));
9425605Skjcstatic int pipe_write __P((struct file *fp, struct uio *uio,
9525605Skjc		struct ucred *cred, int flags, struct thread *td));
9625605Skjcstatic int pipe_close __P((struct file *fp, struct thread *td));
9725605Skjcstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
98118541Sharti		struct thread *td));
9925605Skjcstatic int pipe_kqfilter __P((struct file *fp, struct knote *kn));
10025605Skjcstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct thread *td));
10125605Skjcstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct thread *td));
10225605Skjc
10325605Skjcstatic struct fileops pipeops = {
10425605Skjc	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10525605Skjc	pipe_stat, pipe_close
10625605Skjc};
10725605Skjc
10825605Skjcstatic void	filt_pipedetach(struct knote *kn);
10925605Skjcstatic int	filt_piperead(struct knote *kn, long hint);
11025605Skjcstatic int	filt_pipewrite(struct knote *kn, long hint);
11125605Skjc
11225605Skjcstatic struct filterops pipe_rfiltops =
11325605Skjc	{ 1, NULL, filt_pipedetach, filt_piperead };
114111119Simpstatic struct filterops pipe_wfiltops =
11525605Skjc	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11625605Skjc
11725605Skjc
11825605Skjc/*
11925605Skjc * Default pipe buffer size(s), this can be kind-of large now because pipe
12025605Skjc * space is pageable.  The pipe code will try to maintain locality of
12125605Skjc * reference for performance reasons, so small amounts of outstanding I/O
12225605Skjc * will not wipe the cache.
12325605Skjc */
12425605Skjc#define MINPIPESIZE (PIPE_SIZE/3)
12525605Skjc#define MAXPIPESIZE (2*PIPE_SIZE/3)
12625605Skjc
12725605Skjc/*
128118541Sharti * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
12925605Skjc * is there so that on large systems, we don't exhaust it.
13025605Skjc */
13125605Skjc#define MAXPIPEKVA (8*1024*1024)
13225605Skjc
13325605Skjc/*
13425605Skjc * Limit for direct transfers, we cannot, of course limit
13525605Skjc * the amount of kva for pipes in general though.
13625605Skjc */
13725605Skjc#define LIMITPIPEKVA (16*1024*1024)
13825605Skjc
13986487Sdillon/*
14025605Skjc * Limit the number of "big" pipes
14125605Skjc */
14225605Skjc#define LIMITBIGPIPES	32
14325605Skjcstatic int nbigpipe;
14425605Skjc
14525605Skjcstatic int amountpipekva;
14691458Speter
14725605Skjcstatic void pipeclose __P((struct pipe *cpipe));
14825605Skjcstatic void pipe_free_kmem __P((struct pipe *cpipe));
14925605Skjcstatic int pipe_create __P((struct pipe **cpipep));
150118543Shartistatic __inline int pipelock __P((struct pipe *cpipe, int catch));
15125605Skjcstatic __inline void pipeunlock __P((struct pipe *cpipe));
15225605Skjcstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
15325605Skjc#ifndef PIPE_NODIRECT
15425605Skjcstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
15525605Skjcstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
156118541Shartistatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
15725605Skjcstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
15825605Skjc#endif
15925605Skjcstatic int pipespace __P((struct pipe *cpipe, int size));
16025605Skjc
16125605Skjcstatic vm_zone_t pipe_zone;
16225605Skjc
16325605Skjc/*
16425605Skjc * The pipe system call for the DTYPE_PIPE type of pipes
16528270Swollman */
16625605Skjc
16725605Skjc/* ARGSUSED */
16825605Skjcint
16925605Skjcpipe(td, uap)
17025605Skjc	struct thread *td;
17125605Skjc	struct pipe_args /* {
17225605Skjc		int	dummy;
17325605Skjc	} */ *uap;
17425605Skjc{
17525605Skjc	struct filedesc *fdp = td->td_proc->p_fd;
176118541Sharti	struct file *rf, *wf;
177118541Sharti	struct pipe *rpipe, *wpipe;
17825605Skjc	int fd, error;
17925605Skjc
18025605Skjc	if (pipe_zone == NULL)
18125605Skjc		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
18225605Skjc
18325605Skjc	rpipe = wpipe = NULL;
18425605Skjc	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
18525605Skjc		pipeclose(rpipe);
18625605Skjc		pipeclose(wpipe);
18725605Skjc		return (ENFILE);
18825605Skjc	}
18925605Skjc
19025605Skjc	rpipe->pipe_state |= PIPE_DIRECTOK;
19125605Skjc	wpipe->pipe_state |= PIPE_DIRECTOK;
19225605Skjc
19325605Skjc	error = falloc(td, &rf, &fd);
19425605Skjc	if (error) {
19525605Skjc		pipeclose(rpipe);
19625605Skjc		pipeclose(wpipe);
19725605Skjc		return (error);
19825605Skjc	}
19925605Skjc	fhold(rf);
20025605Skjc	td->td_retval[0] = fd;
201118543Sharti
20225605Skjc	/*
203118543Sharti	 * Warning: once we've gotten past allocation of the fd for the
204118543Sharti	 * read-side, we can only drop the read side via fdrop() in order
205118543Sharti	 * to avoid races against processes which manage to dup() the read
206118543Sharti	 * side while we are blocked trying to allocate the write side.
207118543Sharti	 */
208118543Sharti	rf->f_flag = FREAD | FWRITE;
209118543Sharti	rf->f_type = DTYPE_PIPE;
210118543Sharti	rf->f_data = (caddr_t)rpipe;
211118543Sharti	rf->f_ops = &pipeops;
21225605Skjc	error = falloc(td, &wf, &fd);
21325605Skjc	if (error) {
214118543Sharti		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
21525605Skjc			fdp->fd_ofiles[td->td_retval[0]] = NULL;
21625605Skjc			fdrop(rf, td);
21725605Skjc		}
21825605Skjc		fdrop(rf, td);
21925605Skjc		/* rpipe has been closed by fdrop(). */
22025605Skjc		pipeclose(wpipe);
22125605Skjc		return (error);
22225605Skjc	}
22325605Skjc	wf->f_flag = FREAD | FWRITE;
22425605Skjc	wf->f_type = DTYPE_PIPE;
22525605Skjc	wf->f_data = (caddr_t)wpipe;
22625605Skjc	wf->f_ops = &pipeops;
22725605Skjc	td->td_retval[1] = fd;
22825605Skjc
22925605Skjc	rpipe->pipe_peer = wpipe;
23025605Skjc	wpipe->pipe_peer = rpipe;
23125605Skjc	fdrop(rf, td);
23225605Skjc
233118543Sharti	return (0);
23425605Skjc}
23525605Skjc
23625605Skjc/*
23725605Skjc * Allocate kva for pipe circular buffer, the space is pageable
238118541Sharti * This routine will 'realloc' the size of a pipe safely, if it fails
23925605Skjc * it will retain the old buffer.
24025605Skjc * If it fails it will return ENOMEM.
24125605Skjc */
24225605Skjcstatic int
24325605Skjcpipespace(cpipe, size)
24425605Skjc	struct pipe *cpipe;
24525605Skjc	int size;
24625605Skjc{
24725605Skjc	struct vm_object *object;
24825605Skjc	caddr_t buffer;
24925605Skjc	int npages, error;
25025605Skjc
25125605Skjc	GIANT_REQUIRED;
25225605Skjc
25325605Skjc	npages = round_page(size)/PAGE_SIZE;
254118543Sharti	/*
255118543Sharti	 * Create an object, I don't like the idea of paging to/from
25625605Skjc	 * kernel_object.
25725605Skjc	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
258118543Sharti	 */
25925605Skjc	object = vm_object_allocate(OBJT_DEFAULT, npages);
26025605Skjc	buffer = (caddr_t) vm_map_min(kernel_map);
26125605Skjc
26225605Skjc	/*
26325605Skjc	 * Insert the object into the kernel map, and allocate kva for it.
26425605Skjc	 * The map entry is, by default, pageable.
26525605Skjc	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
26625605Skjc	 */
26725605Skjc	error = vm_map_find(kernel_map, object, 0,
26825605Skjc		(vm_offset_t *) &buffer, size, 1,
26925605Skjc		VM_PROT_ALL, VM_PROT_ALL, 0);
27025605Skjc
27125605Skjc	if (error != KERN_SUCCESS) {
27225605Skjc		vm_object_deallocate(object);
273118541Sharti		return (ENOMEM);
27425605Skjc	}
27525605Skjc
27625605Skjc	/* free old resources if we're resizing */
27728270Swollman	pipe_free_kmem(cpipe);
278118541Sharti	cpipe->pipe_buffer.object = object;
27925605Skjc	cpipe->pipe_buffer.buffer = buffer;
28025605Skjc	cpipe->pipe_buffer.size = size;
28125605Skjc	cpipe->pipe_buffer.in = 0;
28225605Skjc	cpipe->pipe_buffer.out = 0;
28325605Skjc	cpipe->pipe_buffer.cnt = 0;
28425605Skjc	amountpipekva += cpipe->pipe_buffer.size;
28525605Skjc	return (0);
286118541Sharti}
28725605Skjc
28825605Skjc/*
28925605Skjc * initialize and allocate VM and memory for pipe
29025605Skjc */
29125605Skjcstatic int
29225605Skjcpipe_create(cpipep)
29325605Skjc	struct pipe **cpipep;
29425605Skjc{
29525605Skjc	struct pipe *cpipe;
29625605Skjc	int error;
29725605Skjc
29825605Skjc	*cpipep = zalloc(pipe_zone);
29925605Skjc	if (*cpipep == NULL)
30025605Skjc		return (ENOMEM);
30125605Skjc
302111119Simp	cpipe = *cpipep;
30325605Skjc
30425605Skjc	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
30525605Skjc	cpipe->pipe_buffer.object = NULL;
30625605Skjc#ifndef PIPE_NODIRECT
30725605Skjc	cpipe->pipe_map.kva = NULL;
30825605Skjc#endif
30925605Skjc	/*
31025605Skjc	 * protect so pipeclose() doesn't follow a junk pointer
31125605Skjc	 * if pipespace() fails.
31225605Skjc	 */
31325605Skjc	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
31425605Skjc	cpipe->pipe_state = 0;
31525605Skjc	cpipe->pipe_peer = NULL;
31625605Skjc	cpipe->pipe_busy = 0;
31725605Skjc
31825605Skjc#ifndef PIPE_NODIRECT
31925605Skjc	/*
32028270Swollman	 * pipe data structure initializations to support direct pipe I/O
32125605Skjc	 */
32225605Skjc	cpipe->pipe_map.cnt = 0;
32328270Swollman	cpipe->pipe_map.kva = 0;
32425605Skjc	cpipe->pipe_map.pos = 0;
32525605Skjc	cpipe->pipe_map.npages = 0;
32625605Skjc	/* cpipe->pipe_map.ms[] = invalid */
327118541Sharti#endif
32825605Skjc
32925605Skjc	error = pipespace(cpipe, PIPE_SIZE);
33025605Skjc	if (error)
33125605Skjc		return (error);
33225605Skjc
33328270Swollman	vfs_timestamp(&cpipe->pipe_ctime);
33425605Skjc	cpipe->pipe_atime = cpipe->pipe_ctime;
33528270Swollman	cpipe->pipe_mtime = cpipe->pipe_ctime;
33625605Skjc
33741514Sarchie	return (0);
33841514Sarchie}
33925605Skjc
34025605Skjc
34128270Swollman/*
34225605Skjc * lock a pipe for I/O, blocking other access
34325605Skjc */
34425605Skjcstatic __inline int
34525605Skjcpipelock(cpipe, catch)
34625605Skjc	struct pipe *cpipe;
34725605Skjc	int catch;
34825605Skjc{
34936735Sdfr	int error;
350118541Sharti
35125605Skjc	while (cpipe->pipe_state & PIPE_LOCK) {
35225605Skjc		cpipe->pipe_state |= PIPE_LWANT;
35325605Skjc		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
35425605Skjc		    "pipelk", 0);
35525605Skjc		if (error != 0)
356118541Sharti			return (error);
35725605Skjc	}
35825605Skjc	cpipe->pipe_state |= PIPE_LOCK;
35925605Skjc	return (0);
36025605Skjc}
36125605Skjc
362118547Sharti/*
363118547Sharti * unlock a pipe I/O lock
364118547Sharti */
365118547Shartistatic __inline void
36625605Skjcpipeunlock(cpipe)
367118547Sharti	struct pipe *cpipe;
36825605Skjc{
36925605Skjc
37025605Skjc	cpipe->pipe_state &= ~PIPE_LOCK;
37125605Skjc	if (cpipe->pipe_state & PIPE_LWANT) {
37225605Skjc		cpipe->pipe_state &= ~PIPE_LWANT;
37325605Skjc		wakeup(cpipe);
37425605Skjc	}
37525605Skjc}
37625605Skjc
377118541Shartistatic __inline void
37825605Skjcpipeselwakeup(cpipe)
37925605Skjc	struct pipe *cpipe;
38025605Skjc{
38191458Speter
38225605Skjc	if (cpipe->pipe_state & PIPE_SEL) {
383118541Sharti		cpipe->pipe_state &= ~PIPE_SEL;
38425605Skjc		selwakeup(&cpipe->pipe_sel);
38525605Skjc	}
38625605Skjc	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
38728270Swollman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
38825605Skjc	KNOTE(&cpipe->pipe_sel.si_note, 0);
389118541Sharti}
39025605Skjc
39125605Skjc/* ARGSUSED */
39225605Skjcstatic int
39325605Skjcpipe_read(fp, uio, cred, flags, td)
39425605Skjc	struct file *fp;
39525605Skjc	struct uio *uio;
39625605Skjc	struct ucred *cred;
39725605Skjc	struct thread *td;
39825605Skjc	int flags;
39929366Speter{
40025605Skjc	struct pipe *rpipe = (struct pipe *) fp->f_data;
40125605Skjc	int error;
40225605Skjc	int nread = 0;
40325605Skjc	u_int size;
40425605Skjc
40525605Skjc	++rpipe->pipe_busy;
40625605Skjc	error = pipelock(rpipe, 1);
40725605Skjc	if (error)
40825605Skjc		goto unlocked_error;
40925605Skjc
41025605Skjc	while (uio->uio_resid) {
41125605Skjc		/*
41225605Skjc		 * normal pipe buffer receive
41325605Skjc		 */
41425605Skjc		if (rpipe->pipe_buffer.cnt > 0) {
41525605Skjc			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
41625605Skjc			if (size > rpipe->pipe_buffer.cnt)
41725605Skjc				size = rpipe->pipe_buffer.cnt;
41825605Skjc			if (size > (u_int) uio->uio_resid)
41925605Skjc				size = (u_int) uio->uio_resid;
42025605Skjc
42125605Skjc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
42225605Skjc					size, uio);
42325605Skjc			if (error)
42425605Skjc				break;
42525605Skjc
42625605Skjc			rpipe->pipe_buffer.out += size;
42725605Skjc			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
42825605Skjc				rpipe->pipe_buffer.out = 0;
42925605Skjc
43025605Skjc			rpipe->pipe_buffer.cnt -= size;
43125605Skjc
43225605Skjc			/*
43325605Skjc			 * If there is no more to read in the pipe, reset
43425605Skjc			 * its pointers to the beginning.  This improves
43525605Skjc			 * cache hit stats.
43625605Skjc			 */
43725605Skjc			if (rpipe->pipe_buffer.cnt == 0) {
43825605Skjc				rpipe->pipe_buffer.in = 0;
43925605Skjc				rpipe->pipe_buffer.out = 0;
44025605Skjc			}
44125605Skjc			nread += size;
44225605Skjc#ifndef PIPE_NODIRECT
44325605Skjc		/*
44425605Skjc		 * Direct copy, bypassing a kernel buffer.
44525605Skjc		 */
44625605Skjc		} else if ((size = rpipe->pipe_map.cnt) &&
44725605Skjc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
44825605Skjc			caddr_t	va;
44925605Skjc			if (size > (u_int) uio->uio_resid)
45025605Skjc				size = (u_int) uio->uio_resid;
45125605Skjc
45225605Skjc			va = (caddr_t) rpipe->pipe_map.kva +
45325605Skjc			    rpipe->pipe_map.pos;
454111119Simp			error = uiomove(va, size, uio);
45525605Skjc			if (error)
45625605Skjc				break;
45725605Skjc			nread += size;
45825605Skjc			rpipe->pipe_map.pos += size;
45925605Skjc			rpipe->pipe_map.cnt -= size;
46025605Skjc			if (rpipe->pipe_map.cnt == 0) {
46125605Skjc				rpipe->pipe_state &= ~PIPE_DIRECTW;
46225605Skjc				wakeup(rpipe);
46325605Skjc			}
46425605Skjc#endif
46525605Skjc		} else {
46625605Skjc			/*
46786487Sdillon			 * detect EOF condition
46825605Skjc			 * read returns 0 on EOF, no need to set error
46925605Skjc			 */
47025605Skjc			if (rpipe->pipe_state & PIPE_EOF)
47125605Skjc				break;
47225605Skjc
47325605Skjc			/*
47425605Skjc			 * If the "write-side" has been blocked, wake it up now.
47525605Skjc			 */
47625605Skjc			if (rpipe->pipe_state & PIPE_WANTW) {
47725605Skjc				rpipe->pipe_state &= ~PIPE_WANTW;
47825605Skjc				wakeup(rpipe);
47925605Skjc			}
48025605Skjc
48125605Skjc			/*
48225605Skjc			 * Break if some data was read.
48325605Skjc			 */
48425605Skjc			if (nread > 0)
48525605Skjc				break;
48625605Skjc
48725605Skjc			/*
48825605Skjc			 * Unlock the pipe buffer for our remaining processing.  We
48925605Skjc			 * will either break out with an error or we will sleep and
49025605Skjc			 * relock to loop.
49125605Skjc			 */
49225605Skjc			pipeunlock(rpipe);
49325605Skjc
49425605Skjc			/*
49525605Skjc			 * Handle non-blocking mode operation or
49625605Skjc			 * wait for more data.
49725605Skjc			 */
49825605Skjc			if (fp->f_flag & FNONBLOCK) {
49925605Skjc				error = EAGAIN;
50025605Skjc			} else {
50125605Skjc				rpipe->pipe_state |= PIPE_WANTR;
50225605Skjc				if ((error = tsleep(rpipe, PRIBIO | PCATCH,
50325605Skjc				    "piperd", 0)) == 0)
50425605Skjc					error = pipelock(rpipe, 1);
50525605Skjc			}
50625605Skjc			if (error)
50725605Skjc				goto unlocked_error;
50825605Skjc		}
50925605Skjc	}
51025605Skjc	pipeunlock(rpipe);
51125605Skjc
51225605Skjc	if (error == 0)
51325605Skjc		vfs_timestamp(&rpipe->pipe_atime);
51425605Skjcunlocked_error:
51525605Skjc	--rpipe->pipe_busy;
51625605Skjc
51725605Skjc	/*
51825605Skjc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
51925605Skjc	 */
52025605Skjc	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
52125605Skjc		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
52225605Skjc		wakeup(rpipe);
52325605Skjc	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
52425605Skjc		/*
52525605Skjc		 * Handle write blocking hysteresis.
52625605Skjc		 */
52725605Skjc		if (rpipe->pipe_state & PIPE_WANTW) {
52825605Skjc			rpipe->pipe_state &= ~PIPE_WANTW;
52925605Skjc			wakeup(rpipe);
53025605Skjc		}
53125605Skjc	}
53225605Skjc
53325605Skjc	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
53425605Skjc		pipeselwakeup(rpipe);
53525605Skjc
53625605Skjc	return (error);
53725605Skjc}
53825605Skjc
53925605Skjc#ifndef PIPE_NODIRECT
54025605Skjc/*
54125605Skjc * Map the sending processes' buffer into kernel space and wire it.
54225605Skjc * This is similar to a physical write operation.
54325605Skjc */
54425605Skjcstatic int
54525605Skjcpipe_build_write_buffer(wpipe, uio)
54625605Skjc	struct pipe *wpipe;
54725605Skjc	struct uio *uio;
54825605Skjc{
54925605Skjc	u_int size;
55025605Skjc	int i;
55125605Skjc	vm_offset_t addr, endaddr, paddr;
55225605Skjc
55325605Skjc	GIANT_REQUIRED;
55425605Skjc
55525605Skjc	size = (u_int) uio->uio_iov->iov_len;
55625605Skjc	if (size > wpipe->pipe_buffer.size)
55725605Skjc		size = wpipe->pipe_buffer.size;
55825605Skjc
55925605Skjc	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
56025605Skjc	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
56125605Skjc	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
56225605Skjc		vm_page_t m;
56325605Skjc
56425605Skjc		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
56525605Skjc		    (paddr = pmap_kextract(addr)) == 0) {
56625605Skjc			int j;
56725605Skjc
56825605Skjc			for (j = 0; j < i; j++)
56925605Skjc				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
57025605Skjc			return (EFAULT);
57125605Skjc		}
57225605Skjc
57325605Skjc		m = PHYS_TO_VM_PAGE(paddr);
57425605Skjc		vm_page_wire(m);
57525605Skjc		wpipe->pipe_map.ms[i] = m;
57625605Skjc	}
57725605Skjc
57825605Skjc/*
57925605Skjc * set up the control block
58025605Skjc */
58125605Skjc	wpipe->pipe_map.npages = i;
58225605Skjc	wpipe->pipe_map.pos =
58325605Skjc	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
584111119Simp	wpipe->pipe_map.cnt = size;
58525605Skjc
58625605Skjc/*
58725605Skjc * and map the buffer
58825605Skjc */
58925605Skjc	if (wpipe->pipe_map.kva == 0) {
59025605Skjc		/*
59125605Skjc		 * We need to allocate space for an extra page because the
59225605Skjc		 * address range might (will) span pages at times.
59325605Skjc		 */
59425605Skjc		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
59525605Skjc			wpipe->pipe_buffer.size + PAGE_SIZE);
59625605Skjc		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
59725605Skjc	}
59825605Skjc	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
59925605Skjc		wpipe->pipe_map.npages);
60025605Skjc
60125605Skjc/*
60225605Skjc * and update the uio data
60325605Skjc */
60425605Skjc
60525605Skjc	uio->uio_iov->iov_len -= size;
60625605Skjc	uio->uio_iov->iov_base += size;
60725605Skjc	if (uio->uio_iov->iov_len == 0)
60825605Skjc		uio->uio_iov++;
60925605Skjc	uio->uio_resid -= size;
61041514Sarchie	uio->uio_offset += size;
61141514Sarchie	return (0);
61225605Skjc}
61325605Skjc
61425605Skjc/*
61525605Skjc * unmap and unwire the process buffer
61625605Skjc */
61725605Skjcstatic void
61825605Skjcpipe_destroy_write_buffer(wpipe)
61925605Skjc	struct pipe *wpipe;
62025605Skjc{
62125605Skjc	int i;
62225605Skjc
62325605Skjc	GIANT_REQUIRED;
62425605Skjc
62525605Skjc	if (wpipe->pipe_map.kva) {
62625605Skjc		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
62725605Skjc
62825605Skjc		if (amountpipekva > MAXPIPEKVA) {
62925605Skjc			vm_offset_t kva = wpipe->pipe_map.kva;
63025605Skjc			wpipe->pipe_map.kva = 0;
63125605Skjc			kmem_free(kernel_map, kva,
63225605Skjc				wpipe->pipe_buffer.size + PAGE_SIZE);
63325605Skjc			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
63425605Skjc		}
63525605Skjc	}
63625605Skjc	for (i = 0; i < wpipe->pipe_map.npages; i++)
63725605Skjc		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
63825605Skjc}
63925605Skjc
64025605Skjc/*
64125605Skjc * In the case of a signal, the writing process might go away.  This
64225605Skjc * code copies the data into the circular buffer so that the source
64325605Skjc * pages can be freed without loss of data.
64425605Skjc */
64525605Skjcstatic void
64625605Skjcpipe_clone_write_buffer(wpipe)
64725605Skjc	struct pipe *wpipe;
64825605Skjc{
64925605Skjc	int size;
65025605Skjc	int pos;
65125605Skjc
65225605Skjc	size = wpipe->pipe_map.cnt;
65325605Skjc	pos = wpipe->pipe_map.pos;
65425605Skjc	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
65525605Skjc	    (caddr_t) wpipe->pipe_buffer.buffer, size);
65625605Skjc
65725605Skjc	wpipe->pipe_buffer.in = size;
65825605Skjc	wpipe->pipe_buffer.out = 0;
65925605Skjc	wpipe->pipe_buffer.cnt = size;
66025605Skjc	wpipe->pipe_state &= ~PIPE_DIRECTW;
66125605Skjc
66225605Skjc	pipe_destroy_write_buffer(wpipe);
66325605Skjc}
66425605Skjc
66525605Skjc/*
66625605Skjc * This implements the pipe buffer write mechanism.  Note that only
66725605Skjc * a direct write OR a normal pipe write can be pending at any given time.
66825605Skjc * If there are any characters in the pipe buffer, the direct write will
66925605Skjc * be deferred until the receiving process grabs all of the bytes from
67025605Skjc * the pipe buffer.  Then the direct mapping write is set-up.
67125605Skjc */
67225605Skjcstatic int
67325605Skjcpipe_direct_write(wpipe, uio)
67425605Skjc	struct pipe *wpipe;
67525605Skjc	struct uio *uio;
67625605Skjc{
67725605Skjc	int error;
67825605Skjc
67925605Skjcretry:
68025605Skjc	while (wpipe->pipe_state & PIPE_DIRECTW) {
68125605Skjc		if (wpipe->pipe_state & PIPE_WANTR) {
682111888Sjlemon			wpipe->pipe_state &= ~PIPE_WANTR;
68325605Skjc			wakeup(wpipe);
684118541Sharti		}
685118541Sharti		wpipe->pipe_state |= PIPE_WANTW;
686118541Sharti		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
68725605Skjc		if (error)
68825605Skjc			goto error1;
689118541Sharti		if (wpipe->pipe_state & PIPE_EOF) {
69025605Skjc			error = EPIPE;
69125605Skjc			goto error1;
692118541Sharti		}
693118541Sharti	}
69425605Skjc	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
695118541Sharti	if (wpipe->pipe_buffer.cnt > 0) {
696118541Sharti		if (wpipe->pipe_state & PIPE_WANTR) {
697118541Sharti			wpipe->pipe_state &= ~PIPE_WANTR;
69825605Skjc			wakeup(wpipe);
699118541Sharti		}
700118541Sharti
701118541Sharti		wpipe->pipe_state |= PIPE_WANTW;
702118541Sharti		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
703118541Sharti		if (error)
704118541Sharti			goto error1;
70525605Skjc		if (wpipe->pipe_state & PIPE_EOF) {
706118541Sharti			error = EPIPE;
707118541Sharti			goto error1;
708118541Sharti		}
709118541Sharti		goto retry;
71025605Skjc	}
71125605Skjc
712118541Sharti	wpipe->pipe_state |= PIPE_DIRECTW;
71325605Skjc
71425605Skjc	error = pipe_build_write_buffer(wpipe, uio);
715118541Sharti	if (error) {
71625605Skjc		wpipe->pipe_state &= ~PIPE_DIRECTW;
71725605Skjc		goto error1;
71825605Skjc	}
719118547Sharti
72025605Skjc	error = 0;
721118541Sharti	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
722118541Sharti		if (wpipe->pipe_state & PIPE_EOF) {
72325605Skjc			pipelock(wpipe, 0);
724118541Sharti			pipe_destroy_write_buffer(wpipe);
725118541Sharti			pipeunlock(wpipe);
726118541Sharti			pipeselwakeup(wpipe);
72725605Skjc			error = EPIPE;
728118541Sharti			goto error1;
729118541Sharti		}
73025605Skjc		if (wpipe->pipe_state & PIPE_WANTR) {
731118541Sharti			wpipe->pipe_state &= ~PIPE_WANTR;
732118541Sharti			wakeup(wpipe);
73325605Skjc		}
73425605Skjc		pipeselwakeup(wpipe);
73525605Skjc		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
73625605Skjc	}
73725605Skjc
73825605Skjc	pipelock(wpipe,0);
739118541Sharti	if (wpipe->pipe_state & PIPE_DIRECTW) {
740118541Sharti		/*
74125605Skjc		 * this bit of trickery substitutes a kernel buffer for
742118541Sharti		 * the process that might be going away.
743118541Sharti		 */
74425605Skjc		pipe_clone_write_buffer(wpipe);
74525605Skjc	} else {
74625605Skjc		pipe_destroy_write_buffer(wpipe);
74725605Skjc	}
74825605Skjc	pipeunlock(wpipe);
74925605Skjc	return (error);
750118541Sharti
751118541Shartierror1:
75225605Skjc	wakeup(wpipe);
753118541Sharti	return (error);
754118541Sharti}
75525605Skjc#endif
756
757static int
758pipe_write(fp, uio, cred, flags, td)
759	struct file *fp;
760	struct uio *uio;
761	struct ucred *cred;
762	struct thread *td;
763	int flags;
764{
765	int error = 0;
766	int orig_resid;
767	struct pipe *wpipe, *rpipe;
768
769	rpipe = (struct pipe *) fp->f_data;
770	wpipe = rpipe->pipe_peer;
771
772	/*
773	 * detect loss of pipe read side, issue SIGPIPE if lost.
774	 */
775	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
776		return (EPIPE);
777	}
778	++wpipe->pipe_busy;
779
780	/*
781	 * If it is advantageous to resize the pipe buffer, do
782	 * so.
783	 */
784	if ((uio->uio_resid > PIPE_SIZE) &&
785		(nbigpipe < LIMITBIGPIPES) &&
786		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
787		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
788		(wpipe->pipe_buffer.cnt == 0)) {
789
790		if ((error = pipelock(wpipe,1)) == 0) {
791			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
792				nbigpipe++;
793			pipeunlock(wpipe);
794		}
795	}
796
797	/*
798	 * If an early error occured unbusy and return, waking up any pending
799	 * readers.
800	 */
801	if (error) {
802		--wpipe->pipe_busy;
803		if ((wpipe->pipe_busy == 0) &&
804		    (wpipe->pipe_state & PIPE_WANT)) {
805			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
806			wakeup(wpipe);
807		}
808		return(error);
809	}
810
811	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
812
813	orig_resid = uio->uio_resid;
814
815	while (uio->uio_resid) {
816		int space;
817
818#ifndef PIPE_NODIRECT
819		/*
820		 * If the transfer is large, we can gain performance if
821		 * we do process-to-process copies directly.
822		 * If the write is non-blocking, we don't use the
823		 * direct write mechanism.
824		 *
825		 * The direct write mechanism will detect the reader going
826		 * away on us.
827		 */
828		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
829		    (fp->f_flag & FNONBLOCK) == 0 &&
830			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
831			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
832			error = pipe_direct_write( wpipe, uio);
833			if (error)
834				break;
835			continue;
836		}
837#endif
838
839		/*
840		 * Pipe buffered writes cannot be coincidental with
841		 * direct writes.  We wait until the currently executing
842		 * direct write is completed before we start filling the
843		 * pipe buffer.  We break out if a signal occurs or the
844		 * reader goes away.
845		 */
846	retrywrite:
847		while (wpipe->pipe_state & PIPE_DIRECTW) {
848			if (wpipe->pipe_state & PIPE_WANTR) {
849				wpipe->pipe_state &= ~PIPE_WANTR;
850				wakeup(wpipe);
851			}
852			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
853			if (wpipe->pipe_state & PIPE_EOF)
854				break;
855			if (error)
856				break;
857		}
858		if (wpipe->pipe_state & PIPE_EOF) {
859			error = EPIPE;
860			break;
861		}
862
863		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
864
865		/* Writes of size <= PIPE_BUF must be atomic. */
866		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
867			space = 0;
868
869		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
870			if ((error = pipelock(wpipe,1)) == 0) {
871				int size;	/* Transfer size */
872				int segsize;	/* first segment to transfer */
873
874				/*
875				 * It is possible for a direct write to
876				 * slip in on us... handle it here...
877				 */
878				if (wpipe->pipe_state & PIPE_DIRECTW) {
879					pipeunlock(wpipe);
880					goto retrywrite;
881				}
882				/*
883				 * If a process blocked in uiomove, our
884				 * value for space might be bad.
885				 *
886				 * XXX will we be ok if the reader has gone
887				 * away here?
888				 */
889				if (space > wpipe->pipe_buffer.size -
890				    wpipe->pipe_buffer.cnt) {
891					pipeunlock(wpipe);
892					goto retrywrite;
893				}
894
895				/*
896				 * Transfer size is minimum of uio transfer
897				 * and free space in pipe buffer.
898				 */
899				if (space > uio->uio_resid)
900					size = uio->uio_resid;
901				else
902					size = space;
903				/*
904				 * First segment to transfer is minimum of
905				 * transfer size and contiguous space in
906				 * pipe buffer.  If first segment to transfer
907				 * is less than the transfer size, we've got
908				 * a wraparound in the buffer.
909				 */
910				segsize = wpipe->pipe_buffer.size -
911					wpipe->pipe_buffer.in;
912				if (segsize > size)
913					segsize = size;
914
915				/* Transfer first segment */
916
917				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
918						segsize, uio);
919
920				if (error == 0 && segsize < size) {
921					/*
922					 * Transfer remaining part now, to
923					 * support atomic writes.  Wraparound
924					 * happened.
925					 */
926					if (wpipe->pipe_buffer.in + segsize !=
927					    wpipe->pipe_buffer.size)
928						panic("Expected pipe buffer wraparound disappeared");
929
930					error = uiomove(&wpipe->pipe_buffer.buffer[0],
931							size - segsize, uio);
932				}
933				if (error == 0) {
934					wpipe->pipe_buffer.in += size;
935					if (wpipe->pipe_buffer.in >=
936					    wpipe->pipe_buffer.size) {
937						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
938							panic("Expected wraparound bad");
939						wpipe->pipe_buffer.in = size - segsize;
940					}
941
942					wpipe->pipe_buffer.cnt += size;
943					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
944						panic("Pipe buffer overflow");
945
946				}
947				pipeunlock(wpipe);
948			}
949			if (error)
950				break;
951
952		} else {
953			/*
954			 * If the "read-side" has been blocked, wake it up now.
955			 */
956			if (wpipe->pipe_state & PIPE_WANTR) {
957				wpipe->pipe_state &= ~PIPE_WANTR;
958				wakeup(wpipe);
959			}
960
961			/*
962			 * don't block on non-blocking I/O
963			 */
964			if (fp->f_flag & FNONBLOCK) {
965				error = EAGAIN;
966				break;
967			}
968
969			/*
970			 * We have no more space and have something to offer,
971			 * wake up select/poll.
972			 */
973			pipeselwakeup(wpipe);
974
975			wpipe->pipe_state |= PIPE_WANTW;
976			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
977			if (error != 0)
978				break;
979			/*
980			 * If read side wants to go away, we just issue a signal
981			 * to ourselves.
982			 */
983			if (wpipe->pipe_state & PIPE_EOF) {
984				error = EPIPE;
985				break;
986			}
987		}
988	}
989
990	--wpipe->pipe_busy;
991
992	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
993		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
994		wakeup(wpipe);
995	} else if (wpipe->pipe_buffer.cnt > 0) {
996		/*
997		 * If we have put any characters in the buffer, we wake up
998		 * the reader.
999		 */
1000		if (wpipe->pipe_state & PIPE_WANTR) {
1001			wpipe->pipe_state &= ~PIPE_WANTR;
1002			wakeup(wpipe);
1003		}
1004	}
1005
1006	/*
1007	 * Don't return EPIPE if I/O was successful
1008	 */
1009	if ((wpipe->pipe_buffer.cnt == 0) &&
1010	    (uio->uio_resid == 0) &&
1011	    (error == EPIPE)) {
1012		error = 0;
1013	}
1014
1015	if (error == 0)
1016		vfs_timestamp(&wpipe->pipe_mtime);
1017
1018	/*
1019	 * We have something to offer,
1020	 * wake up select/poll.
1021	 */
1022	if (wpipe->pipe_buffer.cnt)
1023		pipeselwakeup(wpipe);
1024
1025	return (error);
1026}
1027
1028/*
1029 * we implement a very minimal set of ioctls for compatibility with sockets.
1030 */
1031int
1032pipe_ioctl(fp, cmd, data, td)
1033	struct file *fp;
1034	u_long cmd;
1035	caddr_t data;
1036	struct thread *td;
1037{
1038	struct pipe *mpipe = (struct pipe *)fp->f_data;
1039
1040	switch (cmd) {
1041
1042	case FIONBIO:
1043		return (0);
1044
1045	case FIOASYNC:
1046		if (*(int *)data) {
1047			mpipe->pipe_state |= PIPE_ASYNC;
1048		} else {
1049			mpipe->pipe_state &= ~PIPE_ASYNC;
1050		}
1051		return (0);
1052
1053	case FIONREAD:
1054		if (mpipe->pipe_state & PIPE_DIRECTW)
1055			*(int *)data = mpipe->pipe_map.cnt;
1056		else
1057			*(int *)data = mpipe->pipe_buffer.cnt;
1058		return (0);
1059
1060	case FIOSETOWN:
1061		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1062
1063	case FIOGETOWN:
1064		*(int *)data = fgetown(mpipe->pipe_sigio);
1065		return (0);
1066
1067	/* This is deprecated, FIOSETOWN should be used instead. */
1068	case TIOCSPGRP:
1069		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1070
1071	/* This is deprecated, FIOGETOWN should be used instead. */
1072	case TIOCGPGRP:
1073		*(int *)data = -fgetown(mpipe->pipe_sigio);
1074		return (0);
1075
1076	}
1077	return (ENOTTY);
1078}
1079
1080int
1081pipe_poll(fp, events, cred, td)
1082	struct file *fp;
1083	int events;
1084	struct ucred *cred;
1085	struct thread *td;
1086{
1087	struct pipe *rpipe = (struct pipe *)fp->f_data;
1088	struct pipe *wpipe;
1089	int revents = 0;
1090
1091	wpipe = rpipe->pipe_peer;
1092	if (events & (POLLIN | POLLRDNORM))
1093		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1094		    (rpipe->pipe_buffer.cnt > 0) ||
1095		    (rpipe->pipe_state & PIPE_EOF))
1096			revents |= events & (POLLIN | POLLRDNORM);
1097
1098	if (events & (POLLOUT | POLLWRNORM))
1099		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1100		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1101		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1102			revents |= events & (POLLOUT | POLLWRNORM);
1103
1104	if ((rpipe->pipe_state & PIPE_EOF) ||
1105	    (wpipe == NULL) ||
1106	    (wpipe->pipe_state & PIPE_EOF))
1107		revents |= POLLHUP;
1108
1109	if (revents == 0) {
1110		if (events & (POLLIN | POLLRDNORM)) {
1111			selrecord(curthread, &rpipe->pipe_sel);
1112			rpipe->pipe_state |= PIPE_SEL;
1113		}
1114
1115		if (events & (POLLOUT | POLLWRNORM)) {
1116			selrecord(curthread, &wpipe->pipe_sel);
1117			wpipe->pipe_state |= PIPE_SEL;
1118		}
1119	}
1120
1121	return (revents);
1122}
1123
1124static int
1125pipe_stat(fp, ub, td)
1126	struct file *fp;
1127	struct stat *ub;
1128	struct thread *td;
1129{
1130	struct pipe *pipe = (struct pipe *)fp->f_data;
1131
1132	bzero((caddr_t)ub, sizeof(*ub));
1133	ub->st_mode = S_IFIFO;
1134	ub->st_blksize = pipe->pipe_buffer.size;
1135	ub->st_size = pipe->pipe_buffer.cnt;
1136	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1137	ub->st_atimespec = pipe->pipe_atime;
1138	ub->st_mtimespec = pipe->pipe_mtime;
1139	ub->st_ctimespec = pipe->pipe_ctime;
1140	ub->st_uid = fp->f_cred->cr_uid;
1141	ub->st_gid = fp->f_cred->cr_gid;
1142	/*
1143	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1144	 * XXX (st_dev, st_ino) should be unique.
1145	 */
1146	return (0);
1147}
1148
1149/* ARGSUSED */
1150static int
1151pipe_close(fp, td)
1152	struct file *fp;
1153	struct thread *td;
1154{
1155	struct pipe *cpipe = (struct pipe *)fp->f_data;
1156
1157	fp->f_ops = &badfileops;
1158	fp->f_data = NULL;
1159	funsetown(cpipe->pipe_sigio);
1160	pipeclose(cpipe);
1161	return (0);
1162}
1163
1164static void
1165pipe_free_kmem(cpipe)
1166	struct pipe *cpipe;
1167{
1168	GIANT_REQUIRED;
1169
1170	if (cpipe->pipe_buffer.buffer != NULL) {
1171		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1172			--nbigpipe;
1173		amountpipekva -= cpipe->pipe_buffer.size;
1174		kmem_free(kernel_map,
1175			(vm_offset_t)cpipe->pipe_buffer.buffer,
1176			cpipe->pipe_buffer.size);
1177		cpipe->pipe_buffer.buffer = NULL;
1178	}
1179#ifndef PIPE_NODIRECT
1180	if (cpipe->pipe_map.kva != NULL) {
1181		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1182		kmem_free(kernel_map,
1183			cpipe->pipe_map.kva,
1184			cpipe->pipe_buffer.size + PAGE_SIZE);
1185		cpipe->pipe_map.cnt = 0;
1186		cpipe->pipe_map.kva = 0;
1187		cpipe->pipe_map.pos = 0;
1188		cpipe->pipe_map.npages = 0;
1189	}
1190#endif
1191}
1192
1193/*
1194 * shutdown the pipe
1195 */
1196static void
1197pipeclose(cpipe)
1198	struct pipe *cpipe;
1199{
1200	struct pipe *ppipe;
1201
1202	if (cpipe) {
1203
1204		pipeselwakeup(cpipe);
1205
1206		/*
1207		 * If the other side is blocked, wake it up saying that
1208		 * we want to close it down.
1209		 */
1210		while (cpipe->pipe_busy) {
1211			wakeup(cpipe);
1212			cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1213			tsleep(cpipe, PRIBIO, "pipecl", 0);
1214		}
1215
1216		/*
1217		 * Disconnect from peer
1218		 */
1219		if ((ppipe = cpipe->pipe_peer) != NULL) {
1220			pipeselwakeup(ppipe);
1221
1222			ppipe->pipe_state |= PIPE_EOF;
1223			wakeup(ppipe);
1224			ppipe->pipe_peer = NULL;
1225		}
1226		/*
1227		 * free resources
1228		 */
1229		pipe_free_kmem(cpipe);
1230		zfree(pipe_zone, cpipe);
1231	}
1232}
1233
1234/*ARGSUSED*/
1235static int
1236pipe_kqfilter(struct file *fp, struct knote *kn)
1237{
1238	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1239
1240	switch (kn->kn_filter) {
1241	case EVFILT_READ:
1242		kn->kn_fop = &pipe_rfiltops;
1243		break;
1244	case EVFILT_WRITE:
1245		kn->kn_fop = &pipe_wfiltops;
1246		cpipe = cpipe->pipe_peer;
1247		break;
1248	default:
1249		return (1);
1250	}
1251	kn->kn_hook = (caddr_t)cpipe;
1252
1253	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1254	return (0);
1255}
1256
1257static void
1258filt_pipedetach(struct knote *kn)
1259{
1260	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
1261
1262	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1263}
1264
1265/*ARGSUSED*/
1266static int
1267filt_piperead(struct knote *kn, long hint)
1268{
1269	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1270	struct pipe *wpipe = rpipe->pipe_peer;
1271
1272	kn->kn_data = rpipe->pipe_buffer.cnt;
1273	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1274		kn->kn_data = rpipe->pipe_map.cnt;
1275
1276	if ((rpipe->pipe_state & PIPE_EOF) ||
1277	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1278		kn->kn_flags |= EV_EOF;
1279		return (1);
1280	}
1281	return (kn->kn_data > 0);
1282}
1283
1284/*ARGSUSED*/
1285static int
1286filt_pipewrite(struct knote *kn, long hint)
1287{
1288	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1289	struct pipe *wpipe = rpipe->pipe_peer;
1290
1291	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1292		kn->kn_data = 0;
1293		kn->kn_flags |= EV_EOF;
1294		return (1);
1295	}
1296	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1297	if (wpipe->pipe_state & PIPE_DIRECTW)
1298		kn->kn_data = 0;
1299
1300	return (kn->kn_data >= PIPE_BUF);
1301}
1302